Commit 0bb31239 authored by Andrey Filippov's avatar Andrey Filippov

4 images with CDP

parent 095bd8c2
......@@ -107,6 +107,7 @@ public class GPUTileProcessor {
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
// pass some defines to gpu source code with #ifdef JCUDA
......@@ -168,6 +169,8 @@ public class GPUTileProcessor {
private CUfunction GPU_RBGA_kernel = null;
private CUfunction GPU_ROT_DERIV_kernel = null;
private CUfunction SET_TILES_OFFSETS_kernel = null;
private CUfunction GPU_IMCLT_ALL_kernel = null;
// CPU arrays of pointers to GPU memory
// These arrays may go to methods, they are here just to be able to free GPU memory if needed
......@@ -186,6 +189,7 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_corrs = new CUdeviceptr(); // allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
private CUdeviceptr gpu_textures = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
private CUdeviceptr gpu_clt = new CUdeviceptr();
private CUdeviceptr gpu_4_images = new CUdeviceptr();
private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.POINTER
......@@ -466,7 +470,8 @@ public class GPUTileProcessor {
GPU_TEXTURES_NAME,
GPU_RBGA_NAME,
GPU_ROT_DERIV,
SET_TILES_OFFSETS
SET_TILES_OFFSETS,
GPU_IMCLT_ALL_NAME
};
CUfunction[] functions = createFunctions(kernelSources,
func_names,
......@@ -479,7 +484,7 @@ public class GPUTileProcessor {
GPU_RBGA_kernel= functions[4];
GPU_ROT_DERIV_kernel = functions[5];
SET_TILES_OFFSETS_kernel = functions[6];
GPU_IMCLT_ALL_kernel = functions[7];
System.out.println("GPU kernel functions initialized");
System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
......@@ -531,10 +536,13 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_kernel_offsets, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_bayer, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_clt, NUM_CAMS * Sizeof.POINTER);
cuMemAlloc(gpu_4_images, NUM_CAMS * Sizeof.POINTER);
long [] gpu_kernels_l = new long [NUM_CAMS];
long [] gpu_kernel_offsets_l = new long [NUM_CAMS];
long [] gpu_bayer_l = new long [NUM_CAMS];
long [] gpu_clt_l = new long [NUM_CAMS];
long [] gpu_4_images_l = new long [NUM_CAMS];
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_kernels_l[ncam] = getPointerAddress(gpu_kernels_h[ncam]);
cuMemcpyHtoD(gpu_kernels, Pointer.to(gpu_kernels_l), NUM_CAMS * Sizeof.POINTER);
......@@ -548,6 +556,9 @@ public class GPUTileProcessor {
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_clt_l[ncam] = getPointerAddress(gpu_clt_h[ncam]);
cuMemcpyHtoD(gpu_clt, Pointer.to(gpu_clt_l), NUM_CAMS * Sizeof.POINTER);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_4_images_l[ncam] = getPointerAddress(gpu_corr_images_h[ncam]);
cuMemcpyHtoD(gpu_4_images, Pointer.to(gpu_4_images_l), NUM_CAMS * Sizeof.POINTER);
// Set GeometryCorrection data
cuMemAlloc(gpu_geometry_correction, GeometryCorrection.arrayLength(NUM_CAMS) * Sizeof.FLOAT);
cuMemAlloc(gpu_rByRDist, RBYRDIST_LEN * Sizeof.FLOAT);
......@@ -1093,9 +1104,12 @@ public class GPUTileProcessor {
Pointer.to(gpu_clt),
Pointer.to(new int[] { mclt_stride }),
Pointer.to(new int[] { num_task_tiles }),
// move lpf to 4-image generator kernel
// Pointer.to(new int[] { 7 }) // lpf_mask ??? (C-code has it 0)
Pointer.to(new int[] { 0 }) // lpf_mask ??? (C-code has it 0)
// move lpf to 4-image generator kernel - DONE
Pointer.to(new int[] { 0 }), // lpf_mask
Pointer.to(new int[] { IMG_WIDTH}), // int woi_width,
Pointer.to(new int[] { IMG_HEIGHT}), // int woi_height,
Pointer.to(new int[] { KERNELS_HOR}), // int kernels_hor,
Pointer.to(new int[] { KERNELS_VERT}) // int kernels_vert);
);
cuCtxSynchronize();
......@@ -1132,10 +1146,12 @@ public class GPUTileProcessor {
Pointer.to(gpu_clt_h[ncam]),
Pointer.to(gpu_corr_images_h[ncam]),
Pointer.to(new int[] { apply_lpf }),
Pointer.to(new int[] { is_mono ? 1 : 0 }),
Pointer.to(new int[] { is_mono ? 1 : NUM_COLORS }), // now - NUM_COLORS
Pointer.to(new int[] { color }),
Pointer.to(new int[] { v_offs }),
Pointer.to(new int[] { h_offs }),
Pointer.to(new int[] { tilesX }),
Pointer.to(new int[] { tilesY }),
Pointer.to(new int[] { imclt_stride }) // lpf_mask
);
cuCtxSynchronize();
......@@ -1152,6 +1168,39 @@ public class GPUTileProcessor {
cuCtxSynchronize();
}
public void execImcltRbgAll(
boolean is_mono
) {
if (GPU_IMCLT_ALL_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_ALL_kernel");
return;
}
int apply_lpf = 1;
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
int [] ThreadsFullWarps = {1, 1, 1};
int [] GridFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt), // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_4_images), // float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
Pointer.to(new int[] { apply_lpf }), // int apply_lpf,
Pointer.to(new int[] { is_mono ? 1 : NUM_COLORS }), // int colors,
Pointer.to(new int[] { tilesX }), // int woi_twidth,
Pointer.to(new int[] { tilesY }), // int woi_theight,
Pointer.to(new int[] { imclt_stride }) // const size_t dstride); // in floats (pixels)
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_IMCLT_ALL_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize();
}
public void execCorr2D(
double [] scales,
double fat_zero,
......
......@@ -2119,7 +2119,8 @@ public class TwoQuadCLT {
// run imclt;
long startIMCLT=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
}
long endImcltTime = System.nanoTime();
// run correlation
......
......@@ -732,8 +732,6 @@ __device__ void convertCorrectTile(
const int lpf_mask,
const float centerX,
const float centerY,
// const short tx,
// const short ty,
const int txy,
const size_t dstride, // in floats (pixels)
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
......@@ -742,7 +740,11 @@ __device__ void convertCorrectTile(
float residual_shift [2],
float window_hor_cos [2*DTT_SIZE],
float window_hor_sin [2*DTT_SIZE],
float window_vert_cos [2*DTT_SIZE]);
float window_vert_cos [2*DTT_SIZE],
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
__device__ void debug_print_lpf(
float * lpf_tile);
......@@ -1551,7 +1553,11 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert)
{
dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
......@@ -1615,280 +1621,18 @@ __global__ void convert_correct_tiles(
residual_shift[tile_in_block], // float frac_topleft [NUM_COLORS][2],
window_hor_cos[tile_in_block], // float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_sin[tile_in_block], //float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_vert_cos[tile_in_block]); //float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
window_vert_cos[tile_in_block], //float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
woi_width, // int woi_width,
woi_height, // int woi_height,
kernels_hor, // int kernels_hor,
kernels_vert); //int kernels_vert)
__syncthreads();// __syncwarp();
}
}
}
//#undef USE_textures_gen
#ifdef USE_textures_gen
extern "C"
__global__ void textures_gen(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
// int diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
float weights[3] = {weight0, weight1, weight2};
// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
int camera_num = threadIdx.y;
int tile_indx = blockIdx.x; // * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if (tile_indx >= num_texture_tiles){
return; // nothing to do
}
// get number of tile
int tile_code = gpu_texture_indices[tile_indx];
if ((tile_code & (1 << CORR_TEXTURE_BIT)) == 0){
return; // nothing to do
}
int tile_num = tile_code >> CORR_NTILE_SHIFT;
__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
__shared__ union {
float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // NUM_CAMS == 4
float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; // to align with clt_tiles
} shr;
__shared__ union {
float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// add more
} shr1;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
// __shared__ float color_avg [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
__shared__ float port_offsets[NUM_CAMS][2];
__shared__ float ports_rgb [NUM_CAMS][NUM_COLORS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float max_diff [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
if (threadIdx.x < 2){
port_offsets[camera_num][threadIdx.x] = * (gpu_port_offsets + 2 * camera_num + threadIdx.x);
}
#ifdef DBG_TILE
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile = %d\n",tile_num);
// debug_print_clt1(clt_tile1, color, 0xf); //
// printf("\textures_gen tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
// debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads();// __syncwarp();
#endif
#endif
// serially for each color, parallel for each camera
// copy clt (frequency domain data)
for (int color = 0; color < colors; color++){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
float * clt_tile = ((float *) shr.clt_tiles[camera_num][color]); // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
float * clt_tilei = clt_tile + threadIdx.x;
float * gpu_tile = ((float *) gpu_clt[camera_num]) + (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * mclt_tile = (float *) mclt_tiles [camera_num][color];
float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
float * mclt_tmp = (float *) shr1.mclt_tmp[camera_num][color];
// float scale = 0.25;
#pragma unroll
for (int q = 0; q < 4; q++) {
float *lpf = lpf_data[(colors > 1)? color : 3] + threadIdx.x; // lpf_data[3] - mono
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
// *clt_tilei = *gpu_tile * (*lpf) * scale;
*clt_tilei = *gpu_tile * (*lpf);
clt_tilei += DTT_SIZE1;
gpu_tile += DTT_SIZE;
lpf += DTT_SIZE;
}
}
__syncthreads();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen LPF for color = %d\n",color);
debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
printf("\ntextures_gen tile = %d, color = %d \n",tile_num, color);
debug_print_clt_scaled(clt_tile, color, 0xf, 0.25); //
}
__syncthreads();// __syncwarp();
#endif
// perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color);
}
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#pragma unroll
for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
float * msp = mclt_tile + threadIdx.x + n;
float * dst = mclt_dst + threadIdx.x + n;
#pragma unroll
for (int row = 0; row < DTT_SIZE2; row++){
*dst = *msp;
msp += DTT_SIZE21;
dst += DTT_SIZE21;
}
}
__syncthreads();
}
#ifdef DEBUG77
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
debug_print_mclt(
mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][0], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
} // for (int color = 0; color < colors; color++)
__syncthreads(); // __syncwarp();
/// return;
#ifdef DEBUG77
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
}
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG77
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
float * gpu_texture_tile = gpu_texture_tiles + tile_indx * texture_stride;
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
int gi = row * DTT_SIZE2 + col;
float * gpu_texture_tile_gi = gpu_texture_tile + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
// always copy 3 (1) colors + alpha
if (colors == 3){
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1 ; ncol++) { // 12
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
} else { // assuming colors = 1
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1 + NUM_CAMS + 1 + 1 ; ncol++) { // 8
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
}
}
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
}
__syncthreads();// __syncwarp();
#endif
}
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
int * woi, // x, y, width,height
......@@ -2300,31 +2044,71 @@ __global__ void textures_accumulate(
} // textures_accumulate()
extern "C"
__global__ void imclt_rbg_all(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
int colors,
int woi_twidth,
int woi_theight,
const size_t dstride) // in floats (pixels)
{
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
if (threadIdx.x == 0) { // anyway 1,1,1
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < colors; color++) {
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
int tilesy_half = (woi_theight + (v_offs ^ 1)) >> 1;
int tilesx_half = (woi_twidth + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>(
gpu_clt[ncam], // float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
1, // int apply_lpf,
colors, // int colors, // defines lpf filter
color, // int color, // defines location of clt data
v_offs, // int v_offset,
h_offs, // int h_offset,
woi_twidth, // int woi_twidth, // will increase by DTT_SIZE (todo - cut away?)
woi_theight, // int woi_theight, // will increase by DTT_SIZE (todo - cut away?)
dstride); // const size_t dstride); // in floats (pixels)
cudaDeviceSynchronize();
}
}
}
}
}
}
extern "C"
__global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono,
int colors, // was mono
int color,
int v_offset,
int h_offset,
int woi_twidth, // will increase by DTT_SIZE (todo - cut away?)
int woi_theight, // will increase by DTT_SIZE (todo - cut away?)
const size_t dstride) // in floats (pixels)
{
float *color_plane = gpu_rbg + dstride * (IMG_HEIGHT + DTT_SIZE) * color;
float *color_plane = gpu_rbg + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color;
int pass = (v_offset << 1) + h_offset; // 0..3 to correctly acummulate 16x16 tiles stride 8
int tile_in_block = threadIdx.y;
int tile_num = blockIdx.x * IMCLT_TILES_PER_BLOCK + tile_in_block;
// if (tile_num >= (TILESY * TILESX)) {
// return; // just testing with a single tile
// }
// int tilesy_half = (TILESY + (v_offset ^ 1)) >> 1;
int tilesx_half = (TILESX + (h_offset ^ 1)) >> 1;
int tilesx_half = (woi_twidth + (h_offset ^ 1)) >> 1;
int tileY_half = tile_num / tilesx_half;
int tileX_half = tile_num - tileY_half * tilesx_half;
int tileY = (tileY_half << 1) + v_offset;
int tileX = (tileX_half << 1) + h_offset;
if (tileY >= TILESY) {
if (tileY >= woi_theight) {
return; // just testing with a single tile
}
#ifdef DEBUG4
......@@ -2351,14 +2135,13 @@ __global__ void imclt_rbg(
// copy clt (frequency domain data)
float * clt_tile = ((float *) clt_tiles) + tile_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
// float * gpu_tile = ((float *) gpu_clt) + ((DBG_TILE_Y * TILESX + DBG_TILE_X) * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
float * gpu_tile = ((float *) gpu_clt) + ((tileY * TILESX + tileX) * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
float * gpu_tile = ((float *) gpu_clt) + ((tileY * woi_twidth + tileX) * colors + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
clt_tile += column + thr3; // first 2 rows
gpu_tile += column; // first 2 rows
if (apply_lpf) {
// lpf - covers 2 rows, as there there are 16 threads
float *lpf0 = lpf_data[mono? 3 :color] + threadIdx.x; // lpf_data[3] - mono
float *lpf0 = lpf_data[(colors == 1)? 3 :color] + threadIdx.x; // lpf_data[3] - mono
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf = lpf0;
......@@ -2787,16 +2570,19 @@ __device__ void convertCorrectTile(
float residual_shift [2],
float window_hor_cos [2*DTT_SIZE],
float window_hor_sin [2*DTT_SIZE],
float window_vert_cos [2*DTT_SIZE])
float window_vert_cos [2*DTT_SIZE],
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert)
{
// TODO: pass these values instead of constants to handle EO/LWIR
int max_px = IMG_WIDTH - 1; // odd
int max_py = IMG_HEIGHT - 1; // odd
int max_px = woi_width - 1; // IMG_WIDTH - 1; // odd
int max_py = woi_height - 1; // IMG_HEIGHT - 1; // odd
int max_pxm1 = max_px - 1; // even
int max_pym1 = max_py - 1; // even
int max_kernel_hor = KERNELS_HOR -1;
int max_kernel_vert = KERNELS_VERT-1;
int max_kernel_hor = kernels_hor - 1; // KERNELS_HOR -1;
int max_kernel_vert = kernels_vert - 1; // KERNELS_VERT-1;
int ktileX, ktileY;
int kernel_index; // common for all coors
......@@ -2806,7 +2592,7 @@ __device__ void convertCorrectTile(
ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
kdx = centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
kdy = centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
kernel_index = (ktileX + ktileY * KERNELS_HOR) * NUM_COLORS;
kernel_index = (ktileX + ktileY * kernels_hor) * NUM_COLORS;
}
// broadcast kernel_index
kernel_index = __shfl_sync(
......@@ -3157,7 +2943,6 @@ __device__ void convertCorrectTile(
}
}
__syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after LPF, color = %d\n",color);
......@@ -3165,33 +2950,22 @@ __device__ void convertCorrectTile(
printf("\nDTT All done\n");
}
__syncthreads();// __syncwarp();
#endif
#endif
}
// const int tx = txy & 0xffff; // slow again
// const int ty = txy >> 16;
int offset_src = threadIdx.x;
// int offset_dst = ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
// int offset_dst = ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
#ifdef USE_UMUL24
int offset_dst = __umul24( __umul24( __umul24(txy >> 16, TILESX) + (txy & 0xfff) , NUM_COLORS) + color , 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
#else
int offset_dst = (((txy >> 16) * TILESX + (txy & 0xfff))*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
#endif
float * clt_src = clt_tile + offset_src; // threadIdx.x;
float * clt_dst = gpu_clt + offset_dst; // ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE1) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
//#ifndef NOICLT
#ifdef DBG_TILE
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("clt_src = 0x%lx\n",clt_src);
printf("clt_dst = 0x%lx\n",clt_dst);
}
#endif
#endif
......
......@@ -51,7 +51,12 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask); // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......@@ -104,6 +109,16 @@ extern "C" __global__ void textures_accumulate(
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C"
__global__ void imclt_rbg_all(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
int colors,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C" __global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
......@@ -112,6 +127,8 @@ extern "C" __global__ void imclt_rbg(
int color, // defines location of clt data
int v_offset,
int h_offset,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C"
......@@ -144,5 +161,3 @@ __global__ void generate_RBGA(
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
......@@ -114,9 +114,9 @@ struct gc {
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
// };
// float rad_coeff [7];
// };
};
float rad_coeff [7];
};
#endif
// parameters, common for all sensors
float elevation; // degrees, up - positive;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment