Commit f3521f7d authored by Andrey Filippov's avatar Andrey Filippov

Version for rectilinear images

parent 9d222e8c
......@@ -1010,7 +1010,7 @@ extern "C" __global__ void correlate2D(
/**
* Calculate 2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
* ones with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* gpu_corr_indices will include sums
* @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
......@@ -2660,7 +2660,7 @@ __global__ void index_correlate(
* Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
* For interscene correlation. One correlation output for each selected sensor
* plus a sum of them all. So for all 16 sensors selected ooutput will have 17
* 2D correlations (with some being the l;ast one)
* 2D correlations (with sum being the last one)
* All pairs for the same tile will always be in the same order: increasing sensor numbers
* with sum being the last. Sum will be marked by 0xff in the LSB.
* With the quad camera each tile may generate up to 6 pairs (int array elements)
......@@ -2770,8 +2770,8 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
convert_correct_tiles<<<grid_tp,threads_tp>>>(
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
gpu_kernel_offsets, // float ** gpu_kernel_offsets, // [num_cams],
gpu_kernels, // float ** gpu_kernels, // [num_cams],
((kernels_hor>0)?gpu_kernel_offsets:0), // float ** gpu_kernel_offsets, // [num_cams],
((kernels_hor>0)?gpu_kernels:0), // float ** gpu_kernels, // [num_cams],
gpu_images, // float ** gpu_images, // [num_cams],
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
......@@ -2862,8 +2862,8 @@ extern "C" __global__ void erase_clt_tiles_inner(
__global__ void convert_correct_tiles(
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float ** gpu_kernel_offsets, // [num_cams],
float ** gpu_kernels, // [num_cams],
float ** gpu_kernel_offsets, // [num_cams], ###
float ** gpu_kernels, // [num_cams], ###
float ** gpu_images, // [num_cams],
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
......@@ -2929,38 +2929,72 @@ __global__ void convert_correct_tiles(
// process each camera,l each color in series (to reduce shared memory)
for (int ncam = 0; ncam < num_cams; ncam++){
for (int color = 0; color < num_colors; color++){
convertCorrectTile(
// TODO: remove debug when done
if (gpu_kernels) {
convertCorrectTile(
// TODO: remove debug when done
#ifdef DBG_TILE
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
#else
num_colors, // int num_colors, //*
num_colors, // int num_colors, //*
#endif
(struct CltExtra*)(gpu_kernel_offsets[ncam]), // struct CltExtra* gpu_kernel_offsets,
gpu_kernels[ncam], // float * gpu_kernels,
gpu_images[ncam], // float * gpu_images,
gpu_clt[ncam], // float * gpu_clt,
color, // const int color,
lpf_mask, // const int lpf_mask,
tt[tile_in_block].xy[ncam][0], // const float centerX,
tt[tile_in_block].xy[ncam][1], // const float centerY,
tt[tile_in_block].txy, // const int txy,
tt[tile_in_block].scale, // const float tscale,
dstride, // size_t dstride, // in floats (pixels)
(float * )(clt_tile [tile_in_block]), // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
(float * )(clt_kernels[tile_in_block]), // float clt_tile [num_colors][4][DTT_SIZE][DTT_SIZE],
int_topleft[tile_in_block], // int int_topleft [num_colors][2],
residual_shift[tile_in_block], // float frac_topleft [num_colors][2],
window_hor_cos[tile_in_block], // float window_hor_cos [num_colors][2*DTT_SIZE],
window_hor_sin[tile_in_block], // float window_hor_sin [num_colors][2*DTT_SIZE],
window_vert_cos[tile_in_block], // float window_vert_cos [num_colors][2*DTT_SIZE]);
window_vert_sin[tile_in_block], // float window_vert_sin [num_colors][2*DTT_SIZE]);
woi_width, // int woi_width,
woi_height, // int woi_height,
kernels_hor, // int kernels_hor,
kernels_vert, //int kernels_vert)
tilesx); // int tilesx);
__syncthreads();
(struct CltExtra*)(gpu_kernel_offsets[ncam]), // struct CltExtra* gpu_kernel_offsets, ####
gpu_kernels[ncam], // float * gpu_kernels, ####
gpu_images[ncam], // float * gpu_images,
gpu_clt[ncam], // float * gpu_clt,
color, // const int color,
lpf_mask, // const int lpf_mask,
tt[tile_in_block].xy[ncam][0], // const float centerX,
tt[tile_in_block].xy[ncam][1], // const float centerY,
tt[tile_in_block].txy, // const int txy,
tt[tile_in_block].scale, // const float tscale,
dstride, // size_t dstride, // in floats (pixels)
(float * )(clt_tile [tile_in_block]), // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
(float * )(clt_kernels[tile_in_block]), // float clt_tile [num_colors][4][DTT_SIZE][DTT_SIZE],
int_topleft[tile_in_block], // int int_topleft [num_colors][2],
residual_shift[tile_in_block], // float frac_topleft [num_colors][2],
window_hor_cos[tile_in_block], // float window_hor_cos [num_colors][2*DTT_SIZE],
window_hor_sin[tile_in_block], // float window_hor_sin [num_colors][2*DTT_SIZE],
window_vert_cos[tile_in_block], // float window_vert_cos [num_colors][2*DTT_SIZE]);
window_vert_sin[tile_in_block], // float window_vert_sin [num_colors][2*DTT_SIZE]);
woi_width, // int woi_width,
woi_height, // int woi_height,
kernels_hor, // int kernels_hor,
kernels_vert, //int kernels_vert)
tilesx); // int tilesx);
} else {
convertCorrectTile(
// TODO: remove debug when done
#ifdef DBG_TILE
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
#else
num_colors, // int num_colors, //*
#endif
(struct CltExtra*) 0, // struct CltExtra* gpu_kernel_offsets, ####
(float*) 0, // float * gpu_kernels, ####
gpu_images[ncam], // float * gpu_images,
gpu_clt[ncam], // float * gpu_clt,
color, // const int color,
lpf_mask, // const int lpf_mask,
tt[tile_in_block].xy[ncam][0], // const float centerX,
tt[tile_in_block].xy[ncam][1], // const float centerY,
tt[tile_in_block].txy, // const int txy,
tt[tile_in_block].scale, // const float tscale,
dstride, // size_t dstride, // in floats (pixels)
(float * )(clt_tile [tile_in_block]), // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
(float * )(clt_kernels[tile_in_block]), // float clt_tile [num_colors][4][DTT_SIZE][DTT_SIZE],
int_topleft[tile_in_block], // int int_topleft [num_colors][2],
residual_shift[tile_in_block], // float frac_topleft [num_colors][2],
window_hor_cos[tile_in_block], // float window_hor_cos [num_colors][2*DTT_SIZE],
window_hor_sin[tile_in_block], // float window_hor_sin [num_colors][2*DTT_SIZE],
window_vert_cos[tile_in_block], // float window_vert_cos [num_colors][2*DTT_SIZE]);
window_vert_sin[tile_in_block], // float window_vert_sin [num_colors][2*DTT_SIZE]);
woi_width, // int woi_width,
woi_height, // int woi_height,
0, // kernels_hor, // int kernels_hor,
0, // kernels_vert, //int kernels_vert)
tilesx); // int tilesx);
}
__syncthreads();
}
}
}
......@@ -4394,8 +4428,8 @@ __device__ void normalizeTileAmplitude(
*/
__device__ void convertCorrectTile(
int num_colors, //*
struct CltExtra * gpu_kernel_offsets, // [tileY][tileX][color]
float * gpu_kernels, // [tileY][tileX][color]
struct CltExtra * gpu_kernel_offsets, // [tileY][tileX][color] if null, will not calculate kernel-related offsets
float * gpu_kernels, // [tileY][tileX][color] // null w/o kernels
float * gpu_images,
float * gpu_clt,
const int color,
......@@ -4406,7 +4440,7 @@ __device__ void convertCorrectTile(
const float tscale,
const size_t dstride, // in floats (pixels)
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float * clt_kernels, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float * clt_kernels, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports // null w/o kernels
int int_topleft [2],
float residual_shift [2],
float window_hor_cos [2*DTT_SIZE],
......@@ -4430,78 +4464,85 @@ __device__ void convertCorrectTile(
int max_py = woi_height - 1; // IMG-HEIGHT - 1; // odd
int max_pxm1 = max_px - 1; // even
int max_pym1 = max_py - 1; // even
int max_kernel_hor = kernels_hor - 1; // KERNELS_HOR -1;
int max_kernel_vert = kernels_vert - 1; // KERNELS_VERT-1;
int ktileX, ktileY;
int kernel_index; // common for all coors
float kdx, kdy;
if (threadIdx.x == 0){
// ktileX = min(max_kernel_hor, max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
// ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
// kdx = centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
// kdy = centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
// From ImageDttCPU.java: extract_correct_tile() (modified 2022/05/12):
// int kernel_pitch = width/(clt_kernels[chn_kernel][0].length - 2);
if (gpu_kernels) {
int ktileX, ktileY;
int max_kernel_hor = kernels_hor - 1; // KERNELS_HOR -1;
int max_kernel_vert = kernels_vert - 1; // KERNELS_VERT-1;
if (threadIdx.x == 0){
// ktileX = min(max_kernel_hor, max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
// ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
// kdx = centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
// kdy = centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
// From ImageDttCPU.java: extract_correct_tile() (modified 2022/05/12):
// int kernel_pitch = width/(clt_kernels[chn_kernel][0].length - 2);
// 1. find closest kernel
// ktileX = (int) Math.round(centerX/kernel_pitch) + 1;
// ktileY = (int) Math.round(centerY/kernel_pitch) + 1;
// if (ktileY < 0) ktileY = 0;
// else if (ktileY >= clt_kernels[chn_kernel].length) ktileY = clt_kernels[chn_kernel].length-1;
// if (ktileX < 0) ktileX = 0;
// else if (ktileX >= clt_kernels[chn_kernel][ktileY].length) ktileX = clt_kernels[chn_kernel][ktileY].length-1;
// extract center offset data stored with each kernel tile
// CltExtra ce = new CltExtra (clt_kernels[chn_kernel][ktileY][ktileX][4]);
// 2. calculate correction for center of the kernel offset
// double kdx = centerX - (ktileX -1 +0.5) * kernel_pitch; // difference in pixel
// double kdy = centerY - (ktileY -1 +0.5) * kernel_pitch;
int kernel_pitch = woi_width / (kernels_hor - 2);
ktileX = min(max_kernel_hor, max(0, ((int) lrintf(centerX /kernel_pitch + 1))));
ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY /kernel_pitch + 1))));
kdx = centerX - (ktileX - 0.5) * kernel_pitch; // difference in pixel
kdy = centerY - (ktileY - 0.5) * kernel_pitch; //
kernel_index = (ktileX + ktileY * kernels_hor) * num_colors;
// ktileX = (int) Math.round(centerX/kernel_pitch) + 1;
// ktileY = (int) Math.round(centerY/kernel_pitch) + 1;
// if (ktileY < 0) ktileY = 0;
// else if (ktileY >= clt_kernels[chn_kernel].length) ktileY = clt_kernels[chn_kernel].length-1;
// if (ktileX < 0) ktileX = 0;
// else if (ktileX >= clt_kernels[chn_kernel][ktileY].length) ktileX = clt_kernels[chn_kernel][ktileY].length-1;
// extract center offset data stored with each kernel tile
// CltExtra ce = new CltExtra (clt_kernels[chn_kernel][ktileY][ktileX][4]);
// 2. calculate correction for center of the kernel offset
// double kdx = centerX - (ktileX -1 +0.5) * kernel_pitch; // difference in pixel
// double kdy = centerY - (ktileY -1 +0.5) * kernel_pitch;
int kernel_pitch = woi_width / (kernels_hor - 2);
ktileX = min(max_kernel_hor, max(0, ((int) lrintf(centerX /kernel_pitch + 1))));
ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY /kernel_pitch + 1))));
kdx = centerX - (ktileX - 0.5) * kernel_pitch; // difference in pixel
kdy = centerY - (ktileY - 0.5) * kernel_pitch; //
kernel_index = (ktileX + ktileY * kernels_hor) * num_colors;
}
// broadcast kernel_index
kernel_index = __shfl_sync(
0xffffffff, // unsigned mask,
kernel_index, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
kdx = __shfl_sync(
0xffffffff, // unsigned mask,
kdx, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
kdy = __shfl_sync(
0xffffffff, // unsigned mask,
kdy, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
__syncthreads();// __syncwarp(); // is it needed?
}
// broadcast kernel_index
kernel_index = __shfl_sync(
0xffffffff, // unsigned mask,
kernel_index, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
kdx = __shfl_sync(
0xffffffff, // unsigned mask,
kdx, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
kdy = __shfl_sync(
0xffffffff, // unsigned mask,
kdy, // T var,
0, // int srcLane,
THREADS_PER_TILE); // int width=warpSize);
__syncthreads();// __syncwarp(); // is it needed?
float px, py;
// copy kernel
int kernel_full_index = kernel_index + color;
float * kernel_src = gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
float * kernelp = clt_kernels;
kernel_src += threadIdx.x; // lsb;
kernelp += threadIdx.x; // lsb;
int kernel_full_index;
if (gpu_kernels) { // only copy kernels if they exist
// copy kernel
kernel_full_index = kernel_index + color;
float * kernel_src = gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
float * kernelp = clt_kernels;
kernel_src += threadIdx.x; // lsb;
kernelp += threadIdx.x; // lsb;
#pragma unroll
for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
// shared memory kernels use DTT_SIZE1 (same as image data)
*kernelp = *kernel_src;
kernelp+=DTT_SIZE1;
kernel_src+=THREADSX;
for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
// shared memory kernels use DTT_SIZE1 (same as image data)
*kernelp = *kernel_src;
kernelp+=DTT_SIZE1;
kernel_src+=THREADSX;
}
}
px = centerX - DTT_SIZE; // fractional left corner (w/o kernel)
py = centerY - DTT_SIZE; // fractional top corner (w/o kernel)
if (gpu_kernel_offsets){ // adding no-kernel version with rectilinear images
struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
px -= clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy ;
py -= clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy ;
}
// Calculate offsets and prepare windows (all colors):
struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
px = centerX - DTT_SIZE - (clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy) ; // fractional left corner
// struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
// px = centerX - DTT_SIZE - (clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy) ; // fractional left corner
int itlx = (int) floorf(px +0.5f);
int_topleft [0] = itlx;
float shift_hor = itlx - px;
......@@ -4536,7 +4577,7 @@ __device__ void convertCorrectTile(
i1++;
}
py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
// py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
int itly = (int) floorf(py +0.5f);
int_topleft[1] = itly;
......@@ -4591,7 +4632,7 @@ __device__ void convertCorrectTile(
if (dbg_tile && (threadIdx.x) == 0){
printf("COLOR=%d\n",color);
printf("centerX=%f, centerY=%f\n",centerX, centerY);
printf("ktileX=%d, ktileY=%d\n", ktileX, ktileY);
// printf("ktileX=%d, ktileY=%d\n", ktileX, ktileY);
printf("kdx=%f, kdy=%f\n", kdx, kdy);
printf("int_topleft[%d][0]=%d, int_topleft[%d][1]=%d\n",i,int_topleft[0],i,int_topleft[1]);
printf("residual_shift[%d][0]=%f, residual_shift[%d][1]=%f\n",i,residual_shift[0],i,residual_shift[1]);
......@@ -4896,28 +4937,32 @@ __device__ void convertCorrectTile(
} // else { // if (is_mono) {
#ifdef DEBUG30
if (dbg_tile && (threadIdx.x) == 0){
printf("\nDTT Tiles after before convolving, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf);
if (dbg_tile && (threadIdx.x == 0)){
printf("\nDTT Tiles after before convolving, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf);
}
__syncthreads();// __syncwarp();
#endif
if (gpu_kernels) {
// convolve first, then rotate to match Java and make it easier to verify
// verify it works w/o kernels
convolveTiles(
clt_tile, // float clt_tile [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
clt_kernels); // float kernel [4][DTT_SIZE][DTT_SIZE1]); // 4 quadrants of the CLT kernel (DTT3 converted)
__syncthreads();// __syncwarp();
#ifdef DEBUG30
if (dbg_tile && (threadIdx.x == 0)){
printf("\nDTT Tiles after convolution, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf); // all colors, all quadrants
}
__syncthreads();// __syncwarp();
#endif
}
// convolve first, then rotate to match Java and make it easier to verify
convolveTiles(
clt_tile, // float clt_tile [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
clt_kernels); // float kernel [4][DTT_SIZE][DTT_SIZE1]); // 4 quadrants of the CLT kernel (DTT3 converted)
__syncthreads();// __syncwarp();
#ifdef DEBUG2
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after convolution, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf); // all colors, all quadrants
}
__syncthreads();// __syncwarp();
#endif
// rotate phases: first horizontal, then vertical
shiftTileHor(
clt_tile, // float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
......@@ -4947,8 +4992,8 @@ __device__ void convertCorrectTile(
#ifdef DBG_TILE
#ifdef DEBUG3
if ((threadIdx.x) == 0){
#ifdef DEBUG30
if (dbg_tile && (threadIdx.x == 0)){
printf("\nDTT Tiles after vertical shift, color = %d\n",color);
debug_print_clt1(clt_tile, color, 0xf); // only 1 quadrant for R,B and 2 - for G
printf("\nDTT All done\n");
......
......@@ -30,13 +30,17 @@
** -----------------------------------------------------------------------------**
*/
// #define NOCORR
// #define NOCORR_TD
// #define NOTEXTURES
// #define NOTEXTURE_RGBA
// all of the next 5 were disabled
#define NOCORR
#define NOCORR_TD
#define NOTEXTURES
#define NOTEXTURE_RGBA
#define NOTEXTURE_RGBAXXX
#define SAVE_CLT
#define NO_DP
#define CORR_INTER_SELF 1
/// #define CORR_INTER_SELF 1
#include <stdio.h>
......@@ -1442,7 +1446,7 @@ int main(int argc, char **argv)
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
IMG_WIDTH, // int woi_width,
IMG_HEIGHT, // int woi_height,
KERNELS_HOR, // int kernels_hor,
0, // KERNELS_HOR, // int kernels_hor,
KERNELS_VERT, // int kernels_vert);
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
......
......@@ -147,7 +147,7 @@
//#define DEBUG20 1 // Geometry Correction
//#define DEBUG21 1 // Geometry Correction
//#define DEBUG210 1
////#define DEBUG30 1
#define DEBUG30 1
//#define DEBUG22 1
//#define DEBUG23 1
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment