Commit 28e55dce authored by Andrey Filippov's avatar Andrey Filippov

Trying interscene correlation

parent 0dcd2dee
...@@ -1041,6 +1041,15 @@ __global__ void index_correlate( ...@@ -1041,6 +1041,15 @@ __global__ void index_correlate(
int * gpu_corr_indices, // array of correlation tasks int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void index_inter_correlate(
int num_cams,
int sel_sensors,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
extern "C" __global__ void create_nonoverlap_list( extern "C" __global__ void create_nonoverlap_list(
int num_cams, int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...@@ -1068,6 +1077,27 @@ __global__ void convert_correct_tiles( ...@@ -1068,6 +1077,27 @@ __global__ void convert_correct_tiles(
int kernels_vert, //); int kernels_vert, //);
int tilesx); int tilesx);
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
int num_cams,
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
int num_cams, int num_cams,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...@@ -1220,6 +1250,331 @@ extern "C" __global__ void correlate2D( ...@@ -1220,6 +1250,331 @@ extern "C" __global__ void correlate2D(
} }
} }
/**
* Calculate 2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
* ones with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param tilesx number of tile rows
* @param gpu_corr_indices allocated array for per-tile correlation tasks (4 bytes per tile)
* @param pnum_corr_tiles allocated space for pointer to a number of number of correlation tiles to process
* @param corr_stride, stride (in floats) for correlation outputs.
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/
extern "C" __global__ void correlate2D_inter( // only results in TD
int num_cams,
int sel_sensors,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
if (num_sel_sensors > 0){
*pnum_corr_tiles = 0;
index_inter_correlate<<<blocks0,threads0>>>(
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
num_tiles, // int num_tiles, // number of tiles in task
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
int num_corr_tiles = (*pnum_corr_tiles) * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs); // correlation output data (either pixel domain or transform domain
}
}
}
/**
* Used for interscene correlations (for motion vector calculation).
* Calculate sum of selected correlation (in TD) and place it after individual (per-sensor) correlations.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_task_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_sel_sensors number of sensors to correlate
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* should be calculated by correlate2D_inter_inner() leaving gaps for sums, calculated here
*
*/
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{
int corr_in_block = threadIdx.y;
int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
return;
}
// __syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int isens = 0; isens < num_sel_sensors; isens++){
float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) += (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
corr_offset++;
}
// Now corr_offset points to the sum of correlations
float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
corr_offset++;
}
/**
* Calculate interscene 2D phase correlation pairs from CLT representation.
* This is an inner kernel that is called from correlate2D_inter.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_corr_tiles_excluding_sums + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_cams number of cameras
* @param num_sel_sensors number of sensors to correlate
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param gpu_clt_ref array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* gpu_clt_ref and gpu_clt_ref correspond to two scenes, the reference is the first in correlation.
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* will be calculated here leaving gaps for sums, calculated by combine_inter()
*/
extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
int num_cams,
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
int tile_index = corr_num / num_sel_sensors;
int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
if (corr_offset >= num_corr_tiles){
return; // nothing to do
}
// get number of pair and number of tile
int corr_sensor = gpu_corr_indices[corr_offset]; // corr_num];
int tile_num = corr_sensor >> CORR_NTILE_SHIFT;
corr_sensor &= (corr_sensor & ((1 << CORR_NTILE_SHIFT) - 1));
if (corr_sensor >= num_cams){
return; // BUG - should not happen
}
__syncthreads();// __syncwarp();
__shared__ float clt_tiles1 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_tiles2 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int color = 0; color < colors; color++){
// copy clt (frequency domain data)
float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
float * clt_tile2 = ((float *) clt_tiles2) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
int offs = (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * gpu_tile1 = ((float *) gpu_clt_ref[corr_sensor]) + offs;
float * gpu_tile2 = ((float *) gpu_clt [corr_sensor]) + offs;
float * clt_tile1i = clt_tile1 + threadIdx.x;
float * clt_tile2i = clt_tile2 + threadIdx.x;
#pragma unroll
#define USE_LOG
#ifdef USE_LOG
// Apply high-pass filter to correlation inputs to reduce dynamic range before multiplication
for (int q = 0; q < 4; q++){
float *log = LoG_corr + threadIdx.x;
for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
*clt_tile1i= (*gpu_tile1) * (*log);
*clt_tile2i= (*gpu_tile2) * (*log);
clt_tile1i += DTT_SIZE1;
clt_tile2i += DTT_SIZE1;
gpu_tile1 += DTT_SIZE;
gpu_tile2 += DTT_SIZE;
log += DTT_SIZE;
}
}
#else
for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
*clt_tile1i= *gpu_tile1;
*clt_tile2i= *gpu_tile2;
clt_tile1i += DTT_SIZE1;
clt_tile2i += DTT_SIZE1;
gpu_tile1 += DTT_SIZE;
gpu_tile2 += DTT_SIZE;
}
#endif //USE_LOG
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA1\n",tile_num, corr_sensor,color);
debug_print_clt1(clt_tile1, color, 0xf); //
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA2\n",tile_num, corr_sensor,color);
debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads();// __syncwarp();
#endif
#endif
// each thread should get the same pointers here, offsets are inside
correlateAccumulateTiles(
scales[color], // float scale, // scale correlation
clt_tile1, // float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
clt_tile2, // float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
clt_corr); // float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D, color = %d CORRELATION\n", color);
debug_print_clt1(clt_corr, color, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
if (color == 1){ // LPF only after B (nothing in mono)
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D LPF for RB correlation\n");
debug_print_lpf(lpf_rb_corr);
}
__syncthreads();// __syncwarp();
#endif
#endif
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf_rb = lpf_rb_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) *= (*lpf_rb);
clt += DTT_SIZE1;
lpf_rb += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION RB LPF-ed\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
}
/** /**
* Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called * Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called
* from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>. * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
...@@ -1234,9 +1589,9 @@ extern "C" __global__ void correlate2D( ...@@ -1234,9 +1589,9 @@ extern "C" __global__ void correlate2D(
* @param fat_zero2 add this value squared to the sum of squared components before normalization * @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride, stride (in floats) for correlation outputs. * @param corr_stride stride (in floats) for correlation outputs.
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization * @param corr_radius radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/ */
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
int num_cams, int num_cams,
...@@ -1504,6 +1859,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1504,6 +1859,7 @@ extern "C" __global__ void correlate2D_inner(
} }
/** /**
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile, * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one * and separately the two diagonals into another single one
...@@ -2515,7 +2871,6 @@ extern "C" __global__ void create_nonoverlap_list( ...@@ -2515,7 +2871,6 @@ extern "C" __global__ void create_nonoverlap_list(
* @param num_cams number of cameras <= NUM_CAMS * @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs). * @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list * @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length * @param pnum_corr_tiles single-element integer array return generated list length
...@@ -2564,23 +2919,60 @@ __global__ void index_correlate( ...@@ -2564,23 +2919,60 @@ __global__ void index_correlate(
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b; gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
} }
} }
/* }
int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm != 0) { /**
int nb = __popc (cm); // number of non-zero bits * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
int indx = atomicAdd(pnum_corr_tiles, nb); * For interscene correlation. One correlation output for each selected sensor
int txy = gpu_tasks[num_tile].txy; * plus a sum of them all. So for all 16 sensors selected ooutput will have 17
int tx = txy & 0xffff; * 2D correlations (with some being the l;ast one)
int ty = txy >> 16; * All pairs for the same tile will always be in the same order: increasing sensor numbers
// int nt = ty * TILES-X + tx; * with sum being the last. Sum will be marked by 0xff in the LSB.
* With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are.
*
* @param num_cams number of cameras <= NUM_CAMS <32
* @param sel_sensors array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length
*/
__global__ void index_inter_correlate(
int num_cams,
int sel_sensors,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles) // pointer to the length of correlation tasks array
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
// int task_size = get_task_size(num_cams);
int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?
return;
}
int nb = __popc (sel_sensors); // number of non-zero bits
if (nb > 0){
int indx = atomicAdd(pnum_corr_tiles, nb+1);
int task_txy = get_task_txy(num_tile, gpu_ftasks, num_cams);
int tx = task_txy & 0xffff;
int ty = task_txy >> 16;
int nt = ty * width + tx; int nt = ty * width + tx;
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) { // for (int b = 0; b < pair_list_len; b++) if ((cm & (1 << b)) != 0) {
for (int b = 0; b < num_cams; b++) if ((sel_sensors & (1 << (b & 31))) != 0) {
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b; gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
} }
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | 0xff; // will be used for sum
} }
*/
} }
/** /**
* Direct MCLT transform and aberration correction with space-variant deconvolution * Direct MCLT transform and aberration correction with space-variant deconvolution
* kernels. Results are used to output aberration-corrected images, textures and * kernels. Results are used to output aberration-corrected images, textures and
......
...@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D( ...@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D(
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats size_t corr_stride, // in floats
// int corr_stride, // in floats // int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
extern "C" __global__ void correlate2D_inter( // only results in TD
int num_cams,
int sel_sensors,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize( extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats const size_t corr_stride_td, // in floats
......
...@@ -36,7 +36,7 @@ ...@@ -36,7 +36,7 @@
// #define NOTEXTURE_RGBA // #define NOTEXTURE_RGBA
#define SAVE_CLT #define SAVE_CLT
//#define NO_DP //#define NO_DP
#define CORR_INTER_SELF 1
#include <stdio.h> #include <stdio.h>
...@@ -685,6 +685,7 @@ int main(int argc, char **argv) ...@@ -685,6 +685,7 @@ int main(int argc, char **argv)
const char* result_corr_quad_file = "clt/aux_corr-quad.corr"; const char* result_corr_quad_file = "clt/aux_corr-quad.corr";
const char* result_corr_td_norm_file = "clt/aux_corr-td-norm.corr"; const char* result_corr_td_norm_file = "clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "clt/aux_corr-cross.corr"; /// const char* result_corr_cross_file = "clt/aux_corr-cross.corr";
const char* result_inter_td_norm_file = "clt/aux_inter-td-norm.corr";
const char* result_textures_file = "clt/aux_texture_nodp.rgba"; const char* result_textures_file = "clt/aux_texture_nodp.rgba";
const char* result_diff_rgb_combo_file ="clt/aux_diff_rgb_combo_nodp.drbg"; const char* result_diff_rgb_combo_file ="clt/aux_diff_rgb_combo_nodp.drbg";
const char* result_textures_rgba_file = "clt/aux_texture_rgba_nodp.rgba"; const char* result_textures_rgba_file = "clt/aux_texture_rgba_nodp.rgba";
...@@ -752,6 +753,7 @@ int main(int argc, char **argv) ...@@ -752,6 +753,7 @@ int main(int argc, char **argv)
const char* result_corr_quad_file = "clt/main_corr-quad.corr"; const char* result_corr_quad_file = "clt/main_corr-quad.corr";
const char* result_corr_td_norm_file = "clt/aux_corr-td-norm.corr"; const char* result_corr_td_norm_file = "clt/aux_corr-td-norm.corr";
/// const char* result_corr_cross_file = "clt/main_corr-cross.corr"; /// const char* result_corr_cross_file = "clt/main_corr-cross.corr";
const char* result_inter_td_norm_file = "clt/aux_inter-td-norm.corr";
const char* result_textures_file = "clt/main_texture_nodp.rgba"; const char* result_textures_file = "clt/main_texture_nodp.rgba";
const char* result_diff_rgb_combo_file ="clt/main_diff_rgb_combo_nodp.drbg"; const char* result_diff_rgb_combo_file ="clt/main_diff_rgb_combo_nodp.drbg";
const char* result_textures_rgba_file = "clt/main_texture_rgba_nodp.rgba"; const char* result_textures_rgba_file = "clt/main_texture_rgba_nodp.rgba";
...@@ -1790,6 +1792,142 @@ int main(int argc, char **argv) ...@@ -1790,6 +1792,142 @@ int main(int argc, char **argv)
#endif // ifndef NOCORR_TD #endif // ifndef NOCORR_TD
// Testing "interframe" correlation with itself, assuming direct convert already ran
#ifdef CORR_INTER_SELF
int sel_sensors = 0xffff;
int num_sel_senosrs = 16;
num_pairs = num_sel_senosrs+1;
num_corr_indices = num_pairs * num_tiles;
StopWatchInterface *timerINTERSELF = 0;
sdkCreateTimer(&timerINTERSELF);
// int num_corr_combo_inter;
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerINTERSELF);
sdkStartTimer(&timerINTERSELF);
}
correlate2D_inter<<<1,1>>>( // only results in TD
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt, // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
num_colors, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R
color_weights[1], // 0.25, // float scale1, // scale for B
color_weights[2], // 0.5, // float scale2, // scale for G
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
tp_task_size, // int num_tiles) // number of tiles in task
TILESX, // int tilesx, // number of tile rows
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D_inter");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D_inter-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
30.0 * 30.0, // float fat_zero2, // here - absolute
CORR_OUT_RAD); // int corr_radius); // radius of the output correlation (7 for 15x15)
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerINTERSELF);
float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
sdkDeleteTimer(&timerINTERSELF);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeINTERSELF, num_corrs);
rslt_corr_size = num_corrs * corr_size * corr_size;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
// int num_pairs = 120;
// int sel_sensors = 0xffff;
// int num_sel_senosrs = 16;
// int corr_size = 2 * CORR_OUT_RAD + 1; // 15
// int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
// int num_corr_indices = num_pairs * num_tiles;
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
if (cpair == 0xff){
cpair = num_sel_senosrs;
}
int ty = ctt / TILESX;
int tx = ctt % TILESX;
int src_offs0 = ict * corr_size * corr_size;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
#ifndef NSAVE_CORR
printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
result_inter_td_norm_file); // const char * path) // file path
#endif
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
#endif // #ifdef CORR_INTER_SELF
// ----------------- // -----------------
#ifndef NOTEXTURES #ifndef NOTEXTURES
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment