Commit b4d88911 authored by Andrey Filippov's avatar Andrey Filippov

Updated GPU code

parent e156dbea
...@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed ...@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
0.32891038f, 0.30456742f, 0.26124917f, 0.20786692f, 0.15413642f, 0.10818204f, 0.07536856f, 0.05845371f, 0.32891038f, 0.30456742f, 0.26124917f, 0.20786692f, 0.15413642f, 0.10818204f, 0.07536856f, 0.05845371f,
0.22914618f, 0.21218686f, 0.18200779f, 0.14481729f, 0.10738418f, 0.07536856f, 0.05250797f, 0.04072369f, 0.22914618f, 0.21218686f, 0.18200779f, 0.14481729f, 0.10738418f, 0.07536856f, 0.05250797f, 0.04072369f,
0.17771927f, 0.16456610f, 0.14116007f, 0.11231618f, 0.08328412f, 0.05845371f, 0.04072369f, 0.03158414f 0.17771927f, 0.16456610f, 0.14116007f, 0.11231618f, 0.08328412f, 0.05845371f, 0.04072369f, 0.03158414f
}; };
__constant__ float lpf_corr[64]={ // modify if needed __constant__ float lpf_corr[64]={ // modify if needed
1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f, 1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
...@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed ...@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f, 0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f, 0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f 0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
}; };
__constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range __constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range
...@@ -1041,6 +1043,15 @@ __global__ void index_correlate( ...@@ -1041,6 +1043,15 @@ __global__ void index_correlate(
int * gpu_corr_indices, // array of correlation tasks int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void index_inter_correlate(
int num_cams,
int sel_sensors,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
extern "C" __global__ void create_nonoverlap_list( extern "C" __global__ void create_nonoverlap_list(
int num_cams, int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...@@ -1068,6 +1079,27 @@ __global__ void convert_correct_tiles( ...@@ -1068,6 +1079,27 @@ __global__ void convert_correct_tiles(
int kernels_vert, //); int kernels_vert, //);
int tilesx); int tilesx);
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
int num_cams,
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
int num_cams, int num_cams,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...@@ -1220,6 +1252,291 @@ extern "C" __global__ void correlate2D( ...@@ -1220,6 +1252,291 @@ extern "C" __global__ void correlate2D(
} }
} }
/**
* Calculate 2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
* ones with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param tilesx number of tile rows
* @param gpu_corr_indices allocated array for per-tile correlation tasks (4 bytes per tile)
* @param pnum_corr_tiles allocated space for pointer to a number of number of correlation tiles to process
* @param corr_stride, stride (in floats) for correlation outputs.
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/
extern "C" __global__ void correlate2D_inter( // only results in TD
int num_cams,
int sel_sensors,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0;
index_inter_correlate<<<blocks0,threads0>>>(
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
num_tiles, // int num_tiles, // number of tiles in task
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
int num_corr_tiles_with_sum = (*pnum_corr_tiles);
int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref, // float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair NOT USED
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs); // correlation output data (either pixel domain or transform domain
}
}
}
/**
* Used for interscene correlations (for motion vector calculation).
* Calculate sum of selected correlation (in TD) and place it after individual (per-sensor) correlations.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_task_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_sel_sensors number of sensors to correlate
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* should be calculated by correlate2D_inter_inner() leaving gaps for sums, calculated here
*
*/
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{
int corr_in_block = threadIdx.y;
int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
return;
}
// __syncthreads();// __syncwarp();
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int isens = 0; isens < num_sel_sensors; isens++){
float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*clt) += (*mem_corr);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
corr_offset++;
}
// Now corr_offset points to the sum of correlations
float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
}
/**
* Calculate interscene 2D phase correlation pairs from CLT representation.
* This is an inner kernel that is called from correlate2D_inter.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_corr_tiles_excluding_sums + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_cams number of cameras
* @param num_sel_sensors number of sensors to correlate
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param gpu_clt_ref array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* gpu_clt_ref and gpu_clt_ref correspond to two scenes, the reference is the first in correlation.
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* will be calculated here leaving gaps for sums, calculated by combine_inter()
*/
extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
int num_cams,
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
int tile_index = corr_num / num_sel_sensors;
int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
if (corr_offset >= num_corr_tiles){
return; // nothing to do
}
// get number of pair and number of tile
int corr_sensor = gpu_corr_indices[corr_offset]; // corr_num];
int tile_num = corr_sensor >> CORR_NTILE_SHIFT;
corr_sensor &= (corr_sensor & ((1 << CORR_NTILE_SHIFT) - 1));
if (corr_sensor >= num_cams){
return; // BUG - should not happen
}
__syncthreads();// __syncwarp();
__shared__ float clt_tiles1 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_tiles2 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
// __shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
// float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int color = 0; color < colors; color++){
// copy clt (frequency domain data)
float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
float * clt_tile2 = ((float *) clt_tiles2) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
int offs = (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * gpu_tile1 = ((float *) gpu_clt_ref[corr_sensor]) + offs;
float * gpu_tile2 = ((float *) gpu_clt [corr_sensor]) + offs;
float * clt_tile1i = clt_tile1 + threadIdx.x;
float * clt_tile2i = clt_tile2 + threadIdx.x;
#pragma unroll
#define USE_LOG
#ifdef USE_LOG
// Apply high-pass filter to correlation inputs to reduce dynamic range before multiplication
for (int q = 0; q < 4; q++){
float *log = LoG_corr + threadIdx.x;
for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
*clt_tile1i= (*gpu_tile1) * (*log);
*clt_tile2i= (*gpu_tile2) * (*log);
clt_tile1i += DTT_SIZE1;
clt_tile2i += DTT_SIZE1;
gpu_tile1 += DTT_SIZE;
gpu_tile2 += DTT_SIZE;
log += DTT_SIZE;
}
}
#else
for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
*clt_tile1i= *gpu_tile1;
*clt_tile2i= *gpu_tile2;
clt_tile1i += DTT_SIZE1;
clt_tile2i += DTT_SIZE1;
gpu_tile1 += DTT_SIZE;
gpu_tile2 += DTT_SIZE;
}
#endif //USE_LOG
__syncthreads();
// each thread should get the same pointers here, offsets are inside
correlateAccumulateTiles(
scales[color], // float scale, // scale correlation
clt_tile1, // float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
clt_tile2, // float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
clt_corr); // float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__syncthreads();
if (color == 1){ // LPF only after B (nothing in mono)
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf_rb = lpf_rb_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) *= (*lpf_rb);
clt += DTT_SIZE1;
lpf_rb += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
__syncthreads();// __syncwarp();
float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
__syncthreads();// __syncwarp();
}
/** /**
* Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called * Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called
* from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>. * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
...@@ -1234,9 +1551,9 @@ extern "C" __global__ void correlate2D( ...@@ -1234,9 +1551,9 @@ extern "C" __global__ void correlate2D(
* @param fat_zero2 add this value squared to the sum of squared components before normalization * @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride, stride (in floats) for correlation outputs. * @param corr_stride stride (in floats) for correlation outputs.
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization * @param corr_radius radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/ */
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
int num_cams, int num_cams,
...@@ -1504,11 +1821,12 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1504,11 +1821,12 @@ extern "C" __global__ void correlate2D_inner(
} }
/** /**
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile, * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one * and separately the two diagonals into another single one
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped. * When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated * when combining two diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* (corresponds to a vertical flip). * (corresponds to a vertical flip).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations). * Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>> * This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
...@@ -1695,10 +2013,10 @@ extern "C" __global__ void corr2D_combine_inner( ...@@ -1695,10 +2013,10 @@ extern "C" __global__ void corr2D_combine_inner(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>> * that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
* *
* @param num_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain). * @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain * @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it) * @param corr_weights null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs. * @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared) * @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
...@@ -1708,7 +2026,7 @@ extern "C" __global__ void corr2D_normalize( ...@@ -1708,7 +2026,7 @@ extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per correlation tile weight (fat_zero2 will be divided by it) float * corr_weights, // null or per correlation tile weight (fat_zero2 will be divided by it), length = num_corr_tile
const size_t corr_stride, // in floats const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared float fat_zero2, // here - absolute, squared
...@@ -1733,14 +2051,14 @@ extern "C" __global__ void corr2D_normalize( ...@@ -1733,14 +2051,14 @@ extern "C" __global__ void corr2D_normalize(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* kernel that is called from corr2D_normalize. * kernel that is called from corr2D_normalize.
* *
* @param num_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain). * @param corr_stride_td stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain * @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it) * @param corr_weights null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs. * @param corr_stride stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization * @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15) * @param corr_radius radius of the output correlation (maximal 7 for 15x15)
*/ */
extern "C" __global__ void corr2D_normalize_inner( extern "C" __global__ void corr2D_normalize_inner(
...@@ -2515,7 +2833,6 @@ extern "C" __global__ void create_nonoverlap_list( ...@@ -2515,7 +2833,6 @@ extern "C" __global__ void create_nonoverlap_list(
* @param num_cams number of cameras <= NUM_CAMS * @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs). * @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list * @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length * @param pnum_corr_tiles single-element integer array return generated list length
...@@ -2564,23 +2881,60 @@ __global__ void index_correlate( ...@@ -2564,23 +2881,60 @@ __global__ void index_correlate(
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b; gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
} }
} }
/* }
int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm != 0) { /**
int nb = __popc (cm); // number of non-zero bits * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
int indx = atomicAdd(pnum_corr_tiles, nb); * For interscene correlation. One correlation output for each selected sensor
int txy = gpu_tasks[num_tile].txy; * plus a sum of them all. So for all 16 sensors selected ooutput will have 17
int tx = txy & 0xffff; * 2D correlations (with some being the l;ast one)
int ty = txy >> 16; * All pairs for the same tile will always be in the same order: increasing sensor numbers
// int nt = ty * TILES-X + tx; * with sum being the last. Sum will be marked by 0xff in the LSB.
* With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are.
*
* @param num_cams number of cameras <= NUM_CAMS <32
* @param sel_sensors array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length
*/
__global__ void index_inter_correlate(
int num_cams,
int sel_sensors,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles) // pointer to the length of correlation tasks array
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
// int task_size = get_task_size(num_cams);
int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?
return;
}
int nb = __popc (sel_sensors); // number of non-zero bits
if (nb > 0){
int indx = atomicAdd(pnum_corr_tiles, nb+1);
int task_txy = get_task_txy(num_tile, gpu_ftasks, num_cams);
int tx = task_txy & 0xffff;
int ty = task_txy >> 16;
int nt = ty * width + tx; int nt = ty * width + tx;
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) { // for (int b = 0; b < pair_list_len; b++) if ((cm & (1 << b)) != 0) {
for (int b = 0; b < num_cams; b++) if ((sel_sensors & (1 << (b & 31))) != 0) {
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b; gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
} }
gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | 0xff; // will be used for sum
} }
*/
} }
/** /**
* Direct MCLT transform and aberration correction with space-variant deconvolution * Direct MCLT transform and aberration correction with space-variant deconvolution
* kernels. Results are used to output aberration-corrected images, textures and * kernels. Results are used to output aberration-corrected images, textures and
...@@ -3917,6 +4271,7 @@ __device__ void correlateAccumulateTiles( ...@@ -3917,6 +4271,7 @@ __device__ void correlateAccumulateTiles(
float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0] float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0] float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0] float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
// __syncthreads(); // *** TESTING ***
//#pragma unroll //#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){ for (int i = 0; i < DTT_SIZE; i++){
// k=0 // k=0
...@@ -3960,6 +4315,8 @@ __device__ void correlateAccumulateTiles( ...@@ -3960,6 +4315,8 @@ __device__ void correlateAccumulateTiles(
corr_tile_j1 ++; corr_tile_j1 ++;
corr_tile_j2 ++; corr_tile_j2 ++;
corr_tile_j3 ++; corr_tile_j3 ++;
// __syncthreads(); // *** TESTING ***
} }
} }
......
...@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D( ...@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D(
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats size_t corr_stride, // in floats
// int corr_stride, // in floats // int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
extern "C" __global__ void correlate2D_inter( // only results in TD
int num_cams,
int sel_sensors,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize( extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats const size_t corr_stride_td, // in floats
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment