Commit 75fa734d authored by Andrey Filippov's avatar Andrey Filippov

adding more to correlation

parent bbcae7a3
...@@ -815,7 +815,7 @@ __device__ void tile_combine_rgba( ...@@ -815,7 +815,7 @@ __device__ void tile_combine_rgba(
// boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing) // boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float * chn_weights, // color channel weights, sum == 1.0 float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differes much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms) int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug); int debug);
...@@ -1003,7 +1003,8 @@ extern "C" __global__ void correlate2D( ...@@ -1003,7 +1003,8 @@ extern "C" __global__ void correlate2D(
/** /**
* Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called * Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called
* from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>> * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
* If corr_radius==0, skip normalization and inverse transform, output transform domain tiles
* *
* @param gpu_clt array of NUM_CAMS pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] * @param gpu_clt array of NUM_CAMS pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param colors number of colors used: 3 for RGB or 1 for monochrome * @param colors number of colors used: 3 for RGB or 1 for monochrome
...@@ -1014,7 +1015,7 @@ extern "C" __global__ void correlate2D( ...@@ -1014,7 +1015,7 @@ extern "C" __global__ void correlate2D(
* @param num_corr_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride, stride (in floats) for correlation outputs. * @param corr_stride, stride (in floats) for correlation outputs.
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15) * @param corr_radius, radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/ */
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
...@@ -1028,7 +1029,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1028,7 +1029,7 @@ extern "C" __global__ void correlate2D_inner(
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{ {
float scales[3] = {scale0, scale1, scale2}; float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
...@@ -1136,10 +1137,13 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1136,10 +1137,13 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
} // if (color == 1){ // LPF only after B (nothing in mono) } // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){ } // for (int color = 0; color < colors; color++){
// Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
if (corr_radius > 0) {
normalizeTileAmplitude( normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?) // Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
...@@ -1151,7 +1155,6 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1151,7 +1155,6 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
#endif #endif
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){ if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
...@@ -1176,6 +1179,8 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1176,6 +1179,8 @@ extern "C" __global__ void correlate2D_inner(
} }
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){ if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
...@@ -1224,7 +1229,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1224,7 +1229,7 @@ extern "C" __global__ void correlate2D_inner(
int corr_tile_offset = + corr_stride * corr_num; int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_tile_offset; float *mem_corr = gpu_corrs + corr_tile_offset;
#pragma unroll #pragma unroll
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread // for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs]; mem_corr[offs] = mclt_corr[offs];
} }
...@@ -1234,11 +1239,27 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1234,11 +1239,27 @@ extern "C" __global__ void correlate2D_inner(
#ifdef DEBUG6 #ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){ if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after copy to main memory\n"); printf("\ncorrelate2D after copy to main memory\n");
// debug_print_clt1(clt_corr, -1, 0xf); // debug_print_clt1(clt_corr, -1, 0xf);
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif #endif
#endif #endif
} else { // if (corr_radius > 0) { transform domain output
// int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*mem_corr) = (*clt);
clt += DTT_SIZE1;
mem_corr += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
} // if (corr_radius > 0) ... else
} }
/** /**
...@@ -1791,6 +1812,7 @@ __global__ void create_nonoverlap_list( ...@@ -1791,6 +1812,7 @@ __global__ void create_nonoverlap_list(
/** /**
* Helper kernel for correlate2D() - generates dense list of correlation tasks. * Helper kernel for correlate2D() - generates dense list of correlation tasks.
* With the quad camera each tile may generate up to 6 pairs (int array elements) * With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are
* *
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
...@@ -2401,8 +2423,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1) ...@@ -2401,8 +2423,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads(); // _syncthreads();1 __syncthreads(); // _syncthreads();1
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]; // float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride; // size_t texture_tile_offset = + tile_indx * texture_stride;
...@@ -3805,7 +3825,7 @@ __device__ void tile_combine_rgba( ...@@ -3805,7 +3825,7 @@ __device__ void tile_combine_rgba(
float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
// int port_mask, // which port to use, 0xf - all 4 (will modify as local variable) // int port_mask, // which port to use, 0xf - all 4 (will modify as local variable)
float diff_sigma, // pixel value/pixel change float diff_sigma, // pixel value/pixel change
float diff_threshold,// pixel value/pixel change float diff_threshold,// pixel value/pixel change - never used
// next not used // next not used
// boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing) // boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
...@@ -3817,7 +3837,7 @@ __device__ void tile_combine_rgba( ...@@ -3817,7 +3837,7 @@ __device__ void tile_combine_rgba(
float * alpha = rgba + (colors * (DTT_SIZE2*DTT_SIZE21)); float * alpha = rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21); float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
float * crms = port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // calculated only if keep_weights float * crms = port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // calculated only if keep_weights
float threshold2 = diff_sigma * diff_threshold; float threshold2 = diff_sigma * diff_threshold; // never used?
threshold2 *= threshold2; // squared to compare with diff^2 threshold2 *= threshold2; // squared to compare with diff^2
float pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once ! float pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once !
int pair_ports[NUM_CAMS*(NUM_CAMS-1)/2][2]; // int [][] pair_ports = new int [ports*(ports-1)/2][2]; int pair_ports[NUM_CAMS*(NUM_CAMS-1)/2][2]; // int [][] pair_ports = new int [ports*(ports-1)/2][2];
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment