Commit 13b9ba89 authored by Andrey Filippov's avatar Andrey Filippov

minor bug fix, removed some debug code to clean up, still has problem in

interscene
parent 28e55dce
...@@ -1292,7 +1292,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1292,7 +1292,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
if (num_sel_sensors > 0){ if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0; *pnum_corr_tiles = 0;
index_inter_correlate<<<blocks0,threads0>>>( index_inter_correlate<<<blocks0,threads0>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
...@@ -1321,7 +1321,6 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1321,7 +1321,6 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
corr_stride, // size_t corr_stride, // in floats corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all) num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum) num_corr_tiles, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
...@@ -1455,10 +1454,10 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1455,10 +1454,10 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
__shared__ float clt_tiles1 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1]; __shared__ float clt_tiles1 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_tiles2 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1]; __shared__ float clt_tiles2 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1]; __shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation // __shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros // set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0 float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1); // float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr); resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed __syncthreads(); /// ***** Was not here: probably not needed
for (int color = 0; color < colors; color++){ for (int color = 0; color < colors; color++){
...@@ -1497,17 +1496,6 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1497,17 +1496,6 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
} }
#endif //USE_LOG #endif //USE_LOG
__syncthreads(); __syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA1\n",tile_num, corr_sensor,color);
debug_print_clt1(clt_tile1, color, 0xf); //
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA2\n",tile_num, corr_sensor,color);
debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads();// __syncwarp();
#endif
#endif
// each thread should get the same pointers here, offsets are inside // each thread should get the same pointers here, offsets are inside
correlateAccumulateTiles( correlateAccumulateTiles(
scales[color], // float scale, // scale correlation scales[color], // float scale, // scale correlation
...@@ -1516,27 +1504,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1516,27 +1504,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
clt_corr); // float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result clt_corr); // float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__syncthreads(); __syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D, color = %d CORRELATION\n", color);
debug_print_clt1(clt_corr, color, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
if (color == 1){ // LPF only after B (nothing in mono) if (color == 1){ // LPF only after B (nothing in mono)
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D LPF for RB correlation\n");
debug_print_lpf(lpf_rb_corr);
}
__syncthreads();// __syncwarp();
#endif
#endif
float *clt = clt_corr + threadIdx.x; float *clt = clt_corr + threadIdx.x;
#pragma unroll #pragma unroll
for (int q = 0; q < 4; q++){ for (int q = 0; q < 4; q++){
...@@ -1549,18 +1517,9 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1549,18 +1517,9 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
} }
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION RB LPF-ed\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
} // if (color == 1){ // LPF only after B (nothing in mono) } // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){ } // for (int color = 0; color < colors; color++){
float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x; float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
float *clt = clt_corr + threadIdx.x; float *clt = clt_corr + threadIdx.x;
#pragma unroll #pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){ for (int i = 0; i < DTT_SIZE4; i++){
...@@ -2051,10 +2010,10 @@ extern "C" __global__ void corr2D_combine_inner( ...@@ -2051,10 +2010,10 @@ extern "C" __global__ void corr2D_combine_inner(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>> * that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
* *
* @param num_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain). * @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain * @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it) * @param corr_weights null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs. * @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared) * @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
...@@ -2064,7 +2023,7 @@ extern "C" __global__ void corr2D_normalize( ...@@ -2064,7 +2023,7 @@ extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per correlation tile weight (fat_zero2 will be divided by it) float * corr_weights, // null or per correlation tile weight (fat_zero2 will be divided by it), length = num_corr_tile
const size_t corr_stride, // in floats const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared float fat_zero2, // here - absolute, squared
...@@ -2089,14 +2048,14 @@ extern "C" __global__ void corr2D_normalize( ...@@ -2089,14 +2048,14 @@ extern "C" __global__ void corr2D_normalize(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* kernel that is called from corr2D_normalize. * kernel that is called from corr2D_normalize.
* *
* @param num_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain). * @param corr_stride_td stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain * @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it) * @param corr_weights null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs. * @param corr_stride stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2 * @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization * @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15) * @param corr_radius radius of the output correlation (maximal 7 for 15x15)
*/ */
extern "C" __global__ void corr2D_normalize_inner( extern "C" __global__ void corr2D_normalize_inner(
...@@ -4309,6 +4268,7 @@ __device__ void correlateAccumulateTiles( ...@@ -4309,6 +4268,7 @@ __device__ void correlateAccumulateTiles(
float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0] float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0] float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0] float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
// __syncthreads(); // *** TESTING ***
//#pragma unroll //#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){ for (int i = 0; i < DTT_SIZE; i++){
// k=0 // k=0
...@@ -4352,6 +4312,8 @@ __device__ void correlateAccumulateTiles( ...@@ -4352,6 +4312,8 @@ __device__ void correlateAccumulateTiles(
corr_tile_j1 ++; corr_tile_j1 ++;
corr_tile_j2 ++; corr_tile_j2 ++;
corr_tile_j3 ++; corr_tile_j3 ++;
// __syncthreads(); // *** TESTING ***
} }
} }
......
...@@ -1798,8 +1798,8 @@ int main(int argc, char **argv) ...@@ -1798,8 +1798,8 @@ int main(int argc, char **argv)
#ifdef CORR_INTER_SELF #ifdef CORR_INTER_SELF
int sel_sensors = 0xffff; int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
int num_sel_senosrs = 16; int num_sel_senosrs = 16; // 15; // 16;
num_pairs = num_sel_senosrs+1; num_pairs = num_sel_senosrs+1;
num_corr_indices = num_pairs * num_tiles; num_corr_indices = num_pairs * num_tiles;
StopWatchInterface *timerINTERSELF = 0; StopWatchInterface *timerINTERSELF = 0;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment