Fixed interscene correlation bug

f72f2a9c · Andrey Filippov · 13b9ba89 · f72f2a9c · f72f2a9c
Commit f72f2a9c authored May 17, 2022 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 102 additions and 30 deletions

TileProcessor.cuh src/TileProcessor.cuh +26 -24

test_tp.cu src/test_tp.cu +76 -6

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1303,30 +1303,31 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 					pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
 			cudaDeviceSynchronize();
-			int num_corr_tiles = (*pnum_corr_tiles) * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
+			int num_corr_tiles_with_sum = (*pnum_corr_tiles);
+			int num_corr_tiles_wo_sum =   num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
 			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-			dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+			dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 			correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
-					num_cams,           // int               num_cams,
+					num_cams,                // int          num_cams,
-					num_sel_sensors,    // int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
-					gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+					gpu_clt,                 // float     ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-					gpu_clt_ref,        // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+					gpu_clt_ref,             // float     ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-					colors,             // int               colors,             // number of colors (3/1)
+					colors,                  // int          colors,             // number of colors (3/1)
-					scale0,             // float             scale0,             // scale for R
+					scale0,                  // float        scale0,             // scale for R
-					scale1,             // float             scale1,             // scale for B
+					scale1,                  // float        scale1,             // scale for B
-					scale2,             // float             scale2,             // scale for G
+					scale2,                  // float        scale2,             // scale for G
-					num_corr_tiles,     // int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum for compatibility with intra format)
+					num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum for compatibility with intra format)
-					gpu_corr_indices,   // int             * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
+					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
-					corr_stride,        // size_t            corr_stride,        // in floats
+					corr_stride,             // size_t       corr_stride,        // in floats
-					gpu_corrs);         // float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
+					gpu_corrs);              // float      * gpu_corrs)          // correlation output data (either pixel domain or transform domain
 			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 			combine_inter<<<grid_combine,threads_corr>>>(     // combine per-senor interscene correlations
-					num_sel_sensors,    // int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
-					num_corr_tiles,     // int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+					num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
-					gpu_corr_indices,   // int             * gpu_corr_indices,   // packed tile+pair
+					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile+pair NOT USED
-					corr_stride,        // size_t            corr_stride,        // in floats
+					corr_stride,             // size_t       corr_stride,        // in floats
-					gpu_corrs);         // float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
+					gpu_corrs);              // float      * gpu_corrs);          // correlation output data (either pixel domain or transform domain
 		}
 	}
 }
@@ -1519,6 +1520,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
        	__syncthreads();// __syncwarp();
        } // if (color == 1){ // LPF only after B (nothing in mono)
    } // for (int color = 0; color < colors; color++){
+	__syncthreads();// __syncwarp();
    float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
    float *clt = clt_corr + threadIdx.x;
 #pragma unroll
@@ -2901,11 +2903,11 @@ __global__ void index_correlate(
 __global__ void index_inter_correlate(
 		int               num_cams,
 		int               sel_sensors,
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		float           * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                num_tiles,         // number of tiles in task
+		int               num_tiles,         // number of tiles in task
-		int                width,                // number of tiles in a row
+		int               width,                // number of tiles in a row
-		int *              gpu_corr_indices,  // array of correlation tasks
+		int *             gpu_corr_indices,  // array of correlation tasks
-		int *              pnum_corr_tiles)   // pointer to the length of correlation tasks array
+		int *             pnum_corr_tiles)   // pointer to the length of correlation tasks array
 {
 	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
 	if (num_tile >= num_tiles){

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -1116,6 +1116,7 @@ int main(int argc, char **argv)
 	float * corr_img; //  = (float *)malloc(corr_img_size * sizeof(float));
 	float * cpu_corr; //  = (float *)malloc(rslt_corr_size * sizeof(float));
+	float * cpu_corr_td;
 	int * cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));
@@ -1799,8 +1800,8 @@ int main(int argc, char **argv)
 #ifdef CORR_INTER_SELF
    int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
-    int num_sel_senosrs = 16; // 15; // 16;
+    int num_sel_sensors = 16; // 15; // 16;
-    num_pairs = num_sel_senosrs+1;
+    num_pairs = num_sel_sensors+1;
    num_corr_indices = num_pairs * num_tiles;
    StopWatchInterface *timerINTERSELF = 0;
    sdkCreateTimer(&timerINTERSELF);
@@ -1838,7 +1839,6 @@ int main(int argc, char **argv)
    			sizeof(int),
    			cudaMemcpyDeviceToHost));
    	checkCudaErrors(cudaDeviceSynchronize());
    	corr2D_normalize<<<1,1>>>(
    			num_corrs, //tp_task_size,           // int               num_corr_tiles,     // number of correlation tiles to process
 				dstride_corr_td/sizeof(float),       // const size_t      corr_stride_td,     // in floats
@@ -1884,7 +1884,7 @@ int main(int argc, char **argv)
    }
 //  int num_pairs = 120;
 //  int sel_sensors = 0xffff;
-//  int num_sel_senosrs = 16;
+//  int num_sel_sensors = 16;
 //	int corr_size =        2 * CORR_OUT_RAD + 1; // 15
 //	int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
 //	int num_corr_indices = num_pairs * num_tiles;
@@ -1893,7 +1893,7 @@ int main(int argc, char **argv)
    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
    	if (cpair == 0xff){
-    		cpair = num_sel_senosrs;
+    		cpair = num_sel_sensors;
    	}
    	int ty = ctt / TILESX;
    	int tx = ctt % TILESX;
@@ -1907,7 +1907,6 @@ int main(int argc, char **argv)
    		}
    	}
    }
 #ifndef NSAVE_CORR
    printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
    		result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
@@ -1916,6 +1915,77 @@ int main(int argc, char **argv)
 			corr_img_size,             // int           size, // length in elements
 			result_inter_td_norm_file); // 			   const char *  path) // file path
 #endif
+#if 1
+	int rslt_corr_size_td =   num_corrs * DTT_SIZE2*DTT_SIZE2;
+	cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr_td,
+			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
+			gpu_corrs_td,
+			dstride_corr_td,
+			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
+			num_corrs,
+			cudaMemcpyDeviceToHost));
+// Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
+    	if (cpair == 0xff){
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / TILESX;
+    	int tx = ctt % TILESX;
+    	int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
+    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
+    	for (int iquad = 0; iquad < 4; iquad ++) {
+    		int iqy = (iquad >> 1) & 1;
+    		int iqx = (iquad >> 0) & 1;
+    		for (int iy = 0; iy < DTT_SIZE; iy++){
+    			int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
+    			int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
+    			for (int ix = 0; ix < DTT_SIZE; ix++){
+    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
+    			}
+    		}
+    	}
+    }
+#ifndef NSAVE_CORR
+    printf("Writing interscene phase correlation TD data");
+    writeFloatsToFile(
+    		corr_img,                  // float *       data, // allocated array
+			corr_img_size,             // int           size, // length in elements
+			"clt/aux_interscene-TD.raw"); // 			   const char *  path) // file path
+#endif
+    int corr_index_img_length = TILESX*TILESY*(num_sel_sensors+1) ;
+    float *corr_index_img =  (float *)malloc(corr_index_img_length * sizeof(float));
+    for (int i = 0; i < corr_index_img_length; i++){
+    	corr_index_img[i] = NAN;
+    }
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
+    	if (cpair == 0xff){
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / TILESX;
+    	int tx = ctt % TILESX;
+    	corr_index_img[cpair*TILESX*TILESY + TILESX*ty + tx] =  ict; // cpu_corr_indices[ict];
+    }
+    printf("Writing interscene indices\n");
+    writeFloatsToFile(
+    		corr_index_img,                  // float *       data, // allocated array
+			corr_index_img_length,           // int           size, // length in elements
+			"clt/aux_inter-indices.raw"); // 			   const char *  path) // file path
+    free (corr_index_img);
+    free (cpu_corr_td);
+#endif
    free (cpu_corr);
    free (cpu_corr_indices);
    free (corr_img);