fixed inter-scene correlation

8612a64e · Andrey Filippov · d8e9a454 · 8612a64e · 8612a64e
Commit 8612a64e authored Jul 22, 2025 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 7 additions and 11 deletions

TileProcessor.cu src/TileProcessor.cu +6 -11

test_tp.cu src/test_tp.cu +1 -0

No files found.
--- a/src/TileProcessor.cu
+++ b/src/TileProcessor.cu
@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		float           * gpu_corrs)          // correlation output data
 {
 	dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-	dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);

 	dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-//	dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-	dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 	dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);

 	if (threadIdx.x == 0) { // only 1 thread, 1 block
@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		if (num_sel_sensors > 0){
 // try with null tp_tasks to use same sequence from GPU memory
 			*pnum_corr_tiles = 0;
+			dim3 grid_corr((num_tiles * num_sel_sensors + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 			index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 					num_cams,            // int               num_cams,
 					sel_sensors,         // int               sel_sensors,
@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					tilesx,              // int                width,                // number of tiles in a row
 					gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 					pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
-///			cudaDeviceSynchronize();
-///			__device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
-///			int num_corr_tiles_wo_sum =   num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
-///			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-///			dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-			correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
+
+			correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalizations and back conversion
 					num_cams,                // int          num_cams,
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
 					gpu_clt,                 // float     ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
 					corr_stride,             // size_t       corr_stride,        // in floats
 					gpu_corrs);              // float      * gpu_corrs)          // correlation output data (either pixel domain or transform domain
-///			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+
 			combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>(     // combine per-senor interscene correlations
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
 					pnum_corr_tiles, //num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 		 //__device__
 		 *pnum_active_tiles = 0; // already _device_
 		 int task_size = get_task_size(num_cams);
-		 index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
+		 index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				 task_size,          // int                task_size,        // flattened task size in 4-byte floats
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				 num_tiles,           //int                num_tiles,          // number of tiles in task

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -46,6 +46,7 @@
 #include <cstdlib>
 #include <cstdio>
 #include <cuda_runtime.h>     // cudaFree
+//#define __CUDA_RUNTIME_H__  // turn on, then off to fix missing findCudaDevice()
 #include <helper_cuda.h>      // for checkCudaErrors
 #include <helper_functions.h> // timer functions