Fixed inter-scene correlation, updated 12.6.0 kernel sources

4e5e1f22 · Andrey Filippov · e08db32b · 4e5e1f22 · 4e5e1f22 · 4e5e1f22
Commit 4e5e1f22 authored Jul 22, 2025 by Andrey Filippov
3 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -78,7 +78,8 @@ import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
 	public static String  CUDA_VERSION = JCudaVersion.get(); 
 	public static boolean USE_CUDA12 = CUDA_VERSION.startsWith("12.");
-	public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
+	public static boolean USE_DS_DP = USE_CUDA12; // false; // Use Dynamic Shared memory with Dynamic Parallelism TODO: Split for DP2/shared memory
+	public static String  SOURCE_FOLDER = ""; // where the sources were loaded from - debug feature
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	// Can be downloaded and twice extracted from
 	// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
@@ -321,6 +322,7 @@ public class GPUTileProcessor {
                		file = new File(src_dir.getPath(), src_file);
                		System.out.println("Loading resource "+file);
                	}
+                	SOURCE_FOLDER = file.getPath();
 //                	System.out.println(file.getAbsolutePath());
                	String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
                	String sourceFile = readFileAsString(cuFileName); // readResourceAsString(cuFileName);

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
--- a/src/main/resources/kernels/12.6.0/TileProcessor.cu
+++ b/src/main/resources/kernels/12.6.0/TileProcessor.cu
@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		float           * gpu_corrs)          // correlation output data
 {
 	dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-	dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);

 	dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-//	dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-	dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 	dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);

 	if (threadIdx.x == 0) { // only 1 thread, 1 block
@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		if (num_sel_sensors > 0){
 // try with null tp_tasks to use same sequence from GPU memory
 			*pnum_corr_tiles = 0;
+			dim3 grid_corr((num_tiles * num_sel_sensors + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 			index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 					num_cams,            // int               num_cams,
 					sel_sensors,         // int               sel_sensors,
@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					tilesx,              // int                width,                // number of tiles in a row
 					gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 					pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
-///			cudaDeviceSynchronize();
-///			__device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
-///			int num_corr_tiles_wo_sum =   num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
-///			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-///			dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-			correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
+
+			correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalizations and back conversion
 					num_cams,                // int          num_cams,
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
 					gpu_clt,                 // float     ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
 					corr_stride,             // size_t       corr_stride,        // in floats
 					gpu_corrs);              // float      * gpu_corrs)          // correlation output data (either pixel domain or transform domain
-///			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+
 			combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>(     // combine per-senor interscene correlations
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
 					pnum_corr_tiles, //num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 		 //__device__
 		 *pnum_active_tiles = 0; // already _device_
 		 int task_size = get_task_size(num_cams);
-		 index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
+		 index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				 task_size,          // int                task_size,        // flattened task size in 4-byte floats
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				 num_tiles,           //int                num_tiles,          // number of tiles in task