Trying interscene correlation

28e55dce · Andrey Filippov · 0dcd2dee · 28e55dce · 28e55dce · 28e55dce
Commit 28e55dce authored May 15, 2022 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 565 additions and 17 deletions

TileProcessor.cuh src/TileProcessor.cuh +407 -15

TileProcessor.h src/TileProcessor.h +19 -1

test_tp.cu src/test_tp.cu +139 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1041,6 +1041,15 @@ __global__ void index_correlate(
 		int *              gpu_corr_indices,  // array of correlation tasks
 		int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array

+__global__ void index_inter_correlate(
+		int               num_cams,
+		int               sel_sensors,
+		float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,         // number of tiles in task
+		int                width,             // number of tiles in a row
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles);  // pointer to the length of correlation tasks array
+
 extern "C" __global__ void create_nonoverlap_list(
 		int                num_cams,
 		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
@@ -1068,6 +1077,27 @@ __global__ void convert_correct_tiles(
 		int                kernels_vert, //);
 		int                tilesx);

+extern "C" __global__ void combine_inter(     // combine per-senor interscene correlations
+		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * gpu_corr_indices,   // packed tile+pair
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
+
+extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
+		int               num_cams,
+		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * gpu_corr_indices,   // packed tile+pair
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
+
 extern "C" __global__ void correlate2D_inner(
 		int               num_cams,
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -1220,6 +1250,331 @@ extern "C" __global__ void correlate2D(
 	 }
 }

+/**
+ * Calculate  2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
+ * ones with CDP, this one should be configured as correlate2D<<<1,1>>>
+ *
+ * @param num_cams         number of cameras <= NUM_CAMS
+ * @param sel_pairs        array of length to accommodate all pairs (4  for 16 cameras, 120 pairs).
+ * @param gpu_clt          array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+ * @param colors           number of colors used:  3 for RGB or 1 for monochrome
+ * @param scale0           scale red (or mono) component before mixing
+ * @param scale1           scale blue (if colors = 3) component before mixing
+ * @param scale2           scale green (if colors = 3) component before mixing
+ * @param fat_zero2        add this value squared to the sum of squared components before normalization (squared)
+ * @param gpu_ftasks           flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+ * @param num_tiles        number of tiles int gpu_tasks array prepared for processing
+ * @param tilesx           number of tile rows
+ * @param gpu_corr_indices allocated array for per-tile correlation tasks (4 bytes per tile)
+ * @param pnum_corr_tiles  allocated space for pointer to a number of number of correlation tiles to process
+ * @param corr_stride,     stride (in floats) for correlation outputs.
+ * @param gpu_corrs)       allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
+ */
+extern "C" __global__ void correlate2D_inter( // only results in TD
+		int               num_cams,
+		int               sel_sensors,
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int               num_tiles,          // number of tiles in task
+		int               tilesx,             // number of tile rows
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs)          // correlation output data
+{
+	dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	if (threadIdx.x == 0) { // only 1 thread, 1 block
+		int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
+		if (num_sel_sensors > 0){
+
+			*pnum_corr_tiles = 0;
+			index_inter_correlate<<<blocks0,threads0>>>(
+					num_cams,            // int               num_cams,
+					sel_sensors,         // int               sel_sensors,
+					gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+					num_tiles,           // int                num_tiles,          // number of tiles in task
+					tilesx,              // int                width,                // number of tiles in a row
+					gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
+					pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
+			cudaDeviceSynchronize();
+			int num_corr_tiles = (*pnum_corr_tiles) * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
+			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+			dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+
+			correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
+					num_cams,           // int               num_cams,
+					num_sel_sensors,    // int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+					gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+					gpu_clt_ref,        // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+					colors,             // int               colors,             // number of colors (3/1)
+					scale0,             // float             scale0,             // scale for R
+					scale1,             // float             scale1,             // scale for B
+					scale2,             // float             scale2,             // scale for G
+					num_corr_tiles,     // int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum for compatibility with intra format)
+					gpu_corr_indices,   // int             * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
+					corr_stride,        // size_t            corr_stride,        // in floats
+					gpu_corrs);         // float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
+			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+
+			combine_inter<<<grid_combine,threads_corr>>>(     // combine per-senor interscene correlations
+					num_sel_sensors,    // int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+					num_corr_tiles,     // int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+					gpu_corr_indices,   // int             * gpu_corr_indices,   // packed tile+pair
+					corr_stride,        // size_t            corr_stride,        // in floats
+					gpu_corrs);         // float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
+		}
+	}
+}
+
+/**
+ * Used for interscene correlations (for motion vector calculation).
+ * Calculate sum of selected correlation (in TD) and place it after individual (per-sensor) correlations.
+ * Configuration
+ *    threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
+ *    grids:   dim3 ((number_of_task_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
+ *
+ * @param num_sel_sensors  number of sensors to correlate
+ * @param num_corr_tiles   number of correlation tiles to process (here it includes sum)
+ * @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number  << 8), low byte
+ *                         is now sensor number or 0xff (last one for each tile) for the sum of all individual
+ *                         correlations. Entries for each tile go in the same order (increasing sensor number)
+ *                         followed by the sum of all the selected correlations. Entries for different tiles
+ *                         are not ordered.
+ * @param corr_stride      stride (in floats) for correlation outputs.
+ * @param gpu_corrs        allocated array for the correlation output data, first num_sel_sensors for each tile
+ *                         should be calculated by correlate2D_inter_inner() leaving gaps for sums, calculated here
+ *
+ */
+extern "C" __global__ void combine_inter(     // combine per-senor interscene correlations
+		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * gpu_corr_indices,   // packed tile+pair
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
+{
+	int corr_in_block = threadIdx.y;
+	int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
+	int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
+	if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
+		return;
+	}
+//    __syncthreads();// __syncwarp();
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+    resetCorrelation(clt_corr);
+    __syncthreads(); /// ***** Was not here: probably not needed
+    for (int isens = 0; isens < num_sel_sensors; isens++){
+        float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
+        float *clt = clt_corr + threadIdx.x;
+    #pragma unroll
+        for (int i = 0; i < DTT_SIZE4; i++){
+        	(*clt) += (*mem_corr);
+        	clt        += DTT_SIZE1;
+        	mem_corr   += DTT_SIZE;
+        }
+        corr_offset++;
+    }
+    // Now corr_offset points to the sum of correlations
+    float *mem_corr = gpu_corrs + corr_stride * corr_offset + threadIdx.x;
+    float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < DTT_SIZE4; i++){
+    	(*mem_corr) = (*clt);
+    	clt        += DTT_SIZE1;
+    	mem_corr   += DTT_SIZE;
+    }
+    corr_offset++;
+
+}
+
+/**
+ * Calculate interscene 2D phase correlation pairs from CLT representation.
+ * This is an inner kernel that is called from correlate2D_inter.
+ * Configuration
+ *    threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
+ *    grids:   dim3 ((number_of_corr_tiles_excluding_sums + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
+ *
+ * @param num_cams         number of cameras
+ * @param num_sel_sensors  number of sensors to correlate
+ * @param gpu_clt          array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+ * @param gpu_clt_ref      array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+ *                         gpu_clt_ref and gpu_clt_ref correspond to two scenes, the reference is the first in correlation.
+ * @param colors           number of colors used:  3 for RGB or 1 for monochrome
+ * @param scale0           scale red (or mono) component before mixing
+ * @param scale1           scale blue (if colors = 3) component before mixing
+ * @param scale2           scale green (if colors = 3) component before mixing
+ * @param num_corr_tiles   number of correlation tiles to process (here it includes sum)
+ * @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number  << 8), low byte
+ *                         is now sensor number or 0xff (last one for each tile) for the sum of all individual
+ *                         correlations. Entries for each tile go in the same order (increasing sensor number)
+ *                         followed by the sum of all the selected correlations. Entries for different tiles
+ *                         are not ordered.
+ * @param corr_stride      stride (in floats) for correlation outputs.
+ * @param gpu_corrs        allocated array for the correlation output data, first num_sel_sensors for each tile
+ *                         will be calculated here leaving gaps for sums, calculated by combine_inter()
+ */
+extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, no normalisations and back conversion
+		int               num_cams,
+		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * gpu_corr_indices,   // packed tile+pair
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
+{
+	float scales[3] = {scale0, scale1, scale2};
+	int corr_in_block = threadIdx.y;
+	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
+	int tile_index =  corr_num / num_sel_sensors;
+	int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
+	if (corr_offset >= num_corr_tiles){
+		return; // nothing to do
+	}
+
+	// get number of pair and number of tile
+	int corr_sensor = gpu_corr_indices[corr_offset]; // corr_num];
+
+	int tile_num = corr_sensor >> CORR_NTILE_SHIFT;
+	corr_sensor &= (corr_sensor & ((1 << CORR_NTILE_SHIFT) - 1));
+	if (corr_sensor >= num_cams){
+		return; // BUG - should not happen
+	}
+
+    __syncthreads();// __syncwarp();
+    __shared__ float clt_tiles1  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_tiles2  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float mlt_corrs   [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
+    // set clt_corr to all zeros
+    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+    float * mclt_corr = ((float *) mlt_corrs) +  corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
+    resetCorrelation(clt_corr);
+    __syncthreads(); /// ***** Was not here: probably not needed
+    for (int color = 0; color < colors; color++){
+        // copy clt (frequency domain data)
+        float * clt_tile1 = ((float *) clt_tiles1) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        float * clt_tile2 = ((float *) clt_tiles2) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        int offs = (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
+        float * gpu_tile1 = ((float *) gpu_clt_ref[corr_sensor]) + offs;
+        float * gpu_tile2 = ((float *) gpu_clt    [corr_sensor]) + offs;
+		float * clt_tile1i = clt_tile1 + threadIdx.x;
+		float * clt_tile2i = clt_tile2 + threadIdx.x;
+#pragma unroll
+#define USE_LOG
+#ifdef 	USE_LOG
+		// Apply high-pass filter to correlation inputs to reduce dynamic range before multiplication
+		for (int q = 0; q < 4; q++){
+    		float *log = LoG_corr + threadIdx.x;
+			for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
+				*clt_tile1i= (*gpu_tile1) * (*log);
+				*clt_tile2i= (*gpu_tile2) * (*log);
+				clt_tile1i += DTT_SIZE1;
+				clt_tile2i += DTT_SIZE1;
+				gpu_tile1 += DTT_SIZE;
+				gpu_tile2 += DTT_SIZE;
+				log +=       DTT_SIZE;
+			}
+		}
+#else
+		for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
+			*clt_tile1i= *gpu_tile1;
+			*clt_tile2i= *gpu_tile2;
+			clt_tile1i += DTT_SIZE1;
+			clt_tile2i += DTT_SIZE1;
+			gpu_tile1 += DTT_SIZE;
+			gpu_tile2 += DTT_SIZE;
+    	}
+#endif //USE_LOG
+		__syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA1\n",tile_num, corr_sensor,color);
+    	debug_print_clt1(clt_tile1, color,  0xf); //
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA2\n",tile_num, corr_sensor,color);
+    	debug_print_clt1(clt_tile2, color,  0xf); //
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+		// each thread should get the same pointers here, offsets are inside
+        correlateAccumulateTiles(
+        		scales[color], // float  scale,     // scale correlation
+				clt_tile1, // float* clt_tile1, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+				clt_tile2, // float* clt_tile2, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+				clt_corr); // float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+        __syncthreads();
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+        if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
+        	printf("\ncorrelate2D, color = %d CORRELATION\n", color);
+        	debug_print_clt1(clt_corr, color,  0xf);
+        }
+        __syncthreads();// __syncwarp();
+#endif
+#endif
+        if (color == 1){ // LPF only after B (nothing in mono)
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+        	if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
+        		printf("\ncorrelate2D LPF for RB correlation\n");
+        		debug_print_lpf(lpf_rb_corr);
+        	}
+        	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+        	float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+        	for (int q = 0; q < 4; q++){
+        		float *lpf_rb = lpf_rb_corr + threadIdx.x;
+#pragma unroll
+        		for (int i = 0; i < DTT_SIZE; i++){
+        			(*clt) *= (*lpf_rb);
+        			clt    += DTT_SIZE1;
+        			lpf_rb += DTT_SIZE;
+        		}
+        	}
+        	__syncthreads();// __syncwarp();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+        	if ((tile_num == DBG_TILE) && (corr_sensor == 0) && (threadIdx.x == 0)){
+        		printf("\ncorrelate2D CORRELATION RB LPF-ed\n");
+        		debug_print_clt1(clt_corr, -1,  0xf);
+        	}
+        	__syncthreads();// __syncwarp();
+#endif
+#endif
+        } // if (color == 1){ // LPF only after B (nothing in mono)
+    } // for (int color = 0; color < colors; color++){
+    float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
+    float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < DTT_SIZE4; i++){
+    	(*mem_corr) = (*clt);
+    	clt        += DTT_SIZE1;
+    	mem_corr   += DTT_SIZE;
+    }
+    __syncthreads();// __syncwarp();
+}
+
+
+
+
+
 /**
 * Calculate  2D phase correlation pairs from CLT representation. This is an inner kernel that is called
 * from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
@@ -1234,9 +1589,9 @@ extern "C" __global__ void correlate2D(
 * @param fat_zero2        add this value squared to the sum of squared components before normalization
 * @param num_corr_tiles   number of correlation tiles to process
 * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
- * @param corr_stride,     stride (in floats) for correlation outputs.
- * @param corr_radius,     radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
- * @param gpu_corrs)       allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
+ * @param corr_stride      stride (in floats) for correlation outputs.
+ * @param corr_radius      radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
+ * @param gpu_corrs        allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
 */
 extern "C" __global__ void correlate2D_inner(
 		int               num_cams,
@@ -1504,6 +1859,7 @@ extern "C" __global__ void correlate2D_inner(

 }

+
 /**
 * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
 * and separately the two diagonals into another single one
@@ -2515,7 +2871,6 @@ extern "C" __global__ void create_nonoverlap_list(
 * @param num_cams           number of cameras <= NUM_CAMS
 * @param sel_pairs          array of length to accommodate all pairs (4  for 16 cameras, 120 pairs).
 * @param gpu_ftasks         flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-// * @param gpu_tasks        array of per-tile tasks (struct tp_task)
 * @param num_tiles          number of tiles int gpu_tasks array prepared for processing
 * @param gpu_corr_indices   integer array to place the generated list
 * @param pnum_corr_tiles    single-element integer array return generated list length
@@ -2564,23 +2919,60 @@ __global__ void index_correlate(
 			gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
 		}
 	}
-	/*
-	int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
-	if (cm != 0) {
-		int nb = __popc (cm); // number of non-zero bits
-		int indx = atomicAdd(pnum_corr_tiles, nb);
-		int txy = gpu_tasks[num_tile].txy;
-		int tx = txy & 0xffff;
-		int ty = txy >> 16;
-//		int nt = ty * TILES-X + tx;
+}
+
+/**
+ * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
+ * For interscene correlation. One correlation output for each selected sensor
+ * plus a sum of them all. So for all 16 sensors selected ooutput will have 17
+ * 2D correlations (with some being the l;ast one)
+ * All pairs for the same tile will always be in the same order: increasing sensor numbers
+ * with sum being the last. Sum will be marked by 0xff in the LSB.
+ * With the quad camera each tile may generate up to 6 pairs (int array elements)
+ * Tiles are not ordered, but the correlation pairs for each tile are.
+ *
+ * @param num_cams           number of cameras <= NUM_CAMS <32
+ * @param sel_sensors        array of length to accommodate all pairs (4  for 16 cameras, 120 pairs).
+ * @param gpu_ftasks         flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ * @param num_tiles          number of tiles int gpu_tasks array prepared for processing
+ * @param gpu_corr_indices   integer array to place the generated list
+ * @param pnum_corr_tiles    single-element integer array return generated list length
+ */
+
+__global__ void index_inter_correlate(
+		int               num_cams,
+		int               sel_sensors,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,         // number of tiles in task
+		int                width,                // number of tiles in a row
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles)   // pointer to the length of correlation tasks array
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	//	int task_size = get_task_size(num_cams);
+	int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
+	if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?
+		return;
+	}
+	int nb = __popc (sel_sensors); // number of non-zero bits
+	if (nb > 0){
+		int indx = atomicAdd(pnum_corr_tiles, nb+1);
+		int task_txy = get_task_txy(num_tile, gpu_ftasks, num_cams);
+		int tx = task_txy & 0xffff;
+		int ty = task_txy >> 16;
 		int nt = ty * width + tx;
-		for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
+		//		for (int b = 0; b < pair_list_len; b++) if ((cm & (1 << b)) != 0) {
+		for (int b = 0; b < num_cams; b++) if ((sel_sensors & (1 << (b & 31))) != 0) {
 			gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
 		}
+		gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | 0xff; // will be used for sum
 	}
-	 */
 }

+
 /**
 * Direct MCLT transform and aberration correction with space-variant deconvolution
 * kernels. Results are used to output aberration-corrected images, textures and

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D(
 		int               tilesx,             // number of tile rows
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-		size_t      corr_stride,        // in floats
+		size_t            corr_stride,        // in floats
 //		int               corr_stride,        // in floats
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data

+extern "C" __global__ void correlate2D_inter( // only results in TD
+		int               num_cams,
+		int               sel_sensors,
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int               num_tiles,          // number of tiles in task
+		int               tilesx,             // number of tile rows
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		size_t            corr_stride,        // in floats
+		float           * gpu_corrs);         // correlation output data
+
+
 extern "C" __global__ void corr2D_normalize(
 		int               num_corr_tiles,     // number of correlation tiles to process
 		const size_t      corr_stride_td,     // in floats

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -36,7 +36,7 @@
 // #define NOTEXTURE_RGBA
 #define SAVE_CLT
 //#define NO_DP
-
+#define CORR_INTER_SELF 1


 #include <stdio.h>
@@ -685,6 +685,7 @@ int main(int argc, char **argv)
    const char* result_corr_quad_file =     "clt/aux_corr-quad.corr";
    const char* result_corr_td_norm_file =  "clt/aux_corr-td-norm.corr";
 ///    const char* result_corr_cross_file = "clt/aux_corr-cross.corr";
+    const char* result_inter_td_norm_file = "clt/aux_inter-td-norm.corr";
    const char* result_textures_file =      "clt/aux_texture_nodp.rgba";
    const char* result_diff_rgb_combo_file ="clt/aux_diff_rgb_combo_nodp.drbg";
    const char* result_textures_rgba_file = "clt/aux_texture_rgba_nodp.rgba";
@@ -752,6 +753,7 @@ int main(int argc, char **argv)
    const char* result_corr_quad_file =  "clt/main_corr-quad.corr";
    const char* result_corr_td_norm_file =  "clt/aux_corr-td-norm.corr";
 ///    const char* result_corr_cross_file = "clt/main_corr-cross.corr";
+    const char* result_inter_td_norm_file =  "clt/aux_inter-td-norm.corr";
    const char* result_textures_file =       "clt/main_texture_nodp.rgba";
    const char* result_diff_rgb_combo_file ="clt/main_diff_rgb_combo_nodp.drbg";
    const char* result_textures_rgba_file = "clt/main_texture_rgba_nodp.rgba";
@@ -1790,6 +1792,142 @@ int main(int argc, char **argv)
 #endif // ifndef NOCORR_TD


+// Testing "interframe" correlation with itself, assuming direct convert already ran
+
+
+
+
+#ifdef CORR_INTER_SELF
+    int sel_sensors = 0xffff;
+    int num_sel_senosrs = 16;
+    num_pairs = num_sel_senosrs+1;
+    num_corr_indices = num_pairs * num_tiles;
+    StopWatchInterface *timerINTERSELF = 0;
+    sdkCreateTimer(&timerINTERSELF);
+//    int num_corr_combo_inter;
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerINTERSELF);
+    		sdkStartTimer(&timerINTERSELF);
+    	}
+    	correlate2D_inter<<<1,1>>>( // only results in TD
+    			num_cams,                      // int               num_cams,
+				sel_sensors,                   // int               sel_sensors,
+				gpu_clt,                       // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				gpu_clt,                       // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				num_colors,                    // int               colors,             // number of colors (3/1)
+				color_weights[0], // 0.25,     // float             scale0,             // scale for R
+				color_weights[1], // 0.25,     // float             scale1,             // scale for B
+				color_weights[2], // 0.5,      // float             scale2,             // scale for G
+				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				tp_task_size,                  // int               num_tiles) // number of tiles in task
+				TILESX,                        // int               tilesx,             // number of tile rows
+				gpu_corr_indices,              // int             * gpu_corr_indices,   // packed tile+pair
+				gpu_num_corr_tiles,            // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr_td/sizeof(float), // const size_t      corr_stride,        // in floats
+				gpu_corrs_td);                 // float           * gpu_corrs);         // correlation output data
+        getLastCudaError("Kernel failure:correlate2D_inter");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("correlate2D_inter-TD pass: %d\n",i);
+    	checkCudaErrors(cudaMemcpy(
+    			&num_corrs,
+    			gpu_num_corr_tiles,
+    			sizeof(int),
+    			cudaMemcpyDeviceToHost));
+    	checkCudaErrors(cudaDeviceSynchronize());
+
+    	corr2D_normalize<<<1,1>>>(
+    			num_corrs, //tp_task_size,           // int               num_corr_tiles,     // number of correlation tiles to process
+				dstride_corr_td/sizeof(float),       // const size_t      corr_stride_td,     // in floats
+				gpu_corrs_td,                        // float           * gpu_corrs_td,       // correlation tiles in transform domain
+				(float *) 0, // corr_weights,        // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+				dstride_corr/sizeof(float),          // const size_t      corr_stride,        // in floats
+				gpu_corrs,                           // float           * gpu_corrs,          // correlation output data (pixel domain)
+				30.0 * 30.0,                         // float             fat_zero2,           // here - absolute
+				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+    	getLastCudaError("Kernel failure:corr2D_normalize");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("corr2D_normalize pass: %d\n",i);
+    }
+    sdkStopTimer(&timerINTERSELF);
+    float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
+    sdkDeleteTimer(&timerINTERSELF);
+    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeINTERSELF, num_corrs);
+
+
+	rslt_corr_size =   num_corrs * corr_size * corr_size;
+	corr_img_size = num_corr_indices * 16*16; // NAN
+	corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+	cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
+
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr,
+			(corr_size * corr_size) * sizeof(float),
+			gpu_corrs,
+			dstride_corr,
+			(corr_size * corr_size) * sizeof(float),
+			num_corrs,
+			cudaMemcpyDeviceToHost));
+
+    checkCudaErrors(cudaMemcpy(
+    		cpu_corr_indices,
+			gpu_corr_indices,
+			num_corr_indices * sizeof(int),
+			cudaMemcpyDeviceToHost));
+
+    for (int i = 0; i < corr_img_size; i++){
+    	corr_img[i] = NAN;
+    }
+//  int num_pairs = 120;
+//  int sel_sensors = 0xffff;
+//  int num_sel_senosrs = 16;
+//	int corr_size =        2 * CORR_OUT_RAD + 1; // 15
+//	int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
+//	int num_corr_indices = num_pairs * num_tiles;
+
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
+    	if (cpair == 0xff){
+    		cpair = num_sel_senosrs;
+    	}
+    	int ty = ctt / TILESX;
+    	int tx = ctt % TILESX;
+    	int src_offs0 = ict * corr_size * corr_size;
+    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
+    	for (int iy = 0; iy < corr_size; iy++){
+    		int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
+    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
+    		for (int ix = 0; ix < corr_size; ix++){
+    			corr_img[dst_offs++] = cpu_corr[src_offs++];
+    		}
+    	}
+    }
+
+#ifndef NSAVE_CORR
+    printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
+    		result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
+    writeFloatsToFile(
+    		corr_img,                  // float *       data, // allocated array
+			corr_img_size,             // int           size, // length in elements
+			result_inter_td_norm_file); // 			   const char *  path) // file path
+#endif
+    free (cpu_corr);
+    free (cpu_corr_indices);
+    free (corr_img);
+#endif    // #ifdef CORR_INTER_SELF
+
+
+
+
+
+
+
+

 // -----------------
 #ifndef NOTEXTURES