implemented/tested intrascene

b9ca98e0 · Andrey Filippov · 47b1fb86 · b9ca98e0 · b9ca98e0 · b9ca98e0
Commit b9ca98e0 authored Aug 27, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 335 additions and 6 deletions

TileProcessor.cuh src/TileProcessor.cuh +201 -1

TileProcessor.h src/TileProcessor.h +12 -0

test_tp.cu src/test_tp.cu +120 -4

tp_defines.h src/tp_defines.h +2 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -919,6 +919,18 @@ extern "C" __global__ void corr2D_normalize_inner(
 		float             fat_zero,           // here - absolute
 		int               corr_radius);        // radius of the output correlation (7 for 15x15)
+extern "C" __global__ void corr2D_combine_inner(
+		int               num_tiles,          // number of tiles to process (each with num_pairs)
+		int               num_pairs,          // num pairs per tile (should be the same)
+		int               init_output,        // !=0 - reset output tiles to zero before accumulating
+		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+		float           * gpu_corrs,          // input correlation tiles
+		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+		float           * gpu_corrs_combo);    // combined correlation output (one per tile)
 extern "C" __global__ void textures_accumulate(
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -1268,6 +1280,194 @@ extern "C" __global__ void correlate2D_inner(
 }
+/**
+ * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
+ * and separately the two diagonals into another single one
+ * When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
+ * when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
+ * (corresponds to a vertical flip).
+ * Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
+ * This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
+ *
+ * @param num_tiles,          // number of tiles to process (each with num_pairs)
+ * @param num_pairs,          // num pairs per tile (should be the same)
+ * @param init_output,        // !=0 - reset output tiles to zero before accumulating
+ * @param pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+ * @param gpu_corr_indices,   // packed tile+pair
+ * @param gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+ * @param corr_stride,        // (in floats) stride for the input TD correlations
+ * @param gpu_corrs,          // input correlation tiles
+ * @param corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+ * @param gpu_corrs_combo)    // combined correlation output (one per tile)
+ */
+extern "C" __global__ void corr2D_combine(
+		int               num_tiles,          // number of tiles to process (each with num_pairs)
+		int               num_pairs,          // num pairs per tile (should be the same)
+		int               init_output,        // !=0 - reset output tiles to zero before accumulating
+		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+		float           * gpu_corrs,          // input correlation tiles
+		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+		float           * gpu_corrs_combo)    // combined correlation output (one per tile)
+{
+	 if (threadIdx.x == 0) { // only 1 thread, 1 block
+		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK_COMBINE, 1);
+		    dim3 grid_corr((num_tiles + CORR_TILES_PER_BLOCK_COMBINE-1) / CORR_TILES_PER_BLOCK_COMBINE,1,1);
+		    corr2D_combine_inner<<<grid_corr,threads_corr>>>(
+		    		num_tiles,          // int               num_tiles,          // number of tiles to process (each with num_pairs)
+					num_pairs,          // int               num_pairs,          // num pairs per tile (should be the same)
+					init_output,        // int               init_output,        // !=0 - reset output tiles to zero before accumulating
+					pairs_mask,         // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+					gpu_corr_indices,   // int             * gpu_corr_indices,   // packed tile+pair
+					gpu_combo_indices,  // int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+					corr_stride,        // const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+					gpu_corrs,          // float           * gpu_corrs,          // input correlation tiles
+					corr_stride_combo,  // const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+					gpu_corrs_combo);    // float           * gpu_corrs_combo)    // combined correlation output (one per tile)
+	 }
+}
+//#define CORR_TILES_PER_BLOCK_COMBINE   4 // increase to 8?
+#define PAIRS_HOR_DIAG_MAIN 0x13
+#define PAIRS_VERT          0x0c
+#define PAIRS_DIAG_OTHER    0x20
+/**
+ * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
+ * and separately the two diagonals into another single one
+ * When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
+ * when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
+ * (corresponds to a vertical flip).
+ * Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
+ * This is an inner kernel that is called from corr2D_combine.
+ *
+ * @param num_tiles,          // number of tiles to process (each with num_pairs)
+ * @param num_pairs,          // num pairs per tile (should be the same)
+ * @param init_output,        // !=0 - reset output tiles to zero before accumulating
+ * @param pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+ * @param gpu_corr_indices,   // packed tile+pair
+ * @param gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+ * @param corr_stride,        // (in floats) stride for the input TD correlations
+ * @param gpu_corrs,          // input correlation tiles
+ * @param corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+ * @param gpu_corrs_combo)    // combined correlation output (one per tile)
+ */
+extern "C" __global__ void corr2D_combine_inner(
+		int               num_tiles,          // number of tiles to process (each with num_pairs)
+		int               num_pairs,          // num pairs per tile (should be the same)
+		int               init_output,        // !=0 - reset output tiles to zero before accumulating
+		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+		float           * gpu_corrs,          // input correlation tiles
+		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+		float           * gpu_corrs_combo)    // combined correlation output (one per tile)
+{
+	int tile_in_block = threadIdx.y;
+	int tile_index = blockIdx.x * CORR_TILES_PER_BLOCK_COMBINE + tile_in_block;
+	if (tile_index >= num_tiles){
+		return; // nothing to do
+	}
+	int corr_tile_index0 = tile_index * num_pairs;
+	if (gpu_combo_indices != 0){
+		int corr_pair = gpu_corr_indices[corr_tile_index0];
+		gpu_combo_indices[tile_index] = ((corr_pair >> CORR_NTILE_SHIFT) << CORR_NTILE_SHIFT) | pairs_mask;
+	}
+	float scale = 1.0/__popc(pairs_mask); // reverse to number of pairs to combine
+    __syncthreads();// __syncwarp();
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK_COMBINE][4][DTT_SIZE][DTT_SIZE1];
+    // start of the block in shared memory
+    float *clt_corr =  ((float *) clt_corrs) +  tile_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+	float *clt = clt_corr + threadIdx.x;
+	float *mem_corr = gpu_corrs_combo + corr_stride_combo * tile_index + threadIdx.x;
+	if (init_output != 0){ // reset combo
+#pragma unroll
+		for (int i = 0; i < DTT_SIZE4; i++){
+			(*clt)         = 0.0f;
+			clt           += DTT_SIZE1;
+		}
+	} else { // read previous from device memory
+#pragma unroll
+		for (int i = 0; i < DTT_SIZE4; i++){
+			(*clt)      = (*mem_corr);
+			clt        += DTT_SIZE1;
+			mem_corr   += DTT_SIZE;
+		}
+	}
+	__syncthreads();// __syncwarp();
+	for (int ipair = 0; ipair < num_pairs; ipair++){ // only selected
+		int corr_tile_index = corr_tile_index0 + ipair;
+		// get number of pair
+		int corr_pair = gpu_corr_indices[corr_tile_index];
+//		int tile_num = corr_pair >> CORR_NTILE_SHIFT;
+		corr_pair &= (corr_pair & ((1 << CORR_NTILE_SHIFT) - 1));
+		int pair_bit = 1 << corr_pair;
+		if ((pairs_mask & pair_bit) != 0) {
+//			if (corr_pair > NUM_PAIRS){
+//				return; // BUG - should not happen
+//			}
+			if (PAIRS_HOR_DIAG_MAIN & pair_bit){ // just accumulate. This if-s will branch in all threads, no diversion
+				clt = clt_corr + threadIdx.x;
+				mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + threadIdx.x;
+#pragma unroll
+				for (int i = 0; i < DTT_SIZE4; i++){
+					(*clt)     += (*mem_corr);
+					clt        += DTT_SIZE1;
+					mem_corr   += DTT_SIZE;
+				}
+			} else if (PAIRS_VERT & pair_bit) { // transpose and swap Q1 and Q2
+				for (int q = 0; q < 4; q++){
+					int qr = ((q & 1) << 1) | ((q >> 1) & 1);
+					clt = clt_corr + qr * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x;
+					mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + q * (DTT_SIZE * DTT_SIZE) + DTT_SIZE * threadIdx.x;
+#pragma unroll
+					for (int i = 0; i < DTT_SIZE; i++){
+						(*clt)     += (*mem_corr);
+						clt        += DTT_SIZE1;
+						mem_corr   += 1;
+					}
+				}
+			} else if (PAIRS_DIAG_OTHER & pair_bit) {
+				clt = clt_corr + threadIdx.x;
+				mem_corr = gpu_corrs + corr_stride_combo * corr_tile_index + threadIdx.x;
+#pragma unroll
+				for (int i = 0; i < DTT_SIZE2; i++){ // CC, CS
+					(*clt)     += (*mem_corr);
+					clt        += DTT_SIZE1;
+					mem_corr   += DTT_SIZE;
+				}
+#pragma unroll
+				for (int i = 0; i < DTT_SIZE2; i++){ // SC, SS
+					(*clt)     -= (*mem_corr); // negate
+					clt        += DTT_SIZE1;
+					mem_corr   += DTT_SIZE;
+				}
+			} //PAIRS_DIAG_OTHER
+		}
+	} //for (int ipair = 0; ipair < num_pairs; ipair++){ // only selected
+	__syncthreads();// __syncwarp();
+	// copy result to the device memory
+	clt = clt_corr + threadIdx.x;
+	mem_corr = gpu_corrs_combo + corr_stride_combo * tile_index + threadIdx.x;
+#pragma unroll
+	for (int i = 0; i < DTT_SIZE4; i++){
+		(*mem_corr) = (*clt);
+		clt        += DTT_SIZE1;
+		mem_corr   += DTT_SIZE;
+	}
+	__syncthreads();// __syncwarp();
+}
 /**
 * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
 * that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
@@ -1290,7 +1490,7 @@ extern "C" __global__ void corr2D_normalize(
 		int               corr_radius)        // radius of the output correlation (7 for 15x15)
 {
 	 if (threadIdx.x == 0) { // only 1 thread, 1 block
-		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK_NORMALIZE, 1);
 		    dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK_NORMALIZE-1) / CORR_TILES_PER_BLOCK_NORMALIZE,1,1);
 		    corr2D_normalize_inner<<<grid_corr,threads_corr>>>(
 	        		num_corr_tiles,      // int               num_corr_tiles,     // number of correlation tiles to process

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -86,6 +86,18 @@ extern "C" __global__ void corr2D_normalize(
 		float             fat_zero,           // here - absolute
 		int               corr_radius);        // radius of the output correlation (7 for 15x15)
+extern "C" __global__ void corr2D_combine(
+		int               num_tiles,          // number of tiles to process (each with num_pairs)
+		int               num_pairs,          // num pairs per tile (should be the same)
+		int               init_output,        // !=0 - reset output tiles to zero before accumulating
+		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+		float           * gpu_corrs,          // input correlation tiles
+		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+		float           * gpu_corrs_combo);   // combined correlation output (one per tile)
 extern "C" __global__ void textures_nonoverlap(
 		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -287,6 +287,8 @@ int main(int argc, char **argv)
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
 #endif
    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
+    const char* result_corr_quad_file =  "/data_ssd/git/tile_processor_gpu/clt/main_corr-quad.corr";
+    const char* result_corr_cross_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-cross.corr";
    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
@@ -316,7 +318,7 @@ int main(int argc, char **argv)
    static struct tp_task     task_data  [TILESX*TILESY]; // maximal length - each tile
    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
    trot_deriv  rot_deriv;
-    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
+///    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
    int                texture_indices      [TILESX*TILESYA];
    int                cpu_woi              [4];
@@ -452,7 +454,7 @@ int main(int argc, char **argv)
 			CORR_SIZE,                       // int width,
 			TILESX * TILESY);                // int height);
-    gpu_corrs_combo = alloc_image_gpu(
+    gpu_corrs_combo_td = alloc_image_gpu(
    		&dstride_corr_combo_td,          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
 			4 * DTT_SIZE * DTT_SIZE,         // int width,
 			TILESX * TILESY);                // int height);
@@ -517,7 +519,8 @@ int main(int argc, char **argv)
    // segfault in the next
    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
-    // build corr_indices
+    // build corr_indices - not needed anympore?
+    /*
    num_corrs = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
@@ -536,6 +539,11 @@ int main(int argc, char **argv)
    		(float * ) corr_indices,
 			num_corrs,
 			NUM_PAIRS * TILESX * TILESY);
+    */
+    // just allocate
+    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        NUM_PAIRS * TILESX * TILESY*sizeof(int)));
+    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
+//
    // build texture_indices
    num_textures = 0;
@@ -973,7 +981,7 @@ int main(int argc, char **argv)
    sdkStopTimer(&timerCORR);
    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
    sdkDeleteTimer(&timerCORR);
-    printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORR, num_corrs);
+//    printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORR, num_corrs);
 	checkCudaErrors(cudaMemcpy(
 			&num_corrs,
@@ -1005,6 +1013,109 @@ int main(int argc, char **argv)
    		free(cpu_corr);
 #endif // ifndef NOCORR
+#ifndef NOCORR_TD
+//    cudaProfilerStart();
+// testing corr
+    StopWatchInterface *timerCORRTD = 0;
+    sdkCreateTimer(&timerCORRTD);
+    int num_corr_combo;
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerCORRTD);
+    		sdkStartTimer(&timerCORRTD);
+    	}
+        correlate2D<<<1,1>>>( // output TD tiles, no normalization
+        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				3,                             // int               colors,             // number of colors (3/1)
+				color_weights[0], // 0.25,     // float             scale0,             // scale for R
+				color_weights[1], // 0.25,     // float             scale1,             // scale for B
+				color_weights[2], // 0.5,      // float             scale2,             // scale for G
+				30.0,                          // float             fat_zero,           // here - absolute
+				gpu_tasks,                     // struct tp_task  * gpu_tasks,
+				tp_task_size,                  // int               num_tiles) // number of tiles in task
+				TILESX,                        // int               tilesx,             // number of tile rows
+				gpu_corr_indices,              // int             * gpu_corr_indices,   // packed tile+pair
+				gpu_num_corr_tiles,            // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr_td/sizeof(float), // const size_t      corr_stride,        // in floats
+				0,                             // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+				gpu_corrs_td);                 // float           * gpu_corrs);         // correlation output data
+    	getLastCudaError("Kernel failure:correlate2D");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("correlate2D-TD pass: %d\n",i);
+    	checkCudaErrors(cudaMemcpy(
+    			&num_corrs,
+    			gpu_num_corr_tiles,
+    			sizeof(int),
+    			cudaMemcpyDeviceToHost));
+    	num_corr_combo = num_corrs/NUM_PAIRS;
+    	corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
+    			num_corr_combo, // tp_task_size,     // int               num_tiles,          // number of tiles to process (each with num_pairs)
+				NUM_PAIRS,                           // int               num_pairs,          // num pairs per tile (should be the same)
+    			1,                                   // int               init_output,        // !=0 - reset output tiles to zero before accumulating
+    			0x0f,                                // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+				gpu_corr_indices,                    // int             * gpu_corr_indices,   // packed tile+pair
+				gpu_corrs_combo_indices,             // int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+				dstride_corr_td/sizeof(float),       // const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+				gpu_corrs_td,                       // float           * gpu_corrs,          // input correlation tiles
+				dstride_corr_combo_td/sizeof(float), // const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+				gpu_corrs_combo_td);                 // float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+    	getLastCudaError("Kernel failure:corr2D_combine");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("corr2D_combine pass: %d\n",i);
+    	corr2D_normalize<<<1,1>>>(
+    			num_corr_combo, //tp_task_size,      // int               num_corr_tiles,     // number of correlation tiles to process
+				dstride_corr_combo_td/sizeof(float), // const size_t      corr_stride_td,     // in floats
+				gpu_corrs_combo_td,                  // float           * gpu_corrs_td,       // correlation tiles in transform domain
+				dstride_corr_combo/sizeof(float),    // const size_t      corr_stride,        // in floats
+				gpu_corrs_combo,                     // float           * gpu_corrs,          // correlation output data (pixel domain)
+				30.0,                                // float             fat_zero,           // here - absolute
+				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+    	getLastCudaError("Kernel failure:corr2D_normalize");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("corr2D_normalize pass: %d\n",i);
+    }
+    sdkStopTimer(&timerCORRTD);
+    float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
+    sdkDeleteTimer(&timerCORRTD);
+    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORR, num_corrs);
+    int corr_size_combo =        2 * CORR_OUT_RAD + 1;
+    int rslt_corr_size_combo =   num_corr_combo * corr_size_combo * corr_size_combo;
+    float * cpu_corr_combo =    (float *)malloc(rslt_corr_size_combo * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr_combo,
+			(corr_size_combo * corr_size_combo) * sizeof(float),
+			gpu_corrs_combo,
+			dstride_corr_combo,
+			(corr_size_combo * corr_size_combo) * sizeof(float),
+			num_corr_combo,
+    		cudaMemcpyDeviceToHost));
+//    const char* result_corr_quad_file =  "/data_ssd/git/tile_processor_gpu/clt/main_corr-quad.corr";
+//    const char* result_corr_cross_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr-cross.corr";
+#ifndef NSAVE_CORR
+    		printf("Writing phase correlation data to %s\n",  result_corr_quad_file);
+    		writeFloatsToFile(
+    				cpu_corr_combo,         // float *       data, // allocated array
+					rslt_corr_size_combo,   // int           size, // length in elements
+					result_corr_quad_file); // 			   const char *  path) // file path
+#endif
+    		free(cpu_corr_combo);
+#endif // ifndef NOCORR_TD
 // -----------------
@@ -1223,7 +1334,12 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaFree(gpu_clt));
 	checkCudaErrors(cudaFree(gpu_corr_images));
 	checkCudaErrors(cudaFree(gpu_corrs));
+	checkCudaErrors(cudaFree(gpu_corrs_td));
 	checkCudaErrors(cudaFree(gpu_corr_indices));
+	checkCudaErrors(cudaFree(gpu_corrs_combo));
+	checkCudaErrors(cudaFree(gpu_corrs_combo_td));
+	checkCudaErrors(cudaFree(gpu_corrs_combo_indices));
 	checkCudaErrors(cudaFree(gpu_num_corr_tiles));
 	checkCudaErrors(cudaFree(gpu_texture_indices));
 	checkCudaErrors(cudaFree(gpu_port_offsets));

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -53,7 +53,8 @@
 #define TILES_PER_BLOCK                4
 #define CORR_THREADS_PER_TILE          8
 #define CORR_TILES_PER_BLOCK           4
-#define CORR_TILES_PER_BLOCK_NORMALIZE 4
+#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
+#define CORR_TILES_PER_BLOCK_COMBINE   4 // increase to 16?
 #define TEXTURE_THREADS_PER_TILE       8
 #define TEXTURE_TILES_PER_BLOCK        1
 #define IMCLT_THREADS_PER_TILE        16