testCorrelate2DIntraTD

3610b7a6 · Andrey Filippov · 6f9c7399 · 3610b7a6 · 3610b7a6 · 3610b7a6
Commit 3610b7a6 authored Apr 10, 2025 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 352 additions and 142 deletions

TpHostGpu.cu src/TpHostGpu.cu +331 -117

TpHostGpu.h src/TpHostGpu.h +5 -2

test_tp.cu src/test_tp.cu +16 -23

No files found.
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -549,51 +549,6 @@ void TpHostGpu::testConvertDirect       (int num_runs){ // 608
 			m_gpu_clt_h);             // float **     gpu_clt_h);
 }

-void TpHostGpu::saveClt(
-		const char ** paths,    // tpPaths.ports_clt_file
-		const char * prompt,    // "CLT data"
-		float **     gpu_clt_h){ // m_gpu_clt_h
-	if (!paths)	return;
-    int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
-    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
-    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && m_gpu_clt_h[ncam]){
-    	checkCudaErrors(cudaMemcpy( // segfault
-    			cpu_clt,
-				m_gpu_clt_h[ncam],
-				rslt_size * sizeof(float),
-    			cudaMemcpyDeviceToHost));
-        printf("Writing %s to %s\n",  prompt, paths[ncam]);
-    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
-    			rslt_size, // int           size, // length in elements
-				paths[ncam]); // 			   const char *  path) // file path
-    }
-    hfree(cpu_clt);
-}
-
-void TpHostGpu::saveRgb(
-		const char ** paths,  // m_tpPaths.result_rbg_file
-		const char * prompt,  // "RBG data"
-		float **     gpu_corr_images_h){
-	if (!paths)	return;
-	int rslt_img_size =       m_tpParams.num_colors * (m_tpParams.img_height + m_tpParams.dtt_size) * (m_tpParams.img_width + m_tpParams.dtt_size);
-	float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
-    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && gpu_corr_images_h[ncam]){
-    	checkCudaErrors(cudaMemcpy2D( // segfault
-    			cpu_corr_image,
-				(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
-				gpu_corr_images_h[ncam],
-				dstride_rslt,
-				(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
-				m_tpParams.num_colors* (m_tpParams.img_height + m_tpParams.dtt_size),
-    			cudaMemcpyDeviceToHost));
-        printf("Writing %s to %s\n",  prompt, paths[ncam]);
-    	writeFloatsToFile( // will have margins
-    			cpu_corr_image, // float *       data, // allocated array
-				rslt_img_size,  // int           size, // length in elements
-				paths[ncam]);   // 			   const char *  path) // file path
-    }
-    free(cpu_corr_image);
-}

 /*
 // not implemented
@@ -716,10 +671,9 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 			m_gpu_corrs,                // float *      gpu_corrs,
 			m_gpu_corr_indices,         // int *        gpu_corr_indices)
 			16);                        //int          num_sel_sensors) { // only for interscene
-
 }

-void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
+void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	// check/replace names
@@ -748,7 +702,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
    dim3 grid_tp(1, 1, 1);
    float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];

-    // use gpu_images and convert to gpu_clt_ref
+    // use gpu_images and convert to gpu_clt_ref was 1152:
 	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
 			m_tpParams.num_cams,              // int                num_cams,           // actual number of cameras
 			m_tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
@@ -865,6 +819,267 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
    gfree(gpu_clt_ref);
 }

+void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886 - 1123
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	// check/replace names
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_corrs_td){
+		throw std::runtime_error("Error: m_gpu_corrs_td is not initialized ");
+	}
+	if (!m_gpu_ftasks){
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_corr_indices){
+		throw std::runtime_error("Error: m_gpu_corr_indices is not initialized ");
+	}
+	if (!m_gpu_num_corr_tiles){
+		throw std::runtime_error("Error: m_gpu_num_corr_tiles is not initialized ");
+	}
+	if (!m_gpu_corrs_combo_td){
+		throw std::runtime_error("Error: m_gpu_corrs_combo_td is not initialized ");
+	}
+	if (!m_gpu_corrs_combo_indices){
+		throw std::runtime_error("Error: m_gpu_corrs_combo_indices is not initialized ");
+	}
+	if (!m_gpu_corrs_combo){
+		throw std::runtime_error("Error: m_gpu_corrs_combo is not initialized ");
+	}
+	//m_gpu_corrs_combo
+	// testing corr
+	StopWatchInterface *timerCORRTD = 0;
+	sdkCreateTimer(&timerCORRTD);
+	int num_corr_combo{};
+    int num_corrs{}; // will get data from the gpu memory
+
+	for (int i = i0; i < numIterations; i++) {
+		if (i == 0) {
+			checkCudaErrors(cudaDeviceSynchronize());
+			sdkResetTimer(&timerCORRTD);
+			sdkStartTimer(&timerCORRTD);
+		}
+		// FIXME: provide sel_pairs
+		correlate2D<<<1,1>>>( // output TD tiles, no normalization
+				m_tpParams.num_cams,           // int               num_cams,
+				m_tpParams.sel_pairs[0],       // int               sel_pairs0           // unused bits should be 0
+				m_tpParams.sel_pairs[1],       // int               sel_pairs1,           // unused bits should be 0
+				m_tpParams.sel_pairs[2],       // int               sel_pairs2,           // unused bits should be 0
+				m_tpParams.sel_pairs[3],       // int               sel_pairs3,           // unused bits should be 0
+				m_gpu_clt,                     // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				m_tpParams.num_colors,         // int               colors,             // number of colors (3/1)
+				m_tpParams.color_weights[0],   // 0.25,  // float             scale0,             // scale for R
+				m_tpParams.color_weights[1],   // 0.25,  // float             scale1,             // scale for B
+				m_tpParams.color_weights[2],   // 0.5,   // float             scale2,             // scale for G
+				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				m_gpu_ftasks,                  // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				m_tpParams.tp_tasks_size,      // int               num_tiles) // number of tiles in task
+				m_tpParams.tilesx,             // int               tilesx,             // number of tile rows
+				m_gpu_corr_indices,            // int             * gpu_corr_indices,   // packed tile+pair
+				m_gpu_num_corr_tiles,          // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr_td/sizeof(float), // const size_t      corr_stride,        // in floats
+				0,                             // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+				m_gpu_corrs_td);               // float           * gpu_corrs);         // correlation output data
+		getLastCudaError("Kernel failure:correlate2D");
+		checkCudaErrors(cudaDeviceSynchronize());
+		printf("correlate2D-TD pass: %d\n",i);
+		checkCudaErrors(cudaMemcpy(
+				&num_corrs,
+				m_gpu_num_corr_tiles,
+				sizeof(int),
+				cudaMemcpyDeviceToHost));
+		if (quad_combine) {
+			num_corr_combo = num_corrs/m_tpParams.num_pairs;
+			corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
+					num_corr_combo, // tp_task_size,      // int               num_tiles,          // number of tiles to process (each with num_pairs)
+					m_tpParams.num_pairs,                 // int               num_pairs,          // num pairs per tile (should be the same)
+					1,                                    // int               init_output,        // !=0 - reset output tiles to zero before accumulating
+					0x0f,                                 // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+					m_gpu_corr_indices,                   // int             * gpu_corr_indices,   // packed tile+pair
+					m_gpu_corrs_combo_indices,            // int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+					dstride_corr_td/sizeof(float),        // const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+					m_gpu_corrs_td,                       // float           * gpu_corrs,          // input correlation tiles
+					dstride_corr_combo_td/sizeof(float),  // const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+					m_gpu_corrs_combo_td);                  // float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+			getLastCudaError("Kernel failure:corr2D_combine");
+			checkCudaErrors(cudaDeviceSynchronize());
+			corr2D_normalize<<<1,1>>>(
+					num_corr_combo, //tp_task_size,            // int               num_corr_tiles,     // number of correlation tiles to process
+					dstride_corr_combo_td/sizeof(float),       // const size_t      corr_stride_td,     // in floats
+					m_gpu_corrs_combo_td,                      // float           * gpu_corrs_td,       // correlation tiles in transform domain
+					(float *) 0,                               // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+					dstride_corr_combo/sizeof(float),          // const size_t      corr_stride,        // in floats
+					m_gpu_corrs_combo,                         // float           * gpu_corrs,          // correlation output data (pixel domain)
+					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+			printf("corr2D_combine pass: %d\n",i);
+		}else { // if (quad_combine) {
+			checkCudaErrors(cudaDeviceSynchronize());
+			corr2D_normalize<<<1,1>>>(
+					num_corrs, //tp_task_size,                 // int               num_corr_tiles,     // number of correlation tiles to process
+					dstride_corr_td/sizeof(float),             // const size_t      corr_stride_td,     // in floats
+					m_gpu_corrs_td,                            // float           * gpu_corrs_td,       // correlation tiles in transform domain
+					(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+					dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
+					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
+					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+		} // if (quad_combine) {
+
+		getLastCudaError("Kernel failure:corr2D_normalize");
+		checkCudaErrors(cudaDeviceSynchronize());
+		printf("corr2D_normalize pass: %d\n",i);
+	}
+	sdkStopTimer(&timerCORRTD);
+	float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
+	sdkDeleteTimer(&timerCORRTD);
+	printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORRTD, num_corrs); // 981
+	if (quad_combine) {
+	    int corr_size_combo =        2 * CORR_OUT_RAD + 1;
+	    int rslt_corr_size_combo =   num_corr_combo * corr_size_combo * corr_size_combo;
+	    float * cpu_corr_combo =    (float *)malloc(rslt_corr_size_combo * sizeof(float));
+
+	    checkCudaErrors(cudaMemcpy2D(
+	    		cpu_corr_combo,
+				(corr_size_combo * corr_size_combo) * sizeof(float),
+				m_gpu_corrs_combo,
+				dstride_corr_combo,
+				(corr_size_combo * corr_size_combo) * sizeof(float),
+				num_corr_combo,
+				cudaMemcpyDeviceToHost));
+	    printf("Writing phase correlation data to %s\n",  m_tpPaths.result_corr_quad_file);
+	    writeFloatsToFile(
+	    		cpu_corr_combo,                 // float *       data, // allocated array
+				rslt_corr_size_combo,              // int           size, // length in elements
+				m_tpPaths.result_corr_quad_file); // 			     const char *  path) // file path
+	    free(cpu_corr_combo);
+	} else { // if (quad_combine) { // 1006
+	    // Reading / formatting / saving 	correlate2D(TD) + corr2D_normalize /1007
+	    checkCudaErrors(cudaMemcpy(
+	    		&num_corrs,
+				m_gpu_num_corr_tiles,
+				sizeof(int),
+				cudaMemcpyDeviceToHost));
+	  //  printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n",  avgTimeCORR, num_corrs);
+
+	//    int corr_size =        2 * CORR_OUT_RAD + 1;
+	//    int rslt_corr_size =   num_corrs * corr_size * corr_size;
+	//    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+
+//	    int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
+		int  num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
+		int rslt_corr_size =   num_corrs * m_tpParams.corr_length; // corr_size * corr_size;
+		int corr_img_size = num_corr_indices * 16*16; // NAN
+//		float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+		float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+		int *   cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
+	    checkCudaErrors(cudaMemcpy2D(
+	    		cpu_corr,
+				m_tpParams.corr_length * sizeof(float),
+				m_gpu_corrs,
+				dstride_corr,
+				m_tpParams.corr_length * sizeof(float),
+				num_corrs,
+				cudaMemcpyDeviceToHost));
+	    checkCudaErrors(cudaMemcpy(
+	    		cpu_corr_indices,
+				m_gpu_corr_indices,
+				num_corr_indices * sizeof(int),
+				cudaMemcpyDeviceToHost));
+	    // Reading / formatting / saving 	correlate2D(TD) + corr2D_normalize
+		float * corr_img = 	getCorrImg(
+				corr_img_size,    // int     corr_img_size,
+				num_corr_indices, //int     num_corr_indices,
+				cpu_corr_indices, // int *   cpu_corr_indices,
+				cpu_corr,         // float * cpu_corr,
+				16); // num_sel_sensors); // int     num_sel_sensors) // Will not be used
+	    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
+	    		m_tpPaths.result_corr_td_norm_file, (m_tpParams.tilesx * 16),(m_tpParams.tilesya*16),
+				m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
+	    writeFloatsToFile(
+	    		corr_img,                  // float *       data, // allocated array
+				corr_img_size,             // int           size, // length in elements
+				m_tpPaths.result_corr_td_norm_file); // 			   const char *  path) // file path
+	    // export TD intra // 1076
+	    int intra_corr_size_td =   num_corrs * m_tpParams.dtt_size2 * m_tpParams.dtt_size2; //  DTT_SIZE2*DTT_SIZE2;
+	    float * cpu_corr_td = (float *) malloc(intra_corr_size_td * sizeof(float));
+	    checkCudaErrors(cudaMemcpy2D(
+	    		cpu_corr_td,
+				(m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float),
+				m_gpu_corrs_td,
+				dstride_corr_td,
+				(m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float),
+				num_corrs,
+				cudaMemcpyDeviceToHost));
+	    float * corr_img_td = getCorrTdImg(
+	    		corr_img_size,    // int     corr_img_size,
+				num_corr_indices, //int     num_corr_indices,
+				cpu_corr_indices, // int *   cpu_corr_indices,
+				m_gpu_corrs_td,   // float * cpu_corr,
+				16);              // num_sel_sensors); // int     num_sel_sensors) // // Will not be used
+
+	    printf("Writing intrascene phase correlation TD data tp %s\n", m_tpPaths.result_intrascene_td);
+	    writeFloatsToFile(
+	    		corr_img_td,                  // float *       data, // allocated array
+				corr_img_size,             // int           size, // length in elements
+				m_tpPaths.result_intrascene_td); // "clt/aux_intrascene-TD.raw"); // 			   const char *  path) // file path
+	    free (cpu_corr_td);
+	    free (cpu_corr);
+	    free (cpu_corr_indices);
+	    free (corr_img);
+	    free (corr_img_td);
+	} // if (quad_combine) {
+}
+
+
+
+void TpHostGpu::saveClt(
+		const char ** paths,    // tpPaths.ports_clt_file
+		const char * prompt,    // "CLT data"
+		float **     gpu_clt_h){ // m_gpu_clt_h
+	if (!paths)	return;
+    int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
+    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && m_gpu_clt_h[ncam]){
+    	checkCudaErrors(cudaMemcpy( // segfault
+    			cpu_clt,
+				m_gpu_clt_h[ncam],
+				rslt_size * sizeof(float),
+    			cudaMemcpyDeviceToHost));
+        printf("Writing %s to %s\n",  prompt, paths[ncam]);
+    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
+    			rslt_size, // int           size, // length in elements
+				paths[ncam]); // 			   const char *  path) // file path
+    }
+    hfree(cpu_clt);
+}
+
+void TpHostGpu::saveRgb(
+		const char ** paths,  // m_tpPaths.result_rbg_file
+		const char * prompt,  // "RBG data"
+		float **     gpu_corr_images_h){
+	if (!paths)	return;
+	int rslt_img_size =       m_tpParams.num_colors * (m_tpParams.img_height + m_tpParams.dtt_size) * (m_tpParams.img_width + m_tpParams.dtt_size);
+	float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && gpu_corr_images_h[ncam]){
+    	checkCudaErrors(cudaMemcpy2D( // segfault
+    			cpu_corr_image,
+				(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
+				gpu_corr_images_h[ncam],
+				dstride_rslt,
+				(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
+				m_tpParams.num_colors* (m_tpParams.img_height + m_tpParams.dtt_size),
+    			cudaMemcpyDeviceToHost));
+        printf("Writing %s to %s\n",  prompt, paths[ncam]);
+    	writeFloatsToFile( // will have margins
+    			cpu_corr_image, // float *       data, // allocated array
+				rslt_img_size,  // int           size, // length in elements
+				paths[ncam]);   // 			   const char *  path) // file path
+    }
+    free(cpu_corr_image);
+}
+
 void TpHostGpu::saveIntraCorrFile(
 		const char * path,
 		const char * prompt,
@@ -919,72 +1134,6 @@ void TpHostGpu::saveIntraCorrFile(
 			tpPaths.result_inter_td_norm_file); // 			   const char *  path) // file path
 */

-float * TpHostGpu::getCorrImg(
-		int     corr_img_size,
-		int     num_corr_indices,
-		int *   cpu_corr_indices,
-		float * cpu_corr,
-		int     num_sel_sensors){
-	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
-	for (int i = 0; i < corr_img_size; i++){
-		corr_img[i] = NAN;
-	}
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
-    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
-    		cpair = num_sel_sensors;
-    	}
-    	int ty = ctt / m_tpParams.tilesx;
-    	int tx = ctt % m_tpParams.tilesx;
-    	int src_offs0 = ict * m_tpParams.corr_length;
-    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
-    	for (int iy = 0; iy < m_tpParams.corr_size; iy++){
-    		int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
-    		int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
-    		for (int ix = 0; ix < m_tpParams.corr_size; ix++){
-    			corr_img[dst_offs++] = cpu_corr[src_offs++];
-    		}
-    	}
-    }
-	return corr_img;
-}
-
-float * TpHostGpu::getCorrTdImg(
-		int     corr_img_size,
-		int     num_corr_indices,
-		int *   cpu_corr_indices,
-		float * cpu_corr_td,
-		int     num_sel_sensors){
-	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
-	for (int i = 0; i < corr_img_size; i++){
-		corr_img[i] = NAN;
-	}
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
-    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
-    		cpair = num_sel_sensors;
-    	}
-    	int ty = ctt / m_tpParams.tilesx;
-    	int tx = ctt % m_tpParams.tilesx;
-    	int src_offs0 = ict * m_tpParams.corr_length;
-    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
-
-    	for (int iquad = 0; iquad < 4; iquad ++) {
-    		int iqy = (iquad >> 1) & 1;
-    		int iqx = (iquad >> 0) & 1;
-    		for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
-    			int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
-    			int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
-    			for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
-    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
-    			}
-    		}
-    	}
-    }
-	return corr_img;
-}



@@ -1016,7 +1165,7 @@ void TpHostGpu::saveInterCorrFile(
 			gpu_corr_indices,
 			num_corr_indices * sizeof(int),
 			cudaMemcpyDeviceToHost));
-	float * corr_img = 	getCorrTdImg(
+	float * corr_img_td = 	getCorrTdImg(
 			corr_img_size,    // int     corr_img_size,
 			num_corr_indices, //int     num_corr_indices,
 			cpu_corr_indices, // int *   cpu_corr_indices,
@@ -1024,11 +1173,11 @@ void TpHostGpu::saveInterCorrFile(
 			num_sel_sensors); // int     num_sel_sensors)
    printf("Writing %s TD data to %s\n", prompt, path);
    writeFloatsToFile(
-    		corr_img,        // float *       data, // allocated array
+    		corr_img_td,        // float *       data, // allocated array
 			corr_img_size,   // int           size, // length in elements
 			path);           // 			   const char *  path) // file path
    free (cpu_corr_indices);
-	free (corr_img);
+	free (corr_img_td);
    free (cpu_corr_td);
 }

@@ -1069,7 +1218,72 @@ void TpHostGpu::saveInterCorrIndicesFile(
    free (cpu_corr_indices);
 }

+float * TpHostGpu::getCorrImg(
+		int     corr_img_size,
+		int     num_corr_indices,
+		int *   cpu_corr_indices,
+		float * cpu_corr,
+		int     num_sel_sensors){
+	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	for (int i = 0; i < corr_img_size; i++){
+		corr_img[i] = NAN;
+	}
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
+    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / m_tpParams.tilesx;
+    	int tx = ctt % m_tpParams.tilesx;
+    	int src_offs0 = ict * m_tpParams.corr_length;
+    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
+    	for (int iy = 0; iy < m_tpParams.corr_size; iy++){
+    		int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
+    		int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
+    		for (int ix = 0; ix < m_tpParams.corr_size; ix++){
+    			corr_img[dst_offs++] = cpu_corr[src_offs++];
+    		}
+    	}
+    }
+	return corr_img;
+}

+float * TpHostGpu::getCorrTdImg(
+		int     corr_img_size,
+		int     num_corr_indices,
+		int *   cpu_corr_indices,
+		float * cpu_corr_td,
+		int     num_sel_sensors){
+	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	for (int i = 0; i < corr_img_size; i++){
+		corr_img[i] = NAN;
+	}
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
+    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / m_tpParams.tilesx;
+    	int tx = ctt % m_tpParams.tilesx;
+    	int src_offs0 = ict * m_tpParams.corr_length;
+    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
+
+    	for (int iquad = 0; iquad < 4; iquad ++) {
+    		int iqy = (iquad >> 1) & 1;
+    		int iqx = (iquad >> 0) & 1;
+    		for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
+    			int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
+    			int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
+    			for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
+    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
+    			}
+    		}
+    	}
+    }
+	return corr_img;
+}

 void TpHostGpu::hfree(float *              p) {if (p) free(p); p = {};}
 void TpHostGpu::hfree(struct CltExtra *    p) {if (p) free(p); p = {};}

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -129,8 +129,11 @@ public:
 //	void testImclt               (int num_runs); // 682 // not implemented
 	void testImcltRbgAll         (int num_runs); // 701

-	void testCorrelate2DIntra    (int num_runs);
-	void testCorrelate2DInterSelf(int num_runs);
+	void testCorrelate2DIntra    (int num_runs); // 762 - 885
+	void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411
+	void testCorrelate2DIntraTD  (int num_runs, int quad_combine); // 886 - 1123
+
+

 	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
 	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -32,7 +32,7 @@

 // all of the next 5 were disabled
 //#define NOCORR
-#define NOCORR_TD
+//#define NOCORR_TD
 #define NOTEXTURES //
 #define NOTEXTURE_RGBA //
 //#define NOTEXTURE_RGBAXXX //
@@ -883,10 +883,8 @@ int main(int argc, char **argv)
    free (corr_img);
 #endif // ifndef NOCORR

-
-
-
 #ifndef NOCORR_TD
+//#define QUAD_COMBINE
 //    cudaProfilerStart();
 // testing corr
    StopWatchInterface *timerCORRTD = 0;
@@ -903,10 +901,10 @@ int main(int argc, char **argv)
    	// FIXME: provide sel_pairs
        correlate2D<<<1,1>>>( // output TD tiles, no normalization
        		tpParams.num_cams,                      // int               num_cams,
-				TpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				TpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				TpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				TpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
+				tpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				tpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				tpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				tpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				tpParams.num_colors,                    // int               colors,             // number of colors (3/1)
 				tpParams.color_weights[0], // 0.25,     // float             scale0,             // scale for R
@@ -1019,20 +1017,18 @@ int main(int argc, char **argv)
 //    int rslt_corr_size =   num_corrs * corr_size * corr_size;
 //    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));

-	rslt_corr_size =   num_corrs * corr_length * corr_length;
+	rslt_corr_size =   num_corrs * tpParams.corr_length; // corr_size * corr_size;
 	corr_img_size = num_corr_indices * 16*16; // NAN
 	corr_img = (float *)malloc(corr_img_size * sizeof(float));
 	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
 	cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));

-
-
    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr,
-			(corr_length * corr_length) * sizeof(float),
+			tpParams.corr_length * sizeof(float),
 			gpu_corrs,
 			dstride_corr,
-			(corr_length * corr_length) * sizeof(float),
+			tpParams.corr_length * sizeof(float),
 			num_corrs,
 			cudaMemcpyDeviceToHost));
    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
@@ -1056,13 +1052,13 @@ int main(int argc, char **argv)
    	int ty = ctt / TILESX;
    	int tx = ctt % TILESX;
    	//		int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
-    	int src_offs0 = ict * corr_length * corr_length;
+    	int src_offs0 = ict * tpParams.corr_length;
    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);

-    	for (int iy = 0; iy < corr_length; iy++){
-    		int src_offs = src_offs0 + iy * corr_length; // ict * tpParams.num_pairs * corr_size * corr_size;
+    	for (int iy = 0; iy < tpParams.corr_size; iy++){
+    		int src_offs = src_offs0 + iy * tpParams.corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < corr_length; ix++){
+    		for (int ix = 0; ix < tpParams.corr_size; ix++){
    			corr_img[dst_offs++] = cpu_corr[src_offs++];
    		}
    	}
@@ -1116,15 +1112,9 @@ int main(int argc, char **argv)
 			"clt/aux_intrascene-TD.raw"); // 			   const char *  path) // file path
 #endif

-
-
    free (cpu_corr_td);
 #endif  // if 1
    // reuse image, export TD data
-
-
-
-
    free (cpu_corr);
    free (cpu_corr_indices);
    free (corr_img);
@@ -1135,6 +1125,9 @@ int main(int argc, char **argv)



+
+
+
 // Testing "interframe" correlation with itself, assuming direct convert already ran