More refactoring

dc090454 · Andrey Filippov · 67816dbf · dc090454 · dc090454 · dc090454
Commit dc090454 authored Apr 07, 2025 by Andrey Filippov
6 changed files
--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -126,7 +126,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
 		size_t            corr_stride,        // in floats
-		float           * gpu_corrs);         // correlation output data
+		float           * gpu_corrs);          // correlation output data


 extern "C" __global__ void corr2D_normalize(

--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -119,6 +119,35 @@ void TpHostGpu::setImgBuffers(){
    m_gpu_images =         copyalloc_pointers_gpu (m_gpu_images_h,      m_tpParams.num_cams); // NUM_CAMS);
 }

+void TpHostGpu::setImgBuffersShifted(int is_bayer, int image_dx, int image_dy) {
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+        readFloatsFromFile(
+        		m_host_kern_buf, // float * data, // allocated array
+				m_tpPaths.image_files[ncam]); // 			   char *  path) // file path
+        shift_image (
+        		m_host_kern_buf,       // float * image,
+				m_tpParams.img_width,  // int width,
+				m_tpParams.img_height, // int height,
+				is_bayer,              // int bayer,
+				image_dx,              // int dx,
+				image_dy);             // int dy);
+
+        update_image_gpu(
+        		m_host_kern_buf,          // float * image_host,
+				m_gpu_images_h[ncam], // float * image_gpu,
+				dstride,                // size_t  dstride, // in floats !
+				m_tpParams.img_width, // IMG_WIDTH,     // int width,
+				m_tpParams.img_height); // IMG_HEIGHT);   // int height);
+        m_gpu_images_h[ncam] =  copyalloc_image_gpu(
+        		m_host_kern_buf, // float * image_host,
+				&dstride,      // size_t* dstride,
+				m_tpParams.img_width, // IMG_WIDTH,     // int width,
+				m_tpParams.img_height); // IMG_HEIGHT);   // int height);
+    }
+}
+
+
+
 void TpHostGpu::setGeometryCorrectionBuffers() {
    readFloatsFromFile(
    		(float *) &m_fgeometry_correction, // float * data, // allocated array, no need to free
@@ -251,7 +280,26 @@ void TpHostGpu::setRGBA(){
    		(m_tpParams.num_colors + 1) * sizeof(float)));
 }

+void TpHostGpu::testRotMatrices         (int num_runs){ // 424
+}
+
+void TpHostGpu::testReverseDistortions  (int num_runs){ // 468
+}
+
+void TpHostGpu::testGeomCorrect         (int num_runs){ // 534
+}
+
+void TpHostGpu::testConvertDirect       (int num_runs){ // 608
+}
+
+void TpHostGpu::testImclt               (int num_runs){ // 682
+}
+
+void TpHostGpu::testImcltRbgAll         (int num_runs){ // 701
+}
+
 void TpHostGpu::testCorrelate2DIntra(int num_runs){
+	int  num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	if (!m_gpu_clt){
@@ -311,27 +359,174 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
    saveIntraCorrFile(
    		m_tpPaths.result_corr_file, // const char * path,
 			"phase correlation data",   // const char * prompt,
-			num_corrs, // int          num_corrs,
-			m_gpu_corrs, // float *      gpu_corrs,
-			m_gpu_corr_indices, // int *        gpu_corr_indices)
-			16); // //int          num_sel_sensors) { // only for interscene
+			num_corrs,                  // int          num_corrs,
+			num_corr_indices,           // int          num_corr_indices,
+			m_gpu_corrs,                // float *      gpu_corrs,
+			m_gpu_corr_indices,         // int *        gpu_corr_indices)
+			16);                        //int          num_sel_sensors) { // only for interscene

 }

+void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	// check/replace names
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_ftasks) {
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_corrs) {
+		throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
+	}
+    int sel_sensors =      0xffff; // 0x7fff; // 0xffff;
+    int num_sel_sensors =  16; // 15; // 16;
+    int num_pairs_inter =  num_sel_sensors+1;
+    int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
+    int is_bayer =         0;
+    int image_dx =         2;
+    int image_dy =         0;
+    float            * gpu_clt_ref_h            [m_tpParams.num_cams];
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
+    }
+    float           ** gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  m_tpParams.num_cams); // NUM_CAMS);
+    dim3 threads_tp(1, 1, 1);
+    dim3 grid_tp(1, 1, 1);
+    float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];
+
+    // use gpu_images and convert to gpu_clt_ref
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			m_tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+			m_tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
+			m_gpu_kernels,           // float           ** gpu_kernels,
+			m_gpu_images,            // float           ** gpu_images,
+			m_gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			gpu_clt_ref, //******    // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float),   // size_t             dstride, // for gpu_images
+			m_tpParams.tp_tasks_size,// int                num_tiles) // number of tiles in task
+			0,                       // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			m_tpParams.img_width,    // int                woi_width,
+			m_tpParams.img_height,   // int                woi_height,
+			m_tpParams.kernels_hor,  // int                kernels_hor,
+			m_tpParams.kernels_vert, //,          // int                kernels_vert);
+			m_gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			m_gpu_num_active,        // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			m_tpParams.tilesx);      // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors (cudaDeviceSynchronize());
+	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
+	setImgBuffersShifted(
+			is_bayer,   // int is_bayer,
+			image_dx,   // int image_dx,
+			image_dy); // int image_dy)
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			m_tpParams.num_cams,      // int                num_cams,           // actual number of cameras
+			m_tpParams.num_colors,    // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,      // struct CltExtra ** gpu_kernel_offsets,
+			m_gpu_kernels,            // float           ** gpu_kernels,
+			m_gpu_images,             // float           ** gpu_images,
+			m_gpu_ftasks,             // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			m_gpu_clt,     //******   // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float),    // size_t             dstride, // for gpu_images
+			m_tpParams.tp_tasks_size, // int                num_tiles) // number of tiles in task
+			0,                        // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			m_tpParams.img_width,     // int                woi_width,
+			m_tpParams.img_height,    // int                woi_height,
+			m_tpParams.kernels_hor,   // int                kernels_hor,
+			m_tpParams.kernels_vert,  //,          // int                kernels_vert);
+			m_gpu_active_tiles,       // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			m_gpu_num_active,         // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			m_tpParams.tilesx);       // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors(cudaDeviceSynchronize());
+    StopWatchInterface *timerINTERSELF = 0;
+    sdkCreateTimer(&timerINTERSELF);
+    int num_corrs{}; // will get data from the gpu memory
+    for (int i = i0; i < numIterations; i++)   {
+    	if (i == 0){
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerINTERSELF);
+    		sdkStartTimer(&timerINTERSELF);
+    	}
+    	correlate2D_inter<<<1,1>>>(             // only results in TD
+    			m_tpParams.num_cams,            // int                num_cams,           // actual number of cameras
+				sel_sensors,                    // int               sel_sensors,
+				m_gpu_clt,                      // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				gpu_clt_ref, // *********       // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				m_tpParams.num_colors,          // int               colors,             // number of colors (3/1)
+				m_tpParams.color_weights[0],    // 0.25,  // float             scale0,             // scale for R
+				m_tpParams.color_weights[1],    // 0.25,  // float             scale1,             // scale for B
+				m_tpParams.color_weights[2],    // 0.5,   // float             scale2,             // scale for G
+				m_gpu_ftasks,                   // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				m_tpParams.tp_tasks_size,       // int               num_tiles) // number of tiles in task
+				m_tpParams.tilesx,              // int               tilesx,             // number of tile rows
+				m_gpu_corr_indices,             // int             * gpu_corr_indices,   // packed tile+pair
+				m_gpu_num_corr_tiles,           // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr_td/sizeof(float),  // const size_t      corr_stride,        // in floats
+				m_gpu_corrs_td);                // float           * gpu_corrs);         // correlation output data
+        getLastCudaError("Kernel failure:correlate2D_inter");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("correlate2D_inter-TD pass: %d\n",i);
+    	checkCudaErrors(cudaMemcpy(
+    			&num_corrs,
+				m_gpu_num_corr_tiles,
+    			sizeof(int),
+    			cudaMemcpyDeviceToHost));
+    	checkCudaErrors(cudaDeviceSynchronize());
+       	corr2D_normalize<<<1,1>>>(
+        			num_corrs, //tp_task_size,                 // int               num_corr_tiles,     // number of correlation tiles to process
+    				dstride_corr_td/sizeof(float),             // const size_t      corr_stride_td,     // in floats
+					m_gpu_corrs_td,                            // float           * gpu_corrs_td,       // correlation tiles in transform domain
+    				(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+    				dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
+					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
+					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+     	getLastCudaError("Kernel failure:corr2D_normalize");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("corr2D_normalize pass: %d\n",i);
+    }
+    sdkStopTimer(&timerINTERSELF);
+    float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
+    sdkDeleteTimer(&timerINTERSELF);
+    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeINTERSELF, num_corrs);
+    saveInterCorrFile(
+    		m_tpPaths.result_interscene_td,      // const char * path,   // "clt/aux_interscene-TD.raw"     m_tpPaths.result_interscene_td
+			"interscene phase correlation",      // const char * prompt, // "interscene phase correlation"
+			num_corrs,                           // int          num_corrs,
+			num_corr_indices,                    // int          num_corr_indices,
+			m_gpu_corrs_td,                      // float      * gpu_corrs_td,
+			m_gpu_corr_indices,                  // int        * gpu_corr_indices,
+			num_sel_sensors);                    // int          num_sel_sensors);
+    saveInterCorrIndicesFile(
+    		m_tpPaths.result_interscene_indices, // const char * path,   //  "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices
+			"interscene indices",                // const char * prompt, //  "interscene indices"
+			num_corr_indices,                    // int          num_corr_indices,
+			m_gpu_corr_indices,                  // int *        gpu_corr_indices,
+			num_sel_sensors);                    // int          num_sel_sensors)
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+    	gfree(gpu_clt_ref_h[ncam]);
+    }
+    gfree(gpu_clt_ref);
+}

 void TpHostGpu::saveIntraCorrFile(
 		const char * path,
 		const char * prompt,
 		int          num_corrs,
+		int          num_corr_indices,
 		float *      gpu_corrs,
 		int *        gpu_corr_indices,
 		int          num_sel_sensors) { // only for interscene
 	if (!path)	return;
 	int rslt_corr_length =   num_corrs * m_tpParams.corr_length;
-	int corr_img_size =    m_tpParams.num_corr_indices * 16 * 16; // NAN
+	int corr_img_size =      num_corr_indices * 16 * 16; // NAN
 //	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
 	float * cpu_corr = (float *)malloc(rslt_corr_length * sizeof(float));
-	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+	int *   cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
 	checkCudaErrors(cudaMemcpy2D(
 			cpu_corr,
 			m_tpParams.corr_length * sizeof(float),
@@ -343,12 +538,13 @@ void TpHostGpu::saveIntraCorrFile(
 	checkCudaErrors(cudaMemcpy(
 			cpu_corr_indices,
 			gpu_corr_indices,
-			m_tpParams.num_corr_indices * sizeof(int),
+			num_corr_indices * sizeof(int),
 			cudaMemcpyDeviceToHost));
 	float * corr_img = 	getCorrImg(
-			corr_img_size, // int     corr_img_size,
+			corr_img_size,    // int     corr_img_size,
+			num_corr_indices, //int     num_corr_indices,
 			cpu_corr_indices, // int *   cpu_corr_indices,
-			cpu_corr, // float * cpu_corr,
+			cpu_corr,         // float * cpu_corr,
 			num_sel_sensors); // int     num_sel_sensors)
    printf("Writing %s to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
    		prompt, path, (m_tpParams.tilesx * 16),(m_tpParams.tilesya * 16), m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
@@ -373,6 +569,7 @@ void TpHostGpu::saveIntraCorrFile(

 float * TpHostGpu::getCorrImg(
 		int     corr_img_size,
+		int     num_corr_indices,
 		int *   cpu_corr_indices,
 		float * cpu_corr,
 		int     num_sel_sensors){
@@ -380,7 +577,7 @@ float * TpHostGpu::getCorrImg(
 	for (int i = 0; i < corr_img_size; i++){
 		corr_img[i] = NAN;
 	}
-    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    for (int ict = 0; ict < num_corr_indices; ict++){
    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
@@ -403,6 +600,7 @@ float * TpHostGpu::getCorrImg(

 float * TpHostGpu::getCorrTdImg(
 		int     corr_img_size,
+		int     num_corr_indices,
 		int *   cpu_corr_indices,
 		float * cpu_corr_td,
 		int     num_sel_sensors){
@@ -410,7 +608,7 @@ float * TpHostGpu::getCorrTdImg(
 	for (int i = 0; i < corr_img_size; i++){
 		corr_img[i] = NAN;
 	}
-    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    for (int ict = 0; ict < num_corr_indices; ict++){
    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
@@ -442,15 +640,16 @@ void TpHostGpu::saveInterCorrFile(
 		const char * path,   // "clt/aux_interscene-TD.raw"
 		const char * prompt, // "interscene phase correlation"
 		int          num_corrs,
+		int          num_corr_indices,
 		float      * gpu_corrs_td,
 		int        * gpu_corr_indices,
 		int          num_sel_sensors){
 	if (!path)	return;
-	int corr_img_size =    m_tpParams.num_corr_indices * 16 * 16; // NAN
+	int corr_img_size =    num_corr_indices * 16 * 16; // NAN
 	int rslt_corr_size_td =   num_corrs *  m_tpParams.dtt_size2 * m_tpParams.dtt_size2;
 	float * cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
 	int     dtile_bytes = (m_tpParams.dtt_size2*m_tpParams.dtt_size2) * sizeof(float);
-	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+	int *   cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));

    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr_td,
@@ -463,11 +662,11 @@ void TpHostGpu::saveInterCorrFile(
    checkCudaErrors(cudaMemcpy(
    		cpu_corr_indices,
 			gpu_corr_indices,
-			m_tpParams.num_corr_indices * sizeof(int),
+			num_corr_indices * sizeof(int),
 			cudaMemcpyDeviceToHost));
-
 	float * corr_img = 	getCorrTdImg(
 			corr_img_size,    // int     corr_img_size,
+			num_corr_indices, //int     num_corr_indices,
 			cpu_corr_indices, // int *   cpu_corr_indices,
 			gpu_corrs_td,     // float * cpu_corr,
 			num_sel_sensors); // int     num_sel_sensors)
@@ -476,7 +675,7 @@ void TpHostGpu::saveInterCorrFile(
    		corr_img,        // float *       data, // allocated array
 			corr_img_size,   // int           size, // length in elements
 			path);           // 			   const char *  path) // file path
-    free(cpu_corr_indices);
+    free (cpu_corr_indices);
 	free (corr_img);
    free (cpu_corr_td);
 }
@@ -484,21 +683,22 @@ void TpHostGpu::saveInterCorrFile(
 void TpHostGpu::saveInterCorrIndicesFile(
 		const char * path,   //  "clt/aux_inter-indices.raw"
 		const char * prompt, //  "interscene indices"
-		int * gpu_corr_indices,
-		int num_sel_sensors){
+		int          num_corr_indices,
+		int *        gpu_corr_indices,
+		int          num_sel_sensors){
 	if (!path)	return;
-	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+	int *   cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
    checkCudaErrors(cudaMemcpy(
    		cpu_corr_indices,
 			gpu_corr_indices,
-			m_tpParams.num_corr_indices * sizeof(int),
+			num_corr_indices * sizeof(int),
 			cudaMemcpyDeviceToHost));
    int corr_index_img_length = m_tpParams.tilesx * m_tpParams.tilesy * (num_sel_sensors+1) ;
    float *corr_index_img =  (float *)malloc(corr_index_img_length * sizeof(float));
    for (int i = 0; i < corr_index_img_length; i++){
    	corr_index_img[i] = NAN;
    }
-    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    for (int ict = 0; ict < num_corr_indices; ict++){
    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
    	if (cpair == 0xff){
@@ -514,7 +714,7 @@ void TpHostGpu::saveInterCorrIndicesFile(
 			corr_index_img_length,           // int           size, // length in elements
 			"clt/aux_inter-indices.raw"); // 			   const char *  path) // file path
    free (corr_index_img);
-    free(cpu_corr_indices);
+    free (cpu_corr_indices);
 }


@@ -527,5 +727,6 @@ void TpHostGpu::gfree(struct CltExtra *    p) {if (p) checkCudaErrors(cudaFree(p
 void TpHostGpu::gfree(struct gc *          p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
 void TpHostGpu::gfree(struct corr_vector * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
 void TpHostGpu::gfree(struct trot_deriv  * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(float **             p) {if (p) checkCudaErrors(cudaFree(p));p = {};}


--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -115,20 +115,32 @@ public:
 	void setCltBuffers();
 	void setCorrImgBuffers();
 	void setImgBuffers();
+	void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy);
 	void setGeometryCorrectionBuffers();
 	void setCorrelationBuffers();
 	void setTasks(const float target_disparity, const float scale);
 	void setTextures();
 	void setRGBA();
-	void testCorrelate2DIntra(int num_runs);
+
+	void testRotMatrices         (int num_runs); // 424
+	void testReverseDistortions  (int num_runs); // 468
+	void testGeomCorrect         (int num_runs); // 534
+	void testConvertDirect       (int num_runs); // 608
+	void testImclt               (int num_runs); // 682
+	void testImcltRbgAll         (int num_runs); // 701
+
+
+	void testCorrelate2DIntra    (int num_runs);
+	void testCorrelate2DInterSelf(int num_runs);
+
 	// for both intra and inter!
-	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrIndicesFile(const char * path, const char * prompt, int * gpu_corr_indices, int num_sel_sensors);
+	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);

 private:
-	float * getCorrImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
-	float * getCorrTdImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
+	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
+	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
 	void hfree(float * p); // {if (p) free (p);}
 	void hfree(struct CltExtra * p);
 	void gfree(float * p);
@@ -138,6 +150,7 @@ private:
 	void gfree(struct gc * p);
 	void gfree(struct corr_vector * p);
 	void gfree(struct trot_deriv * p);
+	void gfree(float ** p);

 };


--- a/src/TpParams.cu
+++ b/src/TpParams.cu
@@ -39,7 +39,7 @@ TpParams::TpParams(int lwir){
 	corr_size =  2 * corr_out_rad + 1;
 	corr_length =  corr_size * corr_size;
    num_tiles =  tp_tasks_size;
-    num_corr_indices = num_pairs * num_tiles;
+//    num_corr_indices = num_pairs * num_tiles;
 }


--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -22,7 +22,7 @@ public:
 	static constexpr int img_width =                IMG_WIDTH;
 	static constexpr int img_height =               IMG_HEIGHT;
 	static constexpr int kernels_hor =              KERNELS_HOR;
-	static constexpr int kernel_vert =              KERNELS_VERT;
+	static constexpr int kernels_vert =             KERNELS_VERT;

 	static constexpr int task_inter_en =            TASK_INTER_EN; //          10 // Task bit to enable interscene correlation
 	static constexpr int task_corr_en =             TASK_CORR_EN;  //           9 // Task bit to enable intrascene correlation (pairs defined separately)
@@ -90,7 +90,7 @@ public:
    int num_tiles{};
 	int corr_size{};
    int corr_length{};
-    int num_corr_indices{};
+//    int num_corr_indices{}; // removing - different length for intra/inter
 //    std::vector<float[2]> m_port_offsets;



--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -463,6 +463,8 @@ int main(int argc, char **argv)

 #endif // TEST_ROT_MATRICES

+
+
 #define TEST_REVERSE_DISTORTIONS
 #ifdef  TEST_REVERSE_DISTORTIONS
    dim3 threads_rd(3,3,3);