More refactoring

67816dbf · Andrey Filippov · 68491042 · 67816dbf · 67816dbf · 67816dbf
Commit 67816dbf authored Apr 06, 2025 by Andrey Filippov
6 changed files
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -4,9 +4,528 @@
 *  Created on: Apr 2, 2025
 *      Author: elphel
 */
-#include "TpParams.h"
+#include <stdexcept>
+#include <helper_cuda.h>  // for checkCudaErrors
+#include <cuda_runtime.h> // cudaFree
+#include <helper_functions.h> // timer functions
+
+//#include "TpParams.h"   // TpHostGpu.h has it
 #include "tp_paths.h"
+#include "tp_files.h"
+#include "tp_utils.h"       // for copyalloc_kernel_gpu
 #include "TpHostGpu.h"


+TpHostGpu::~TpHostGpu(){
+	 hfree(m_host_kern_buf);
+	 hfree(m_ftask_data);
+	 hfree(m_ftask_data1);
+
+	 hfree(m_correction_vector);
+	 hfree(m_rByRDist);
+	 gfree(m_gpu_geometry_correction);
+	 gfree(m_gpu_correction_vector);
+	 gfree(m_gpu_rByRDist);
+	 gfree(m_gpu_rot_deriv);
+
+	 for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+		 // Free used host memory, initialized as arrays of pointers
+		 hfree(m_gpu_kernels_h[ncam]);
+		 hfree(m_gpu_kernel_offsets_h[ncam]);
+		 hfree(m_gpu_images_h[ncam]);
+		 hfree(m_gpu_clt_h[ncam]);
+		 hfree(m_gpu_corr_images_h[ncam]);
+		 // Free used GPU memory initialized as **
+		 if (m_gpu_kernels)          gfree(m_gpu_kernels[ncam]);
+		 if (m_gpu_kernel_offsets)   gfree(m_gpu_kernel_offsets[ncam]);
+		 if (m_gpu_images)           gfree(m_gpu_images[ncam]);
+		 if (m_gpu_clt)              gfree(m_gpu_clt[ncam]);
+		 if (m_gpu_corr_images)      gfree(m_gpu_corr_images[ncam]);
+	 }
+	 gfree(m_gpu_corrs);
+	 gfree(m_gpu_corrs_td);
+	 gfree(m_gpu_corrs_combo);
+	 gfree(m_gpu_corrs_combo_td);
+	 gfree(m_gpu_corr_indices);
+	 gfree(m_gpu_corrs_combo_indices);
+	 gfree(m_gpu_ftasks);
+	 gfree(m_gpu_active_tiles);
+	 gfree(m_gpu_num_active);
+	 gfree(m_gpu_num_corr_tiles);
+	 gfree(m_gpu_texture_indices);
+	 gfree(m_gpu_woi);
+	 gfree(m_gpu_twh);
+	 gfree(m_gpu_num_texture_tiles);
+	 gfree(m_gpu_port_offsets);
+	 gfree(m_gpu_color_weights);
+	 gfree(m_gpu_generate_RBGA_params);
+	 gfree(m_gpu_textures);
+	 gfree(m_gpu_diff_rgb_combo);
+	 gfree(m_gpu_textures_rbga);
+
+	 return;
+};
+void TpHostGpu::setImageKernels(){
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+       readFloatsFromFile(
+       		m_host_kern_buf, // float * data, // allocated array
+			m_tpPaths.kernel_file[ncam]); // 			   char *  path) // file path
+       m_gpu_kernels_h[ncam] = copyalloc_kernel_gpu(m_host_kern_buf, m_tpParams.kern_size);
+       readFloatsFromFile(
+    		   m_host_kern_buf, // float * data, // allocated array
+			   m_tpPaths.kernel_offs_file[ncam]); // 			   char *  path) // file path
+       m_gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
+    		   m_host_kern_buf,
+			   m_tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
+    }
+    m_gpu_kernels =         copyalloc_pointers_gpu (m_gpu_kernels_h,     m_tpParams.num_cams); // NUM_CAMS);
+    m_gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) m_gpu_kernel_offsets_h, m_tpParams.num_cams); // NUM_CAMS);
+}
+void TpHostGpu::setCltBuffers(){
+	const int slice_size {m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size};
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+        // will get results back
+    	//gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
+        m_gpu_clt_h[ncam] = alloc_kernel_gpu(slice_size);
+        printf("Allocating GPU memory, 0x%x floats\n", slice_size) ;
+    }
+    m_gpu_clt =            copyalloc_pointers_gpu (m_gpu_clt_h,         m_tpParams.num_cams); // NUM_CAMS);
+}
+
+void TpHostGpu::setCorrImgBuffers(){
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+        // allocate result images (3x height to accommodate 3 colors
+        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
+        //host array of pointers to GPU arrays
+        m_gpu_corr_images_h[ncam] = alloc_image_gpu(
+        		&dstride_rslt,                // size_t* dstride, // in bytes!!
+				m_tpParams.img_width + m_tpParams.dtt_size,         // int width,
+				m_tpParams.num_colors*(m_tpParams.img_height + m_tpParams.dtt_size));   // int height);
+    }
+    m_gpu_corr_images =    copyalloc_pointers_gpu (m_gpu_corr_images_h, m_tpParams.num_cams); // NUM_CAMS);
+}
+
+void TpHostGpu::setImgBuffers(){
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+        readFloatsFromFile(
+        		m_host_kern_buf, // float * data, // allocated array
+				m_tpPaths.image_files[ncam]); // 			   char *  path) // file path
+        m_gpu_images_h[ncam] =  copyalloc_image_gpu(
+        		m_host_kern_buf, // float * image_host,
+				&dstride,      // size_t* dstride,
+				m_tpParams.img_width, // IMG_WIDTH,     // int width,
+				m_tpParams.img_height); // IMG_HEIGHT);   // int height);
+    }
+    m_gpu_images =         copyalloc_pointers_gpu (m_gpu_images_h,      m_tpParams.num_cams); // NUM_CAMS);
+}
+
+void TpHostGpu::setGeometryCorrectionBuffers() {
+    readFloatsFromFile(
+    		(float *) &m_fgeometry_correction, // float * data, // allocated array, no need to free
+			m_tpPaths.geometry_correction_file); // char *  path) // file path
+    m_rByRDist = readAllFloatsFromFile(
+    		m_tpPaths.rByRDist_file, // const char *  path,
+    		&m_rByRDist_length);       // int * len_in_floats)
+    m_correction_vector =  readAllFloatsFromFile(
+    		m_tpPaths.correction_vector_file, // const char *  path,
+    		&m_correction_vector_length); // int * len_in_floats)
+    m_gpu_geometry_correction =  (struct gc *) copyalloc_kernel_gpu(
+    		(float *) &m_fgeometry_correction,
+    		sizeof(m_fgeometry_correction)/sizeof(float));
+    m_gpu_correction_vector =  (struct corr_vector * ) copyalloc_kernel_gpu(
+    		m_correction_vector,
+			m_correction_vector_length);
+    m_gpu_rByRDist =  copyalloc_kernel_gpu(
+    		m_rByRDist,
+			m_rByRDist_length);
+    checkCudaErrors(cudaMalloc((void **)&m_gpu_rot_deriv, sizeof(trot_deriv)));
+}
+void TpHostGpu::setCorrelationBuffers(){
+    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
+    m_gpu_corrs = alloc_image_gpu(
+    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			m_tpParams.corr_length,                      // int width,
+			m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy);   // int height);
+    // read channel images (assuming host_kern_buf size > image size, reusing it)
+    // allocate all other correlation data, some may be
+    m_gpu_corrs_td = alloc_image_gpu(
+    		&dstride_corr_td,                                                 // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			4 * m_tpParams.dtt_size *m_tpParams.dtt_size,                     // int width,
+			m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy);    // int height);
+
+    m_gpu_corrs_combo = alloc_image_gpu(
+    		&dstride_corr_combo,                                              // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			m_tpParams.corr_length,                                             // int width,
+			m_tpParams.tilesx * m_tpParams.tilesy);                           // int height);
+
+    m_gpu_corrs_combo_td = alloc_image_gpu(
+    		&dstride_corr_combo_td,                                           // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			4 * m_tpParams.dtt_size *m_tpParams.dtt_size,                     // int width,
+			m_tpParams.tilesx * m_tpParams.tilesy);                           // int height);
+    // just allocate
+   checkCudaErrors (cudaMalloc((void **)&m_gpu_corr_indices,  m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int)));
+   checkCudaErrors (cudaMalloc((void **)&m_gpu_corrs_combo_indices,            m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int)));
+}
+
+void TpHostGpu::setTasks(const float target_disparity, const float scale){
+	// allocate m_ftask_data
+	m_ftask_data  =   (float *) malloc( m_tpParams.tilesx *  m_tpParams.tilesy * m_tpParams.task_size * sizeof(float));
+//    float * ftask_data1  =  (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
+
+	   // tasks for all tiles
+	    for (int ty = 0; ty < m_tpParams.tilesy ; ty++){
+	        for (int tx = 0; tx < m_tpParams.tilesx ; tx++){
+	            int nt = ty * m_tpParams.tilesx  + tx;
+	            int task_task = (1 << m_tpParams.task_inter_en) | (1 << m_tpParams.task_corr_en) | (1 << m_tpParams.task_text_en); // just 1 bit, correlation selection is defined by common corr_sel bits
+	            int task_txy = tx + (ty << 16);
+	//            float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
+	            float * tp = m_ftask_data + m_tpParams.task_size * nt;
+	            *(tp + m_tpParams.tp_task_task_offset) =    *(float *) &task_task;
+	            *(tp + m_tpParams.tp_task_txy_offset) =      *(float *) &task_txy;
+	            *(tp + m_tpParams.tp_task_disparity_offset) =  target_disparity;
+	//            tp += 2; // skip centerX, centerY
+	            *(tp + m_tpParams.tp_task_scale_offset) = scale; //  0; // 0.5f; // ,0; // scale, 0 - old way, just set
+	            tp+= m_tpParams.tp_task_xy_offset;
+	            for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+	            	*(tp++) = m_tile_coords_h[ncam][nt][0];
+	            	*(tp++) = m_tile_coords_h[ncam][nt][1];
+	            }
+	        }
+	    }
+	    m_gpu_ftasks = (float  *) copyalloc_kernel_gpu(m_ftask_data, m_tpParams.tp_tasks_size * m_tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
+}
+void TpHostGpu::setTextures(){
+	if (!m_ftask_data) { // ftask_data is not initialized
+		throw std::runtime_error("Error: m_ftask_data is not initialized ");
+	}
+	m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0));
+	m_tile_texture_size = m_tile_texture_layers *256;
+	 // task data should be initialized
+	m_num_textures=0;
+    for (int ty = 0; ty < m_tpParams.tilesy; ty++){
+    	for (int tx = 0; tx < m_tpParams.tilesx; tx++){
+    		int nt = ty * m_tpParams.tilesx + tx;
+            float *tp = m_ftask_data + m_tpParams.task_size * nt;
+    		int cm = (*(int *) tp) & m_tpParams.task_texture_bits; // non-zero any of 4 lower task bits
+    		if (cm){
+    			m_texture_indices[m_num_textures++] = (nt << m_tpParams.text_ntile_shift) | (1 << m_tpParams.list_texture_bit); // setting  0x80 in texture indices
+    		}
+    	}
+    }
+    // num_textures now has the total number of textures
+    // copy corr_indices to gpu
+    m_gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
+    		(float * ) m_texture_indices,
+			m_num_textures,
+			m_tpParams.tilesx * m_tpParams.tilesya); // number of rows - multiple of 4
+    m_gpu_textures = alloc_image_gpu(
+    		&dstride_textures,                      // in bytes ! for one rgba/ya 16x16 tile
+			m_tile_texture_size,                    // int width (floats),
+			m_tpParams.tilesx * m_tpParams.tilesy); // int height);
+}
+
+void TpHostGpu::setRGBA(){
+	/*
+	// appears in setTextures
+    m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0));
+    m_tile_texture_size = m_tile_texture_layers *256;
+    */
+    m_rgba_width =   (m_tpParams.tilesx + 1) * m_tpParams.dtt_size;
+    m_rgba_height =  (m_tpParams.tilesy + 1) * m_tpParams.dtt_size;;
+    m_rbga_slices =  m_tpParams.texture_colors + 1; // 4/1
+    if (m_tpParams.keep_texture_weights & 2){
+    	m_rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams;
+    }
+    checkCudaErrors(cudaMalloc((void **)&m_gpu_woi,               4 * sizeof(float))); // RGBA
+    checkCudaErrors(cudaMalloc((void **)&m_gpu_twh,               2 * sizeof(float))); // RGBA, dynamic
+    checkCudaErrors(cudaMalloc((void **)&m_gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border, RGBA all
+	m_gpu_port_offsets =         (float *) copyalloc_kernel_gpu((float * ) m_tpParams.port_offsets, m_tpParams.num_cams * 2); // num_ports * 2);
+    m_gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) m_tpParams.color_weights, sizeof(m_tpParams.color_weights));
+    m_gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.generate_RBGA_params, sizeof(m_tpParams.generate_RBGA_params));
+
+    m_gpu_textures_rbga = alloc_image_gpu(
+   		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
+			m_rgba_width,              // int width (floats),
+			m_rgba_height * m_rbga_slices);               // int height);
+    checkCudaErrors(cudaMalloc((void **)&m_gpu_diff_rgb_combo,  m_tpParams.tilesx * m_tpParams.tilesy * m_tpParams.num_cams *
+    		(m_tpParams.num_colors + 1) * sizeof(float)));
+}
+
+void TpHostGpu::testCorrelate2DIntra(int num_runs){
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_ftasks) {
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_corrs) {
+		throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
+	}
+    StopWatchInterface *timerCORR = 0;
+    sdkCreateTimer(&timerCORR);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerCORR);
+    		sdkStartTimer(&timerCORR);
+    	}
+    	correlate2D<<<1,1>>>(
+    			m_tpParams.num_cams,         // int               num_cams,
+				m_tpParams.sel_pairs[0],     // int               sel_pairs0           // unused bits should be 0
+				m_tpParams.sel_pairs[1],     // int               sel_pairs1,           // unused bits should be 0
+				m_tpParams.sel_pairs[2],     // int               sel_pairs2,           // unused bits should be 0
+				m_tpParams.sel_pairs[3],     // int               sel_pairs3,           // unused bits should be 0
+				m_gpu_clt,                   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				m_tpParams.num_colors,       // int               colors,             // number of colors (3/1)
+				m_tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
+				m_tpParams.color_weights[1], // 0.25,  // float             scale1,             // scale for B
+				m_tpParams.color_weights[2], // 0.5,   // float             scale2,             // scale for G
+				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				m_gpu_ftasks,                // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				m_tpParams.tp_tasks_size,    // int               num_tiles) // number of tiles in task
+				m_tpParams.tilesx,           // int               tilesx,             // number of tile rows
+				m_gpu_corr_indices,          // int             * gpu_corr_indices,   // packed tile+pair
+				m_gpu_num_corr_tiles,        // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr/sizeof(float),  // const size_t      corr_stride,        // in floats
+				m_tpParams.corr_out_rad,     // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+				m_gpu_corrs);                // float           * gpu_corrs);          // correlation output data
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
+    sdkStopTimer(&timerCORR);
+    sdkDeleteTimer(&timerCORR);
+    int num_corrs{}; // will get data from the gpu memory
+    checkCudaErrors(cudaMemcpy(
+    		&num_corrs,
+			m_gpu_num_corr_tiles,
+			sizeof(int),
+			cudaMemcpyDeviceToHost));
+    printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n",  avgTimeCORR, num_corrs);
+    saveIntraCorrFile(
+    		m_tpPaths.result_corr_file, // const char * path,
+			"phase correlation data",   // const char * prompt,
+			num_corrs, // int          num_corrs,
+			m_gpu_corrs, // float *      gpu_corrs,
+			m_gpu_corr_indices, // int *        gpu_corr_indices)
+			16); // //int          num_sel_sensors) { // only for interscene
+
+}
+
+
+void TpHostGpu::saveIntraCorrFile(
+		const char * path,
+		const char * prompt,
+		int          num_corrs,
+		float *      gpu_corrs,
+		int *        gpu_corr_indices,
+		int          num_sel_sensors) { // only for interscene
+	if (!path)	return;
+	int rslt_corr_length =   num_corrs * m_tpParams.corr_length;
+	int corr_img_size =    m_tpParams.num_corr_indices * 16 * 16; // NAN
+//	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	float * cpu_corr = (float *)malloc(rslt_corr_length * sizeof(float));
+	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+	checkCudaErrors(cudaMemcpy2D(
+			cpu_corr,
+			m_tpParams.corr_length * sizeof(float),
+			gpu_corrs,
+			dstride_corr,
+			m_tpParams.corr_length * sizeof(float),
+			num_corrs,
+			cudaMemcpyDeviceToHost));
+	checkCudaErrors(cudaMemcpy(
+			cpu_corr_indices,
+			gpu_corr_indices,
+			m_tpParams.num_corr_indices * sizeof(int),
+			cudaMemcpyDeviceToHost));
+	float * corr_img = 	getCorrImg(
+			corr_img_size, // int     corr_img_size,
+			cpu_corr_indices, // int *   cpu_corr_indices,
+			cpu_corr, // float * cpu_corr,
+			num_sel_sensors); // int     num_sel_sensors)
+    printf("Writing %s to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
+    		prompt, path, (m_tpParams.tilesx * 16),(m_tpParams.tilesya * 16), m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
+    writeFloatsToFile(
+    		corr_img,       // float *       data, // allocated array
+			corr_img_size,  // int           size, // length in elements
+			path);          // 			   const char *  path) // file path
+    free (cpu_corr);
+    free (cpu_corr_indices);
+    free (corr_img);
+}
+
+/*
+
+    printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
+    		tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
+    writeFloatsToFile(
+    		corr_img,                  // float *       data, // allocated array
+			corr_img_size,             // int           size, // length in elements
+			tpPaths.result_inter_td_norm_file); // 			   const char *  path) // file path
+ */
+
+float * TpHostGpu::getCorrImg(
+		int     corr_img_size,
+		int *   cpu_corr_indices,
+		float * cpu_corr,
+		int     num_sel_sensors){
+	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	for (int i = 0; i < corr_img_size; i++){
+		corr_img[i] = NAN;
+	}
+    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
+    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / m_tpParams.tilesx;
+    	int tx = ctt % m_tpParams.tilesx;
+    	int src_offs0 = ict * m_tpParams.corr_length;
+    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
+    	for (int iy = 0; iy < m_tpParams.corr_size; iy++){
+    		int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
+    		int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
+    		for (int ix = 0; ix < m_tpParams.corr_size; ix++){
+    			corr_img[dst_offs++] = cpu_corr[src_offs++];
+    		}
+    	}
+    }
+	return corr_img;
+}
+
+float * TpHostGpu::getCorrTdImg(
+		int     corr_img_size,
+		int *   cpu_corr_indices,
+		float * cpu_corr_td,
+		int     num_sel_sensors){
+	float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
+	for (int i = 0; i < corr_img_size; i++){
+		corr_img[i] = NAN;
+	}
+    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
+    	if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / m_tpParams.tilesx;
+    	int tx = ctt % m_tpParams.tilesx;
+    	int src_offs0 = ict * m_tpParams.corr_length;
+    	int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) +  (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
+
+    	for (int iquad = 0; iquad < 4; iquad ++) {
+    		int iqy = (iquad >> 1) & 1;
+    		int iqx = (iquad >> 0) & 1;
+    		for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
+    			int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
+    			int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
+    			for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
+    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
+    			}
+    		}
+    	}
+    }
+	return corr_img;
+}
+
+
+
+void TpHostGpu::saveInterCorrFile(
+		const char * path,   // "clt/aux_interscene-TD.raw"
+		const char * prompt, // "interscene phase correlation"
+		int          num_corrs,
+		float      * gpu_corrs_td,
+		int        * gpu_corr_indices,
+		int          num_sel_sensors){
+	if (!path)	return;
+	int corr_img_size =    m_tpParams.num_corr_indices * 16 * 16; // NAN
+ 	int rslt_corr_size_td =   num_corrs *  m_tpParams.dtt_size2 * m_tpParams.dtt_size2;
+ 	float * cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
+ 	int     dtile_bytes = (m_tpParams.dtt_size2*m_tpParams.dtt_size2) * sizeof(float);
+	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr_td,
+			dtile_bytes,
+			gpu_corrs_td,
+			dstride_corr_td,
+			dtile_bytes,
+			num_corrs,
+			cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(
+    		cpu_corr_indices,
+			gpu_corr_indices,
+			m_tpParams.num_corr_indices * sizeof(int),
+			cudaMemcpyDeviceToHost));
+
+	float * corr_img = 	getCorrTdImg(
+			corr_img_size,    // int     corr_img_size,
+			cpu_corr_indices, // int *   cpu_corr_indices,
+			gpu_corrs_td,     // float * cpu_corr,
+			num_sel_sensors); // int     num_sel_sensors)
+    printf("Writing %s TD data to %s\n", prompt, path);
+    writeFloatsToFile(
+    		corr_img,        // float *       data, // allocated array
+			corr_img_size,   // int           size, // length in elements
+			path);           // 			   const char *  path) // file path
+    free(cpu_corr_indices);
+	free (corr_img);
+    free (cpu_corr_td);
+}
+
+void TpHostGpu::saveInterCorrIndicesFile(
+		const char * path,   //  "clt/aux_inter-indices.raw"
+		const char * prompt, //  "interscene indices"
+		int * gpu_corr_indices,
+		int num_sel_sensors){
+	if (!path)	return;
+	int *   cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
+    checkCudaErrors(cudaMemcpy(
+    		cpu_corr_indices,
+			gpu_corr_indices,
+			m_tpParams.num_corr_indices * sizeof(int),
+			cudaMemcpyDeviceToHost));
+    int corr_index_img_length = m_tpParams.tilesx * m_tpParams.tilesy * (num_sel_sensors+1) ;
+    float *corr_index_img =  (float *)malloc(corr_index_img_length * sizeof(float));
+    for (int i = 0; i < corr_index_img_length; i++){
+    	corr_index_img[i] = NAN;
+    }
+    for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
+    	if (cpair == 0xff){
+    		cpair = num_sel_sensors;
+    	}
+    	int ty = ctt / m_tpParams.tilesx;
+    	int tx = ctt % m_tpParams.tilesx;
+    	corr_index_img[cpair * m_tpParams.tilesx * m_tpParams.tilesy + m_tpParams.tilesx * ty + tx] =  ict; // cpu_corr_indices[ict];
+    }
+    printf("Writing %s to %s\n",prompt,path);
+    writeFloatsToFile(
+    		corr_index_img,                  // float *       data, // allocated array
+			corr_index_img_length,           // int           size, // length in elements
+			"clt/aux_inter-indices.raw"); // 			   const char *  path) // file path
+    free (corr_index_img);
+    free(cpu_corr_indices);
+}
+
+
+
+void TpHostGpu::hfree(float *              p) {if (p) free(p); p = {};}
+void TpHostGpu::hfree(struct CltExtra *    p) {if (p) free(p); p = {};}
+void TpHostGpu::gfree(float *              p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(int *                p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(struct CltExtra *    p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(struct gc *          p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(struct corr_vector * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+void TpHostGpu::gfree(struct trot_deriv  * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
+

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -7,17 +7,139 @@

 #ifndef SRC_TPHOSTGPU_H_
 #define SRC_TPHOSTGPU_H_
+
+#include "geometry_correction.h"
 #include "TpParams.h"
+#include "TileProcessor.h"
+
 class TpHostGpu{
+	static constexpr int m_num_cams_lwir =   16;
+	static constexpr int m_num_cams_rgb =     4;
+	static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
+	/*
+	 *
+	 */
+
 public:
 	TpParams& m_tpParams;
 	TpPaths& m_tpPaths;
+private:
+	// no need to free
+    float               m_tile_coords_h        [max_num_cams][TpParams::tilesx*TpParams::tilesy][2]; // [TILESX * TILESY][2];
+    struct gc           m_fgeometry_correction;
+    int                 m_correction_vector_length{};
+    int                 m_rByRDist_length{};
+    int                 m_texture_indices [TpParams::tilesx * TpParams::tilesya]; //     [TILESX*TILESYA];
+    int                 m_num_textures{};
+    int                 m_tile_texture_layers{};
+    int                 m_tile_texture_size{};
+
+    int                 m_rgba_width{};  //  =   (TILESX+1) * DTT_SIZE;
+    int                 m_rgba_height{}; //  =  (TILESY+1) * DTT_SIZE;
+    int                 m_rbga_slices{}; //  =  tpParams.texture_colors + 1; // 4/1
+
+
+	// need to free
+	float *             m_host_kern_buf{};
+	float *             m_ftask_data{};
+	float *             m_ftask_data1{};
+    float *             m_gpu_ftasks{};
+    int *               m_gpu_active_tiles{};   // tasks
+    int *               m_gpu_num_active{};     // tasks
+    int *               m_gpu_num_corr_tiles{}; // correlations
+
+
+	// host memory
+	// dstride* - size in byte to be passed to the GPU kernels
+	size_t              dstride{};
+    size_t              dstride_rslt{};          // in bytes !
+    size_t              dstride_corr{};          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    // in the future, dstride_corr can reuse that of dstride_corr_td?
+    size_t              dstride_corr_td{};       // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
+    size_t              dstride_corr_combo{};    // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    size_t              dstride_corr_combo_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
+    size_t              dstride_textures{}; // in bytes ! for one rgba/ya 16x16 tile
+    size_t              dstride_textures_rbga{}; // in bytes ! for one rgba/ya 16x16 tile
+
+	float *             m_gpu_kernels_h[max_num_cams]{};
+	struct CltExtra *   m_gpu_kernel_offsets_h[max_num_cams]{};
+	float *             m_gpu_images_h[max_num_cams]{};
+	float *             m_gpu_clt_h[max_num_cams]{};
+	float *             m_gpu_corr_images_h[max_num_cams]{};
+    float *             m_correction_vector{};
+    float *             m_rByRDist{};
+    int   *             m_gpu_texture_indices{};
+
+
+
+	// GPU memory
+	float **            m_gpu_kernels{};
+	struct CltExtra  ** m_gpu_kernel_offsets{};
+	float **            m_gpu_images{};
+	float **            m_gpu_clt{};
+	float **            m_gpu_corr_images{};
+
+    struct gc          * m_gpu_geometry_correction{};
+    struct corr_vector * m_gpu_correction_vector{};
+    float              * m_gpu_rByRDist{};
+    struct trot_deriv  * m_gpu_rot_deriv{};
+
+    // correlations device memory
+    float              * m_gpu_corrs{};               // correlation tiles (per tile, per pair) in pixel domain
+    float              * m_gpu_corrs_td{};            // correlation tiles (per tile, per pair) in transform domain
+    float              * m_gpu_corrs_combo{};         // correlation tiles combined (1 per tile), pixel domain
+    float              * m_gpu_corrs_combo_td{};      // correlation tiles combined (1 per tile), transform domain
+    int                * m_gpu_corr_indices{};        // shared by gpu_corrs gpu_corrs_td
+    int                * m_gpu_corrs_combo_indices{}; // shared by gpu_corrs_combo and gpu_corrs_combo_td
+
+    float *             m_gpu_textures{};
+    float *             m_gpu_diff_rgb_combo{};
+    float *             m_gpu_textures_rbga{};
+
+    int   *             m_gpu_woi{};
+    int   *             m_gpu_twh{};
+    int   *             m_gpu_num_texture_tiles{};
+
+    float *             m_gpu_port_offsets{};
+    float *             m_gpu_color_weights{};
+    float *             m_gpu_generate_RBGA_params{};
+
+public:
 	TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
 	:m_tpParams{tpParams}
 	,m_tpPaths{tpPaths}
+	,m_host_kern_buf{(float *) malloc(tpParams.kern_size * sizeof(float))}
 	{};
-};
+	~TpHostGpu();
+	void setImageKernels();
+	void setCltBuffers();
+	void setCorrImgBuffers();
+	void setImgBuffers();
+	void setGeometryCorrectionBuffers();
+	void setCorrelationBuffers();
+	void setTasks(const float target_disparity, const float scale);
+	void setTextures();
+	void setRGBA();
+	void testCorrelate2DIntra(int num_runs);
+	// for both intra and inter!
+	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrIndicesFile(const char * path, const char * prompt, int * gpu_corr_indices, int num_sel_sensors);
+
+private:
+	float * getCorrImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
+	float * getCorrTdImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
+	void hfree(float * p); // {if (p) free (p);}
+	void hfree(struct CltExtra * p);
+	void gfree(float * p);
+	void gfree(int * p);
+	void gfree(struct CltExtra * p);

+	void gfree(struct gc * p);
+	void gfree(struct corr_vector * p);
+	void gfree(struct trot_deriv * p);
+
+};




--- a/src/TpParams.cu
+++ b/src/TpParams.cu
@@ -36,10 +36,10 @@ TpParams::TpParams(int lwir){
    texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
    kern_tiles = KERNELS_HOR *  KERNELS_VERT * num_colors; // NUM_COLORS;
    kern_size =  kern_tiles * 4 * 64;
-    corr_size =  (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
-
-
-
+	corr_size =  2 * corr_out_rad + 1;
+	corr_length =  corr_size * corr_size;
+    num_tiles =  tp_tasks_size;
+    num_corr_indices = num_pairs * num_tiles;
 }


--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -8,8 +8,54 @@
 #ifndef SRC_TPPARAMS_H_
 #define SRC_TPPARAMS_H_
 #include <math.h>
+#include "dtt8x8.h"
+#include "tp_defines.h"
+#include "geometry_correction.h" // TP_TASK_TASK_*
+#include "TileProcessor.h"
 class TpParams{
-	static constexpr int m_num_cams_lwir =   16;
+public:
+	static constexpr int tilesx =                   TILESX;
+	static constexpr int tilesy =                   TILESY;
+	static constexpr int tilesya =                  TILESYA;
+	static constexpr int dtt_size =                 DTT_SIZE;
+	static constexpr int dtt_size2 =                DTT_SIZE2;
+	static constexpr int img_width =                IMG_WIDTH;
+	static constexpr int img_height =               IMG_HEIGHT;
+	static constexpr int kernels_hor =              KERNELS_HOR;
+	static constexpr int kernel_vert =              KERNELS_VERT;
+
+	static constexpr int task_inter_en =            TASK_INTER_EN; //          10 // Task bit to enable interscene correlation
+	static constexpr int task_corr_en =             TASK_CORR_EN;  //           9 // Task bit to enable intrascene correlation (pairs defined separately)
+	static constexpr int task_text_en =             TASK_TEXT_EN;  //           8 // task bit to enable texture generation
+
+	static constexpr int list_texture_bit =         LIST_TEXTURE_BIT; //        8 // 7 // bit to request texture calculation
+    static constexpr int text_ntile_shift =         TEXT_NTILE_SHIFT; //        9 // 8 // tile number shift for texture calculation (will be different from CORR_NTILE_SHIFT!)
+    static constexpr int task_texture_bits =        TASK_TEXTURE_BITS; //       TileProcessor.h
+
+    static constexpr int corr_ntile_shift =         CORR_NTILE_SHIFT; // 8 // higher bits - number of a pair, other bits tile number
+
+    static constexpr int corr_out_rad =             CORR_OUT_RAD; // 7
+    //???
+//    static constexpr int tp_task_size =        TASK_TEXTURE_BITS; //       TileProcessor.h
+//    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
+    static constexpr int tp_tasks_size =       tilesx * tilesy; //
+
+	static constexpr int tp_task_task_offset =      TP_TASK_TASK_OFFSET;//      0
+	static constexpr int tp_task_txy_offset =       TP_TASK_TXY_OFFSET;//       1
+	static constexpr int tp_task_disparity_offset = TP_TASK_DISPARITY_OFFSET;// 2
+	static constexpr int tp_task_centerxy_offset =  TP_TASK_CENTERXY_OFFSET;//  3
+	static constexpr int tp_task_scale_offset =     TP_TASK_SCALE_OFFSET;//     5
+	static constexpr int tp_task_xy_offset =        TP_TASK_XY_OFFSET;//        6
+	static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
+
+#ifdef DBG_TILE
+	static constexpr int debug_tile{1};
+#else
+	static constexpr int debug_tile{0};
+#endif
+
+private:
+	static constexpr int m_num_cams_lwir =   16; // refactor to s_
 	static constexpr int m_num_colors_lwir =  1;
 	static constexpr int m_num_pairs_lwir = 120;
 	static constexpr int m_num_cams_rgb =     4;
@@ -38,10 +84,13 @@ public:

    float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
    int keep_texture_weights {3}; // 0; // 1; // try with 0 also
-    int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
-    int kern_tiles;
-    int kern_size;
-    int corr_size;
+    int texture_colors{}; // 3; // result will be 3+1 RGBA (for mono - 2)
+    int kern_tiles{};
+    int kern_size{};
+    int num_tiles{};
+	int corr_size{};
+    int corr_length{};
+    int num_corr_indices{};
 //    std::vector<float[2]> m_port_offsets;



--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -31,7 +31,7 @@
 */

 // all of the next 5 were disabled
-#define NOCORR
+//#define NOCORR
 #define NOCORR_TD
 #define NOTEXTURES //
 #define NOTEXTURE_RGBA //
@@ -129,6 +129,8 @@ int main(int argc, char **argv)

    GenerateRgbaHost generateRgbaHost{}; //  = new GenerateRgbaHost();

+//    return 0;
+
    float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
    float * ftask_data  =   (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
    float * ftask_data1  =  (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
@@ -264,7 +266,7 @@ int main(int argc, char **argv)
    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
    gpu_corrs = alloc_image_gpu(
    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_size,                      // int width,
+			tpParams.corr_length,                      // int width,
 			tpParams.num_pairs * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
 // allocate all other correlation data, some may be
@@ -275,7 +277,7 @@ int main(int argc, char **argv)

    gpu_corrs_combo = alloc_image_gpu(
    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_size,                       // int width,
+			tpParams.corr_length,                       // int width,
 			TILESX * TILESY);                // int height);

    gpu_corrs_combo_td = alloc_image_gpu(
@@ -309,7 +311,7 @@ int main(int argc, char **argv)
            int nt = ty * TILESX + tx;
            int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
            int task_txy = tx + (ty << 16);
-            float task_target_disparity = DBG_DISPARITY;
+            float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
            float * tp = ftask_data + tpParams.task_size * nt;
            *(tp + TP_TASK_TASK_OFFSET) =    *(float *) &task_task;
            *(tp + TP_TASK_TXY_OFFSET) =      *(float *) &task_txy;
@@ -325,15 +327,18 @@ int main(int argc, char **argv)
    }

    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
+    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
+
    int num_active_tiles; // will be calculated by convert_direct
 	int rslt_corr_size;
 	int corr_img_size;

-    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));

    // just allocate
    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
+
+
    num_textures = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
@@ -353,10 +358,11 @@ int main(int argc, char **argv)
    		(float * ) texture_indices,
 			num_textures,
 			TILESX * TILESYA); // number of rows - multiple of 4
+
+
    // just allocate
    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&gpu_twh,               2 * sizeof(float)));
-
    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
    // number of border tiles
    // copy port indices to gpu
@@ -410,7 +416,7 @@ int main(int argc, char **argv)
 	float * corr_img; //  = (float *)malloc(corr_img_size * sizeof(float));
 	float * cpu_corr; //  = (float *)malloc(rslt_corr_size * sizeof(float));
 	float * cpu_corr_td;
-	int * cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));
+	int *   cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));



@@ -540,16 +546,6 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerGEOM);
    		sdkStartTimer(&timerGEOM);
    	}
-/*
-    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			tpParams.num_cams,                // int                  num_cams,
-    			gpu_tasks,               // struct tp_task     * gpu_tasks,
-				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
-				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
-				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
-				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
-				*/
    	calculate_tiles_offsets<<<1,1>>> (
    			1,                       // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 				tpParams.num_cams,                // int                  num_cams,
@@ -778,10 +774,10 @@ int main(int argc, char **argv)
    	}
    	correlate2D<<<1,1>>>(
    			tpParams.num_cams,                      // int               num_cams,
-				TpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				TpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				TpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				TpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
+				tpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				tpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				tpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				tpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
 				gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				tpParams.num_colors,                 // int               colors,             // number of colors (3/1)
 				tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
@@ -886,6 +882,9 @@ int main(int argc, char **argv)
    free (corr_img);
 #endif // ifndef NOCORR

+
+
+
 #ifndef NOCORR_TD
 //    cudaProfilerStart();
 // testing corr
@@ -1019,7 +1018,7 @@ int main(int argc, char **argv)
 //    int rslt_corr_size =   num_corrs * corr_size * corr_size;
 //    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));

-	rslt_corr_size =   num_corrs * corr_size * corr_size;
+	rslt_corr_size =   num_corrs * corr_length * corr_length;
 	corr_img_size = num_corr_indices * 16*16; // NAN
 	corr_img = (float *)malloc(corr_img_size * sizeof(float));
 	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
@@ -1029,10 +1028,10 @@ int main(int argc, char **argv)

    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr,
-			(corr_size * corr_size) * sizeof(float),
+			(corr_length * corr_length) * sizeof(float),
 			gpu_corrs,
 			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
+			(corr_length * corr_length) * sizeof(float),
 			num_corrs,
 			cudaMemcpyDeviceToHost));
    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
@@ -1056,13 +1055,13 @@ int main(int argc, char **argv)
    	int ty = ctt / TILESX;
    	int tx = ctt % TILESX;
    	//		int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
-    	int src_offs0 = ict * corr_size * corr_size;
+    	int src_offs0 = ict * corr_length * corr_length;
    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);

-    	for (int iy = 0; iy < corr_size; iy++){
-    		int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
+    	for (int iy = 0; iy < corr_length; iy++){
+    		int src_offs = src_offs0 + iy * corr_length; // ict * tpParams.num_pairs * corr_size * corr_size;
    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < corr_size; ix++){
+    		for (int ix = 0; ix < corr_length; ix++){
    			corr_img[dst_offs++] = cpu_corr[src_offs++];
    		}
    	}

--- a/src/tp_paths.h
+++ b/src/tp_paths.h
@@ -33,6 +33,9 @@ public:
 		const char* rByRDist_file;
 		const char* correction_vector_file;
 		const char* geometry_correction_file;
+		const char* result_interscene_td =      "clt/aux_interscene-TD.raw";
+		const char* result_intrascene_td =      "clt/aux_intrascene-TD.raw";
+		const char* result_interscene_indices = "clt/aux_inter-indices.raw";

 private:
 	    const char * m_kernel_file_lwir[16] ={