More refactoring

68491042 · Andrey Filippov · 46256d14 · 68491042 · 68491042 · 68491042
Commit 68491042 authored Apr 03, 2025 by Andrey Filippov
7 changed files
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
+/*
+ * TpHostGpu.cu
+ *
+ *  Created on: Apr 2, 2025
+ *      Author: elphel
+ */
+#include "TpParams.h"
+#include "tp_paths.h"
+#include "TpHostGpu.h"
+
+
+
--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
+/*
+ * TpHostGpu.h
+ *
+ *  Created on: Apr 2, 2025
+ *      Author: elphel
+ */
+
+#ifndef SRC_TPHOSTGPU_H_
+#define SRC_TPHOSTGPU_H_
+#include "TpParams.h"
+class TpHostGpu{
+public:
+	TpParams& m_tpParams;
+	TpPaths& m_tpPaths;
+	TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
+	:m_tpParams{tpParams}
+	,m_tpPaths{tpPaths}
+	{};
+};
+
+
+
+
+#endif /* SRC_TPHOSTGPU_H_ */
--- a/src/TpParams.cu
+++ b/src/TpParams.cu
+/*
+ * TpParams.cu
+ *
+ *  Created on: Mar 31, 2025
+ *      Author: elphel
+ */
+#include <math.h>
+#include "tp_defines.h"   // was not here
+#include "geometry_correction.h"
+#include "TpParams.h"
+TpParams::TpParams(int lwir){
+	m_lwir = lwir;
+	for (int i = 0; i < sizeof(color_weights)/sizeof(color_weights[0]); i++) {
+		color_weights[i] =              lwir? m_color_weights_lwir[i] :        m_color_weights_rgb[i];
+	}
+	for (int i = 0; i < sizeof(generate_RBGA_params)/sizeof(generate_RBGA_params[0]); i++) {
+		generate_RBGA_params[i] =       lwir? m_generate_RBGA_params_lwir[i] : m_generate_RBGA_params_rgb[i];
+	}
+	num_cams =                   lwir? m_num_cams_lwir :             m_num_cams_rgb;
+	num_colors =                 lwir? m_num_colors_lwir :           m_num_colors_rgb;
+	num_pairs =                  lwir? m_num_pairs_lwir :            m_num_pairs_rgb;
+	sel_pairs =                  lwir? m_sel_pairs_lwir :            m_sel_pairs_rgb;
+	task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
+    if (num_cams == 4){
+    	for (int ncam = 0; ncam < 4; ncam++){
+    		port_offsets[ncam][0]= port_offsets4[ncam][0];
+    		port_offsets[ncam][1]= port_offsets4[ncam][1];
+    	}
+    } else {
+		 for (int ncam = 0; ncam < num_cams; ncam++) {
+			 double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
+			 port_offsets[ncam][0] =  0.5 * sin((alpha));
+			 port_offsets[ncam][1] = -0.5 * cos((alpha));
+		 }
+    }
+    texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
+    kern_tiles = KERNELS_HOR *  KERNELS_VERT * num_colors; // NUM_COLORS;
+    kern_size =  kern_tiles * 4 * 64;
+    corr_size =  (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
+
+
+
+}
+
+
--- a/src/TpParams.h
+++ b/src/TpParams.h
+/*
+ * TpParams.h
+ *
+ *  Created on: Mar 31, 2025
+ *      Author: elphel
+ */
+
+#ifndef SRC_TPPARAMS_H_
+#define SRC_TPPARAMS_H_
+#include <math.h>
+class TpParams{
+	static constexpr int m_num_cams_lwir =   16;
+	static constexpr int m_num_colors_lwir =  1;
+	static constexpr int m_num_pairs_lwir = 120;
+	static constexpr int m_num_cams_rgb =     4;
+	static constexpr int m_num_colors_rgb =   3;
+	static constexpr int m_num_pairs_rgb =    6;
+	static constexpr unsigned int m_sel_pairs_lwir[4] = {0xffffffff,0xffffffff,0xffffffff,0x00ffffff};
+	static constexpr unsigned int m_sel_pairs_rgb[4] =  {0x3f,0,0,0};
+	static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
+public:
+	TpParams(int lwir);
+	int     m_lwir;
+	float   color_weights[3]{};
+	float   generate_RBGA_params[5]{};
+    int     num_cams{};
+    int     num_colors{};
+    int     num_pairs{};
+    const unsigned int   * sel_pairs{};
+    int     task_size;
+    float port_offsets4[4][2] {// used only in textures to scale differences
+			{-0.5, -0.5},
+			{ 0.5, -0.5},
+			{-0.5,  0.5},
+			{ 0.5,  0.5}};
+
+    // Could not make it variable port_offsets[?][2], 16 is maximal size
+
+    float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
+    int keep_texture_weights {3}; // 0; // 1; // try with 0 also
+    int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
+    int kern_tiles;
+    int kern_size;
+    int corr_size;
+//    std::vector<float[2]> m_port_offsets;
+
+
+private:
+    float  m_color_weights_lwir[3] = {
+    		1.0f,          // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
+			1.0f,          // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
+			1.0f};         // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
+    float  m_generate_RBGA_params_lwir[5] = {
+    		10.0f,         // float             min_shot,           // 10.0
+			3.0f,          // float             scale_shot,         // 3.0
+			10.0f, // 1.5f,// float             diff_sigma,         // pixel value/pixel change
+			10.0f,         // float             diff_threshold,     // pixel value/pixel change
+			12.0f}; // 3.0 // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    float m_color_weights_rgb [3] = {
+    		0.294118f,     // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
+			0.117647f,     // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
+			0.588235f};    // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
+    float m_generate_RBGA_params_rgb[5] = {
+    		10.0f,         // float             min_shot,           // 10.0
+			3.0f,          // float             scale_shot,         // 3.0
+			1.5f,          // float             diff_sigma,         // pixel value/pixel change
+			10.0f,         // float             diff_threshold,     // pixel value/pixel change
+			3.0f};         // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+};
+#endif /* SRC_TPPARAMS_H_ */
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -65,6 +65,8 @@
 #include "tp_files.h"
 //#include "tp_paths.cuh"
 #include "tp_paths.h"
+#include "TpParams.h"
+#include "TpHostGpu.h"
 #include "GenerateRgbaHost.h"
 /*
 #if TEST_LWIR
@@ -85,283 +87,6 @@
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
 #define TILESYA       ((TILESY +3) & (~3))
 */
-//#include "generate_RGBA_host.inc"
-#if 0
-void generate_RBGA_host(
-		int                num_cams,           // number of cameras used
-		// Parameters to generate texture tasks
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
-		int                num_tiles,          // number of tiles in task list
-		// declare arrays in device code?
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		int              * gpu_num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
-		int              * gpu_woi,                // x,y,width,height of the woi
-		int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
-		int                height, // <= TILES-Y, use for faster processing of LWIR images
-		// Parameters for the texture generation
-		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-		// TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
-		int               colors,             // number of colors (3/1)
-		int               is_lwir,            // do not perform shot correction
-		const float       cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
-		const float       weights[3],         // scale for R,B,G should be host_array, not gpu
-		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-		const size_t      texture_rbga_stride,     // in floats
-		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
-{
-	int               cpu_woi[4];
-	int               cpu_num_texture_tiles[8];
-	float             min_shot = cpu_params[0];           // 10.0
-	float             scale_shot = cpu_params[1];         // 3.0
-	float             diff_sigma = cpu_params[2];         // pixel value/pixel change
-	float             diff_threshold = cpu_params[3];     // pixel value/pixel change
-	float             min_agree = cpu_params[4];          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	int               tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
-	dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
-    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
-    dim3 blocks0 (blocks_x, height, 1);
-    clear_texture_list<<<blocks0,threads0>>>(
-    		gpu_texture_indices,
-			width,
-			height);
-	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
-    dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
-    int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
-    dim3 blocks(blocks_t, 1, 1);
-    // mark used tiles in gpu_texture_indices memory
-    mark_texture_tiles <<<blocks,threads>>>(
-    		num_cams,           // int                num_cams,
-			gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			num_tiles,          // number of tiles in task list
-			width,              // number of tiles in a row
-			gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-	checkCudaErrors(cudaDeviceSynchronize());
-    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
-    checkCudaErrors(cudaMemcpy(
-    		(float * ) cpu_woi,
-			gpu_woi,
-			4  * sizeof(float),
-			cudaMemcpyDeviceToHost));
-    cpu_woi[0] = width;
-    cpu_woi[1] = height;
-    cpu_woi[2] = 0;
-    cpu_woi[3] = 0;
-    checkCudaErrors(cudaMemcpy(
-    		gpu_woi,
-			cpu_woi,
-			4 * sizeof(float),
-			cudaMemcpyHostToDevice));
-    /*
-    *(woi + 0) = width;  // TILES-X;
-    *(woi + 1) = height; // TILES-Y;
-    *(woi + 2) = 0; // maximal x
-    *(woi + 3) = 0; // maximal y
-    int * gpu_woi = (int  *) copyalloc_kernel_gpu(
-    		(float * ) woi,
-			4); // number of elements
-   */
-    // TODO: create gpu_woi to pass	(copy from woi)
-    // set lower 4 bits in each gpu_ftasks task
-    mark_texture_neighbor_tiles <<<blocks,threads>>>(
-    		num_cams,            // int                num_cams,
-			gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			num_tiles,           // number of tiles in task list
-			width,               // number of tiles in a row
-			height,              // number of tiles rows
-			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-			gpu_woi);                // min_x, min_y, max_x, max_y
-
-	checkCudaErrors(cudaDeviceSynchronize());
-	/*
-    checkCudaErrors(cudaMemcpy( //
-    		(float * ) cpu_num_texture_tiles,
-			gpu_num_texture_tiles,
-			8 * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	*/
-    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
-    for (int i = 0; i <8; i++){
-    	cpu_num_texture_tiles[i] = 0;
-    }
-    /*
-    *(num_texture_tiles+0) = 0;
-    *(num_texture_tiles+1) = 0;
-    *(num_texture_tiles+2) = 0;
-    *(num_texture_tiles+3) = 0;
-    *(num_texture_tiles+4) = 0;
-    *(num_texture_tiles+5) = 0;
-    *(num_texture_tiles+6) = 0;
-    *(num_texture_tiles+7) = 0;
-    */
-    // copy zeroed  num_texture_tiles
-//    int * gpu_num_texture_tiles = (int  *) copyalloc_kernel_gpu(
-//   		(float * ) num_texture_tiles,
-//			8); // number of elements
-    checkCudaErrors(cudaMemcpy(
-    		gpu_num_texture_tiles,
-			cpu_num_texture_tiles,
-			8 * sizeof(float),
-			cudaMemcpyHostToDevice));
-
-    gen_texture_list <<<blocks,threads>>>(
-    		num_cams,            // int                num_cams,
-			gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			num_tiles,           // number of tiles in task list
-			width,               // number of tiles in a row
-			height,              // int                height,               // number of tiles rows
-			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-			gpu_num_texture_tiles,   // number of texture tiles to process
-			gpu_woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y
-	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
-
-    // copy gpu_woi back to host woi
-    checkCudaErrors(cudaMemcpy(
-    		(float * ) cpu_woi,
-			gpu_woi,
-			4  * sizeof(float),
-			cudaMemcpyDeviceToHost));
-//    *(cpu_woi + 2) += 1 - *(cpu_woi + 0); // width  (was min and max)
-//    *(cpu_woi + 3) += 1 - *(cpu_woi + 1); // height (was min and max)
-      cpu_woi[2] += 1 - cpu_woi[0]; // width  (was min and max)
-      cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max)
-    // copy host-modified data back to GPU
-    checkCudaErrors(cudaMemcpy(
-    		gpu_woi,
-			cpu_woi,
-			4 * sizeof(float),
-			cudaMemcpyHostToDevice));
-
-    // copy gpu_num_texture_tiles back to host num_texture_tiles
-    checkCudaErrors(cudaMemcpy(
-    		(float * ) cpu_num_texture_tiles,
-			gpu_num_texture_tiles,
-			8 * sizeof(float),
-			cudaMemcpyDeviceToHost));
-
-// Zero output textures. Trim
-// texture_rbga_stride
-//	 int texture_width =        (*(cpu_woi + 2) + 1) * DTT_SIZE;
-//	 int texture_tiles_height = (*(cpu_woi + 3) + 1) * DTT_SIZE;
-	 int texture_width =        (cpu_woi[2] + 1) * DTT_SIZE;
-	 int texture_tiles_height = (cpu_woi[3] + 1) * DTT_SIZE;
-	 int texture_slices =       colors + 1;
-	 if (keep_weights & 2){
-		 texture_slices += colors * num_cams;
-	 }
-
-	 dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
-	 int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
-	 dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
-
-
-#ifdef DEBUG8A
-	 int                cpu_texture_indices      [TILESX*TILESYA];
-	 checkCudaErrors(cudaMemcpy(
-			 (float * ) cpu_texture_indices,
-			 gpu_texture_indices,
-			 TILESX*TILESYA * sizeof(float),
-			 cudaMemcpyDeviceToHost));
-	 for (int i = 0; i < 256; i++){
-		 int indx = cpu_texture_indices[i];
-		 printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
-	 }
-#endif // #ifdef DEBUG8A
-
-
-	 clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
-			 texture_width,
-			 texture_tiles_height * texture_slices, // int               texture_slice_height,
-			 texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
-			 gpu_texture_tiles) ;                   // float           * gpu_texture_tiles);
-	 // Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
-	 // oversize (border 16x 16 tiles overhang by 4 pixels)
-
- 	checkCudaErrors(cudaDeviceSynchronize());  // not needed yet, just for testing
-	 for (int pass = 0; pass < 8; pass++){
-		 int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
-		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-
-		 int border_tile =  pass >> 2;
-		 int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
-		 int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
-		 dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
-
-		 /* before CDP2
-		 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
-		 if (border_tile){
-			 ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
-		 }
-		 */
-		 // for CDP2
-		 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
-		 if (border_tile){
-//	    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
-			 ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
-			 ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
-		 }
-
-#ifdef DEBUG8A
-		 printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
-				 pass, border_tile,ti_offset, ntt);
-		 printf("\ngenerate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n",
-				 (void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset));
-		 printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
-				 grid_texture.x, grid_texture.y, grid_texture.z);
-		 printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
-				 threads_texture.x, threads_texture.y, threads_texture.z);
-		 printf("\n");
-
-
-#endif
-		 /* */
-		 int shared_size = host_get_textures_shared_size( // in bytes
-				 num_cams,     // int                num_cams,     // actual number of cameras
-				 colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
-				 0);           // int *              offsets);     // in floats
-    	 printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
-		 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
-		 cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
-		 textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
-				 num_cams,                        // int               num_cams,           // number of cameras used
-				 gpu_woi,                             // int             * woi,                // x, y, width,height
-				 gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				 pntt, // ntt,                    // int *             num_texture_tiles,  // number of texture tiles to process
-				 ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
-				 gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-//				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				 gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
-				 colors,                          // int               colors,             // number of colors (3/1)
-				 is_lwir,                         // int               is_lwir,            // do not perform shot correction
-				 min_shot,                        // float             min_shot,           // 10.0
-				 scale_shot,                      // float             scale_shot,         // 3.0
-				 diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
-				 diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
-				 min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				 weights,                         // float             weights[3],         // scale for R,B,G
-				 dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-				 keep_weights,                    // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-				 // combining both non-overlap and overlap (each calculated if pointer is not null )
-				 texture_rbga_stride,             // size_t      texture_rbg_stride, // in floats
-				 gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-				 0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
-				 (float *) 0, // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-				 0, // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
-				 (float *)0, //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
-				 width);
-	    	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
-		 /* */
-
-	 }
-
-//	 checkCudaErrors(cudaFree(gpu_woi));
-//	 checkCudaErrors(cudaFree(gpu_num_texture_tiles));
-	 //	 __syncthreads();
-}
-#endif // 0
-

 /**
 **************************************************************************
@@ -391,95 +116,22 @@ int main(int argc, char **argv)
 #else
    int use_lwir= 0;
 #endif
+    class YourClass
+    {
+        std::vector<int> myVector;
+        // ...
+    };

+    TpParams tpParams(use_lwir);
    TpPaths tpPaths(use_lwir);
-    GenerateRgbaHost generateRgbaHost{}; //  = new GenerateRgbaHost();

-#if TEST_LWIR
-
-    float color_weights [] = {
-            1.0,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
-            1.0,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
-            1.0};             // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
-	float generate_RBGA_params[]={
-			10.0,                  // float             min_shot,           // 10.0
-            3.0,                   // float             scale_shot,         // 3.0
-            10.0, // 1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-            12.0    // 3.0         // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	};
-
-#else
-    float color_weights [] = {
-            0.294118,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
-            0.117647,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
-            0.588235};              // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
-	float generate_RBGA_params[]={
-			10.0,                  // float             min_shot,           // 10.0
-            3.0,                   // float             scale_shot,         // 3.0
-            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-            3.0                    // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	};
+    TpHostGpu tpHostGpu(tpParams,tpPaths);

-#endif
-
-
-
-	[[maybe_unused]] int sel_pairs[4];
-
-#if TEST_LWIR
-    // testing with 16 LWIR
-    int num_cams =   16;
-    int num_colors = 1;
-    sel_pairs[0] =   0xffffffff;
-    sel_pairs[1] =   0xffffffff;
-    sel_pairs[2] =   0xffffffff;
-    sel_pairs[3] =   0x00ffffff;
-    int num_pairs = 120;
-#else
-    // testing with quad RGB
-    int num_cams =   4;
-    int num_colors = 3;
-    sel_pairs[0] = 0x3f;
-    sel_pairs[1] =   0;
-    sel_pairs[2] =   0;
-    sel_pairs[3] =   0;
-    int num_pairs =  6;
-#endif
-
-	int task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
-
-	// FIXME: update to use new correlations and num_cams
-    float port_offsets4[4][2] =  {// used only in textures to scale differences
-			{-0.5, -0.5},
-			{ 0.5, -0.5},
-			{-0.5,  0.5},
-			{ 0.5,  0.5}};
-
-    float port_offsets[NUM_CAMS][2];
-    if (num_cams == 4){
-    	for (int ncam = 0; ncam < 4; ncam++){
-    		port_offsets[ncam][0]= port_offsets4[ncam][0];
-    		port_offsets[ncam][1]= port_offsets4[ncam][1];
-    	}
-    } else {
-		 for (int ncam = 0; ncam < num_cams; ncam++) {
-			 double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
-			 port_offsets[ncam][0] =  0.5 * sin((alpha));
-			 port_offsets[ncam][1] = -0.5 * cos((alpha));
-		 }
-    }
-
-    int keep_texture_weights = 3; // 0; // 1; // try with 0 also
-    int texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
+    GenerateRgbaHost generateRgbaHost{}; //  = new GenerateRgbaHost();

-    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * num_colors; // NUM_COLORS;
-    int KERN_SIZE =  KERN_TILES * 4 * 64;
-    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
-    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
-    float * ftask_data  =  (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
-    float * ftask_data1  = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
+    float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
+    float * ftask_data  =   (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
+    float * ftask_data1  =  (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));

    trot_deriv  rot_deriv;

@@ -488,12 +140,12 @@ int main(int argc, char **argv)


    // host array of pointers to GPU memory
-    float            * gpu_kernels_h        [num_cams];
-    struct CltExtra  * gpu_kernel_offsets_h [num_cams];
-    float            * gpu_images_h         [num_cams];
-    float              tile_coords_h        [num_cams][TILESX * TILESY][2];
-    float            * gpu_clt_h            [num_cams];
-    float            * gpu_corr_images_h    [num_cams];
+    float            * gpu_kernels_h        [tpParams.num_cams];
+    struct CltExtra  * gpu_kernel_offsets_h [tpParams.num_cams];
+    float            * gpu_images_h         [tpParams.num_cams];
+    float              tile_coords_h        [tpParams.num_cams][TILESX * TILESY][2];
+    float            * gpu_clt_h            [tpParams.num_cams];
+    float            * gpu_corr_images_h    [tpParams.num_cams];

    float            * gpu_corrs;               // correlation tiles (per tile, per pair) in pixel domain
    float            * gpu_corrs_td;            // correlation tiles (per tile, per pair) in transform domain
@@ -582,21 +234,21 @@ int main(int argc, char **argv)
    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));

 ///    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
 				tpPaths.kernel_file[ncam]); // 			   char *  path) // file path
-        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
+        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, tpParams.kern_size);

        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
 				tpPaths.kernel_offs_file[ncam]); // 			   char *  path) // file path
        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
        		host_kern_buf,
-				KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
+				tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
        // will get results back
-        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE);
-        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE)) ;
+        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
+        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE)) ;
        // allocate result images (3x height to accommodate 3 colors

        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
@@ -605,23 +257,25 @@ int main(int argc, char **argv)
        		&dstride_rslt,                // size_t* dstride, // in bytes!!
 				IMG_WIDTH + DTT_SIZE,         // int width,
 //				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
-				num_colors*(IMG_HEIGHT + DTT_SIZE));   // int height);
+				tpParams.num_colors*(IMG_HEIGHT + DTT_SIZE));   // int height);
    }
+
+
    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
    gpu_corrs = alloc_image_gpu(
    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			CORR_SIZE,                      // int width,
-			num_pairs * TILESX * TILESY);   // int height);
+			tpParams.corr_size,                      // int width,
+			tpParams.num_pairs * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
 // allocate all other correlation data, some may be
    gpu_corrs_td = alloc_image_gpu(
    		&dstride_corr_td,               // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
 			4 * DTT_SIZE * DTT_SIZE,         // int width,
-			num_pairs * TILESX * TILESY);    // int height);
+			tpParams.num_pairs * TILESX * TILESY);    // int height);

    gpu_corrs_combo = alloc_image_gpu(
    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			CORR_SIZE,                       // int width,
+			tpParams.corr_size,                       // int width,
 			TILESX * TILESY);                // int height);

    gpu_corrs_combo_td = alloc_image_gpu(
@@ -630,7 +284,7 @@ int main(int argc, char **argv)
 			TILESX * TILESY);                // int height);


-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
 				tpPaths.image_files[ncam]); // 			   char *  path) // file path
@@ -643,47 +297,48 @@ int main(int argc, char **argv)
 //#define DBG_TILE  (174*324 +118)

 //  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
        readFloatsFromFile(
 			    (float *) &tile_coords_h[ncam],
 				tpPaths.ports_offs_xy_file[ncam]); // 			   char *  path) // file path
    }

+    // tasks for all tiles
    for (int ty = 0; ty < TILESY; ty++){
        for (int tx = 0; tx < TILESX; tx++){
            int nt = ty * TILESX + tx;
            int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
            int task_txy = tx + (ty << 16);
            float task_target_disparity = DBG_DISPARITY;
-            float * tp = ftask_data + task_size * nt;
+            float * tp = ftask_data + tpParams.task_size * nt;
            *(tp + TP_TASK_TASK_OFFSET) =    *(float *) &task_task;
            *(tp + TP_TASK_TXY_OFFSET) =      *(float *) &task_txy;
            *(tp + TP_TASK_DISPARITY_OFFSET) =  task_target_disparity;
 //            tp += 2; // skip centerX, centerY
            *(tp + TP_TASK_SCALE_OFFSET) =  0; // 0.5f; // ,0; // scale, 0 - old way, just set
            tp+= TP_TASK_XY_OFFSET;
-            for (int ncam = 0; ncam < num_cams; ncam++) {
+            for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
            	*(tp++) = tile_coords_h[ncam][nt][0];
            	*(tp++) = tile_coords_h[ncam][nt][1];
            }
        }
    }

-    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
+    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
    int num_active_tiles; // will be calculated by convert_direct
 	int rslt_corr_size;
 	int corr_img_size;

-    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
+    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));

    // just allocate
-    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
+    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
    num_textures = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
    		int nt = ty * TILESX + tx;
-            float *tp = ftask_data + task_size * nt;
+            float *tp = ftask_data + tpParams.task_size * nt;
    		int cm = (*(int *) tp) & TASK_TEXTURE_BITS; // non-zero any of 4 lower task bits
    		if (cm){
    			texture_indices[num_textures++] = (nt << TEXT_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // setting  0x80 in texture indices
@@ -704,16 +359,14 @@ int main(int argc, char **argv)

    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
    // number of border tiles
-
    // copy port indices to gpu
+	gpu_port_offsets =         (float *) copyalloc_kernel_gpu((float * ) tpParams.port_offsets, tpParams.num_cams * 2); // num_ports * 2);
+    gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) tpParams.color_weights, sizeof(tpParams.color_weights));
+    gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) tpParams.generate_RBGA_params, sizeof(tpParams.generate_RBGA_params));

-	gpu_port_offsets =         (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_cams * 2); // num_ports * 2);
-    gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) color_weights, sizeof(color_weights));
-    gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) generate_RBGA_params, sizeof(generate_RBGA_params));
-
-///    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+///    int tile_texture_size = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (NUM_CAMS + tpParams.texture_colors + 1): 0)) *256;
    // in Java always allocated as for keep_texture_weights = 1;
-    int tile_texture_layers = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0));
+    int tile_texture_layers = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (tpParams.num_cams + tpParams.texture_colors + 1): 0));
    int tile_texture_size = tile_texture_layers *256;

    gpu_textures = alloc_image_gpu(
@@ -723,9 +376,9 @@ int main(int argc, char **argv)

    int rgba_width =   (TILESX+1) * DTT_SIZE;
    int rgba_height =  (TILESY+1) * DTT_SIZE;
-    int rbga_slices =  texture_colors + 1; // 4/1
-    if (keep_texture_weights & 2){
-    	rbga_slices += texture_colors * num_cams;
+    int rbga_slices =  tpParams.texture_colors + 1; // 4/1
+    if (tpParams.keep_texture_weights & 2){
+    	rbga_slices += tpParams.texture_colors * tpParams.num_cams;
    }

    gpu_textures_rbga = alloc_image_gpu(
@@ -733,14 +386,14 @@ int main(int argc, char **argv)
 			rgba_width,              // int width (floats),
 			rgba_height * rbga_slices);               // int height);
 ///    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * num_cams * (num_colors + 1) * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * tpParams.num_cams * (tpParams.num_colors + 1) * sizeof(float)));

    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
-    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     num_cams); // NUM_CAMS);
-    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, num_cams); // NUM_CAMS);
-    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      num_cams); // NUM_CAMS);
-    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         num_cams); // NUM_CAMS);
-    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, num_cams); // NUM_CAMS);
+    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     tpParams.num_cams); // NUM_CAMS);
+    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, tpParams.num_cams); // NUM_CAMS);
+    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      tpParams.num_cams); // NUM_CAMS);
+    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         tpParams.num_cams); // NUM_CAMS);
+    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, tpParams.num_cams); // NUM_CAMS);

 #ifdef DBG_TILE
    const int numIterations = 1; //0;
@@ -752,7 +405,7 @@ int main(int argc, char **argv)

 	int corr_size =        2 * CORR_OUT_RAD + 1;
 	int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
-	int num_corr_indices = num_pairs * num_tiles;
+	int num_corr_indices = tpParams.num_pairs * num_tiles;

 	float * corr_img; //  = (float *)malloc(corr_img_size * sizeof(float));
 	float * cpu_corr; //  = (float *)malloc(rslt_corr_size * sizeof(float));
@@ -766,7 +419,7 @@ int main(int argc, char **argv)
 #ifdef  TEST_ROT_MATRICES
    dim3 threads_rot(3,3,3);
 ///    dim3 grid_rot   (NUM_CAMS, 1, 1);
-    dim3 grid_rot   (num_cams, 1, 1);
+    dim3 grid_rot   (tpParams.num_cams, 1, 1);

    printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
    printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
@@ -782,7 +435,7 @@ int main(int argc, char **argv)
    	}

    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
-    			num_cams,                // int                  num_cams,
+    			tpParams.num_cams,                // int                  num_cams,
    			gpu_correction_vector ,  // struct corr_vector * gpu_correction_vector,
    			gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);

@@ -808,7 +461,7 @@ int main(int argc, char **argv)
 #ifdef  TEST_REVERSE_DISTORTIONS
    dim3 threads_rd(3,3,3);
    dim3 grid_rd   (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
-//    dim3 grid_rd   (num_cams, 1, 1);
+//    dim3 grid_rd   (tpParams.num_cams, 1, 1);

    printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
    printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
@@ -873,7 +526,7 @@ int main(int argc, char **argv)
 #define TEST_GEOM_CORR
 #ifdef  TEST_GEOM_CORR
 ///    dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
-    dim3 threads_geom(num_cams,TILES_PER_BLOCK_GEOM, 1);
+    dim3 threads_geom(tpParams.num_cams,TILES_PER_BLOCK_GEOM, 1);
    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
@@ -889,7 +542,7 @@ int main(int argc, char **argv)
    	}
 /*
    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			num_cams,                // int                  num_cams,
+    			tpParams.num_cams,                // int                  num_cams,
    			gpu_tasks,               // struct tp_task     * gpu_tasks,
 				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
@@ -899,7 +552,7 @@ int main(int argc, char **argv)
 				*/
    	calculate_tiles_offsets<<<1,1>>> (
    			1,                       // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-    			num_cams,                // int                  num_cams,
+				tpParams.num_cams,                // int                  num_cams,
 				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //    			gpu_tasks,               // struct tp_task     * gpu_tasks,
 				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
@@ -927,7 +580,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
 			ftask_data1,
 			gpu_ftasks,
-			tp_task_size * task_size *sizeof(float),
+			tp_task_size * tpParams.task_size *sizeof(float),
 			cudaMemcpyDeviceToHost));

 //task_size
@@ -936,19 +589,19 @@ int main(int argc, char **argv)
 	struct tp_task * new_task = &task_data1[DBG_TILE];
 #endif
 #ifdef 	DBG_TILE
-    printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
-    printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
-        for (int ncam = 0; ncam < num_cams; ncam++){
+    printf("old_task txy = 0x%x\n", *(int *) (ftask_data + tpParams.task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
+    printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + tpParams.task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
+        for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
            printf("camera %d pX old %f new %f diff = %f\n", ncam,
-            		 *(ftask_data  + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 *(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 (*(ftask_data + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
-            		 (*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
+            		 *(ftask_data  + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
+            		 *(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
+            		 (*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
+            		 (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
            printf("camera %d pY old %f new %f diff = %f\n", ncam,
-           		 *(ftask_data  +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 *(ftask_data1 +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 (*(ftask_data +  task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
-           		 (*(ftask_data1 + task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
+           		 *(ftask_data  +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
+           		 *(ftask_data1 +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
+           		 (*(ftask_data +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
+           		 (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
        }
 #endif //#ifdef 	DBG_TILE
 #endif // TEST_GEOM_CORR
@@ -964,7 +617,7 @@ int main(int argc, char **argv)
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);

    ///    cudaProfilerStart();
-    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //  [num_cams]  [NUM_CAMS];
+    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];

    for (int i = i0; i < numIterations; i++)
    {
@@ -975,8 +628,8 @@ int main(int argc, char **argv)
    		sdkStartTimer(&timerTP);
    	}
    	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-    			num_cams,              // int                num_cams,           // actual number of cameras
-				num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+				tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
    			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
 				gpu_kernels,           // float           ** gpu_kernels,
 				gpu_images,            // float           ** gpu_images,
@@ -1012,10 +665,10 @@ int main(int argc, char **argv)


 #ifdef SAVE_CLT
-    int rslt_size = (TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE);
+    int rslt_size = (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
 //  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
    	checkCudaErrors(cudaMemcpy( // segfault
    			cpu_clt,
 				gpu_clt_h[ncam],
@@ -1036,7 +689,7 @@ int main(int argc, char **argv)
    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
 //      for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-   	    for (int ncam = 0; ncam < num_cams; ncam++) {
+   	    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
    		test_imclt<<<grid_imclt,threads_imclt>>>(
    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
 					ncam);                                        // int             ncam); // just for debug print
@@ -1063,11 +716,11 @@ int main(int argc, char **argv)
        printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
        printf("grid_imclt_all=   (%d, %d, %d)\n",grid_imclt_all.x,   grid_imclt_all.y,   grid_imclt_all.z);
        imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>(
-        		num_cams,                    // int                num_cams,
+        		tpParams.num_cams,                    // int                num_cams,
        		gpu_clt,                     // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
 				gpu_corr_images,             // float           ** gpu_corr_images,    // [num_cams][WIDTH, 3 * HEIGHT]
 				1,                           // int               apply_lpf,
-				num_colors,                  // int               colors,               // defines lpf filter
+				tpParams.num_colors,                  // int               colors,               // defines lpf filter
 				TILESX,                      // int               woi_twidth,
 				TILESY,                      // int               woi_theight,
 				dstride_rslt/sizeof(float)); // const size_t      dstride);            // in floats (pixels)
@@ -1083,13 +736,13 @@ int main(int argc, char **argv)
    sdkDeleteTimer(&timerIMCLT);
    printf("Average IMCLT run time =%f ms\n",  avgTimeIMCLT);

-    int rslt_img_size =       num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
+    int rslt_img_size =       tpParams.num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));



 //  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
    	checkCudaErrors(cudaMemcpy2D( // segfault
    			cpu_corr_image,
 				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
@@ -1097,7 +750,7 @@ int main(int argc, char **argv)
 				dstride_rslt,
 				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
 //				3* (IMG_HEIGHT + DTT_SIZE),
-				num_colors* (IMG_HEIGHT + DTT_SIZE),
+				tpParams.num_colors* (IMG_HEIGHT + DTT_SIZE),
    			cudaMemcpyDeviceToHost));
        printf("Writing RBG data to %s\n",  tpPaths.result_rbg_file[ncam]);
    	writeFloatsToFile( // will have margins
@@ -1124,16 +777,16 @@ int main(int argc, char **argv)
    		sdkStartTimer(&timerCORR);
    	}
    	correlate2D<<<1,1>>>(
-    			num_cams,                      // int               num_cams,
-				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
+    			tpParams.num_cams,                      // int               num_cams,
+				TpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				TpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				TpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				TpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
 				gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				num_colors,                 // int               colors,             // number of colors (3/1)
-				color_weights[0], // 0.25,  // float             scale0,             // scale for R
-				color_weights[1], // 0.25,  // float             scale1,             // scale for B
-				color_weights[2], // 0.5,   // float             scale2,             // scale for G
+				tpParams.num_colors,                 // int               colors,             // number of colors (3/1)
+				tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
+				tpParams.color_weights[1], // 0.25,  // float             scale1,             // scale for B
+				tpParams.color_weights[2], // 0.5,   // float             scale2,             // scale for G
 				fat_zero * fat_zero,                // float             fat_zero2,           // here - absolute
 				gpu_ftasks,                 // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				tp_task_size,               // int               num_tiles) // number of tiles in task
@@ -1216,7 +869,7 @@ int main(int argc, char **argv)

 #ifndef NSAVE_CORR
    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
+    		tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
    /*
    		writeFloatsToFile(
    				cpu_corr,    // float *       data, // allocated array
@@ -1249,16 +902,16 @@ int main(int argc, char **argv)
    	}
    	// FIXME: provide sel_pairs
        correlate2D<<<1,1>>>( // output TD tiles, no normalization
-        		num_cams,                      // int               num_cams,
-				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
+        		tpParams.num_cams,                      // int               num_cams,
+				TpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				TpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				TpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				TpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				num_colors,                    // int               colors,             // number of colors (3/1)
-				color_weights[0], // 0.25,     // float             scale0,             // scale for R
-				color_weights[1], // 0.25,     // float             scale1,             // scale for B
-				color_weights[2], // 0.5,      // float             scale2,             // scale for G
+				tpParams.num_colors,                    // int               colors,             // number of colors (3/1)
+				tpParams.color_weights[0], // 0.25,     // float             scale0,             // scale for R
+				tpParams.color_weights[1], // 0.25,     // float             scale1,             // scale for B
+				tpParams.color_weights[2], // 0.5,      // float             scale2,             // scale for G
 				fat_zero*fat_zero,                     // float             fat_zero2,          // here - absolute (squared)
 				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				tp_task_size,                  // int               num_tiles) // number of tiles in task
@@ -1279,10 +932,10 @@ int main(int argc, char **argv)
    			cudaMemcpyDeviceToHost));

 #ifdef QUAD_COMBINE
-    	num_corr_combo = num_corrs/num_pairs;
+    	num_corr_combo = num_corrs/tpParams.num_pairs;
    	corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
    			num_corr_combo, // tp_task_size,     // int               num_tiles,          // number of tiles to process (each with num_pairs)
-				num_pairs,                           // int               num_pairs,          // num pairs per tile (should be the same)
+				tpParams.num_pairs,                  // int               num_pairs,          // num pairs per tile (should be the same)
    			1,                                   // int               init_output,        // !=0 - reset output tiles to zero before accumulating
    			0x0f,                                // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
 				gpu_corr_indices,                    // int             * gpu_corr_indices,   // packed tile+pair
@@ -1384,7 +1037,7 @@ int main(int argc, char **argv)
 			cudaMemcpyDeviceToHost));
    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
 //    int num_tiles = TILESX * TILESYA;
-//    int num_corr_indices = num_pairs * num_tiles;
+//    int num_corr_indices = tpParams.num_pairs * num_tiles;
 //    int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
    checkCudaErrors(cudaMemcpy(
    		cpu_corr_indices,
@@ -1402,12 +1055,12 @@ int main(int argc, char **argv)
    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
    	int ty = ctt / TILESX;
    	int tx = ctt % TILESX;
-    	//		int src_offs0 = ict * num_pairs * corr_size * corr_size;
+    	//		int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
    	int src_offs0 = ict * corr_size * corr_size;
    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);

    	for (int iy = 0; iy < corr_size; iy++){
-    		int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
+    		int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
    		for (int ix = 0; ix < corr_size; ix++){
    			corr_img[dst_offs++] = cpu_corr[src_offs++];
@@ -1418,7 +1071,7 @@ int main(int argc, char **argv)

 #ifndef NSAVE_CORR
    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
+    		tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
    writeFloatsToFile(
    		corr_img,                  // float *       data, // allocated array
 			corr_img_size,             // int           size, // length in elements
@@ -1490,24 +1143,24 @@ int main(int argc, char **argv)
 #ifdef CORR_INTER_SELF
    int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
    int num_sel_sensors = 16; // 15; // 16;
-    num_pairs = num_sel_sensors+1;
-    num_corr_indices = num_pairs * num_tiles;
+    int num_pairs_inter = num_sel_sensors+1;
+    num_corr_indices = num_pairs_inter * num_tiles;
    int is_bayer = 0;
    int image_dx = 2;
    int image_dy = 0;

-    float            * gpu_clt_ref_h            [num_cams];
+    float            * gpu_clt_ref_h            [tpParams.num_cams];
    float           ** gpu_clt_ref;    //            [NUM_CAMS];
-    for (int ncam = 0; ncam < num_cams; ncam++) {
-       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE);
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
+       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
    }
-    gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  num_cams); // NUM_CAMS);
+    gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  tpParams.num_cams); // NUM_CAMS);

    // use gpu_images and convert to gpu_clt_ref

 	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			num_cams,              // int                num_cams,           // actual number of cameras
-			num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+			tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
 			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
 			gpu_kernels,           // float           ** gpu_kernels,
 			gpu_images,            // float           ** gpu_images,
@@ -1528,7 +1181,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaDeviceSynchronize());

 	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
 				tpPaths.image_files[ncam]); // 			   char *  path) // file path
@@ -1548,8 +1201,8 @@ int main(int argc, char **argv)
    }

 	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			num_cams,              // int                num_cams,           // actual number of cameras
-			num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+			tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
 			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
 			gpu_kernels,           // float           ** gpu_kernels,
 			gpu_images,            // float           ** gpu_images,
@@ -1582,14 +1235,14 @@ int main(int argc, char **argv)
    		sdkStartTimer(&timerINTERSELF);
    	}
    	correlate2D_inter<<<1,1>>>( // only results in TD
-    			num_cams,                      // int               num_cams,
+    			tpParams.num_cams,                      // int               num_cams,
 				sel_sensors,                   // int               sel_sensors,
 				gpu_clt,                       // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 				gpu_clt_ref, // *********      // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				num_colors,                    // int               colors,             // number of colors (3/1)
-				color_weights[0], // 0.25,     // float             scale0,             // scale for R
-				color_weights[1], // 0.25,     // float             scale1,             // scale for B
-				color_weights[2], // 0.5,      // float             scale2,             // scale for G
+				tpParams.num_colors,                    // int               colors,             // number of colors (3/1)
+				tpParams.color_weights[0], // 0.25,     // float             scale0,             // scale for R
+				tpParams.color_weights[1], // 0.25,     // float             scale1,             // scale for B
+				tpParams.color_weights[2], // 0.5,      // float             scale2,             // scale for G
 				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				tp_task_size,                  // int               num_tiles) // number of tiles in task
 				TILESX,                        // int               tilesx,             // number of tile rows
@@ -1676,7 +1329,7 @@ int main(int argc, char **argv)
    }
 #ifndef NSAVE_CORR
    printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
+    		tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
    writeFloatsToFile(
    		corr_img,                  // float *       data, // allocated array
 			corr_img_size,             // int           size, // length in elements
@@ -1757,7 +1410,7 @@ int main(int argc, char **argv)
    free (cpu_corr_indices);
    free (corr_img);

-    for (int ncam = 0; ncam < num_cams; ncam++){
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
    	checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
    }
    checkCudaErrors(cudaFree(gpu_clt_ref));
@@ -1797,10 +1450,10 @@ int main(int argc, char **argv)
    		    		sdkStartTimer(&timerTEXTURE);
    		    	}
    		    	int shared_size = host_get_textures_shared_size( // in bytes
-    		    			num_cams,         // int                num_cams,     // actual number of cameras
-							texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    		    			tpParams.num_cams,         // int                num_cams,     // actual number of cameras
+							tpParams.texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 							0);           // int *              offsets);     // in floats
-    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
+    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, texture_colors);
    				//*pnum_texture_tiles = 0;
    				cpu_pnum_texture_tiles = 0;
    				checkCudaErrors(cudaMemcpy(
@@ -1813,7 +1466,7 @@ int main(int argc, char **argv)

 #ifdef NO_DP
    				 create_nonoverlap_list<<<blocks0,threads0>>>(
-    						 num_cams,                // int                num_cams,
+    						 tpParams.num_cams,                // int                num_cams,
    						 gpu_ftasks,              // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 							 tp_task_size,            // int                num_tiles,           // number of tiles in task
 							 TILESX,                  // int                width,               // number of tiles in a row
@@ -1836,23 +1489,23 @@ int main(int argc, char **argv)
    		     		printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
    		     		printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
    				textures_accumulate <<<grid_texture1,threads_texture1,  shared_size>>>( // 65536>>>( //
-    						num_cams,                        // 	int               num_cams,           // number of cameras used
+    						tpParams.num_cams,                        // 	int               num_cams,           // number of cameras used
 							(int *) 0,                       // int             * woi,                // x, y, width,height
 							gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 							gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles,          // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
 							0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
 							gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 							gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
-							texture_colors,                  // int               colors,             // number of colors (3/1)
-							(texture_colors == 1),           // int               is_lwir,            // do not perform shot correction
-							generate_RBGA_params[0], // min_shot,      // float             min_shot,           // 10.0
-							generate_RBGA_params[1], // scale_shot,    // float             scale_shot,         // 3.0
-							generate_RBGA_params[2], // diff_sigma,    // float             diff_sigma,         // pixel value/pixel change
-							generate_RBGA_params[3], // diff_threshold,// float             diff_threshold,     // pixel value/pixel change
-							generate_RBGA_params[4], // min_agree,     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+							tpParams.texture_colors,                  // int               colors,             // number of colors (3/1)
+							(tpParams.texture_colors == 1),           // int               is_lwir,            // do not perform shot correction
+							tpParams.generate_RBGA_params[0], // min_shot,      // float             min_shot,           // 10.0
+							tpParams.generate_RBGA_params[1], // scale_shot,    // float             scale_shot,         // 3.0
+							tpParams.generate_RBGA_params[2], // diff_sigma,    // float             diff_sigma,         // pixel value/pixel change
+							tpParams.generate_RBGA_params[3], // diff_threshold,// float             diff_threshold,     // pixel value/pixel change
+							tpParams.generate_RBGA_params[4], // min_agree,     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 							gpu_color_weights,               // float             weights[3],         // scale for R,B,G
 							1,                       // dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-							keep_texture_weights, // 0, // 1                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+							tpParams.keep_texture_weights, // 0, // 1                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
 							// combining both non-overlap and overlap (each calculated if pointer is not null )
 							0,                               // size_t      texture_rbg_stride, // in floats
 							(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
@@ -1862,9 +1515,9 @@ int main(int argc, char **argv)
 							gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 							TILESX);
 #else // #ifdef NO_DP
-    				//keep_texture_weights is assumed 0 in textures_nonoverlap
+    				//tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
    		    	textures_nonoverlap<<<1,1>>> ( //,65536>>> (
-    		    			num_cams,              // int                num_cams,           // number of cameras used
+    		    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
    						gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
    		                tp_task_size,          // int                num_tiles,          // number of tiles in task list
    		    	// declare arrays in device code?
@@ -1872,12 +1525,12 @@ int main(int argc, char **argv)
    						gpu_pnum_texture_tiles, // int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
    				        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
    						gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-    						texture_colors,        // int               colors,             // number of colors (3/1)
-    						(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+							tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
+    						(tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
    						gpu_generate_RBGA_params,
    						gpu_color_weights,     // float             weights[3],         // scale for R
    						1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-							keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+							tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
    		    	// combining both non-overlap and overlap (each calculated if pointer is not null )
    						dstride_textures/sizeof(float), // size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
    						gpu_textures,         // float           * gpu_texture_tiles,
@@ -1950,7 +1603,7 @@ int main(int argc, char **argv)


    		    int ntiles = TILESX * TILESY;
-    		    int nlayers = num_cams * (num_colors + 1);
+    		    int nlayers = tpParams.num_cams * (tpParams.num_colors + 1);
    		    int diff_rgb_combo_size = ntiles * nlayers;
    		    float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
    			checkCudaErrors(cudaMemcpy(
@@ -2037,7 +1690,7 @@ int main(int argc, char **argv)
    	// FIXME: update to use new correlations and num_cams
 #ifdef NO_DP
    	generateRgbaHost.generate_RBGA_host (
-    			num_cams,              // int                num_cams,           // number of cameras used
+    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
    	// Parameters to generate texture tasks
 				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
                tp_task_size,          // int                num_tiles,          // number of tiles in task list
@@ -2050,24 +1703,24 @@ int main(int argc, char **argv)
    	// Parameters for the texture generation
 	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-	            texture_colors,        // int               colors,             // number of colors (3/1)
-	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				generate_RBGA_params, //		float             cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
+				tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
+	            (tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+				tpParams.generate_RBGA_params, //		float             cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
 				gpu_color_weights,     // float             weights[3],         // scale for R
 	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
+				tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
 				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
 				gpu_textures_rbga);     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
 #else
    	int shared_size = host_get_textures_shared_size( // in bytes
-    	        num_cams,     // int                num_cams,     // actual number of cameras
-				texture_colors, // colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    			tpParams.num_cams,     // int                num_cams,     // actual number of cameras
+				tpParams.texture_colors, // colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
    	        0);           // int *              offsets);     // in floats
-    	printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
+    	printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);

 		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
    	generate_RBGA<<<1,1>>> (
-    			num_cams,              // int                num_cams,           // number of cameras used
+    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
    	// Parameters to generate texture tasks
 				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //                gpu_tasks,             // struct tp_task   * gpu_tasks,
@@ -2081,12 +1734,12 @@ int main(int argc, char **argv)
    	// Parameters for the texture generation
 	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-	            texture_colors,        // int               colors,             // number of colors (3/1)
-	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+				tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
+	            (tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
 				gpu_generate_RBGA_params,
 				gpu_color_weights,     // float             weights[3],         // scale for R
 	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
+				tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
 				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
 				gpu_textures_rbga,     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
 				gpu_twh);              // int             * twh)
@@ -2170,7 +1823,7 @@ int main(int argc, char **argv)

    free (host_kern_buf);
    // TODO: move somewhere when all is done
-    for (int ncam = 0; ncam < num_cams; ncam++) {
+    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -173,7 +173,7 @@
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
 #define TILESYA       ((TILESY +3) & (~3))

-
+//#define CORR_SIZE     ((2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1))

 #endif //#ifndef JCUDA

--- a/src/tp_paths.cu
+++ b/src/tp_paths.cu
@@ -4,8 +4,8 @@
 *  Created on: Mar 26, 2025
 *      Author: elphel
 */
-#include <vector>
-#include <string>
+//#include <vector>
+//#include <string>
 #include "tp_paths.h"
 TpPaths::TpPaths(int lwir){
 	m_lwir = lwir;