debugged initially with jcuda

d8e9a454 · Andrey Filippov · 414f6351 · d8e9a454 · d8e9a454 · d8e9a454
Commit d8e9a454 authored Apr 15, 2025 by Andrey Filippov
6 changed files
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -5,15 +5,21 @@
 *      Author: elphel
 */
 #include <stdexcept>
-#include <helper_cuda.h>  // for checkCudaErrors
+//#include <driver_types.h> // was not needed before, only for indexes - needs __DRIVER_TYPES_H__
+#include <cstdlib>
+#include <cstdio>
 #include <cuda_runtime.h>     // cudaFree
+#include <helper_cuda.h>      // for checkCudaErrors
 #include <helper_functions.h> // timer functions
 //#include "TpParams.h"   // TpHostGpu.h has it
 #include "tp_paths.h"
 #include "tp_files.h"
 #include "tp_utils.h"       // for copyalloc_kernel_gpu
-#include "GenerateRgbaHost.h"
+//#include "GenerateRgbaHost.h"
 #include "TpHostGpu.h"
 #define MY_EXCEPTION(aMessage) \
@@ -79,6 +85,7 @@ void TpHostGpu::allTests(
 		int image_dy,
 		const float target_disparity,
 		const float scale,
+		const float fat_zero,         // 1000.0
 		int quad_combine,
 		int use_dp,
 		int debug){
@@ -100,10 +107,10 @@ void TpHostGpu::allTests(
 	testConvertDirect       (num_runs);                                // 608
 	testImcltRbgAll         (num_runs);                                // 701
-	testCorrelate2DIntra    (num_runs);                                // 762 - 885
+	testCorrelate2DIntra    (num_runs, fat_zero);                                // 762 - 885
-	testCorrelate2DIntraTD  (num_runs, quad_combine);                  // 886 - 1123
+	testCorrelate2DIntraTD  (num_runs, fat_zero, quad_combine);                  // 886 - 1123
 	setImgBuffersShifted(is_bayer, image_dx, image_dy);                // 1171-1188
-	testCorrelate2DInterSelf(num_runs);                                // 1136 - 1411
+	testCorrelate2DInterSelf(num_runs, fat_zero);                                // 1136 - 1411
 	testTextures            (num_runs, use_dp, debug);                 // 1422-1664
 	testTexturesRGBA        (num_runs, use_dp, debug);                 // 1669-1810
 	return;
@@ -696,7 +703,7 @@ void TpHostGpu::testImcltRbgAll         (int num_runs){ // 701
    		m_gpu_corr_images_h); // float **     gpu_corr_images_h){
 }
-void TpHostGpu::testCorrelate2DIntra(int num_runs){
+void TpHostGpu::testCorrelate2DIntra(int num_runs, float fat_zero){
 	int  num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
@@ -730,7 +737,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 				m_tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
 				m_tpParams.color_weights[1], // 0.25,  // float             scale1,             // scale for B
 				m_tpParams.color_weights[2], // 0.5,   // float             scale2,             // scale for G
-				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,         // float             fat_zero2,           // here - absolute
 				m_gpu_ftasks,                // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				m_tpParams.tp_tasks_size,    // int               num_tiles) // number of tiles in task
 				m_tpParams.tilesx,           // int               tilesx,             // number of tile rows
@@ -764,7 +771,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 			16);                        //int          num_sel_sensors) { // only for interscene
 }
-void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886 - 1123
+void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, float fat_zero, int quad_combine){ // 886 - 1123
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	// check/replace names
@@ -817,7 +824,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 				m_tpParams.color_weights[0],   // 0.25,  // float             scale0,             // scale for R
 				m_tpParams.color_weights[1],   // 0.25,  // float             scale1,             // scale for B
 				m_tpParams.color_weights[2],   // 0.5,   // float             scale2,             // scale for G
-				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,           // float             fat_zero2,           // here - absolute
 				m_gpu_ftasks,                  // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				m_tpParams.tp_tasks_size,      // int               num_tiles) // number of tiles in task
 				m_tpParams.tilesx,             // int               tilesx,             // number of tile rows
@@ -856,7 +863,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 					(float *) 0,                               // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 					dstride_corr_combo/sizeof(float),          // const size_t      corr_stride,        // in floats
 					m_gpu_corrs_combo,                         // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 			printf("corr2D_combine pass: %d\n",i);
 		}else { // if (quad_combine) {
@@ -868,7 +875,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 					(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 					dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
 					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 		} // if (quad_combine) {
@@ -971,7 +978,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 	} // if (quad_combine) {
 }
-void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
+void TpHostGpu::testCorrelate2DInterSelf(int num_runs, float fat_zero){ // 889
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	// check/replace names
@@ -1087,7 +1094,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
    				(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
    				dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
 					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
     	getLastCudaError("Kernel failure:corr2D_normalize");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -1440,6 +1447,8 @@ void TpHostGpu::testTexturesRGBA        (
 			printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
 			cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
+			// was not here - next line
+			cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
 			generate_RBGA<<<1,1>>> (
 					m_tpParams.num_cams,              // int                num_cams,           // number of cameras used
 					// Parameters to generate texture tasks
@@ -1788,6 +1797,8 @@ float * TpHostGpu::getCorrTdImg(
 	return corr_img;
 }
+//void TpHostGpu::generate_RBGA_host(
+// static // https://stackoverflow.com/questions/15725922/static-function-a-storage-class-may-not-be-specified-here
 void TpHostGpu::generate_RBGA_host(
 		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
@@ -2031,7 +2042,6 @@ void TpHostGpu::generate_RBGA_host(
 				pntt, // ntt,                    // int *             num_texture_tiles,  // number of texture tiles to process
 				ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
 				gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				//				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 				colors,                          // int               colors,             // number of colors (3/1)
 				is_lwir,                         // int               is_lwir,            // do not perform shot correction

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -117,6 +117,7 @@ public:
 			int image_dy,                 // 0
 			const float target_disparity, // DBG_DISPARITY == 0.0
 			const float scale,            // 0.0
+			const float fat_zero,         // 1000.0
 			int quad_combine,
 			int use_dp,
 			int debug);
@@ -138,27 +139,13 @@ public:
 //	void testImclt               (int num_runs);                            // 682 // not implemented
 	void testImcltRbgAll         (int num_runs);                            // 701
-	void testCorrelate2DIntra    (int num_runs);                            // 762 - 885
+	void testCorrelate2DIntra    (int num_runs, float fat_zero);                            // 762 - 885
-	void testCorrelate2DIntraTD  (int num_runs, int quad_combine);          // 886 - 1123
+	void testCorrelate2DIntraTD  (int num_runs, float fat_zero, int quad_combine);          // 886 - 1123
 	//void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy);  // 1171-1188
-	void testCorrelate2DInterSelf(int num_runs);                            // 1136 - 1411
+	void testCorrelate2DInterSelf(int num_runs, float fat_zero);                            // 1136 - 1411
 	void testTextures            (int num_runs, int use_dp, int debug);     // 1422-1664
 	void testTexturesRGBA        (int num_runs, int use_dp, int debug);      // 1669-1810
+	static void generate_RBGA_host( // not a member
-private:
-	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
-	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);
-	// for both intra and inter!
-	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
-	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
-	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
-	void generate_RBGA_host( // not a member
 			int                num_cams,           // number of cameras used
 			// Parameters to generate texture tasks
 			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p//		struct tp_task   * gpu_tasks,
@@ -182,6 +169,19 @@ private:
 			const int         texture_rbga_stride,     // in floats
 			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+private:
+	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
+	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);
+	// for both intra and inter!
+	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
+	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
+	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
 	void hfree(float *& p); // {if (p) free (p);}
 	void hfree(struct CltExtra *& p);
 	void gfree(float *& p);
@@ -193,9 +193,9 @@ private:
 	void gfree(struct trot_deriv *& p);
 	void gfree(float **& p);
 	void gfree(struct CltExtra **& p);
 };
 #endif /* SRC_TPHOSTGPU_H_ */
--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -49,7 +49,7 @@ public:
 	static constexpr int tp_task_centerxy_offset =  TP_TASK_CENTERXY_OFFSET;//  3
 	static constexpr int tp_task_scale_offset =     TP_TASK_SCALE_OFFSET;//     5
 	static constexpr int tp_task_xy_offset =        TP_TASK_XY_OFFSET;//        6
-	static constexpr float fat_zero =               1000.0f; // 300.0f; // 30.0;
+//	static constexpr float fat_zero =               1000.0f; // 300.0f; // 30.0;
    static constexpr int convert_direct_indexing_threads =      CONVERT_DIRECT_INDEXING_THREADS; //
    static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; //

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -112,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
 		}
 };
+// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
+__constant__ int angles_offsets [4] {15,0,30,30};
+/*
+__constant__ int angles_offsets [4]  {
+		(int) (offsetof4(corr_vector, azimuth)),
+		(int) (offsetof4(corr_vector, tilt)),
+		(int) (offsetof4(corr_vector, roll)),
+		(int) (offsetof4(corr_vector, roll))};
+*/
+/*
+ __constant__ int angles_offsets [4] = {
+		(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
+		(int) (offsetof(corr_vector, tilt)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float))};
-__constant__ int angles_offsets [4] = {
+ */
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
 __constant__ int mm_seq [3][3][3]={
 		{
 				{6,5,12}, // a_t * a_z -> tmp0

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -51,6 +51,11 @@
    ((size_t)&(((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
+#ifndef offsetof4
+#define offsetof4(st, m) \
+    (((size_t)&(((st *)0)->m))>>2)
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -43,12 +43,14 @@
 #define CORR_INTER_SELF 1
-#include <stdio.h>
+#include <cstdlib>
-#include <stdlib.h>
+#include <cstdio>
+#include <cuda_runtime.h>     // cudaFree
+#include <helper_cuda.h>      // for checkCudaErrors
+#include <helper_functions.h> // timer functions
 #include <math.h>
-#include <cuda_runtime.h>
+//#include <driver_types.h> // was not needed before, only for indexes - needs __DRIVER_TYPES_H__
-#include <helper_cuda.h>
-#include <helper_functions.h>
 // for reading binary files
 #include <fstream>
@@ -59,34 +61,12 @@
 #include "tp_defines.h"   // was not here
 #include "dtt8x8.h"
 #include "geometry_correction.h"
-//#include "TileProcessor.cuh"
 #include "TileProcessor.h"
 #include "tp_utils.h"
 #include "tp_files.h"
-//#include "tp_paths.cuh"
 #include "tp_paths.h"
 #include "TpParams.h"
 #include "TpHostGpu.h"
-#include "GenerateRgbaHost.h"
-/*
-#if TEST_LWIR
-	#define IMG_WIDTH                   640
-	#define IMG_HEIGHT                  512
-	#define KERNELS_HOR                 82 // 80+2
-	#define KERNELS_VERT                66 // 64+2
-#else
-	#define IMG_WIDTH                  2592
-	#define IMG_HEIGHT                 1936
-	#define KERNELS_HOR                 164  // 2592 / 16 + 2
-	#define KERNELS_VERT                123  // 1936 / 16 + 2
-#endif
-#define CORR_OUT_RAD                   7 // full tile (15x15), was 4 (9x9)
-#define DBG_DISPARITY                  0.0 // 56.0//   0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
-// only used in C++ test
-#define TILESX        (IMG_WIDTH / DTT_SIZE)
-#define TILESY        (IMG_HEIGHT / DTT_SIZE)
-#define TILESYA       ((TILESY +3) & (~3))
-*/
 /**
 **************************************************************************
@@ -99,1764 +79,38 @@
 */
 int main(int argc, char **argv)
 {
-    //
-    // Sample initialization
-    //
    printf("%s Starting...\n\n", argv[0]);
    printf("sizeof(float*)=%d\n",(int)sizeof(float*));
    //initialize CUDA
    findCudaDevice(argc, (const char **)argv);
-    float fat_zero = 1000.0f; // 300.0f; // 30.0;
-    int is_bayer = 0; // from 1136
    int image_dx =      2;
    int image_dy =      0;
+    float fat_zero = 1000.0f; // 300.0f; // 30.0;
 #if TEST_LWIR
    int use_lwir= 1;
 #else
    int use_lwir= 0;
 #endif
-    TpParams tpParams(use_lwir);
+/*
-    TpPaths tpPaths(use_lwir);
+   const int angles_offsets [4]  {
+    		(int) (offsetof4(corr_vector, azimuth)),
+    		(int) (offsetof4(corr_vector, tilt)),
+    		(int) (offsetof4(corr_vector, roll)),
+    		(int) (offsetof4(corr_vector, roll))};
+*/
+    TpParams  tpParams (use_lwir);
+    TpPaths   tpPaths  (use_lwir);
    TpHostGpu tpHostGpu(tpParams,tpPaths);
-    /* */
    tpHostGpu.allTests(
 			10,   // int num_runs,
-			 2,   // int image_dx,                 // 2
+			image_dx,   // int image_dx,                 // 2
-			 0,   // int image_dy,                 // 0
+			image_dy,   // int image_dy,                 // 0
 			 0.0, // const float target_disparity, // DBG_DISPARITY == 0.0
 			 0.0, // const float scale,            // 0.0
+			 fat_zero, //			const float fat_zero,         // 1000.0
 			 0,   // int quad_combine,
 			 1,   // int use_dp,
 			 0);  // int debug);
-	return 0;
-	/* */
-    GenerateRgbaHost generateRgbaHost{}; //  = new GenerateRgbaHost(); Remove files, use tpHostGpu::
-//    return 0;
-    float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
-    float * ftask_data  =   (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
-    float * ftask_data1  =  (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
-    trot_deriv  rot_deriv;
-    int                texture_indices      [TILESX*TILESYA];
-    int                cpu_woi              [4];
-    // host array of pointers to GPU memory
-    float            * gpu_kernels_h        [tpParams.num_cams];
-    struct CltExtra  * gpu_kernel_offsets_h [tpParams.num_cams];
-    float            * gpu_images_h         [tpParams.num_cams];
-    float              tile_coords_h        [tpParams.num_cams][TILESX * TILESY][2];
-    float            * gpu_clt_h            [tpParams.num_cams];
-    float            * gpu_corr_images_h    [tpParams.num_cams];
-    float            * gpu_corrs;               // correlation tiles (per tile, per pair) in pixel domain
-    float            * gpu_corrs_td;            // correlation tiles (per tile, per pair) in transform domain
-    int              * gpu_corr_indices;        // shared by gpu_corrs gpu_corrs_td
-    float            * gpu_corrs_combo;         // correlation tiles combined (1 per tile), pixel domain
-    float            * gpu_corrs_combo_td;      // correlation tiles combined (1 per tile), transform domain
-    int              * gpu_corrs_combo_indices; // shared by gpu_corrs_combo and gpu_corrs_combo_td
-    float            * gpu_textures;
-    float            * gpu_diff_rgb_combo;
-    float            * gpu_textures_rbga;
-    int              * gpu_texture_indices;
-    int              * gpu_woi;
-    int              * gpu_twh;
-    int              * gpu_num_texture_tiles;
-    float            * gpu_port_offsets;
-    float            * gpu_color_weights;
-    float            * gpu_generate_RBGA_params;
-    int                num_corrs;
-    int                num_textures;
-    // GPU pointers to GPU pointers to memory
-    float           ** gpu_kernels; //           [NUM_CAMS];
-    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
-    float           ** gpu_images; //            [NUM_CAMS];
-    float           ** gpu_clt;    //            [NUM_CAMS];
-    float           ** gpu_corr_images; //       [NUM_CAMS];
-    // GPU pointers to GPU memory
-    float *           gpu_ftasks; // TODO: ***** allocate ! **** DONE
-    int *             gpu_active_tiles;
-    int *             gpu_num_active;
-    int *             gpu_num_corr_tiles;
-    checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
-    checkCudaErrors (cudaMalloc((void **)&gpu_num_active,                     sizeof(int)));
-    checkCudaErrors (cudaMalloc((void **)&gpu_num_corr_tiles,                 sizeof(int)));
-    size_t  dstride;               // in bytes !
-    size_t  dstride_rslt;          // in bytes !
-    size_t  dstride_corr;          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-    // in the future, dstride_corr can reuse that of dstride_corr_td?
-    size_t  dstride_corr_td;       // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
-    size_t  dstride_corr_combo;    // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-    size_t  dstride_corr_combo_td; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
-    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
-    size_t  dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
-    struct gc fgeometry_correction;
-    float*  correction_vector;
-    int     correction_vector_length;
-    float * rByRDist;
-    int     rByRDist_length;
-    struct gc          * gpu_geometry_correction;
-    struct corr_vector * gpu_correction_vector;
-    float              * gpu_rByRDist;
-    trot_deriv   * gpu_rot_deriv;
-    readFloatsFromFile(
-    		(float *) &fgeometry_correction, // float * data, // allocated array
-			tpPaths.geometry_correction_file); // 			   char *  path) // file path
-    rByRDist = readAllFloatsFromFile(
-    		tpPaths.rByRDist_file, // const char *  path,
-    		&rByRDist_length); // int * len_in_floats)
-    correction_vector =  readAllFloatsFromFile(
-    		tpPaths.correction_vector_file, // const char *  path,
-    		&correction_vector_length); // int * len_in_floats)
-    gpu_geometry_correction =  (struct gc *) copyalloc_kernel_gpu(
-    		(float *) &fgeometry_correction,
-    		sizeof(fgeometry_correction)/sizeof(float));
-    gpu_correction_vector =  (struct corr_vector * ) copyalloc_kernel_gpu(
-    		correction_vector,
-			correction_vector_length);
-    gpu_rByRDist =  copyalloc_kernel_gpu(
-    		rByRDist,
-			rByRDist_length);
-    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
-///    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				tpPaths.kernel_file[ncam]); // 			   char *  path) // file path
-        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, tpParams.kern_size);
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				tpPaths.kernel_offs_file[ncam]); // 			   char *  path) // file path
-        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
-        		host_kern_buf,
-				tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
-        // will get results back
-        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
-        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE)) ;
-        // allocate result images (3x height to accommodate 3 colors
-        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
-        //host array of pointers to GPU arrays
-        gpu_corr_images_h[ncam] = alloc_image_gpu(
-        		&dstride_rslt,                // size_t* dstride, // in bytes!!
-				IMG_WIDTH + DTT_SIZE,         // int width,
-//				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
-				tpParams.num_colors*(IMG_HEIGHT + DTT_SIZE));   // int height);
-    }
-    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
-    gpu_corrs = alloc_image_gpu(
-    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_length,                      // int width,
-			tpParams.num_pairs * TILESX * TILESY);   // int height);
-    // read channel images (assuming host_kern_buf size > image size, reusing it)
-// allocate all other correlation data, some may be
-    gpu_corrs_td = alloc_image_gpu(
-    		&dstride_corr_td,               // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			4 * DTT_SIZE * DTT_SIZE,         // int width,
-			tpParams.num_pairs * TILESX * TILESY);    // int height);
-    gpu_corrs_combo = alloc_image_gpu(
-    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_length,                       // int width,
-			TILESX * TILESY);                // int height);
-    gpu_corrs_combo_td = alloc_image_gpu(
-    		&dstride_corr_combo_td,          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			4 * DTT_SIZE * DTT_SIZE,         // int width,
-			TILESX * TILESY);                // int height);
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				tpPaths.image_files[ncam]); // 			   char *  path) // file path
-        gpu_images_h[ncam] =  copyalloc_image_gpu(
-        		host_kern_buf, // float * image_host,
-				&dstride,      // size_t* dstride,
-				IMG_WIDTH,     // int width,
-				IMG_HEIGHT);   // int height);
-    }
-//#define DBG_TILE  (174*324 +118)
-//  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-        readFloatsFromFile(
-			    (float *) &tile_coords_h[ncam],
-				tpPaths.ports_offs_xy_file[ncam]); // 			   char *  path) // file path
-    }
-    // tasks for all tiles
-    for (int ty = 0; ty < TILESY; ty++){
-        for (int tx = 0; tx < TILESX; tx++){
-            int nt = ty * TILESX + tx;
-            int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
-            int task_txy = tx + (ty << 16);
-            float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
-            float * tp = ftask_data + tpParams.task_size * nt;
-            *(tp + TP_TASK_TASK_OFFSET) =    *(float *) &task_task;
-            *(tp + TP_TASK_TXY_OFFSET) =      *(float *) &task_txy;
-            *(tp + TP_TASK_DISPARITY_OFFSET) =  task_target_disparity;
-//            tp += 2; // skip centerX, centerY
-            *(tp + TP_TASK_SCALE_OFFSET) =  0; // 0.5f; // ,0; // scale, 0 - old way, just set
-            tp+= TP_TASK_XY_OFFSET;
-            for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-            	*(tp++) = tile_coords_h[ncam][nt][0];
-            	*(tp++) = tile_coords_h[ncam][nt][1];
-            }
-        }
-    }
-    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
-    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
-    int num_active_tiles; // will be calculated by convert_direct
-	int rslt_corr_size;
-	int corr_img_size;
-    // just allocate
-    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
-    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
-    num_textures = 0;
-    for (int ty = 0; ty < TILESY; ty++){
-    	for (int tx = 0; tx < TILESX; tx++){
-    		int nt = ty * TILESX + tx;
-            float *tp = ftask_data + tpParams.task_size * nt;
-    		int cm = (*(int *) tp) & (TASK_TEXTURE_BITS | (1 << TASK_TEXT_EN)); // non-zero any of 8 lower task bits or bit 8 (TASK_TEXT_EN)
-    		if (cm){
-    			texture_indices[num_textures++] = (nt << TEXT_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // setting  0x80 in texture indices
-    		}
-    	}
-    }
-    // num_textures now has the total number of textures
-    // copy corr_indices to gpu
-    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
-    		(float * ) texture_indices,
-			num_textures,
-			TILESX * TILESYA); // number of rows - multiple of 4
-    // just allocate
-    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_twh,               2 * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
-    // number of border tiles
-    // copy port indices to gpu
-	gpu_port_offsets =         (float *) copyalloc_kernel_gpu((float * ) tpParams.port_offsets, tpParams.num_cams * 2); // num_ports * 2);
-    gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) tpParams.color_weights, sizeof(tpParams.color_weights));
-    gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) tpParams.generate_RBGA_params, sizeof(tpParams.generate_RBGA_params));
-///    int tile_texture_size = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (NUM_CAMS + tpParams.texture_colors + 1): 0)) *256;
-    // in Java always allocated as for keep_texture_weights = 1;
-    int tile_texture_layers = (tpParams.texture_colors + 1 + (tpParams.keep_texture_weights? (tpParams.num_cams + tpParams.texture_colors + 1): 0));
-    int tile_texture_size = tile_texture_layers *256;
-    gpu_textures = alloc_image_gpu(
-    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
-			tile_texture_size,              // int width (floats),
-			TILESX * TILESY);               // int height);
-    int rgba_width =   (TILESX+1) * DTT_SIZE;
-    int rgba_height =  (TILESY+1) * DTT_SIZE;
-    int rbga_slices =  tpParams.texture_colors + 1; // 4/1
-    if (tpParams.keep_texture_weights & 2){
-    	rbga_slices += tpParams.texture_colors * tpParams.num_cams;
-    }
-    gpu_textures_rbga = alloc_image_gpu(
-    		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
-			rgba_width,              // int width (floats),
-			rgba_height * rbga_slices);               // int height);
-///    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * NUM_CAMS * (NUM_COLORS + 1) * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_diff_rgb_combo,  TILESX * TILESY * tpParams.num_cams * (tpParams.num_colors + 1) * sizeof(float)));
-    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
-    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     tpParams.num_cams); // NUM_CAMS);
-    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, tpParams.num_cams); // NUM_CAMS);
-    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      tpParams.num_cams); // NUM_CAMS);
-    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         tpParams.num_cams); // NUM_CAMS);
-    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, tpParams.num_cams); // NUM_CAMS);
-#ifdef DBG_TILE
-    const int numIterations = 1; //0;
-    const int i0 =  0; // -1;
-#else
-    const int numIterations = 10; // 0; //0;
-    const int i0 = -1; // 0; // -1;
-#endif
-	int corr_size =        2 * CORR_OUT_RAD + 1;
-	int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
-	int num_corr_indices = tpParams.num_pairs * num_tiles;
-	float * corr_img; //  = (float *)malloc(corr_img_size * sizeof(float));
-	float * cpu_corr; //  = (float *)malloc(rslt_corr_size * sizeof(float));
-	float * cpu_corr_td;
-	int *   cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));
-#define TEST_ROT_MATRICES
-#ifdef  TEST_ROT_MATRICES
-    dim3 threads_rot(3,3,3);
-///    dim3 grid_rot   (NUM_CAMS, 1, 1);
-    dim3 grid_rot   (tpParams.num_cams, 1, 1);
-    printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
-    printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
-    StopWatchInterface *timerROT_MATRICES = 0;
-    sdkCreateTimer(&timerROT_MATRICES);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerROT_MATRICES);
-    		sdkStartTimer(&timerROT_MATRICES);
-    	}
-    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
-    			tpParams.num_cams,                // int                  num_cams,
-    			gpu_correction_vector ,  // struct corr_vector * gpu_correction_vector,
-    			gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    ///	cudaProfilerStop();
-    sdkStopTimer(&timerROT_MATRICES);
-    float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
-    sdkDeleteTimer(&timerROT_MATRICES);
-    printf("Average calc_rot_matrices run time =%f ms\n",  avgTimeROT_MATRICES);
-	checkCudaErrors(cudaMemcpy(
-			&rot_deriv,
-			gpu_rot_deriv,
-			sizeof(trot_deriv),
-			cudaMemcpyDeviceToHost));
-#endif // TEST_ROT_MATRICES
-#define TEST_REVERSE_DISTORTIONS
-#ifdef  TEST_REVERSE_DISTORTIONS
-    dim3 threads_rd(3,3,3);
-    dim3 grid_rd   (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
-//    dim3 grid_rd   (tpParams.num_cams, 1, 1);
-    printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
-    printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
-    StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
-    sdkCreateTimer(&timerREVERSE_DISTORTIONS);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerREVERSE_DISTORTIONS);
-    		sdkStartTimer(&timerREVERSE_DISTORTIONS);
-    	}
-    	calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
-    			gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
-				gpu_rByRDist);
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    ///	cudaProfilerStop();
-    sdkStopTimer(&timerREVERSE_DISTORTIONS);
-    float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
-    sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
-    printf("Average calcReverseDistortionTable  run time =%f ms\n",  avgTimeREVERSE_DISTORTIONS);
-    float * rByRDist_gen = (float *) malloc(RBYRDIST_LEN * sizeof(float));
-	checkCudaErrors(cudaMemcpy(
-			rByRDist_gen,
-			gpu_rByRDist,
-			RBYRDIST_LEN * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	float max_err = 0;
-	for (int i = 0; i < RBYRDIST_LEN; i++){
-		float err = abs(rByRDist_gen[i] - rByRDist[i]);
-		if (err > max_err){
-			max_err = err;
-		}
-#ifdef VERBOSE
-///		printf ("%5d: %8.6f %8.6f %f %f\n", i, rByRDist[i], rByRDist_gen[i] , err, max_err);
-#endif // #ifdef VERBOSE
-	}
-	printf("Maximal rByRDist error = %f\n",max_err);
-	free (rByRDist_gen);
-#if 0
-    // temporarily restore
-    checkCudaErrors(cudaMemcpy(
-    		gpu_rByRDist,
-			rByRDist,
-			RBYRDIST_LEN * sizeof(float),
-            cudaMemcpyHostToDevice));
-#endif // #if 1
-#endif // TEST_REVERSE_DISTORTIONS
-#define TEST_GEOM_CORR
-#ifdef  TEST_GEOM_CORR
-///    dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
-    dim3 threads_geom(tpParams.num_cams,TILES_PER_BLOCK_GEOM, 1);
-    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
-    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
-    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
-    StopWatchInterface *timerGEOM = 0;
-    sdkCreateTimer(&timerGEOM);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerGEOM);
-    		sdkStartTimer(&timerGEOM);
-    	}
-    	calculate_tiles_offsets<<<1,1>>> (
-    			1,                       // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-				tpParams.num_cams,                // int                  num_cams,
-				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//    			gpu_tasks,               // struct tp_task     * gpu_tasks,
-				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
-				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
-				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
-				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    ///	cudaProfilerStop();
-    sdkStopTimer(&timerGEOM);
-    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
-    sdkDeleteTimer(&timerGEOM);
-    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
-/*
-	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
-			&task_data1,
-			gpu_tasks,
-			tp_task_size * sizeof(struct tp_task),
-			cudaMemcpyDeviceToHost));
-*/
-	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
-			ftask_data1,
-			gpu_ftasks,
-			tp_task_size * tpParams.task_size *sizeof(float),
-			cudaMemcpyDeviceToHost));
-//task_size
-#if 0 // for manual browsing
-	struct tp_task * old_task = &task_data [DBG_TILE];
-	struct tp_task * new_task = &task_data1[DBG_TILE];
-#endif
-#ifdef 	DBG_TILE
-    printf("old_task txy = 0x%x\n", *(int *) (ftask_data + tpParams.task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
-    printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + tpParams.task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
-        for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
-            printf("camera %d pX old %f new %f diff = %f\n", ncam,
-            		 *(ftask_data  + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 *(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0),
-            		 (*(ftask_data + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)) -
-            		 (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 0)));
-            printf("camera %d pY old %f new %f diff = %f\n", ncam,
-           		 *(ftask_data  +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 *(ftask_data1 +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1),
-           		 (*(ftask_data +  tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)) -
-           		 (*(ftask_data1 + tpParams.task_size * DBG_TILE + TP_TASK_XY_OFFSET + 2*ncam + 1)));
-        }
-#endif //#ifdef 	DBG_TILE
-#endif // TEST_GEOM_CORR
-    //create and start CUDA timer
-    StopWatchInterface *timerTP = 0;
-    sdkCreateTimer(&timerTP);
-    dim3 threads_tp(1, 1, 1);
-    dim3 grid_tp(1, 1, 1);
-    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
-    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
-    ///    cudaProfilerStart();
-    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerTP);
-    		sdkStartTimer(&timerTP);
-    	}
-    	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-    			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
-				tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-    			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
-				gpu_kernels,           // float           ** gpu_kernels,
-				gpu_images,            // float           ** gpu_images,
-				gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-				gpu_clt,               // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-				dstride/sizeof(float), // size_t             dstride, // for gpu_images
-				tp_task_size,          // int                num_tiles) // number of tiles in task
-				0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-				IMG_WIDTH,             // int                woi_width,
-				IMG_HEIGHT,            // int                woi_height,
-				0, // KERNELS_HOR,           // int                kernels_hor,
-				KERNELS_VERT,          // int                kernels_vert);
-				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated list of tiles
-    			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
-				TILESX); // int                tilesx)
-        printf("HOST: convert_direct() done\n");
-    	getLastCudaError("Kernel execution failed");
-        printf("HOST: convert_direct() done - 1\n");
-    	checkCudaErrors(cudaDeviceSynchronize());
-        printf("HOST: convert_direct() done - 2\n");
-//    	printf("%d\n",i);
-    }
-    sdkStopTimer(&timerTP);
-    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
-    sdkDeleteTimer(&timerTP);
-	checkCudaErrors(cudaMemcpy(
-			&num_active_tiles,
-			gpu_num_active,
-			sizeof(int),
-			cudaMemcpyDeviceToHost));
-    printf("Run time =%f ms, num active tiles = %d\n",  avgTime, num_active_tiles);
-#ifdef SAVE_CLT
-    int rslt_size = (TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
-    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
-//  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-    	checkCudaErrors(cudaMemcpy( // segfault
-    			cpu_clt,
-				gpu_clt_h[ncam],
-				rslt_size * sizeof(float),
-    			cudaMemcpyDeviceToHost));
-        printf("Writing CLT data to %s\n",  tpPaths.ports_clt_file[ncam]);
-    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
-    			rslt_size, // int           size, // length in elements
-				tpPaths.ports_clt_file[ncam]); // 			   const char *  path) // file path
-    }
-#endif
-#ifdef TEST_IMCLT
-    // test_imclt does not exist
-     {
-    	// testing imclt
-    	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    	dim3 grid_imclt(1,1,1);
-    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-//      for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-   	    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-    		test_imclt<<<grid_imclt,threads_imclt>>>(
-    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-					ncam);                                        // int             ncam); // just for debug print
-    	}
-    	getLastCudaError("Kernel execution failed");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test_imclt() DONE\n");
-    }
-#endif
-    StopWatchInterface *timerIMCLT = 0;
-    sdkCreateTimer(&timerIMCLT);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerIMCLT);
-    		sdkStartTimer(&timerIMCLT);
-    	}
-        dim3 threads_imclt_all(1, 1, 1);
-		dim3 grid_imclt_all(1, 1, 1);
-        printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
-        printf("grid_imclt_all=   (%d, %d, %d)\n",grid_imclt_all.x,   grid_imclt_all.y,   grid_imclt_all.z);
-        imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>(
-        		tpParams.num_cams,                    // int                num_cams,
-        		gpu_clt,                     // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-				gpu_corr_images,             // float           ** gpu_corr_images,    // [num_cams][WIDTH, 3 * HEIGHT]
-				1,                           // int               apply_lpf,
-				tpParams.num_colors,                  // int               colors,               // defines lpf filter
-				TILESX,                      // int               woi_twidth,
-				TILESY,                      // int               woi_theight,
-				dstride_rslt/sizeof(float)); // const size_t      dstride);            // in floats (pixels)
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    // TODO: *** Stop here for initial testing ***
-    sdkStopTimer(&timerIMCLT);
-    float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
-    sdkDeleteTimer(&timerIMCLT);
-    printf("Average imclt_rbg_all run time =%f ms\n",  avgTimeIMCLT);
-    int rslt_img_size =       tpParams.num_colors * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
-    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
-//  for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-    	checkCudaErrors(cudaMemcpy2D( // segfault
-    			cpu_corr_image,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				gpu_corr_images_h[ncam],
-				dstride_rslt,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-//				3* (IMG_HEIGHT + DTT_SIZE),
-				tpParams.num_colors* (IMG_HEIGHT + DTT_SIZE),
-    			cudaMemcpyDeviceToHost));
-        printf("Writing RBG data to %s\n",  tpPaths.result_rbg_file[ncam]);
-    	writeFloatsToFile( // will have margins
-    			cpu_corr_image, // float *       data, // allocated array
-				rslt_img_size, // int           size, // length in elements
-				tpPaths.result_rbg_file[ncam]); // 			   const char *  path) // file path
-    }
-    free(cpu_corr_image);
-#ifndef NOCORR
-//    cudaProfilerStart();
-// testing corr
-    StopWatchInterface *timerCORR = 0;
-    sdkCreateTimer(&timerCORR);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerCORR);
-    		sdkStartTimer(&timerCORR);
-    	}
-    	correlate2D<<<1,1>>>(
-    			tpParams.num_cams,                      // int               num_cams,
-				tpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				tpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				tpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				tpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
-				gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				tpParams.num_colors,                 // int               colors,             // number of colors (3/1)
-				tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
-				tpParams.color_weights[1], // 0.25,  // float             scale1,             // scale for B
-				tpParams.color_weights[2], // 0.5,   // float             scale2,             // scale for G
-				fat_zero * fat_zero,                // float             fat_zero2,           // here - absolute
-				gpu_ftasks,                 // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-				tp_task_size,               // int               num_tiles) // number of tiles in task
-				TILESX,                     // int               tilesx,             // number of tile rows
-				gpu_corr_indices,           // int             * gpu_corr_indices,   // packed tile+pair
-				gpu_num_corr_tiles,         // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-				dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
-				CORR_OUT_RAD,               // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-				gpu_corrs);                 // float           * gpu_corrs);          // correlation output data
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerCORR);
-    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
-    sdkDeleteTimer(&timerCORR);
-    //    printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORR, num_corrs);
-    checkCudaErrors(cudaMemcpy(
-    		&num_corrs,
-			gpu_num_corr_tiles,
-			sizeof(int),
-			cudaMemcpyDeviceToHost));
-    printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n",  avgTimeCORR, num_corrs);
-//    int corr_size =        2 * CORR_OUT_RAD + 1;
-//    int rslt_corr_size =   num_corrs * corr_size * corr_size;
-//    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-	rslt_corr_size =   num_corrs * corr_size * corr_size;
-	corr_img_size = num_corr_indices * 16*16; // NAN
-	corr_img = (float *)malloc(corr_img_size * sizeof(float));
-	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-	cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr,
-			(corr_size * corr_size) * sizeof(float),
-			gpu_corrs,
-			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
-			num_corrs,
-			cudaMemcpyDeviceToHost));
-    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
-//    int num_tiles = TILESX * TILESYA;
-//   int num_corr_indices = num_pairs * num_tiles;
-//    int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
-    checkCudaErrors(cudaMemcpy(
-    		cpu_corr_indices,
-			gpu_corr_indices,
-			num_corr_indices * sizeof(int),
-			cudaMemcpyDeviceToHost));
-//    int corr_img_size = num_corr_indices * 16*16; // NAN
-//    float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
-    for (int i = 0; i < corr_img_size; i++){
-    	corr_img[i] = NAN;
-    }
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	//    	int ct = cpu_corr_indices[ict];
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	//		int src_offs0 = ict * num_pairs * corr_size * corr_size;
-    	int src_offs0 = ict * corr_size * corr_size;
-    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
-    	for (int iy = 0; iy < corr_size; iy++){
-    		int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
-    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < corr_size; ix++){
-    			corr_img[dst_offs++] = cpu_corr[src_offs++];
-    		}
-    	}
-    }
-    // num_pairs
-#ifndef NSAVE_CORR
-    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_corr_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
-    /*
-    		writeFloatsToFile(
-    				cpu_corr,    // float *       data, // allocated array
-					rslt_corr_size,    // int           size, // length in elements
-					tpPaths.result_corr_file); // 			   const char *  path) // file path
-     */
-    writeFloatsToFile(
-    		corr_img,    // float *       data, // allocated array
-			corr_img_size,    // int           size, // length in elements
-			tpPaths.result_corr_file); // 			   const char *  path) // file path
-#endif
-    free (cpu_corr);
-    free (cpu_corr_indices);
-    free (corr_img);
-#endif // ifndef NOCORR
-#ifndef NOCORR_TD
-//#define QUAD_COMBINE
-//    cudaProfilerStart();
-// testing corr
-    StopWatchInterface *timerCORRTD = 0;
-    sdkCreateTimer(&timerCORRTD);
-    int num_corr_combo;
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerCORRTD);
-    		sdkStartTimer(&timerCORRTD);
-    	}
-    	// FIXME: provide sel_pairs
-        correlate2D<<<1,1>>>( // output TD tiles, no normalization
-        		tpParams.num_cams,                      // int               num_cams,
-				tpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				tpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				tpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				tpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
-        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				tpParams.num_colors,                    // int               colors,             // number of colors (3/1)
-				tpParams.color_weights[0], // 0.25,     // float             scale0,             // scale for R
-				tpParams.color_weights[1], // 0.25,     // float             scale1,             // scale for B
-				tpParams.color_weights[2], // 0.5,      // float             scale2,             // scale for G
-				fat_zero*fat_zero,                     // float             fat_zero2,          // here - absolute (squared)
-				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-				tp_task_size,                  // int               num_tiles) // number of tiles in task
-				TILESX,                        // int               tilesx,             // number of tile rows
-				gpu_corr_indices,              // int             * gpu_corr_indices,   // packed tile+pair
-				gpu_num_corr_tiles,            // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-				dstride_corr_td/sizeof(float), // const size_t      corr_stride,        // in floats
-				0,                             // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-				gpu_corrs_td);                 // float           * gpu_corrs);         // correlation output data
-    	getLastCudaError("Kernel failure:correlate2D");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("correlate2D-TD pass: %d\n",i);
-    	checkCudaErrors(cudaMemcpy(
-    			&num_corrs,
-    			gpu_num_corr_tiles,
-    			sizeof(int),
-    			cudaMemcpyDeviceToHost));
-#ifdef QUAD_COMBINE
-    	num_corr_combo = num_corrs/tpParams.num_pairs;
-    	corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
-    			num_corr_combo, // tp_task_size,     // int               num_tiles,          // number of tiles to process (each with num_pairs)
-				tpParams.num_pairs,                  // int               num_pairs,          // num pairs per tile (should be the same)
-    			1,                                   // int               init_output,        // !=0 - reset output tiles to zero before accumulating
-    			0x0f,                                // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
-				gpu_corr_indices,                    // int             * gpu_corr_indices,   // packed tile+pair
-				gpu_corrs_combo_indices,             // int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
-				dstride_corr_td/sizeof(float),       // const size_t      corr_stride,        // (in floats) stride for the input TD correlations
-				gpu_corrs_td,                       // float           * gpu_corrs,          // input correlation tiles
-				dstride_corr_combo_td/sizeof(float), // const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
-				gpu_corrs_combo_td);                 // float           * gpu_corrs_combo);   // combined correlation output (one per tile)
-    	getLastCudaError("Kernel failure:corr2D_combine");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("corr2D_combine pass: %d\n",i);
-    	corr2D_normalize<<<1,1>>>(
-    			num_corr_combo, //tp_task_size,      // int               num_corr_tiles,     // number of correlation tiles to process
-				dstride_corr_combo_td/sizeof(float), // const size_t      corr_stride_td,     // in floats
-				gpu_corrs_combo_td,                  // float           * gpu_corrs_td,       // correlation tiles in transform domain
-				(float *) 0, // corr_weights,                  // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
-				dstride_corr_combo/sizeof(float),    // const size_t      corr_stride,        // in floats
-				gpu_corrs_combo,                     // float           * gpu_corrs,          // correlation output data (pixel domain)
-				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
-				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
-#else
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	corr2D_normalize<<<1,1>>>(
-    			num_corrs, //tp_task_size,           // int               num_corr_tiles,     // number of correlation tiles to process
-				dstride_corr_td/sizeof(float),       // const size_t      corr_stride_td,     // in floats
-				gpu_corrs_td,                        // float           * gpu_corrs_td,       // correlation tiles in transform domain
-				(float *) 0, // corr_weights,        // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
-				dstride_corr/sizeof(float),          // const size_t      corr_stride,        // in floats
-				gpu_corrs,                           // float           * gpu_corrs,          // correlation output data (pixel domain)
-				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
-				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
-#endif
-    	getLastCudaError("Kernel failure:corr2D_normalize");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("corr2D_normalize pass: %d\n",i);
-    }
-    sdkStopTimer(&timerCORRTD);
-    float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
-    sdkDeleteTimer(&timerCORRTD);
-    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORRTD, num_corrs);
-#ifdef QUAD_COMBINE
-    int corr_size_combo =        2 * CORR_OUT_RAD + 1;
-    int rslt_corr_size_combo =   num_corr_combo * corr_size_combo * corr_size_combo;
-    float * cpu_corr_combo =    (float *)malloc(rslt_corr_size_combo * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr_combo,
-			(corr_size_combo * corr_size_combo) * sizeof(float),
-			gpu_corrs_combo,
-			dstride_corr_combo,
-			(corr_size_combo * corr_size_combo) * sizeof(float),
-			num_corr_combo,
-			cudaMemcpyDeviceToHost));
-#ifndef NSAVE_CORR
-    printf("Writing phase correlation data to %s\n",  tpPaths.result_corr_quad_file);
-    writeFloatsToFile(
-    		cpu_corr_combo,         // float *       data, // allocated array
-			rslt_corr_size_combo,   // int           size, // length in elements
-			tpPaths.result_corr_quad_file); // 			     const char *  path) // file path
-#endif
-    free(cpu_corr_combo);
-#else // QUAD_COMBINE
-    // Reading / formatting / saving 	correlate2D(TD) + corr2D_normalize
-    checkCudaErrors(cudaMemcpy(
-    		&num_corrs,
-			gpu_num_corr_tiles,
-			sizeof(int),
-			cudaMemcpyDeviceToHost));
-  //  printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n",  avgTimeCORR, num_corrs);
-//    int corr_size =        2 * CORR_OUT_RAD + 1;
-//    int rslt_corr_size =   num_corrs * corr_size * corr_size;
-//    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-	rslt_corr_size =   num_corrs * tpParams.corr_length; // corr_size * corr_size;
-	corr_img_size = num_corr_indices * 16*16; // NAN
-	corr_img = (float *)malloc(corr_img_size * sizeof(float));
-	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-	cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr,
-			tpParams.corr_length * sizeof(float),
-			gpu_corrs,
-			dstride_corr,
-			tpParams.corr_length * sizeof(float),
-			num_corrs,
-			cudaMemcpyDeviceToHost));
-    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
-//    int num_tiles = TILESX * TILESYA;
-//    int num_corr_indices = tpParams.num_pairs * num_tiles;
-//    int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
-    checkCudaErrors(cudaMemcpy(
-    		cpu_corr_indices,
-			gpu_corr_indices,
-			num_corr_indices * sizeof(int),
-			cudaMemcpyDeviceToHost));
-//    int corr_img_size = num_corr_indices * 16*16; // NAN
-//    float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
-    for (int i = 0; i < corr_img_size; i++){
-    	corr_img[i] = NAN;
-    }
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	//    	int ct = cpu_corr_indices[ict];
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	//		int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
-    	int src_offs0 = ict * tpParams.corr_length;
-    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
-    	for (int iy = 0; iy < tpParams.corr_size; iy++){
-    		int src_offs = src_offs0 + iy * tpParams.corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
-    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < tpParams.corr_size; ix++){
-    			corr_img[dst_offs++] = cpu_corr[src_offs++];
-    		}
-    	}
-    }
-    // num_pairs
-#ifndef NSAVE_CORR
-    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_corr_td_norm_file, (TILESX*16),(TILESYA*16), tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
-    writeFloatsToFile(
-    		corr_img,                  // float *       data, // allocated array
-			corr_img_size,             // int           size, // length in elements
-			tpPaths.result_corr_td_norm_file); // 			   const char *  path) // file path
-#endif
-#if 1 // export TD intra
-    int intra_corr_size_td =   num_corrs * DTT_SIZE2*DTT_SIZE2;
-    cpu_corr_td = (float *)malloc(intra_corr_size_td * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr_td,
-			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
-			gpu_corrs_td,
-			dstride_corr_td,
-			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
-			num_corrs,
-			cudaMemcpyDeviceToHost));
-    // Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
-    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
-    	for (int iquad = 0; iquad < 4; iquad ++) {
-    		int iqy = (iquad >> 1) & 1;
-    		int iqx = (iquad >> 0) & 1;
-    		for (int iy = 0; iy < DTT_SIZE; iy++){
-    			int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
-    			int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
-    			for (int ix = 0; ix < DTT_SIZE; ix++){
-    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
-    			}
-    		}
-    	}
-    }
-#ifndef NSAVE_CORR
-    printf("Writing intrascene phase correlation TD data");
-    writeFloatsToFile(
-    		corr_img,                  // float *       data, // allocated array
-			corr_img_size,             // int           size, // length in elements
-			"clt/aux_intrascene-TD.raw"); // 			   const char *  path) // file path
-#endif
-    free (cpu_corr_td);
-#endif  // if 1
-    // reuse image, export TD data
-    free (cpu_corr);
-    free (cpu_corr_indices);
-    free (corr_img);
-#endif // // QUAD_COMBINE#else
-#endif // ifndef NOCORR_TD
-// Testing "interframe" correlation with itself, assuming direct convert already ran
-#ifdef CORR_INTER_SELF
-    int sel_sensors = 0xffff; // 0x7fff; // 0xffff;
-    int num_sel_sensors = 16; // 15; // 16;
-    int num_pairs_inter = num_sel_sensors+1;
-    num_corr_indices = num_pairs_inter * num_tiles;
-//    int is_bayer = 0;
-//    int image_dx = 2;
-//    int image_dy = 0;
-    float            * gpu_clt_ref_h            [tpParams.num_cams];
-    float           ** gpu_clt_ref;    //            [NUM_CAMS];
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
-    }
-    gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  tpParams.num_cams); // NUM_CAMS);
-    // use gpu_images and convert to gpu_clt_ref
-	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
-			tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
-			gpu_kernels,           // float           ** gpu_kernels,
-			gpu_images,            // float           ** gpu_images,
-			gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			gpu_clt_ref, //******  // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-			dstride/sizeof(float), // size_t             dstride, // for gpu_images
-			tp_task_size,          // int                num_tiles) // number of tiles in task
-			0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-			IMG_WIDTH,             // int                woi_width,
-			IMG_HEIGHT,            // int                woi_height,
-			KERNELS_HOR,           // int                kernels_hor,
-			KERNELS_VERT,          // int                kernels_vert);
-			gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated list of tiles
-			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
-			TILESX); // int                tilesx)
-	getLastCudaError("Kernel execution failed");
-	checkCudaErrors(cudaDeviceSynchronize());
-	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				tpPaths.image_files[ncam]); // 			   char *  path) // file path
-        shift_image (
-        		host_kern_buf, // float * image,
-				IMG_WIDTH,     // int width,
-				IMG_HEIGHT,    // int height,
-				is_bayer,      // int bayer,
-				image_dx,      // int dx,
-				image_dy);     // int dy);
-        update_image_gpu(
-        		host_kern_buf,          // float * image_host,
-				gpu_images_h[ncam], // float * image_gpu,
-				dstride,                // size_t  dstride, // in floats !
-				IMG_WIDTH,              // int     width,
-				IMG_HEIGHT);            // int     height);
-    }
-	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			tpParams.num_cams,              // int                num_cams,           // actual number of cameras
-			tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
-			gpu_kernels,           // float           ** gpu_kernels,
-			gpu_images,            // float           ** gpu_images,
-			gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			gpu_clt,     //******  // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-			dstride/sizeof(float), // size_t             dstride, // for gpu_images
-			tp_task_size,          // int                num_tiles) // number of tiles in task
-			0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-			IMG_WIDTH,             // int                woi_width,
-			IMG_HEIGHT,            // int                woi_height,
-			KERNELS_HOR,           // int                kernels_hor,
-			KERNELS_VERT,          // int                kernels_vert);
-			gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated list of tiles
-			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
-			TILESX); // int                tilesx)
-	getLastCudaError("Kernel execution failed");
-	checkCudaErrors(cudaDeviceSynchronize());
-    StopWatchInterface *timerINTERSELF = 0;
-    sdkCreateTimer(&timerINTERSELF);
-//    int num_corr_combo_inter;
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerINTERSELF);
-    		sdkStartTimer(&timerINTERSELF);
-    	}
-    	correlate2D_inter<<<1,1>>>( // only results in TD
-    			tpParams.num_cams,                      // int               num_cams,
-				sel_sensors,                   // int               sel_sensors,
-				gpu_clt,                       // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				gpu_clt_ref, // *********      // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				tpParams.num_colors,                    // int               colors,             // number of colors (3/1)
-				tpParams.color_weights[0], // 0.25,     // float             scale0,             // scale for R
-				tpParams.color_weights[1], // 0.25,     // float             scale1,             // scale for B
-				tpParams.color_weights[2], // 0.5,      // float             scale2,             // scale for G
-				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-				tp_task_size,                  // int               num_tiles) // number of tiles in task
-				TILESX,                        // int               tilesx,             // number of tile rows
-				gpu_corr_indices,              // int             * gpu_corr_indices,   // packed tile+pair
-				gpu_num_corr_tiles,            // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-				dstride_corr_td/sizeof(float), // const size_t      corr_stride,        // in floats
-				gpu_corrs_td);                 // float           * gpu_corrs);         // correlation output data
-        getLastCudaError("Kernel failure:correlate2D_inter");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("correlate2D_inter-TD pass: %d\n",i);
-    	checkCudaErrors(cudaMemcpy(
-    			&num_corrs,
-    			gpu_num_corr_tiles,
-    			sizeof(int),
-    			cudaMemcpyDeviceToHost));
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	corr2D_normalize<<<1,1>>>(
-    			num_corrs, //tp_task_size,           // int               num_corr_tiles,     // number of correlation tiles to process
-				dstride_corr_td/sizeof(float),       // const size_t      corr_stride_td,     // in floats
-				gpu_corrs_td,                        // float           * gpu_corrs_td,       // correlation tiles in transform domain
-				(float *) 0, // corr_weights,        // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
-				dstride_corr/sizeof(float),          // const size_t      corr_stride,        // in floats
-				gpu_corrs,                           // float           * gpu_corrs,          // correlation output data (pixel domain)
-				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
-				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
-    	getLastCudaError("Kernel failure:corr2D_normalize");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("corr2D_normalize pass: %d\n",i);
-    }
-    sdkStopTimer(&timerINTERSELF);
-    float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
-    sdkDeleteTimer(&timerINTERSELF);
-    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeINTERSELF, num_corrs);
-	rslt_corr_size =   num_corrs * corr_size * corr_size;
-	corr_img_size = num_corr_indices * 16*16; // NAN
-	corr_img = (float *)malloc(corr_img_size * sizeof(float));
-	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-	cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr,
-			(corr_size * corr_size) * sizeof(float),
-			gpu_corrs,
-			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
-			num_corrs,
-			cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMemcpy(
-    		cpu_corr_indices,
-			gpu_corr_indices,
-			num_corr_indices * sizeof(int),
-			cudaMemcpyDeviceToHost));
-    for (int i = 0; i < corr_img_size; i++){
-    	corr_img[i] = NAN;
-    }
-//  int num_pairs = 120;
-//  int sel_sensors = 0xffff;
-//  int num_sel_sensors = 16;
-//	int corr_size =        2 * CORR_OUT_RAD + 1; // 15
-//	int num_tiles = tp_task_size; // TILESX * TILESYA; //Was this on 01/22/2022
-//	int num_corr_indices = num_pairs * num_tiles;
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	if (cpair == 0xff){
-    		cpair = num_sel_sensors;
-    	}
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	int src_offs0 = ict * corr_size * corr_size;
-    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
-    	for (int iy = 0; iy < corr_size; iy++){
-    		int src_offs = src_offs0 + iy * corr_size; // ict * num_pairs * corr_size * corr_size;
-    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < corr_size; ix++){
-    			corr_img[dst_offs++] = cpu_corr[src_offs++];
-    		}
-    	}
-    }
-#ifndef NSAVE_CORR
-    printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
-    writeFloatsToFile(
-    		corr_img,                  // float *       data, // allocated array
-			corr_img_size,             // int           size, // length in elements
-			tpPaths.result_inter_td_norm_file); // 			   const char *  path) // file path
-#endif
-#if 1
-	int rslt_corr_size_td =   num_corrs * DTT_SIZE2*DTT_SIZE2;
-	cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr_td,
-			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
-			gpu_corrs_td,
-			dstride_corr_td,
-			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
-			num_corrs,
-			cudaMemcpyDeviceToHost));
-// Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	if (cpair == 0xff){
-    		cpair = num_sel_sensors;
-    	}
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
-    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
-    	for (int iquad = 0; iquad < 4; iquad ++) {
-    		int iqy = (iquad >> 1) & 1;
-    		int iqx = (iquad >> 0) & 1;
-    		for (int iy = 0; iy < DTT_SIZE; iy++){
-    			int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
-    			int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
-    			for (int ix = 0; ix < DTT_SIZE; ix++){
-    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
-    			}
-    		}
-    	}
-    }
-#ifndef NSAVE_CORR
-    printf("Writing interscene phase correlation TD data");
-    writeFloatsToFile(
-    		corr_img,                  // float *       data, // allocated array
-			corr_img_size,             // int           size, // length in elements
-			"clt/aux_interscene-TD.raw"); // 			   const char *  path) // file path
-#endif
-    int corr_index_img_length = TILESX*TILESY*(num_sel_sensors+1) ;
-    float *corr_index_img =  (float *)malloc(corr_index_img_length * sizeof(float));
-    for (int i = 0; i < corr_index_img_length; i++){
-    	corr_index_img[i] = NAN;
-    }
-    for (int ict = 0; ict < num_corr_indices; ict++){
-    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
-    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
-    	if (cpair == 0xff){
-    		cpair = num_sel_sensors;
-    	}
-    	int ty = ctt / TILESX;
-    	int tx = ctt % TILESX;
-    	corr_index_img[cpair*TILESX*TILESY + TILESX*ty + tx] =  ict; // cpu_corr_indices[ict];
-    }
-    printf("Writing interscene indices\n");
-    writeFloatsToFile(
-    		corr_index_img,                  // float *       data, // allocated array
-			corr_index_img_length,           // int           size, // length in elements
-			"clt/aux_inter-indices.raw"); // 			   const char *  path) // file path
-    free (corr_index_img);
-    free (cpu_corr_td);
-#endif
-    free (cpu_corr);
-    free (cpu_corr_indices);
-    free (corr_img);
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++){
-    	checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
-    }
-    checkCudaErrors(cudaFree(gpu_clt_ref));
-#endif    // #ifdef CORR_INTER_SELF
-// -----------------
-#ifndef NOTEXTURES
-    		dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-    		dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
-    		int  linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
-     		printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
-     		printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
-     		int   cpu_pnum_texture_tiles = 0;
-     	    int * gpu_pnum_texture_tiles;
-     	    checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
-    		    StopWatchInterface *timerTEXTURE = 0;
-    		    sdkCreateTimer(&timerTEXTURE);
-    		    for (int i = i0; i < numIterations; i++)
-    		    {
-    		    	if (i == 0)
-    		    	{
-    		    		checkCudaErrors(cudaDeviceSynchronize());
-    		    		sdkResetTimer(&timerTEXTURE);
-    		    		sdkStartTimer(&timerTEXTURE);
-    		    	}
-    		    	int shared_size = host_get_textures_shared_size( // in bytes
-    		    			tpParams.num_cams,         // int                num_cams,     // actual number of cameras
-							tpParams.texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
-							0);           // int *              offsets);     // in floats
-    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
-    				//*pnum_texture_tiles = 0;
-    				cpu_pnum_texture_tiles = 0;
-    				checkCudaErrors(cudaMemcpy(
-    						gpu_pnum_texture_tiles,
-							&cpu_pnum_texture_tiles,
-							sizeof(int),
-							cudaMemcpyHostToDevice));
-    				cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
-    				cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
-#ifdef NO_DP
-    				 create_nonoverlap_list<<<blocks0,threads0>>>(
-    						 tpParams.num_cams,                // int                num_cams,
-    						 gpu_ftasks,              // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-							 tp_task_size,            // int                num_tiles,           // number of tiles in task
-							 TILESX,                  // int                width,               // number of tiles in a row
-    						 gpu_texture_indices,     // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
-							 gpu_pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
-    				 cudaDeviceSynchronize();
-    				 checkCudaErrors(cudaMemcpy(
-    						 &cpu_pnum_texture_tiles,
-							 gpu_pnum_texture_tiles,
-							 sizeof(int),
-							 cudaMemcpyDeviceToHost));
-    				 printf("cpu_pnum_texture_tiles = %d\n",  cpu_pnum_texture_tiles);
-    				 int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
-    				 dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-    				 dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
-    		     		printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
-    		     		printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
-    				textures_accumulate <<<grid_texture1,threads_texture1,  shared_size>>>( // 65536>>>( //
-    						tpParams.num_cams,                        // 	int               num_cams,           // number of cameras used
-							(int *) 0,                       // int             * woi,                // x, y, width,height
-							gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-							gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles,          // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
-							0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
-							gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-							gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
-							tpParams.texture_colors,                  // int               colors,             // number of colors (3/1)
-							(tpParams.texture_colors == 1),           // int               is_lwir,            // do not perform shot correction
-							tpParams.generate_RBGA_params[0], // min_shot,      // float             min_shot,           // 10.0
-							tpParams.generate_RBGA_params[1], // scale_shot,    // float             scale_shot,         // 3.0
-							tpParams.generate_RBGA_params[2], // diff_sigma,    // float             diff_sigma,         // pixel value/pixel change
-							tpParams.generate_RBGA_params[3], // diff_threshold,// float             diff_threshold,     // pixel value/pixel change
-							tpParams.generate_RBGA_params[4], // min_agree,     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-							gpu_color_weights,               // float             weights[3],         // scale for R,B,G
-							1,                       // dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-							tpParams.keep_texture_weights, // 0, // 1                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-							// combining both non-overlap and overlap (each calculated if pointer is not null )
-							0,                               // size_t      texture_rbg_stride, // in floats
-							(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-							dstride_textures /sizeof(float),          // texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
-							gpu_textures, // (float *) 0,             // gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-							linescan_order,          // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
-							gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
-							TILESX);
-#else // #ifdef NO_DP
-    				//tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
-    		    	textures_nonoverlap<<<1,1>>> ( //,65536>>> (
-    		    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
-    						gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
-    		                tp_task_size,          // int                num_tiles,          // number of tiles in task list
-    		    	// declare arrays in device code?
-    						gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-    						gpu_pnum_texture_tiles, // int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
-    				        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-    						gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-							tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
-    						(tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-    						gpu_generate_RBGA_params,
-    						gpu_color_weights,     // float             weights[3],         // scale for R
-    						1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-							tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-    		    	// combining both non-overlap and overlap (each calculated if pointer is not null )
-    						dstride_textures/sizeof(float), // size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
-    						gpu_textures,         // float           * gpu_texture_tiles,
-    						linescan_order,       // int               linescan_order,
-    						gpu_diff_rgb_combo,   //);  // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
-    						TILESX);
-#endif
-    				getLastCudaError("Kernel failure");
-    				checkCudaErrors(cudaDeviceSynchronize());
-    				printf("test pass: %d\n",i);
-    		    }
-    		    sdkStopTimer(&timerTEXTURE);
-    		    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
-    		    sdkDeleteTimer(&timerTEXTURE);
-    		    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
-#ifdef NO_DP
-#else
-				 checkCudaErrors(cudaMemcpy(
-						 &cpu_pnum_texture_tiles,
-						 gpu_pnum_texture_tiles,
-						 sizeof(int),
-						 cudaMemcpyDeviceToHost));
-				 printf("cpu_pnum_texture_tiles = %d\n",  cpu_pnum_texture_tiles);
-				 printf("tile_texture_layers = %d\n",     tile_texture_layers);
-#endif
-    		    int rslt_texture_size =   num_textures * tile_texture_size;
-    			checkCudaErrors(cudaMemcpy(
-    					(float * ) texture_indices,
-						gpu_texture_indices,
-						cpu_pnum_texture_tiles  * sizeof(float),
-    					cudaMemcpyDeviceToHost));
-    		    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
-    		    checkCudaErrors(cudaMemcpy2D(
-    		    		cpu_textures,
-    					tile_texture_size * sizeof(float),
-    					gpu_textures,
-    					dstride_textures,
-    					tile_texture_size * sizeof(float),
-    					num_textures,
-    		    		cudaMemcpyDeviceToHost));
-//    		    float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16];
-    		    int num_nonoverlap_pixels = tile_texture_layers * TILESY*16 * TILESX*16;
-    		    float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float));
-    		    for (int i = 0; i < num_nonoverlap_pixels; i++){
-    		    	non_overlap_layers[i] = NAN;
-    		    }
-    		    for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
-    		    	int ntile = texture_indices[itile] >> TEXT_NTILE_SHIFT;
-    		    	int tileX = ntile % TILESX;
-    		    	int tileY = ntile / TILESX;
-    		    	for (int ilayer = 0; ilayer < tile_texture_layers; ilayer++){
-    		    		int src_index0 = itile * tile_texture_size + 256 * ilayer;
-    		    		int dst_index0 =  ilayer * (TILESX * TILESYA * 256) + (tileY * 16) * (16 * TILESX) + (tileX * 16);
-    		    		for (int iy = 0; iy < 16; iy++){
-    		    			[[maybe_unused]] int src_index1 = src_index0 + 16 * iy;
-    		    			[[maybe_unused]] int dst_index1 = dst_index0 + iy * (16 * TILESX);
-        		    		for (int ix = 0; ix < 16; ix++){
-        		    			int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix;
-        		    			int dst_index = ilayer * (TILESX * TILESY * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
-        		    			non_overlap_layers[dst_index] = cpu_textures[src_index];
-        		    		}
-    		    		}
-    		    	}
-    		    }
-    		    int ntiles = TILESX * TILESY;
-    		    int nlayers = tpParams.num_cams * (tpParams.num_colors + 1);
-    		    int diff_rgb_combo_size = ntiles * nlayers;
-    		    float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
-    			checkCudaErrors(cudaMemcpy(
-    					cpu_diff_rgb_combo,
-    					gpu_diff_rgb_combo,
-    					diff_rgb_combo_size  * sizeof(float),
-    					cudaMemcpyDeviceToHost));
-    		    float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
-    		    for (int nl = 0; nl <nlayers; nl++){
-    		    	for (int ntile = 0; ntile < ntiles; ntile++){
-    		    		cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
-    		    	}
-    		    }
-#ifndef NSAVE_TEXTURES
-#ifdef NO_DP
-    		    		printf("Writing phase texture data to %s\n",  tpPaths.result_textures_file);
-    		    		writeFloatsToFile(
-    		    				non_overlap_layers,    // float *       data, // allocated array
-    							rslt_texture_size,     // int           size, // length in elements
-								tpPaths.result_textures_file); // 			   const char *  path) // file path
-    		    		printf("Writing low-res data to %s\n",  tpPaths.result_diff_rgb_combo_file);
-    		    		writeFloatsToFile(
-    		    				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
-    							diff_rgb_combo_size,    // int           size, // length in elements
-								tpPaths.result_diff_rgb_combo_file); // 			   const char *  path) // file path
-#else
-    		    		printf("Writing phase texture data to %s\n",  tpPaths.result_textures_file_dp);
-    		    		writeFloatsToFile(
-    		    				non_overlap_layers,    // float *       data, // allocated array
-    							rslt_texture_size,     // int           size, // length in elements
-								tpPaths.result_textures_file_dp); // 			   const char *  path) // file path
-    		    		printf("Writing low-res data to %s\n",  tpPaths.result_diff_rgb_combo_file_dp);
-    		    		writeFloatsToFile(
-    		    				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
-    							diff_rgb_combo_size,    // int           size, // length in elements
-								tpPaths.result_diff_rgb_combo_file_dp); // 			   const char *  path) // file path
-#endif
-#ifdef 	DBG_TILE
-    		#ifdef DEBUG10
-    		    		int texture_offset = DBG_TILE * tile_texture_size;
-    		    		int chn = 0;
-    		    		for (int i = 0; i < tile_texture_size; i++){
-    		    			if ((i % 256) == 0){
-    		    				printf("\nchn = %d\n", chn++);
-    		    			}
-    		    			printf("%10.4f", *(cpu_textures + texture_offset + i));
-    		    			if (((i + 1) % 16) == 0){
-    		    				printf("\n");
-    		    			} else {
-    		    				printf(" ");
-    		    			}
-    		    		}
-    		#endif // DEBUG10
-#endif //#ifdef 	DBG_TILE
-    		#endif // #ifndef NSAVE_TEXTURES
-    		    		free(cpu_textures);
-    		    		free (cpu_diff_rgb_combo);
-    		    		free (cpu_diff_rgb_combo_out);
-    		    		checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
-#endif //NOTEXTURES
-#ifndef NOTEXTURE_RGBAXXX
-    dim3 threads_rgba(1, 1, 1);
-    dim3 grid_rgba(1,1,1);
-    printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
-    printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
-    StopWatchInterface *timerRGBA = 0;
-    sdkCreateTimer(&timerRGBA);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerRGBA);
-    		sdkStartTimer(&timerRGBA);
-    	}
-    	// FIXME: update to use new correlations and num_cams
-#ifdef NO_DP
-    	generateRgbaHost.generate_RBGA_host (
-    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
-    	// Parameters to generate texture tasks
-				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-                tp_task_size,          // int                num_tiles,          // number of tiles in task list
-		// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
-	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
-    	// Parameters for the texture generation
-	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-				tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
-	            (tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				tpParams.generate_RBGA_params, //		float             cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
-				gpu_color_weights,     // float             weights[3],         // scale for R
-	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
-				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
-				gpu_textures_rbga);     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-#else
-    	int shared_size = host_get_textures_shared_size( // in bytes
-    			tpParams.num_cams,     // int                num_cams,     // actual number of cameras
-				tpParams.texture_colors, // colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
-    	        0);           // int *              offsets);     // in floats
-    	printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
-		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
-    	generate_RBGA<<<1,1>>> (
-    			tpParams.num_cams,              // int                num_cams,           // number of cameras used
-    	// Parameters to generate texture tasks
-				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//                gpu_tasks,             // struct tp_task   * gpu_tasks,
-                tp_task_size,          // int                num_tiles,          // number of tiles in task list
-		// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
-	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
-    	// Parameters for the texture generation
-	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				gpu_geometry_correction, // struct gc     * gpu_geometry_correction,
-				tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
-	            (tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				gpu_generate_RBGA_params,
-				gpu_color_weights,     // float             weights[3],         // scale for R
-	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
-				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
-				gpu_textures_rbga,     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-				gpu_twh);              // int             * twh)
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-#endif
-    }
-    sdkStopTimer(&timerRGBA);
-    float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
-    sdkDeleteTimer(&timerRGBA);
-    printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
-	checkCudaErrors(cudaMemcpy(
-			cpu_woi,
-			gpu_woi,
-			4 * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
-	// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
-    int rgba_woi_width =  (cpu_woi[2] + 1) * DTT_SIZE;
-    int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
-    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
-    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_textures_rgba,
-			rgba_width * sizeof(float),
-			gpu_textures_rbga,
-			dstride_textures_rbga,
-			rgba_width * sizeof(float),
-			rgba_height * rbga_slices,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_TEXTURES
-    printf("Import raw, real, little-endian, 18 images 648x520\n");
-#ifdef NO_DP
-    printf("Writing RBGA texture slices to %s\n",  tpPaths.result_textures_rgba_file);
-    writeFloatsToFile(
-    		cpu_textures_rgba,    // float *       data, // allocated array
-			rslt_rgba_size,    // int           size, // length in elements
-			tpPaths.result_textures_rgba_file); // 			   const char *  path) // file path
-#else
-    printf("Writing RBGA texture slices to %s\n",  tpPaths.result_textures_rgba_file_dp);
-    writeFloatsToFile(
-    		cpu_textures_rgba,    // float *       data, // allocated array
-			rslt_rgba_size,    // int           size, // length in elements
-			tpPaths.result_textures_rgba_file_dp); // 			   const char *  path) // file path
-#endif
-#endif
-#ifdef 	DBG_TILE
-#ifdef DEBUG11
-    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
-    for (int chn = 0; chn < rbga_slices; chn++){
-    	printf("\nchn = %d\n", chn);
-    	int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
-    	for (int i = 0; i < 8; i++){
-    		for (int j = 0; j < 8; j++){
-    			printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
-    		}
-    		printf("\n");
-    	}
-    }
-#endif // DEBUG11
-#endif //#ifdef 	DBG_TILE
-    free(cpu_textures_rgba);
-#endif // ifndef NOTEXTURE_RGBAXXX
-#ifdef SAVE_CLT
-    free(cpu_clt);
-#endif
-    free (host_kern_buf);
-    // TODO: move somewhere when all is done
-    for (int ncam = 0; ncam < tpParams.num_cams; ncam++) {
-    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
-    }
-//	checkCudaErrors(cudaFree(gpu_tasks));
-	checkCudaErrors(cudaFree(gpu_ftasks));
-	checkCudaErrors(cudaFree(gpu_active_tiles));
-	checkCudaErrors(cudaFree(gpu_num_active));
-	checkCudaErrors(cudaFree(gpu_kernels));
-	checkCudaErrors(cudaFree(gpu_kernel_offsets));
-	checkCudaErrors(cudaFree(gpu_images));
-	checkCudaErrors(cudaFree(gpu_clt));
-	checkCudaErrors(cudaFree(gpu_corr_images));
-	checkCudaErrors(cudaFree(gpu_corrs));
-	checkCudaErrors(cudaFree(gpu_corrs_td));
-	checkCudaErrors(cudaFree(gpu_corr_indices));
-	checkCudaErrors(cudaFree(gpu_corrs_combo));
-	checkCudaErrors(cudaFree(gpu_corrs_combo_td));
-	checkCudaErrors(cudaFree(gpu_corrs_combo_indices));
-	checkCudaErrors(cudaFree(gpu_num_corr_tiles));
-	checkCudaErrors(cudaFree(gpu_texture_indices));
-	checkCudaErrors(cudaFree(gpu_port_offsets));
-	checkCudaErrors(cudaFree(gpu_color_weights));
-	checkCudaErrors(cudaFree(gpu_generate_RBGA_params));
-	checkCudaErrors(cudaFree(gpu_textures));
-	checkCudaErrors(cudaFree(gpu_textures_rbga));
-	checkCudaErrors(cudaFree(gpu_diff_rgb_combo));
-	checkCudaErrors(cudaFree(gpu_woi));
-	checkCudaErrors(cudaFree(gpu_twh));
-	checkCudaErrors(cudaFree(gpu_num_texture_tiles));
-	checkCudaErrors(cudaFree(gpu_geometry_correction));
-    checkCudaErrors(cudaFree(gpu_correction_vector));
-    checkCudaErrors(cudaFree(gpu_rByRDist));
-    checkCudaErrors(cudaFree(gpu_rot_deriv));
-	free (rByRDist);
-	free (correction_vector);
-	free (ftask_data);
-	free (ftask_data1);
-//	delete generateRgbaHost;
 	exit(0);
 }