implemented testTextures, testTexturesRGBA

13ca3625 · Andrey Filippov · 3610b7a6 · 13ca3625 · 13ca3625 · 13ca3625
Commit 13ca3625 authored Apr 10, 2025 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 886 additions and 155 deletions

TpHostGpu.cu src/TpHostGpu.cu +846 -146

TpHostGpu.h src/TpHostGpu.h +28 -2

TpParams.h src/TpParams.h +3 -0

test_tp.cu src/test_tp.cu +9 -7

No files found.
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -13,6 +13,7 @@
 #include "tp_paths.h"
 #include "tp_files.h"
 #include "tp_utils.h"       // for copyalloc_kernel_gpu
+#include "GenerateRgbaHost.h"
 #include "TpHostGpu.h"
@@ -673,152 +674,6 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 			16);                        //int          num_sel_sensors) { // only for interscene
 }
-void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
-	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
-	int i0 =            m_tpParams.debug_tile ? 0 : -1;
-	// check/replace names
-	if (!m_gpu_clt){
-		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
-	}
-	if (!m_gpu_ftasks) {
-		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
-	}
-	if (!m_gpu_corrs) {
-		throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
-	}
-    int sel_sensors =      0xffff; // 0x7fff; // 0xffff;
-    int num_sel_sensors =  16; // 15; // 16;
-    int num_pairs_inter =  num_sel_sensors+1;
-    int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
-    int is_bayer =         0;
-    int image_dx =         2;
-    int image_dy =         0;
-    float            * gpu_clt_ref_h            [m_tpParams.num_cams];
-    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
-       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
-    }
-    float           ** gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  m_tpParams.num_cams); // NUM_CAMS);
-    dim3 threads_tp(1, 1, 1);
-    dim3 grid_tp(1, 1, 1);
-    float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];
-    // use gpu_images and convert to gpu_clt_ref was 1152:
-	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			m_tpParams.num_cams,              // int                num_cams,           // actual number of cameras
-			m_tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
-			m_gpu_kernels,           // float           ** gpu_kernels,
-			m_gpu_images,            // float           ** gpu_images,
-			m_gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			gpu_clt_ref, //******    // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-			dstride/sizeof(float),   // size_t             dstride, // for gpu_images
-			m_tpParams.tp_tasks_size,// int                num_tiles) // number of tiles in task
-			0,                       // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-			m_tpParams.img_width,    // int                woi_width,
-			m_tpParams.img_height,   // int                woi_height,
-			m_tpParams.kernels_hor,  // int                kernels_hor,
-			m_tpParams.kernels_vert, //,          // int                kernels_vert);
-			m_gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-			m_gpu_num_active,        // int *              pnum_active_tiles);  //  indices to gpu_tasks
-			m_tpParams.tilesx);      // int                tilesx)
-	getLastCudaError("Kernel execution failed");
-	checkCudaErrors (cudaDeviceSynchronize());
-	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
-	setImgBuffersShifted(
-			is_bayer,   // int is_bayer,
-			image_dx,   // int image_dx,
-			image_dy); // int image_dy)
-	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-			m_tpParams.num_cams,      // int                num_cams,           // actual number of cameras
-			m_tpParams.num_colors,    // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-			fgpu_kernel_offsets,      // struct CltExtra ** gpu_kernel_offsets,
-			m_gpu_kernels,            // float           ** gpu_kernels,
-			m_gpu_images,             // float           ** gpu_images,
-			m_gpu_ftasks,             // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-			m_gpu_clt,     //******   // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-			dstride/sizeof(float),    // size_t             dstride, // for gpu_images
-			m_tpParams.tp_tasks_size, // int                num_tiles) // number of tiles in task
-			0,                        // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-			m_tpParams.img_width,     // int                woi_width,
-			m_tpParams.img_height,    // int                woi_height,
-			m_tpParams.kernels_hor,   // int                kernels_hor,
-			m_tpParams.kernels_vert,  //,          // int                kernels_vert);
-			m_gpu_active_tiles,       // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-			m_gpu_num_active,         // int *              pnum_active_tiles);  //  indices to gpu_tasks
-			m_tpParams.tilesx);       // int                tilesx)
-	getLastCudaError("Kernel execution failed");
-	checkCudaErrors(cudaDeviceSynchronize());
-    StopWatchInterface *timerINTERSELF = 0;
-    sdkCreateTimer(&timerINTERSELF);
-    int num_corrs{}; // will get data from the gpu memory
-    for (int i = i0; i < numIterations; i++)   {
-    	if (i == 0){
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerINTERSELF);
-    		sdkStartTimer(&timerINTERSELF);
-    	}
-    	correlate2D_inter<<<1,1>>>(             // only results in TD
-    			m_tpParams.num_cams,            // int                num_cams,           // actual number of cameras
-				sel_sensors,                    // int               sel_sensors,
-				m_gpu_clt,                      // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				gpu_clt_ref, // *********       // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				m_tpParams.num_colors,          // int               colors,             // number of colors (3/1)
-				m_tpParams.color_weights[0],    // 0.25,  // float             scale0,             // scale for R
-				m_tpParams.color_weights[1],    // 0.25,  // float             scale1,             // scale for B
-				m_tpParams.color_weights[2],    // 0.5,   // float             scale2,             // scale for G
-				m_gpu_ftasks,                   // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-				m_tpParams.tp_tasks_size,       // int               num_tiles) // number of tiles in task
-				m_tpParams.tilesx,              // int               tilesx,             // number of tile rows
-				m_gpu_corr_indices,             // int             * gpu_corr_indices,   // packed tile+pair
-				m_gpu_num_corr_tiles,           // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-				dstride_corr_td/sizeof(float),  // const size_t      corr_stride,        // in floats
-				m_gpu_corrs_td);                // float           * gpu_corrs);         // correlation output data
-        getLastCudaError("Kernel failure:correlate2D_inter");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("correlate2D_inter-TD pass: %d\n",i);
-    	checkCudaErrors(cudaMemcpy(
-    			&num_corrs,
-				m_gpu_num_corr_tiles,
-    			sizeof(int),
-    			cudaMemcpyDeviceToHost));
-    	checkCudaErrors(cudaDeviceSynchronize());
-       	corr2D_normalize<<<1,1>>>(
-        			num_corrs, //tp_task_size,                 // int               num_corr_tiles,     // number of correlation tiles to process
-    				dstride_corr_td/sizeof(float),             // const size_t      corr_stride_td,     // in floats
-					m_gpu_corrs_td,                            // float           * gpu_corrs_td,       // correlation tiles in transform domain
-    				(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
-    				dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
-					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
-					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
-     	getLastCudaError("Kernel failure:corr2D_normalize");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("corr2D_normalize pass: %d\n",i);
-    }
-    sdkStopTimer(&timerINTERSELF);
-    float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
-    sdkDeleteTimer(&timerINTERSELF);
-    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeINTERSELF, num_corrs);
-    saveInterCorrFile(
-    		m_tpPaths.result_interscene_td,      // const char * path,   // "clt/aux_interscene-TD.raw"     m_tpPaths.result_interscene_td
-			"interscene phase correlation",      // const char * prompt, // "interscene phase correlation"
-			num_corrs,                           // int          num_corrs,
-			num_corr_indices,                    // int          num_corr_indices,
-			m_gpu_corrs_td,                      // float      * gpu_corrs_td,
-			m_gpu_corr_indices,                  // int        * gpu_corr_indices,
-			num_sel_sensors);                    // int          num_sel_sensors);
-    saveInterCorrIndicesFile(
-    		m_tpPaths.result_interscene_indices, // const char * path,   //  "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices
-			"interscene indices",                // const char * prompt, //  "interscene indices"
-			num_corr_indices,                    // int          num_corr_indices,
-			m_gpu_corr_indices,                  // int *        gpu_corr_indices,
-			num_sel_sensors);                    // int          num_sel_sensors)
-    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
-    	gfree(gpu_clt_ref_h[ncam]);
-    }
-    gfree(gpu_clt_ref);
-}
 void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886 - 1123
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
@@ -1032,7 +887,580 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 	} // if (quad_combine) {
 }
+void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	// check/replace names
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_ftasks) {
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_corrs) {
+		throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
+	}
+    int sel_sensors =      0xffff; // 0x7fff; // 0xffff;
+    int num_sel_sensors =  16; // 15; // 16;
+    int num_pairs_inter =  num_sel_sensors+1;
+    int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
+    int is_bayer =         0;
+    int image_dx =         2;
+    int image_dy =         0;
+    float            * gpu_clt_ref_h            [m_tpParams.num_cams];
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
+    }
+    float           ** gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  m_tpParams.num_cams); // NUM_CAMS);
+    dim3 threads_tp(1, 1, 1);
+    dim3 grid_tp(1, 1, 1);
+    float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];
+    // use gpu_images and convert to gpu_clt_ref was 1152:
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			m_tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+			m_tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
+			m_gpu_kernels,           // float           ** gpu_kernels,
+			m_gpu_images,            // float           ** gpu_images,
+			m_gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			gpu_clt_ref, //******    // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float),   // size_t             dstride, // for gpu_images
+			m_tpParams.tp_tasks_size,// int                num_tiles) // number of tiles in task
+			0,                       // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			m_tpParams.img_width,    // int                woi_width,
+			m_tpParams.img_height,   // int                woi_height,
+			m_tpParams.kernels_hor,  // int                kernels_hor,
+			m_tpParams.kernels_vert, //,          // int                kernels_vert);
+			m_gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			m_gpu_num_active,        // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			m_tpParams.tilesx);      // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors (cudaDeviceSynchronize());
+	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
+	setImgBuffersShifted(
+			is_bayer,   // int is_bayer,
+			image_dx,   // int image_dx,
+			image_dy); // int image_dy)
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			m_tpParams.num_cams,      // int                num_cams,           // actual number of cameras
+			m_tpParams.num_colors,    // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,      // struct CltExtra ** gpu_kernel_offsets,
+			m_gpu_kernels,            // float           ** gpu_kernels,
+			m_gpu_images,             // float           ** gpu_images,
+			m_gpu_ftasks,             // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			m_gpu_clt,     //******   // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float),    // size_t             dstride, // for gpu_images
+			m_tpParams.tp_tasks_size, // int                num_tiles) // number of tiles in task
+			0,                        // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			m_tpParams.img_width,     // int                woi_width,
+			m_tpParams.img_height,    // int                woi_height,
+			m_tpParams.kernels_hor,   // int                kernels_hor,
+			m_tpParams.kernels_vert,  //,          // int                kernels_vert);
+			m_gpu_active_tiles,       // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			m_gpu_num_active,         // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			m_tpParams.tilesx);       // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors(cudaDeviceSynchronize());
+    StopWatchInterface *timerINTERSELF = 0;
+    sdkCreateTimer(&timerINTERSELF);
+    int num_corrs{}; // will get data from the gpu memory
+    for (int i = i0; i < numIterations; i++)   {
+    	if (i == 0){
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerINTERSELF);
+    		sdkStartTimer(&timerINTERSELF);
+    	}
+    	correlate2D_inter<<<1,1>>>(             // only results in TD
+    			m_tpParams.num_cams,            // int                num_cams,           // actual number of cameras
+				sel_sensors,                    // int               sel_sensors,
+				m_gpu_clt,                      // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				gpu_clt_ref, // *********       // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				m_tpParams.num_colors,          // int               colors,             // number of colors (3/1)
+				m_tpParams.color_weights[0],    // 0.25,  // float             scale0,             // scale for R
+				m_tpParams.color_weights[1],    // 0.25,  // float             scale1,             // scale for B
+				m_tpParams.color_weights[2],    // 0.5,   // float             scale2,             // scale for G
+				m_gpu_ftasks,                   // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				m_tpParams.tp_tasks_size,       // int               num_tiles) // number of tiles in task
+				m_tpParams.tilesx,              // int               tilesx,             // number of tile rows
+				m_gpu_corr_indices,             // int             * gpu_corr_indices,   // packed tile+pair
+				m_gpu_num_corr_tiles,           // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+				dstride_corr_td/sizeof(float),  // const size_t      corr_stride,        // in floats
+				m_gpu_corrs_td);                // float           * gpu_corrs);         // correlation output data
+        getLastCudaError("Kernel failure:correlate2D_inter");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("correlate2D_inter-TD pass: %d\n",i);
+    	checkCudaErrors(cudaMemcpy(
+    			&num_corrs,
+				m_gpu_num_corr_tiles,
+    			sizeof(int),
+    			cudaMemcpyDeviceToHost));
+    	checkCudaErrors(cudaDeviceSynchronize());
+       	corr2D_normalize<<<1,1>>>(
+        			num_corrs, //tp_task_size,                 // int               num_corr_tiles,     // number of correlation tiles to process
+    				dstride_corr_td/sizeof(float),             // const size_t      corr_stride_td,     // in floats
+					m_gpu_corrs_td,                            // float           * gpu_corrs_td,       // correlation tiles in transform domain
+    				(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+    				dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
+					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
+					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
+     	getLastCudaError("Kernel failure:corr2D_normalize");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("corr2D_normalize pass: %d\n",i);
+    }
+    sdkStopTimer(&timerINTERSELF);
+    float avgTimeINTERSELF = (float)sdkGetTimerValue(&timerINTERSELF) / (float)numIterations;
+    sdkDeleteTimer(&timerINTERSELF);
+    printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n",  avgTimeINTERSELF, num_corrs);
+    saveInterCorrFile(
+    		m_tpPaths.result_interscene_td,      // const char * path,   // "clt/aux_interscene-TD.raw"     m_tpPaths.result_interscene_td
+			"interscene phase correlation",      // const char * prompt, // "interscene phase correlation"
+			num_corrs,                           // int          num_corrs,
+			num_corr_indices,                    // int          num_corr_indices,
+			m_gpu_corrs_td,                      // float      * gpu_corrs_td,
+			m_gpu_corr_indices,                  // int        * gpu_corr_indices,
+			num_sel_sensors);                    // int          num_sel_sensors);
+    saveInterCorrIndicesFile(
+    		m_tpPaths.result_interscene_indices, // const char * path,   //  "clt/aux_inter-indices.raw" m_tpPaths.result_interscene_indices
+			"interscene indices",                // const char * prompt, //  "interscene indices"
+			num_corr_indices,                    // int          num_corr_indices,
+			m_gpu_corr_indices,                  // int *        gpu_corr_indices,
+			num_sel_sensors);                    // int          num_sel_sensors)
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
+    	gfree(gpu_clt_ref_h[ncam]);
+    }
+    gfree(gpu_clt_ref);
+}
+void TpHostGpu::testTextures (
+		int num_runs,
+		int use_dp,
+		int debug){ // DEBUG10 // 1422-1664
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	// check/replace names
+	if (!m_gpu_ftasks){
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_texture_indices){
+		throw std::runtime_error("Error: m_gpu_texture_indices is not initialized ");
+	}
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+	if (!m_gpu_textures){
+		throw std::runtime_error("Error: m_gpu_textures is not initialized ");
+	}
+	if (!m_gpu_diff_rgb_combo){
+		throw std::runtime_error("Error: m_gpu_diff_rgb_combo is not initialized ");
+	}
+	if (!m_gpu_generate_RBGA_params){
+		throw std::runtime_error("Error: m_gpu_generate_RBGA_params is not initialized ");
+	}
+	dim3 threads0(m_tpParams.convert_direct_indexing_threads, 1, 1);
+	dim3 blocks0 ((m_tpParams.tp_tasks_size + m_tpParams.convert_direct_indexing_threads -1) >> m_tpParams.convert_direct_indexing_threads_log2,1, 1);
+	int  linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
+	printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
+	printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
+	int   cpu_pnum_texture_tiles = 0;
+	int * gpu_pnum_texture_tiles;
+	checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
+	StopWatchInterface *timerTEXTURE = 0;
+	sdkCreateTimer(&timerTEXTURE);
+	for (int i = i0; i < numIterations; i++) {
+		if (i == 0)	{
+			checkCudaErrors(cudaDeviceSynchronize());
+			sdkResetTimer(&timerTEXTURE);
+			sdkStartTimer(&timerTEXTURE);
+		}
+		int shared_size = host_get_textures_shared_size( // in bytes
+				m_tpParams.num_cams,         // int                num_cams,     // actual number of cameras
+				m_tpParams.texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				0);           // int *              offsets);     // in floats
+		printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
+		//*pnum_texture_tiles = 0;
+		cpu_pnum_texture_tiles = 0;
+		checkCudaErrors(cudaMemcpy(
+				gpu_pnum_texture_tiles,
+				&cpu_pnum_texture_tiles,
+				sizeof(int),
+				cudaMemcpyHostToDevice));
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
+		if (!use_dp) {
+			create_nonoverlap_list<<<blocks0,threads0>>>(
+					m_tpParams.num_cams,                // int                num_cams,
+					m_gpu_ftasks,              // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+					m_tpParams.tp_tasks_size,            // int                num_tiles,           // number of tiles in task
+					m_tpParams.tilesx,                  // int                width,               // number of tiles in a row
+					m_gpu_texture_indices,     // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+					gpu_pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
+			cudaDeviceSynchronize();
+			checkCudaErrors(cudaMemcpy(
+					&cpu_pnum_texture_tiles,
+					gpu_pnum_texture_tiles,
+					sizeof(int),
+					cudaMemcpyDeviceToHost));
+			printf("cpu_pnum_texture_tiles = %d\n",  cpu_pnum_texture_tiles);
+			int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+			dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+			dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+			printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
+			printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
+			textures_accumulate <<<grid_texture1,threads_texture1,  shared_size>>>( // 65536>>>( //
+					m_tpParams.num_cams,                        // 	int               num_cams,           // number of cameras used
+					(int *) 0,                         // int             * woi,                // x, y, width,height
+					m_gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+					gpu_pnum_texture_tiles,            /// cpu_pnum_texture_tiles,          // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+					0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
+					m_gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+					m_gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
+					m_tpParams.texture_colors,                  // int               colors,             // number of colors (3/1)
+					(m_tpParams.texture_colors == 1),           // int               is_lwir,            // do not perform shot correction
+					m_tpParams.generate_RBGA_params[0], // min_shot,      // float             min_shot,           // 10.0
+					m_tpParams.generate_RBGA_params[1], // scale_shot,    // float             scale_shot,         // 3.0
+					m_tpParams.generate_RBGA_params[2], // diff_sigma,    // float             diff_sigma,         // pixel value/pixel change
+					m_tpParams.generate_RBGA_params[3], // diff_threshold,// float             diff_threshold,     // pixel value/pixel change
+					m_tpParams.generate_RBGA_params[4], // min_agree,     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					m_gpu_color_weights,               // float             weights[3],         // scale for R,B,G
+					1,                       // dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+					m_tpParams.keep_texture_weights, // 0, // 1                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+					// combining both non-overlap and overlap (each calculated if pointer is not null )
+					0,                               // size_t      texture_rbg_stride, // in floats
+					(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+					dstride_textures /sizeof(float),          // texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+					m_gpu_textures, // (float *) 0,             // gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+					linescan_order,          // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+					m_gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+					m_tpParams.tilesx);
+		} else { // if (!use_dp)
+			//tpParams.keep_texture_weights is assumed 0 in textures_nonoverlap
+			textures_nonoverlap<<<1,1>>> ( //,65536>>> (
+					m_tpParams.num_cams,                 // int                num_cams,           // number of cameras used
+					m_gpu_ftasks,                        // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+					m_tpParams.tp_tasks_size,            // int                num_tiles,          // number of tiles in task list
+					// declare arrays in device code?
+					m_gpu_texture_indices,               // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+					gpu_pnum_texture_tiles,              // int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
+					m_gpu_clt ,                          // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+					m_gpu_geometry_correction,           // struct gc     * gpu_geometry_correction,
+					m_tpParams.texture_colors,           // int               colors,             // number of colors (3/1)
+					(m_tpParams.texture_colors == 1),    // int               is_lwir,            // do not perform shot correction
+					m_gpu_generate_RBGA_params,
+					m_gpu_color_weights,                 // float             weights[3],         // scale for R
+					1,                                   // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					m_tpParams.keep_texture_weights,     // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+					// combining both non-overlap and overlap (each calculated if pointer is not null )
+					dstride_textures/sizeof(float),      // size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
+					m_gpu_textures,                      // float           * gpu_texture_tiles,
+					linescan_order,                      // int               linescan_order,
+					m_gpu_diff_rgb_combo,                //);  // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
+					m_tpParams.tilesx);
+		} // if (!use_dp) else
+		getLastCudaError("Kernel failure");
+		checkCudaErrors(cudaDeviceSynchronize());
+		printf("test pass: %d\n",i);
+	}
+	sdkStopTimer(&timerTEXTURE);
+	float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
+	sdkDeleteTimer(&timerTEXTURE);
+	printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
+	if (use_dp) {
+		checkCudaErrors(cudaMemcpy(
+				&cpu_pnum_texture_tiles,
+				gpu_pnum_texture_tiles,
+				sizeof(int),
+				cudaMemcpyDeviceToHost));
+		printf("cpu_pnum_texture_tiles = %d\n",  cpu_pnum_texture_tiles);
+		printf("tile_texture_layers = %d\n",     m_tile_texture_layers);
+	} // if (use_dp) {
+	int rslt_texture_size =   m_num_textures * m_tile_texture_size;
+	checkCudaErrors(cudaMemcpy(
+			(float * ) m_texture_indices,
+			m_gpu_texture_indices,
+			cpu_pnum_texture_tiles  * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
+	checkCudaErrors(cudaMemcpy2D(
+			cpu_textures,
+			m_tile_texture_size * sizeof(float),
+			m_gpu_textures,
+			dstride_textures,
+			m_tile_texture_size * sizeof(float),
+			m_num_textures,
+			cudaMemcpyDeviceToHost));
+	//    		    float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16];
+	int num_nonoverlap_pixels = m_tile_texture_layers * m_tpParams.tilesy * 16 * m_tpParams.tilesx * 16;
+	float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float));
+	for (int i = 0; i < num_nonoverlap_pixels; i++){
+		non_overlap_layers[i] = NAN;
+	}
+	for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
+		int ntile = m_texture_indices[itile] >> TEXT_NTILE_SHIFT;
+		int tileX = ntile % m_tpParams.tilesx;
+		int tileY = ntile / m_tpParams.tilesx;
+		for (int ilayer = 0; ilayer < m_tile_texture_layers; ilayer++){
+			int src_index0 = itile * m_tile_texture_size + 256 * ilayer;
+			int dst_index0 =  ilayer * (m_tpParams.tilesx * m_tpParams.tilesya * 256) + (m_tpParams.tilesy * 16) * (16 * m_tpParams.tilesx) + (tileX * 16);
+			for (int iy = 0; iy < 16; iy++){
+				[[maybe_unused]] int src_index1 = src_index0 + 16 * iy;
+				[[maybe_unused]] int dst_index1 = dst_index0 + iy * (16 * m_tpParams.tilesx);
+				for (int ix = 0; ix < 16; ix++){
+					int src_index= itile * m_tile_texture_size + 256 * ilayer + 16 * iy + ix;
+					int dst_index = ilayer * (m_tpParams.tilesx * m_tpParams.tilesy * 256) + (tileY * 16 + iy) * (16 * m_tpParams.tilesx) + (tileX * 16) + ix;
+					non_overlap_layers[dst_index] = cpu_textures[src_index];
+				}
+			}
+		}
+	}
+	int ntiles = m_tpParams.tilesx * m_tpParams.tilesy;
+	int nlayers = m_tpParams.num_cams * (m_tpParams.num_colors + 1);
+	int diff_rgb_combo_size = ntiles * nlayers;
+	float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
+	checkCudaErrors(cudaMemcpy(
+			cpu_diff_rgb_combo,
+			m_gpu_diff_rgb_combo,
+			diff_rgb_combo_size  * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
+	for (int nl = 0; nl <nlayers; nl++){
+		for (int ntile = 0; ntile < ntiles; ntile++){
+			cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
+		}
+	}
+	if (!use_dp) {
+		printf("Writing phase texture data to %s\n",  m_tpPaths.result_textures_file);
+		writeFloatsToFile(
+				non_overlap_layers,    // float *       data, // allocated array
+				rslt_texture_size,     // int           size, // length in elements
+				m_tpPaths.result_textures_file); // 			   const char *  path) // file path
+		printf("Writing low-res data to %s\n",  m_tpPaths.result_diff_rgb_combo_file);
+		writeFloatsToFile(
+				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
+				diff_rgb_combo_size,    // int           size, // length in elements
+				m_tpPaths.result_diff_rgb_combo_file); // 			   const char *  path) // file path
+	} else { // if (!use_dp) {
+		printf("Writing phase texture data to %s\n",  m_tpPaths.result_textures_file_dp);
+		writeFloatsToFile(
+				non_overlap_layers,    // float *       data, // allocated array
+				rslt_texture_size,     // int           size, // length in elements
+				m_tpPaths.result_textures_file_dp); // 			   const char *  path) // file path
+		printf("Writing low-res data to %s\n",  m_tpPaths.result_diff_rgb_combo_file_dp);
+		writeFloatsToFile(
+				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
+				diff_rgb_combo_size,    // int           size, // length in elements
+				m_tpPaths.result_diff_rgb_combo_file_dp); // 			   const char *  path) // file path
+	} // if (!use_dp) { else
+	if ( m_tpParams.debug_tile) {
+		if (debug) {
+			int texture_offset = m_tpParams.dbg_tile * m_tile_texture_size;
+			int chn = 0;
+			for (int i = 0; i < m_tile_texture_size; i++){
+				if ((i % 256) == 0){
+					printf("\nchn = %d\n", chn++);
+				}
+				printf("%10.4f", *(cpu_textures + texture_offset + i));
+				if (((i + 1) % 16) == 0){
+					printf("\n");
+				} else {
+					printf(" ");
+				}
+			}
+		}
+	} // if ( m_tpParams.debug_tile) {
+	free(non_overlap_layers);
+	free(cpu_textures);
+	free (cpu_diff_rgb_combo);
+	free (cpu_diff_rgb_combo_out);
+	checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
+}
+void TpHostGpu::testTexturesRGBA        (
+		int num_runs,
+		int use_dp,
+		int debug){ // DEBUG11 // 1669-1810
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	// check/replace names
+	if (!m_gpu_ftasks){
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_texture_indices){
+		throw std::runtime_error("Error: m_gpu_texture_indices is not initialized ");
+	}
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+ 	if (!m_gpu_num_texture_tiles){
+		throw std::runtime_error("Error: m_gpu_num_texture_tiles is not initialized ");
+	}
+	if (!m_gpu_textures_rbga){
+		throw std::runtime_error("Error: m_gpu_textures_rbga is not initialized ");
+	}
+	if (!m_gpu_generate_RBGA_params){
+		throw std::runtime_error("Error: m_gpu_generate_RBGA_params is not initialized ");
+	}
+	if (!m_gpu_twh){
+		throw std::runtime_error("Error: m_gpu_twh is not initialized ");
+	}
+	if (!m_gpu_woi){
+		throw std::runtime_error("Error: m_gpu_woi is not initialized ");
+	}
+	dim3 threads_rgba(1, 1, 1);
+	dim3 grid_rgba(1,1,1);
+	printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
+	printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
+	StopWatchInterface *timerRGBA = 0;
+	sdkCreateTimer(&timerRGBA);
+	for (int i = i0; i < numIterations; i++) {
+		if (i == 0) {
+			checkCudaErrors(cudaDeviceSynchronize());
+			sdkResetTimer(&timerRGBA);
+			sdkStartTimer(&timerRGBA);
+		}
+		// FIXME: update to use new correlations and num_cams
+		//#ifdef NO_DP
+		if (!use_dp) {
+			generate_RBGA_host (
+					m_tpParams.num_cams,              // int                num_cams,           // number of cameras used
+					// Parameters to generate texture tasks
+					m_gpu_ftasks,                      // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+					m_tpParams.tp_tasks_size,          // int                num_tiles,          // number of tiles in task list
+					// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
+					m_gpu_texture_indices,             // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+					m_gpu_num_texture_tiles,           // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+					m_gpu_woi,                         // int              * woi,                // x,y,width,height of the woi
+					m_tpParams.tilesx,                 // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+					m_tpParams.tilesy,                 // int                height); // <= TILESY, use for faster processing of LWIR images
+					// Parameters for the texture generation
+					m_gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+					m_gpu_geometry_correction,         // struct gc     * gpu_geometry_correction,
+					m_tpParams.texture_colors,         // int               colors,             // number of colors (3/1)
+					(m_tpParams.texture_colors == 1),  // int               is_lwir,            // do not perform shot correction
+					m_tpParams.generate_RBGA_params,   //		float             cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
+					m_gpu_color_weights,     // float             weights[3],         // scale for R
+					1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					m_tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
+					dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
+					m_gpu_textures_rbga);     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		} else { // if (!use_dp) {
+			//#else
+			int shared_size = host_get_textures_shared_size( // in bytes
+					m_tpParams.num_cams,     // int                num_cams,     // actual number of cameras
+					m_tpParams.texture_colors, // colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+					0);           // int *              offsets);     // in floats
+			printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
+			cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
+			generate_RBGA<<<1,1>>> (
+					m_tpParams.num_cams,              // int                num_cams,           // number of cameras used
+					// Parameters to generate texture tasks
+					m_gpu_ftasks,                     // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+					//                gpu_tasks,             // struct tp_task   * gpu_tasks,
+					m_tpParams.tp_tasks_size,         // int                num_tiles,          // number of tiles in task list
+					// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
+					m_gpu_texture_indices,            // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+					m_gpu_num_texture_tiles,          // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+					m_gpu_woi,                        // int              * woi,                // x,y,width,height of the woi
+					m_tpParams.tilesx,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+					m_tpParams.tilesy,                // int                height); // <= TILESY, use for faster processing of LWIR images
+					// Parameters for the texture generation
+					m_gpu_clt ,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+					m_gpu_geometry_correction,        // struct gc     * gpu_geometry_correction,
+					m_tpParams.texture_colors,        // int               colors,             // number of colors (3/1)
+					(m_tpParams.texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+					m_gpu_generate_RBGA_params,
+					m_gpu_color_weights,     // float             weights[3],         // scale for R
+					1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					m_tpParams.keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
+					dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
+					m_gpu_textures_rbga,     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+					m_gpu_twh);              // int             * twh)
+			getLastCudaError("Kernel failure");
+			checkCudaErrors(cudaDeviceSynchronize());
+			printf("test pass: %d\n",i);
+		} // if (use_dp) // else
+		//#endif
+	}
+	sdkStopTimer(&timerRGBA);
+	float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
+	sdkDeleteTimer(&timerRGBA);
+	printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
+    int cpu_woi[4];
+	checkCudaErrors(cudaMemcpy(
+			cpu_woi,
+			m_gpu_woi,
+			4 * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+	// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
+    int rgba_woi_width =  (cpu_woi[2] + 1) * m_tpParams.dtt_size;
+    int rgba_woi_height = (cpu_woi[3] + 1)* m_tpParams.dtt_size;
+    int rgba_width =   (m_tpParams.tilesx + 1) * m_tpParams.dtt_size;
+    int rgba_height =  (m_tpParams.tilesy + 1) * m_tpParams.dtt_size;
+    int rbga_slices =  m_tpParams.texture_colors + 1; // 4/1
+    if (m_tpParams.keep_texture_weights & 2){
+    	rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams;
+    }
+    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
+    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_textures_rgba,
+			rgba_width * sizeof(float),
+			m_gpu_textures_rbga,
+			dstride_textures_rbga,
+			rgba_width * sizeof(float),
+			rgba_height * rbga_slices,
+    		cudaMemcpyDeviceToHost));
+        printf("Import raw, real, little-endian, 18 images 648x520\n");
+    if (!use_dp) {
+        printf("Writing RBGA texture slices to %s\n",  m_tpPaths.result_textures_rgba_file);
+        writeFloatsToFile(
+        		cpu_textures_rgba,    // float *       data, // allocated array
+    			rslt_rgba_size,    // int           size, // length in elements
+    			m_tpPaths.result_textures_rgba_file); // 			   const char *  path) // file path
+    } else { // if (use_dp) {
+        printf("Writing RBGA texture slices to %s\n",  m_tpPaths.result_textures_rgba_file_dp);
+        writeFloatsToFile(
+        		cpu_textures_rgba,    // float *       data, // allocated array
+    			rslt_rgba_size,    // int           size, // length in elements
+    			m_tpPaths.result_textures_rgba_file_dp); // 			   const char *  path) // file path
+    } // if (use_dp) { // else
+    if ( m_tpParams.debug_tile) {
+    	if (debug) {
+    		int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
+    		for (int chn = 0; chn < rbga_slices; chn++){
+    			printf("\nchn = %d\n", chn);
+    			int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
+    			for (int i = 0; i < 8; i++){
+    				for (int j = 0; j < 8; j++){
+    					printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
+    				}
+    				printf("\n");
+    			}
+    		}
+    	}
+    }
+    free(cpu_textures_rgba);
+}
 void TpHostGpu::saveClt(
 		const char ** paths,    // tpPaths.ports_clt_file
@@ -1285,6 +1713,278 @@ float * TpHostGpu::getCorrTdImg(
 	return corr_img;
 }
+void TpHostGpu::generate_RBGA_host(
+		int                num_cams,           // number of cameras used
+		// Parameters to generate texture tasks
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p//		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task list
+		// declare arrays in device code?
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int              * gpu_num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+		int              * gpu_woi,                // x,y,width,height of the woi
+		int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
+		int                height, // <= TILES-Y, use for faster processing of LWIR images
+		// Parameters for the texture generation
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		const float       cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
+		const float       weights[3],         // scale for R,B,G should be host_array, not gpu
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+		const int         texture_rbga_stride,     // in floats
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+{
+	int               cpu_woi[4];
+	int               cpu_num_texture_tiles[8];
+	float             min_shot = cpu_params[0];           // 10.0
+	float             scale_shot = cpu_params[1];         // 3.0
+	float             diff_sigma = cpu_params[2];         // pixel value/pixel change
+	float             diff_threshold = cpu_params[3];     // pixel value/pixel change
+	float             min_agree = cpu_params[4];          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	int               tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
+	dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
+	int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+	dim3 blocks0 (blocks_x, height, 1);
+	clear_texture_list<<<blocks0,threads0>>>(
+			gpu_texture_indices,
+			width,
+			height);
+	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
+	dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+	int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
+	dim3 blocks(blocks_t, 1, 1);
+	// mark used tiles in gpu_texture_indices memory
+	mark_texture_tiles <<<blocks,threads>>>(
+			num_cams,           // int                num_cams,
+			gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			num_tiles,          // number of tiles in task list
+			width,              // number of tiles in a row
+			gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+	checkCudaErrors(cudaDeviceSynchronize());
+	// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
+	checkCudaErrors(cudaMemcpy(
+			(float * ) cpu_woi,
+			gpu_woi,
+			4  * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	cpu_woi[0] = width;
+	cpu_woi[1] = height;
+	cpu_woi[2] = 0;
+	cpu_woi[3] = 0;
+	checkCudaErrors(cudaMemcpy(
+			gpu_woi,
+			cpu_woi,
+			4 * sizeof(float),
+			cudaMemcpyHostToDevice));
+	/*
+	 *(woi + 0) = width;  // TILES-X;
+	 *(woi + 1) = height; // TILES-Y;
+	 *(woi + 2) = 0; // maximal x
+	 *(woi + 3) = 0; // maximal y
+	    int * gpu_woi = (int  *) copyalloc_kernel_gpu(
+	    		(float * ) woi,
+				4); // number of elements
+	 */
+	// TODO: create gpu_woi to pass	(copy from woi)
+	// set lower 4 bits in each gpu_ftasks task
+	mark_texture_neighbor_tiles <<<blocks,threads>>>(
+			num_cams,            // int                num_cams,
+			gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			num_tiles,           // number of tiles in task list
+			width,               // number of tiles in a row
+			height,              // number of tiles rows
+			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+			gpu_woi);                // min_x, min_y, max_x, max_y
+	checkCudaErrors(cudaDeviceSynchronize());
+	/*
+	    checkCudaErrors(cudaMemcpy( //
+	    		(float * ) cpu_num_texture_tiles,
+				gpu_num_texture_tiles,
+				8 * sizeof(float),
+				cudaMemcpyDeviceToHost));
+	 */
+	// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
+	for (int i = 0; i <8; i++){
+		cpu_num_texture_tiles[i] = 0;
+	}
+	/*
+	 *(num_texture_tiles+0) = 0;
+	 *(num_texture_tiles+1) = 0;
+	 *(num_texture_tiles+2) = 0;
+	 *(num_texture_tiles+3) = 0;
+	 *(num_texture_tiles+4) = 0;
+	 *(num_texture_tiles+5) = 0;
+	 *(num_texture_tiles+6) = 0;
+	 *(num_texture_tiles+7) = 0;
+	 */
+	// copy zeroed  num_texture_tiles
+	//    int * gpu_num_texture_tiles = (int  *) copyalloc_kernel_gpu(
+	//   		(float * ) num_texture_tiles,
+	//			8); // number of elements
+	checkCudaErrors(cudaMemcpy(
+			gpu_num_texture_tiles,
+			cpu_num_texture_tiles,
+			8 * sizeof(float),
+			cudaMemcpyHostToDevice));
+	gen_texture_list <<<blocks,threads>>>(
+			num_cams,            // int                num_cams,
+			gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			num_tiles,           // number of tiles in task list
+			width,               // number of tiles in a row
+			height,              // int                height,               // number of tiles rows
+			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+			gpu_num_texture_tiles,   // number of texture tiles to process
+			gpu_woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y
+	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
+	// copy gpu_woi back to host woi
+	checkCudaErrors(cudaMemcpy(
+			(float * ) cpu_woi,
+			gpu_woi,
+			4  * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	//    *(cpu_woi + 2) += 1 - *(cpu_woi + 0); // width  (was min and max)
+	//    *(cpu_woi + 3) += 1 - *(cpu_woi + 1); // height (was min and max)
+	cpu_woi[2] += 1 - cpu_woi[0]; // width  (was min and max)
+	cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max)
+	// copy host-modified data back to GPU
+	checkCudaErrors(cudaMemcpy(
+			gpu_woi,
+			cpu_woi,
+			4 * sizeof(float),
+			cudaMemcpyHostToDevice));
+	// copy gpu_num_texture_tiles back to host num_texture_tiles
+	checkCudaErrors(cudaMemcpy(
+			(float * ) cpu_num_texture_tiles,
+			gpu_num_texture_tiles,
+			8 * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	// Zero output textures. Trim
+	// texture_rbga_stride
+	//	 int texture_width =        (*(cpu_woi + 2) + 1) * DTT_SIZE;
+	//	 int texture_tiles_height = (*(cpu_woi + 3) + 1) * DTT_SIZE;
+	int texture_width =        (cpu_woi[2] + 1) * DTT_SIZE;
+	int texture_tiles_height = (cpu_woi[3] + 1) * DTT_SIZE;
+	int texture_slices =       colors + 1;
+	if (keep_weights & 2){
+		texture_slices += colors * num_cams;
+	}
+	dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
+	int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
+	dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
+#ifdef DEBUG8A
+	int                cpu_texture_indices      [TILESX*TILESYA];
+	checkCudaErrors(cudaMemcpy(
+			(float * ) cpu_texture_indices,
+			gpu_texture_indices,
+			TILESX*TILESYA * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	for (int i = 0; i < 256; i++){
+		int indx = cpu_texture_indices[i];
+		printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
+	}
+#endif // #ifdef DEBUG8A
+	clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
+			texture_width,
+			texture_tiles_height * texture_slices, // int               texture_slice_height,
+			texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
+			gpu_texture_tiles) ;                   // float           * gpu_texture_tiles);
+	// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
+	// oversize (border 16x 16 tiles overhang by 4 pixels)
+	checkCudaErrors(cudaDeviceSynchronize());  // not needed yet, just for testing
+	for (int pass = 0; pass < 8; pass++){
+		int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+		dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+		int border_tile =  pass >> 2;
+		int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
+		int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
+		dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
+		/* before CDP2
+			 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+			 if (border_tile){
+				 ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			 }
+		 */
+		// for CDP2
+		int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+		if (border_tile){
+			//	    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
+			ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
+		}
+#ifdef DEBUG8A
+		printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
+				pass, border_tile,ti_offset, ntt);
+		printf("\ngenerate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n",
+				(void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset));
+		printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
+				grid_texture.x, grid_texture.y, grid_texture.z);
+		printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
+				threads_texture.x, threads_texture.y, threads_texture.z);
+		printf("\n");
+#endif
+		/* */
+		int shared_size = host_get_textures_shared_size( // in bytes
+				num_cams,     // int                num_cams,     // actual number of cameras
+				colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				0);           // int *              offsets);     // in floats
+		printf("\n2. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 65536); // for CC 7.5
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
+		textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
+				num_cams,                        // int               num_cams,           // number of cameras used
+				gpu_woi,                             // int             * woi,                // x, y, width,height
+				gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				pntt, // ntt,                    // int *             num_texture_tiles,  // number of texture tiles to process
+				ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
+				gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				//				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
+				colors,                          // int               colors,             // number of colors (3/1)
+				is_lwir,                         // int               is_lwir,            // do not perform shot correction
+				min_shot,                        // float             min_shot,           // 10.0
+				scale_shot,                      // float             scale_shot,         // 3.0
+				diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
+				diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
+				min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				weights,                         // float             weights[3],         // scale for R,B,G
+				dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+				keep_weights,                    // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+				// combining both non-overlap and overlap (each calculated if pointer is not null )
+				texture_rbga_stride,             // size_t      texture_rbg_stride, // in floats
+				gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+				0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+				(float *) 0, // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				0, // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+				(float *)0, //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+				width);
+		checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
+		/* */
+	}
+	//	 checkCudaErrors(cudaFree(gpu_woi));
+	//	 checkCudaErrors(cudaFree(gpu_num_texture_tiles));
+	//	 __syncthreads();
+};
 void TpHostGpu::hfree(float *              p) {if (p) free(p); p = {};}
 void TpHostGpu::hfree(struct CltExtra *    p) {if (p) free(p); p = {};}
 void TpHostGpu::gfree(float *              p) {if (p) checkCudaErrors(cudaFree(p));p = {};}

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -130,11 +130,14 @@ public:
 	void testImcltRbgAll         (int num_runs); // 701
 	void testCorrelate2DIntra    (int num_runs); // 762 - 885
-	void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411
 	void testCorrelate2DIntraTD  (int num_runs, int quad_combine); // 886 - 1123
+	void testCorrelate2DInterSelf(int num_runs); // 1136 - 1411
+	void testTextures            (int num_runs, int use_dp, int debug); // 1422-1664
+	void testTexturesRGBA        (int num_runs, int use_dp, int debug); // 1669-1810
+private:
 	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
 	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);
@@ -144,9 +147,32 @@ public:
 	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
 	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
-private:
 	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
 	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
+	void generate_RBGA_host(
+			int                num_cams,           // number of cameras used
+			// Parameters to generate texture tasks
+			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p//		struct tp_task   * gpu_tasks,
+			int                num_tiles,          // number of tiles in task list
+			// declare arrays in device code?
+			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+			int              * gpu_num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+			int              * gpu_woi,                // x,y,width,height of the woi
+			int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
+			int                height, // <= TILES-Y, use for faster processing of LWIR images
+			// Parameters for the texture generation
+			float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
+			struct gc       * gpu_geometry_correction,
+			int               colors,             // number of colors (3/1)
+			int               is_lwir,            // do not perform shot correction
+			const float       cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
+			const float       weights[3],         // scale for R,B,G should be host_array, not gpu
+			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+			const int         texture_rbga_stride,     // in floats
+			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 	void hfree(float * p); // {if (p) free (p);}
 	void hfree(struct CltExtra * p);
 	void gfree(float * p);

--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -51,6 +51,9 @@ public:
 	static constexpr int tp_task_xy_offset =        TP_TASK_XY_OFFSET;//        6
 	static constexpr float fat_zero =               1000.0f; // 300.0f; // 30.0;
+    static constexpr int convert_direct_indexing_threads =      CONVERT_DIRECT_INDEXING_THREADS; //
+    static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; //
 #ifdef DBG_TILE
 	static constexpr int debug_tile{1};
 	static constexpr int dbg_tile{DBG_TILE};

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -33,7 +33,7 @@
 // all of the next 5 were disabled
 //#define NOCORR
 //#define NOCORR_TD
-#define NOTEXTURES //
+//#define NOTEXTURES //
 #define NOTEXTURE_RGBA //
 //#define NOTEXTURE_RGBAXXX //
@@ -1421,8 +1421,8 @@ int main(int argc, char **argv)
 // -----------------
 #ifndef NOTEXTURES
-    		 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+    		dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-    		 dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+    		dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
    		int  linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
     		printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
     		printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
@@ -1446,7 +1446,7 @@ int main(int argc, char **argv)
    		    			tpParams.num_cams,         // int                num_cams,     // actual number of cameras
 							tpParams.texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 							0);           // int *              offsets);     // in floats
-    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, texture_colors);
+    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, tpParams.num_cams, tpParams.texture_colors);
    				//*pnum_texture_tiles = 0;
    				cpu_pnum_texture_tiles = 0;
    				checkCudaErrors(cudaMemcpy(
@@ -1653,9 +1653,9 @@ int main(int argc, char **argv)
    		    				printf(" ");
    		    			}
    		    		}
-    		#endif // DEBUG9
+    		#endif // DEBUG10
 #endif //#ifdef 	DBG_TILE
-    		#endif
+    		#endif // #ifndef NSAVE_TEXTURES
    		    		free(cpu_textures);
    		    		free (cpu_diff_rgb_combo);
    		    		free (cpu_diff_rgb_combo_out);
@@ -1664,6 +1664,8 @@ int main(int argc, char **argv)
 #endif //NOTEXTURES
 #ifndef NOTEXTURE_RGBAXXX
    dim3 threads_rgba(1, 1, 1);
    dim3 grid_rgba(1,1,1);
@@ -1805,7 +1807,7 @@ int main(int argc, char **argv)
 #endif // DEBUG11
 #endif //#ifdef 	DBG_TILE
    free(cpu_textures_rgba);
-#endif // ifndef NOTEXTURES
+#endif // ifndef NOTEXTURE_RGBAXXX