GPU code debugging, fixed right/bottom on dewarped images, tested

textures with LWIR

GPU code debugging, fixed right/bottom on dewarped images, tested
textures with LWIR
46612695 · Andrey Filippov · 78a62e6f · 46612695 · 46612695 · 46612695
Commit 46612695 authored Dec 11, 2021 by Andrey Filippov
6 changed files
--- a/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
+++ b/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
@@ -127,7 +127,7 @@ public class CLTParameters {
 	public double     min_shot =          10.0;  // Do not adjust for shot noise if lower than
 	public double     scale_shot =        3.0;   // scale when dividing by sqrt
-	public double     diff_sigma =        5.0;   // RMS difference from average to reduce weights (~ 1.0 - 1/255 full scale image)
+	public double     diff_sigma =        5.0;   // RMS difference from average to reduce weights (~ 1.0 - 1/255 full scale image) (1.5 for RGB, 10 for LWIR?) 
 	public double     diff_threshold =    1.5;   // RMS difference from average to discard channel (~ 1.0 - 1/255 full scale image)
 	public boolean    diff_gauss =        true;  // when averaging images, use Gaussian around average as weight (false - sharp all/nothing)
 	public double     min_agree =         3.0;   // minimal number of channels to agree on a point (real number to work with fuzzy averages)

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -88,7 +88,17 @@ public class GPUTileProcessor {
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALCULATE_TILES_OFFSETS =    "calculate_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALC_REVERSE_DISTORTION =    "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction
+	// Kernels to use w/o Dynamic Parallelism    
+	static String GPU_CLEAR_TEXTURE_LIST_NAME =    "clear_texture_list";
+	static String GPU_MARK_TEXTURE_LIST_NAME =     "mark_texture_tiles";
+	static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
+	static String GPU_GEN_TEXTURE_LIST_NAME =      "gen_texture_list";
+	static String GPU_CLEAR_TEXTURE_RBGA_NAME =    "clear_texture_rbga";
+	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";
+	static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
 //  pass some defines to gpu source code with #ifdef JCUDA
 	public static int DTT_SIZE_LOG2 =             3;
 	public static int DTT_SIZE =                  (1 << DTT_SIZE_LOG2);
@@ -153,9 +163,18 @@ public class GPUTileProcessor {
    CUfunction GPU_TEXTURES_kernel =                null; // "textures_nonoverlap"
    CUfunction GPU_RBGA_kernel =                    null; // "generate_RBGA"
    CUfunction GPU_ROT_DERIV_kernel =               null; // "calc_rot_deriv"
-//    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null; // "get_tiles_offsets"
    CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null; // "calculate_tiles_offsets"
    CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null; // "calcReverseDistortionTable"
+// Kernels to use w/o Dynamic Parallelism    
+    CUfunction GPU_CLEAR_TEXTURE_LIST_kernel =      null; // "clear_texture_list"
+    CUfunction GPU_MARK_TEXTURE_LIST_kernel =       null; // "mark_texture_tiles"
+    CUfunction GPU_MARK_TEXTURE_NEIGHBOR_kernel =   null; // "mark_texture_neighbor_tiles"
+    CUfunction GPU_GEN_TEXTURE_LIST_kernel =        null; // "gen_texture_list"
+    CUfunction GPU_CLEAR_TEXTURE_RBGA_kernel =      null; // "clear_texture_rbga"
+    CUfunction GPU_TEXTURES_ACCUMULATE_kernel =     null; // "textures_accumulate"
+    CUfunction GPU_CREATE_NONOVERLAP_LIST_kernel =  null; // "create_nonoverlap_list"
    CUmodule    module; // to access constants memory
    //    private
@@ -295,9 +314,16 @@ public class GPUTileProcessor {
        		GPU_TEXTURES_NAME,
        		GPU_RBGA_NAME,
        		GPU_ROT_DERIV,
-//        		GPU_SET_TILES_OFFSETS,
        		GPU_CALCULATE_TILES_OFFSETS,
-        		GPU_CALC_REVERSE_DISTORTION
+        		GPU_CALC_REVERSE_DISTORTION,
+        		// Kernels to use w/o Dynamic Parallelism    
+        		GPU_CLEAR_TEXTURE_LIST_NAME,
+        		GPU_MARK_TEXTURE_LIST_NAME,
+        		GPU_MARK_TEXTURE_NEIGHBOR_NAME,
+        		GPU_GEN_TEXTURE_LIST_NAME,
+        		GPU_CLEAR_TEXTURE_RBGA_NAME,
+        		GPU_TEXTURES_ACCUMULATE_NAME,
+        		GPU_CREATE_NONOVERLAP_LIST_NAME
        };
        CUfunction[] functions = createFunctions(kernelSources,
        		                                 func_names,
@@ -311,9 +337,17 @@ public class GPUTileProcessor {
        GPU_TEXTURES_kernel=                 functions[5];
        GPU_RBGA_kernel=                     functions[6];
        GPU_ROT_DERIV_kernel =               functions[7];
-//        GPU_SET_TILES_OFFSETS_kernel =       functions[8];
        GPU_CALCULATE_TILES_OFFSETS_kernel = functions[8];
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[9];
+     // Kernels to use w/o Dynamic Parallelism    
+        GPU_CLEAR_TEXTURE_LIST_kernel =      functions[10];
+        GPU_MARK_TEXTURE_LIST_kernel =       functions[11];
+        GPU_MARK_TEXTURE_NEIGHBOR_kernel =   functions[12];
+        GPU_GEN_TEXTURE_LIST_kernel =        functions[13];
+        GPU_CLEAR_TEXTURE_RBGA_kernel =      functions[14];
+        GPU_TEXTURES_ACCUMULATE_kernel =     functions[15];
+        GPU_CREATE_NONOVERLAP_LIST_kernel =  functions[16];
        System.out.println("GPU kernel functions initialized");
        System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
@@ -324,10 +358,16 @@ public class GPUTileProcessor {
        System.out.println(GPU_TEXTURES_kernel.toString());
        System.out.println(GPU_RBGA_kernel.toString());
        System.out.println(GPU_ROT_DERIV_kernel.toString());
-//        System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
        System.out.println(GPU_CALCULATE_TILES_OFFSETS_kernel.toString());
        System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());
+        // Kernels to use w/o Dynamic Parallelism    
+        System.out.println(GPU_CLEAR_TEXTURE_LIST_kernel.toString());
+        System.out.println(GPU_MARK_TEXTURE_LIST_kernel.toString());
+        System.out.println(GPU_MARK_TEXTURE_NEIGHBOR_kernel.toString());
+        System.out.println(GPU_GEN_TEXTURE_LIST_kernel.toString());
+        System.out.println(GPU_CLEAR_TEXTURE_RBGA_kernel.toString());
+        System.out.println(GPU_TEXTURES_ACCUMULATE_kernel.toString());
+        System.out.println(GPU_CREATE_NONOVERLAP_LIST_kernel.toString());
        // GPU data structures are now initialized through GpuQuad instances
    }
@@ -475,6 +515,7 @@ public class GPUTileProcessor {
    	for (int i = 0; i < kernelNames.length; i++) {
    		// Find the function in the source by name, get its pointer
    		functions[i] = new CUfunction();
+    		System.out.println("Looking for GPU kernel ["+i+"]: "+kernelNames[i]);
    		cuModuleGetFunction(functions[i] , module, kernelNames[i]);
    	}
    	return functions;

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -84,6 +84,7 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_color_weights;
 	private CUdeviceptr gpu_generate_RBGA_params;
 	private CUdeviceptr gpu_woi;
+	private CUdeviceptr gpu_num_texture_tiles;
 	private CUdeviceptr gpu_textures_rgba;
 	private CUdeviceptr gpu_correction_vector;
 	private CUdeviceptr gpu_rot_deriv;
@@ -108,6 +109,41 @@ public class GpuQuad{ // quad camera description
 	private boolean geometry_correction_set = false;
 	private boolean geometry_correction_vector_set = false;
 	public  int gpu_debug_level = 1;
+	int host_get_textures_shared_size( // in bytes
+			//__device__ int get_textures_shared_size( // in bytes
+			int                num_cams,     // actual number of cameras
+			int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			int []             offsets){     // in floats
+		int DTT_SIZE1 =       GPUTileProcessor.DTT_SIZE + 1;
+		int DTT_SIZE2 =       2 * GPUTileProcessor.DTT_SIZE;
+		int DTT_SIZE21 =      DTT_SIZE2 + 1;
+		int offs = 0;
+		if (offsets != null) offsets[0] = offs;
+		offs += num_cams * num_colors * 2 * GPUTileProcessor.DTT_SIZE * DTT_SIZE21; //float mclt_tiles         [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
+		if (offsets != null) offsets[1] = offs;
+		offs += num_cams * num_colors * 4 * GPUTileProcessor.DTT_SIZE * DTT_SIZE1;  // float clt_tiles         [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
+		if (offsets != null) offsets[2] = offs;
+		int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;                // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
+		int rgbaw_size =    (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21;     // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
+		offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
+		if (offsets != null) offsets[3] = offs;
+		offs += num_cams * 2;                                      // float port_offsets      [NUM_CAMS][2];
+		if (offsets != null) offsets[4] = offs;
+		offs += num_colors * num_cams;                             // float ports_rgb_shared  [NUM_COLORS][NUM_CAMS];
+		if (offsets != null) offsets[5] = offs;
+		offs += num_cams;                                          // float max_diff_shared   [NUM_CAMS];
+		if (offsets != null) offsets[6] = offs;
+		offs += num_cams * GPUTileProcessor.TEXTURE_THREADS_PER_TILE;               // float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
+		if (offsets != null) offsets[7] = offs;
+		offs += num_colors * num_cams *  GPUTileProcessor.TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
+		if (offsets != null) offsets[8] = offs;
+		return Sizeof.INT * offs; // shared_floats;
+	}
 	// should only be updated with the same cameras instance
 	public void updateQuadCLT(final QuadCLT quadCLT) {
 		this.quadCLT =      quadCLT;
@@ -183,6 +219,8 @@ public class GpuQuad{ // quad camera description
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers
 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
 		gpu_correction_vector=    new CUdeviceptr();
@@ -285,6 +323,7 @@ public class GpuQuad{ // quad camera description
 		cuMemAlloc(gpu_woi,                               4 * Sizeof.FLOAT);
+		cuMemAlloc(gpu_num_texture_tiles,                 8 * Sizeof.FLOAT);
 		cuMemAlloc(gpu_num_texture_ovlp,                  8 * Sizeof.FLOAT);
 		cuMemAlloc(gpu_texture_indices_len,               1 * Sizeof.FLOAT);
@@ -1575,6 +1614,25 @@ public class GpuQuad{ // quad camera description
 		cuCtxSynchronize();
 	}
+	public void execRBGA(
+			double [] color_weights,
+			boolean   is_lwir,
+			double    min_shot,           // 10.0
+			double    scale_shot,         // 3.0
+			double    diff_sigma,         // pixel value/pixel change
+			double    diff_threshold,     // pixel value/pixel change
+			double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			boolean   dust_remove) {
+		execRBGA_noDP(
+				color_weights,  // double [] color_weights,
+				is_lwir,        // boolean   is_lwir,
+				min_shot,       // double    min_shot,           // 10.0
+				scale_shot,     // double    scale_shot,         // 3.0
+				diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
+				diff_threshold, // double    diff_threshold,     // pixel value/pixel change
+				min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				dust_remove);   // boolean   dust_remove);
+	}
 	/**
 	 * Generate combined (overlapping) texture
 	 * @param color_weights - [3] (RGB) or [1] (mono) color weights for matching
@@ -1586,7 +1644,7 @@ public class GpuQuad{ // quad camera description
 	 * @param min_agree  minimal number of channels to agree on a point (real number to work with fuzzy averages
 	 * @param dust_remove do not reduce average weight when only one image differs much from the average
 	 */
-	public void execRBGA(
+	public void execRBGA_DP(
 			double [] color_weights,
 			boolean   is_lwir,
 			double    min_shot,           // 10.0
@@ -1600,7 +1658,8 @@ public class GpuQuad{ // quad camera description
 			IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
 			return;
 		}
-		int num_colors = color_weights.length;
+//		int num_colors = color_weights.length;
+		int num_colors = is_lwir? 1 : color_weights.length;
 		if (num_colors > 3) num_colors = 3;
 		float [] fcolor_weights = new float[3];
 		fcolor_weights[0] = (float) color_weights[0];
@@ -1657,6 +1716,352 @@ public class GpuQuad{ // quad camera description
 				kernelParameters, null);   // Kernel- and extra parameters
 		cuCtxSynchronize();
 	}
+	/**
+	 * Generate combined (overlapping) texture
+	 * Version w/o Dynamic Parallelism to pass shared memory size. May be removed if maximal SM issue will be resolved
+	 * @param color_weights - [3] (RGB) or [1] (mono) color weights for matching
+	 * @param is_lwir ignore shot noise normalization
+	 * @param min_shot - minimal value to use sqrt() for shot noise normalization (default - 10)
+	 * @param scale_shot scale shot noise (3.0)
+	 * @param diff_sigma pixel value/pixel change (1.5) - normalize channel values difference by the offsets of the cameras 
+	 * @param diff_threshold - never used?
+	 * @param min_agree  minimal number of channels to agree on a point (real number to work with fuzzy averages
+	 * @param dust_remove do not reduce average weight when only one image differs much from the average
+	 */
+	public void execRBGA_noDP(
+			double [] color_weights,
+			boolean   is_lwir,
+			double    min_shot,           // 10.0
+			double    scale_shot,         // 3.0
+			double    diff_sigma,         // pixel value/pixel change
+			double    diff_threshold,     // pixel value/pixel change
+			double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			boolean   dust_remove) {
+		execCalcReverseDistortions(); // will check if it is needed first
+		if (this.gpuTileProcessor.GPU_RBGA_kernel == null) {
+			IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
+			return;
+		}
+		int num_colors = is_lwir? 1 : color_weights.length;
+		if (num_colors > 3) num_colors = 3;
+		float [] fcolor_weights = new float[3];
+		fcolor_weights[0] = (float) color_weights[0];
+		fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
+		fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
+		double sw = 0.0; for (int i = 0; i < fcolor_weights.length; i++) sw+= fcolor_weights[i];
+		for (int i = 0; i < fcolor_weights.length; i++) fcolor_weights[i]/=sw;
+		cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);
+		float [] generate_RBGA_params = {
+				(float)             min_shot,           // 10.0
+				(float)             scale_shot,         // 3.0
+				(float)             diff_sigma,         // pixel value/pixel change
+				(float)             diff_threshold,     // pixel value/pixel change
+				(float)             min_agree          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		};
+		cuMemcpyHtoD(gpu_generate_RBGA_params, Pointer.to(generate_RBGA_params),  generate_RBGA_params.length * Sizeof.FLOAT);
+		int iis_lwir =      (is_lwir)? 1:0;
+		int idust_remove =  (dust_remove)? 1 : 0;
+		int width =  img_width / GPUTileProcessor.DTT_SIZE;
+		int height = img_height / GPUTileProcessor.DTT_SIZE;
+		// Testing parameters
+		TpTask [] test_tasks0 = new TpTask[width * height];
+		float [] test_ftasks0 = new float [test_tasks0.length * getTaskSize()];
+		cuMemcpyDtoH(Pointer.to(test_ftasks0),            gpu_ftasks,  test_ftasks0.length * Sizeof.FLOAT);
+		for (int i = 0; i < test_tasks0.length; i++) {
+			test_tasks0[i] = new TpTask(num_cams, test_ftasks0, i, false);	
+		}
+		//		woi.x =      Float.floatToIntBits(fwoi[0]) * GPUTileProcessor.DTT_SIZE;
+		//		woi.y =      Float.floatToIntBits(fwoi[1]) * GPUTileProcessor.DTT_SIZE;
+		//		woi.width =  Float.floatToIntBits(fwoi[2]) * GPUTileProcessor.DTT_SIZE;
+		//		woi.height = Float.floatToIntBits(fwoi[3]) * GPUTileProcessor.DTT_SIZE;
+		int blocks_x = (width + ((1 << GPUTileProcessor.THREADS_DYNAMIC_BITS) - 1)) >> GPUTileProcessor.THREADS_DYNAMIC_BITS;
+		int []  blocks0 =  {blocks_x, height, 1};
+		int []  threads0 = {(1 << GPUTileProcessor.THREADS_DYNAMIC_BITS), 1, 1};
+		/*
+	    clear_texture_list<<<blocks0,threads0>>>(
+	    		gpu_texture_indices,
+				width,
+				height);
+		 */
+		Pointer kp_clear_texture_list = Pointer.to(
+				Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices,
+				Pointer.to(new int[] {width}),
+				Pointer.to(new int[] {height}));
+		cuCtxSynchronize();
+		// Call the kernel function
+		cuLaunchKernel(this.gpuTileProcessor.GPU_CLEAR_TEXTURE_LIST_kernel,
+				blocks0[0],  blocks0[1],  blocks0[2], // Grid dimension
+				threads0[0], threads0[1], threads0[2],// Block dimension
+				0, null,                              // Shared memory size and stream (shared - only dynamic, static is in code)
+				kp_clear_texture_list, null);         // Kernel- and extra parameters
+		cuCtxSynchronize();
+		int blocks_t =    (num_task_tiles + ((1 << GPUTileProcessor.THREADS_DYNAMIC_BITS)) -1) >> GPUTileProcessor.THREADS_DYNAMIC_BITS;//
+		int []  blocks =  {blocks_t, 1, 1};
+		int []  threads = {(1 << GPUTileProcessor.THREADS_DYNAMIC_BITS), 1, 1};
+		// mark used tiles in gpu_texture_indices memory
+		/*
+	    mark_texture_tiles <<<blocks,threads>>>(
+	    		num_cams,           // int                num_cams,
+				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				num_tiles,          // number of tiles in task list
+				width,              // number of tiles in a row
+				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		 */
+		Pointer kp_mark_texture_tiles = Pointer.to(
+				Pointer.to(new int[] {num_cams}),         // int                num_cams,
+				Pointer.to(gpu_ftasks),                   // float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+				Pointer.to(new int[] { num_task_tiles }), // int                num_tiles,          // number of tiles in task list
+				Pointer.to(new int[] {width}),
+				Pointer.to(gpu_texture_indices_ovlp));    // gpu_texture_indices,
+		cuLaunchKernel(this.gpuTileProcessor.GPU_MARK_TEXTURE_LIST_kernel,
+				blocks[0],  blocks[1],  blocks[2],        // Grid dimension
+				threads[0], threads[1], threads[2],       // Block dimension
+				0, null,                                  // Shared memory size and stream (shared - only dynamic, static is in code)
+				kp_mark_texture_tiles, null);             // Kernel- and extra parameters
+		cuCtxSynchronize();
+		int [] cpu_woi =               new int[4];
+		int [] cpu_num_texture_tiles = new int[8];
+		//		cuMemcpyDtoH(Pointer.to(cpu_woi), gpu_woi,  cpu_woi.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
+		cpu_woi[0] = width;
+		cpu_woi[1] = height;
+		cpu_woi[2] = 0;
+		cpu_woi[3] = 0;
+		cuMemcpyHtoD(gpu_woi, Pointer.to(cpu_woi),  cpu_woi.length * Sizeof.INT);
+//		cuMemcpyDtoH(Pointer.to(cpu_woi), gpu_woi,  cpu_woi.length * Sizeof.INT); //just for testing		
+		/*
+    // TODO: create gpu_woi to pass	(copy from woi)
+    // set lower 4 bits in each gpu_ftasks task
+    mark_texture_neighbor_tiles <<<blocks,threads>>>(
+    		num_cams,           // int                num_cams,
+			gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			num_tiles,           // number of tiles in task list
+			width,               // number of tiles in a row
+			height,              // number of tiles rows
+			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+			gpu_woi);                // min_x, min_y, max_x, max_y
+	checkCudaErrors(cudaDeviceSynchronize());
+		 */
+		Pointer kp_mark_texture_neighbor_tiles = Pointer.to(
+				Pointer.to(new int[] {num_cams}),         // int      num_cams,
+				Pointer.to(gpu_ftasks),                   // float  * gpu_ftasks,          // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+				Pointer.to(new int[] { num_task_tiles }), // int      num_tiles,           // number of tiles in task list
+				Pointer.to(new int[] {width}),            // int      width,               // number of tiles in a row
+				Pointer.to(new int[] {height}),           // int      height,              // number of tiles rows
+				Pointer.to(gpu_texture_indices_ovlp),     // int    * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+				Pointer.to(gpu_woi));                     // int    * woi,                 // min_x, min_y, max_x, max_y
+		cuLaunchKernel(this.gpuTileProcessor.GPU_MARK_TEXTURE_NEIGHBOR_kernel,
+				blocks[0],  blocks[1],  blocks[2],        // Grid dimension
+				threads[0], threads[1], threads[2],       // Block dimension
+				0, null,                                  // Shared memory size and stream (shared - only dynamic, static is in code)
+				kp_mark_texture_neighbor_tiles, null);    // Kernel- and extra parameters
+		cuCtxSynchronize();
+		//		int [] cpu_num_texture_tiles = new int[8];
+		Arrays.fill(cpu_num_texture_tiles, 0); 
+		cuMemcpyHtoD(gpu_num_texture_tiles, Pointer.to(cpu_num_texture_tiles),  cpu_num_texture_tiles.length * Sizeof.INT);
+		/*
+    gen_texture_list <<<blocks,threads>>>(
+    		num_cams,            // int                num_cams,
+			gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			num_tiles,           // number of tiles in task list
+			width,               // number of tiles in a row
+			height,              // int                height,               // number of tiles rows
+			gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+			gpu_num_texture_tiles,   // number of texture tiles to process
+			gpu_woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y
+	checkCudaErrors(cudaDeviceSynchronize()); // not needed yet, just for testing
+		 */
+//		cuMemcpyHtoD(gpu_woi, Pointer.to(cpu_woi),  cpu_woi.length * Sizeof.INT); // just for testing
+// Testing parameters
+		TpTask [] test_tasks = new TpTask[width * height];
+		float [] test_ftasks = new float [test_tasks.length * getTaskSize()];
+		cuMemcpyDtoH(Pointer.to(test_ftasks),            gpu_ftasks,  test_ftasks.length * Sizeof.FLOAT);
+		for (int i = 0; i < test_tasks.length; i++) {
+			test_tasks[i] = new TpTask(num_cams, test_ftasks, i, false);	
+		}
+		cuMemcpyDtoH(Pointer.to(cpu_num_texture_tiles), gpu_num_texture_tiles,  cpu_num_texture_tiles.length * Sizeof.INT);		
+		cuMemcpyDtoH(Pointer.to(cpu_woi),               gpu_woi,  cpu_woi.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		int tilesX = width;
+		int tilesY = height;
+		int tilesYa = (tilesY + 3) & ~3;
+		int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
+		cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		cuMemcpyHtoD(gpu_texture_indices_ovlp,                           Pointer.to(cpu_texture_indices_ovlp),  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		Pointer kp_gen_texture_list = Pointer.to(
+				Pointer.to(new int[] {num_cams}),         // int      num_cams,
+				Pointer.to(gpu_ftasks),                   // float  * gpu_ftasks,          // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+				Pointer.to(new int[] { num_task_tiles }), // int      num_tiles,           // number of tiles in task list
+				Pointer.to(new int[] {width}),            // int      width,               // number of tiles in a row
+				Pointer.to(new int[] {height}),           // int      height,              // number of tiles rows
+				Pointer.to(gpu_texture_indices_ovlp),     // int    * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+				Pointer.to(gpu_num_texture_tiles),        // int    * gpu_num_texture_tiles,   // number of texture tiles to process
+				Pointer.to(gpu_woi));                     // int    * woi,                 // min_x, min_y, max_x, max_y
+		cuLaunchKernel(this.gpuTileProcessor.GPU_GEN_TEXTURE_LIST_kernel,
+				blocks[0],  blocks[1],  blocks[2],        // Grid dimension
+				threads[0], threads[1], threads[2],       // Block dimension
+				0, null,                                  // Shared memory size and stream (shared - only dynamic, static is in code)
+				kp_gen_texture_list, null);    // Kernel- and extra parameters
+		cuCtxSynchronize();
+		// copy gpu_woi back to host woi
+//		float [] fcpu_woi = new float [4];
+//		cuMemcpyDtoH(Pointer.to(fcpu_woi), gpu_woi,  4 * Sizeof.FLOAT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
+		cuMemcpyDtoH(Pointer.to(cpu_woi), gpu_woi,  cpu_woi.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		cpu_woi[2] += 1 - cpu_woi[0]; // width  (was min and max)
+		cpu_woi[3] += 1 - cpu_woi[1]; // height (was min and max)
+		// copy host-modified data back to GPU
+		cuMemcpyHtoD(gpu_woi, Pointer.to(cpu_woi),  cpu_woi.length * Sizeof.INT);
+		// copy gpu_num_texture_tiles back to host cpu_num_texture_tiles for processing by the CPU code
+		cuMemcpyDtoH(Pointer.to(cpu_num_texture_tiles), gpu_num_texture_tiles,  cpu_num_texture_tiles.length * Sizeof.INT);		
+		// Testing parameters
+		//		int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
+		cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		// Zero output textures. Trim
+		// texture_rbga_stride
+		int texture_width =        (cpu_woi[2] + 1) * GPUTileProcessor.DTT_SIZE;
+		int texture_tiles_height = (cpu_woi[3] + 1) * GPUTileProcessor.DTT_SIZE;
+		int texture_slices =       num_colors + 1;
+		int blocks_x2 = ((texture_width + 
+				((1 << (GPUTileProcessor.THREADS_DYNAMIC_BITS + GPUTileProcessor.DTT_SIZE_LOG2 )) - 1)) >>
+				(GPUTileProcessor.THREADS_DYNAMIC_BITS + GPUTileProcessor.DTT_SIZE_LOG2));
+		int [] blocks2  = {blocks_x2, texture_tiles_height * texture_slices, 1}; // each thread - 8 vertical
+		int [] threads2 = {(1 << GPUTileProcessor.THREADS_DYNAMIC_BITS), 1, 1};
+		/*
+		 clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
+				 texture_width,
+				 texture_tiles_height * texture_slices, // int               texture_slice_height,
+				 texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
+				 gpu_texture_tiles) ;                   // float           * gpu_texture_tiles);
+		 */
+		Pointer kp_clear_texture_rbga = Pointer.to(
+				Pointer.to(new int[] {texture_width}),           // int      texture_width,
+				Pointer.to(new int[] {texture_tiles_height * texture_slices}),// int               texture_slice_height,
+				Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats 8*stride
+				Pointer.to(gpu_textures_rgba));                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		cuLaunchKernel(this.gpuTileProcessor.GPU_CLEAR_TEXTURE_RBGA_kernel,
+				blocks2[0],  blocks2[1],  blocks2[2],   // Grid dimension
+				threads2[0], threads2[1], threads2[2],  // Block dimension
+				0, null,                                // Shared memory size and stream (shared - only dynamic, static is in code)
+				kp_clear_texture_rbga, null);           // Kernel- and extra parameters
+		cuCtxSynchronize();
+		boolean DEBUG8A = true;
+		// Testing parameters
+		//		int [] cpu_texture_indices_ovlp = new int [tilesX * tilesYa];
+		cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+//		cuMemcpyHtoD(gpu_texture_indices_ovlp,                           Pointer.to(cpu_texture_indices_ovlp),  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed		
+		// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
+		// oversize (border 16x 16 tiles overhang by 4 pixels)
+		int tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
+		for (int pass = 0; pass < 8; pass++){
+			int num_cams_per_thread = GPUTileProcessor.NUM_THREADS / GPUTileProcessor.TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+			int[] threads_texture = {GPUTileProcessor.TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1}; // TEXTURE_TILES_PER_BLOCK, 1);
+			int border_tile =  (pass >> 2);
+			int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile];
+			int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1
+			int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+			if (border_tile != 0){
+				ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			}
+			int shared_size = host_get_textures_shared_size( // in bytes
+					num_cams,     // int                num_cams,     // actual number of cameras
+					num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+					null);           // int *              offsets);     // in floats
+			if (DEBUG8A) {
+				System.out.println(String.format("generate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d",
+						pass, border_tile,ti_offset, ntt));
+				//			System.out.println(String.format("generate_RBGA() gpu_texture_indices= %p, gpu_texture_indices + ti_offset= %p\n",
+				//					(void *) gpu_texture_indices, (void *) (gpu_texture_indices + ti_offset));
+				System.out.println(String.format("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
+						grid_texture[0], grid_texture[1], grid_texture[2]));
+				System.out.println(String.format("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
+						threads_texture[0], threads_texture[1], threads_texture[2]));
+				System.out.println ("pass="+pass);
+				for (int i = 0; i < 256; i++){
+					int indx0 = ti_offset + i;
+					if (indx0 < cpu_texture_indices_ovlp.length) {
+						int indx = cpu_texture_indices_ovlp[indx0];
+						System.out.println(String.format("%05x | %02d: %04x %03d %03d %x",indx0, i,indx>>8, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff));
+					}
+				}
+				System.out.println ("\n\n");
+			}
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+			Pointer kp_textures_accumulate = Pointer.to(
+					Pointer.to(new int[] {num_cams}),                // int         num_cams,
+					Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
+					Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+					Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+					Pointer.to(new int[] {ti_offset}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+					Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
+					Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
+					Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
+					Pointer.to(new int[]   {iis_lwir}),              // int         is_lwir,        // do not perform shot correction
+					Pointer.to(new float[] {(float) min_shot}),      // float       min_shot,       // 10.0
+					Pointer.to(new float[] {(float) scale_shot}),    // float       scale_shot,     // 3.0
+					Pointer.to(new float[] {(float) diff_sigma}),    // float       diff_sigma,     // pixel value/pixel change
+					Pointer.to(new float[] {(float) diff_threshold}),// float       diff_threshold, // pixel value/pixel change
+					Pointer.to(new float[] {(float) min_agree}),     // float       min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					Pointer.to(gpu_color_weights),                   // float       weights[3],     // scale for R,B,G (or {1.0,0.0,0.0}
+					Pointer.to(new int[]   {idust_remove}),          // int         dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					Pointer.to(new int[]   {0}),                     // int         keep_weights,       // return channel weights after A in RGBA
+					 // combining both non-overlap and overlap (each calculated if pointer is not null )
+					Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
+					Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+					Pointer.to(new int[]   {0}),                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+					Pointer.to(new int[]   {0}), // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+					Pointer.to(new int[]   {0}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+					Pointer.to(new int[]   {0}), //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+					Pointer.to(new int[]   {width}));
+			cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel,
+					grid_texture[0],    grid_texture[1],    grid_texture[2],   // Grid dimension
+					threads_texture[0], threads_texture[1], threads_texture[2],  // Block dimension
+					shared_size, null,                      // Shared memory size and stream (shared - only dynamic, static is in code)
+					kp_textures_accumulate, null);          // Kernel- and extra parameters
+			cuCtxSynchronize();
+		}
+	}
 	/**
 	 * Generate non-overlapping (16x16) texture tiles
 	 * @param color_weights - [3] (RGB) or [1] (mono) color weights for matching
@@ -2565,7 +2970,7 @@ public class GpuQuad{ // quad camera description
 		CUdeviceptr constantMemoryPointer = new CUdeviceptr();
 		long constantMemorySizeArray[] = { 0 };
-		cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray,  this.gpuTileProcessor.module, "lpf_data");
+		cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray,  this.gpuTileProcessor.module, "lpf_data"); //CUDA_ERROR_ILLEGAL_ADDRESS after re-run
 		int constantMemorySize = (int)constantMemorySizeArray[0];
 		if (debug) System.out.println("constantMemoryPointer: " + constantMemoryPointer);
 		if (debug) System.out.println("constantMemorySize: " + constantMemorySize);

--- a/src/main/java/com/elphel/imagej/gpu/TpTask.java
+++ b/src/main/java/com/elphel/imagej/gpu/TpTask.java
@@ -45,6 +45,7 @@ public class TpTask {
 	 */
 	public TpTask(int num_sensors, float [] flt, int task_indx, boolean use_aux)
 	{
+		this.num_sensors = num_sensors; // will not be encoded
 		int indx = task_indx * getSize(num_sensors);
 		task =    Float.floatToIntBits(flt[indx++]); // 0
 		int txy = Float.floatToIntBits(flt[indx++]); // 1

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -2364,7 +2364,55 @@ public class QuadCLT extends QuadCLTCPU {
 					debugLevel);
 		}
+// try textures here
+		boolean show_textures_rgba = clt_parameters.show_rgba_color;
+		if (show_textures_rgba) {
+			float [][] texture_img = new float [isMonochrome()?2:4][];
+			double [] col_weights = new double[3];
+			if (isMonochrome()) {
+				col_weights[0] = 1.0;
+				col_weights[1] = 0.0;
+				col_weights[2] = 0.0;// green color/mono
+			} else {
+				col_weights[2] = 1.0/(1.0 +  clt_parameters.corr_red + clt_parameters.corr_blue);    // green color
+				col_weights[0] = clt_parameters.corr_red *  col_weights[2];
+				col_weights[1] = clt_parameters.corr_blue * col_weights[2];
+			}
+			Rectangle woi = new Rectangle(); // will be filled out to match actual available image
+			gpuQuad.execRBGA(
+					col_weights,                   // double [] color_weights,
+					isLwir(),                      // boolean   is_lwir,
+					clt_parameters.min_shot,       // double    min_shot,           // 10.0
+					clt_parameters.scale_shot,     // double    scale_shot,         // 3.0
+					clt_parameters.diff_sigma,     // double    diff_sigma,         // pixel value/pixel change Used much larger sigma = 10.0 instead of 1.5
+					clt_parameters.diff_threshold, // double    diff_threshold,     // pixel value/pixel change
+					clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					clt_parameters.dust_remove);   // boolean   dust_remove,
+			float [][] rbga = gpuQuad.getRBGA(
+					(isMonochrome() ? 1 : 3), // int     num_colors,
+					woi);
+			for (int ncol = 0; ncol < texture_img.length; ncol++) if (ncol < rbga.length) {
+				texture_img[ncol] = rbga[ncol];
+			}
+			// first try just multi-layer, then with palette
+			(new ShowDoubleFloatArrays()).showArrays( // show slices RBGA (colors - 256, A - 1.0)
+					rbga,
+					woi.width,
+					woi.height,
+					true,
+					getImageName()+"-RGBA-STACK-D"+clt_parameters.disparity+
+					":"+clt_parameters.gpu_woi_tx+":"+clt_parameters.gpu_woi_ty+
+					":"+clt_parameters.gpu_woi_twidth+":"+clt_parameters.gpu_woi_theight+
+					":"+(clt_parameters.gpu_woi_round?"C":"R")
+					//,new String[] {"R","B","G","A"}
+					);
+		}
 /**
       if (colorProcParameters.isLwir() && colorProcParameters.lwir_autorange) {
            double rel_low =  colorProcParameters.lwir_low;

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -978,33 +978,30 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)
-__global__ void clear_texture_list(
+extern "C" __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILES-X, use for faster processing of LWIR images
 		int                height); // <= TILES-Y, use for faster processing of LWIR images
-__global__ void mark_texture_tiles(
+extern "C" __global__ void mark_texture_tiles(
 		int                num_cams,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
-__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
+extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 		int                num_cams,
 		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int                height,              // number of tiles rows
 		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
 		int              * woi);                  // x,y,width,height of the woi
-__global__ void gen_texture_list(
+extern "C" __global__ void gen_texture_list(
 		int                num_cams,
 		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-///		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
 		int                width,                // number of tiles in a row
 		int                height,               // number of tiles rows
@@ -1012,7 +1009,7 @@ __global__ void gen_texture_list(
 		int              * num_texture_tiles,    // number of texture tiles to process
 		int              * woi);                 // min_x, min_y, max_x, max_y input
-__global__ void clear_texture_rbga(
+extern "C" __global__ void clear_texture_rbga(
 		int               texture_width,
 		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
@@ -1044,10 +1041,9 @@ __global__ void index_correlate(
 		int *              gpu_corr_indices,  // array of correlation tasks
 		int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
-__global__ void create_nonoverlap_list(
+extern "C" __global__ void create_nonoverlap_list(
 		int                num_cams,
 		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task
 		int                width,               // number of tiles in a row
 		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
@@ -1108,11 +1104,12 @@ extern "C" __global__ void corr2D_combine_inner(
 		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
 		float           * gpu_corrs_combo);    // combined correlation output (one per tile)
-extern "C" __global__ void textures_accumulate(
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int               num_cams,           // number of cameras used
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
@@ -1131,7 +1128,8 @@ extern "C" __global__ void textures_accumulate(
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
-		float           * gpu_diff_rgb_combo, //'); // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 		int               tilesx);
 __device__ int get_textures_shared_size( // in bytes
@@ -2065,7 +2063,8 @@ extern "C" __global__ void generate_RBGA(
 			    		woi,                             // int             * woi,                // x, y, width,height
 						gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
-						gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+						ti_offset,                       //                   gpu_texture_indices_offset,// add to gpu_texture_indices
+						gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 						gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 						colors,                          // int               colors,             // number of colors (3/1)
 						is_lwir,                         // int               is_lwir,            // do not perform shot correction
@@ -2082,6 +2081,7 @@ extern "C" __global__ void generate_RBGA(
 						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
 						gpu_texture_tiles, //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+						1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 						(float *)0, //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 						width);
 				cudaDeviceSynchronize(); // not needed yet, just for testing
@@ -2277,8 +2277,7 @@ __global__ void mark_texture_tiles(
 * the result textures to fade along the border.
 *
 * @param num_cams             number of cameras
- * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ * @param gpu_ftasks           flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param height               number of tiles rows
@@ -2289,7 +2288,6 @@ __global__ void mark_texture_tiles(
 __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 		int                num_cams,
 		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-///		struct tp_task   * gpu_tasks,
 		int                num_tiles,           // number of tiles in task list
 		int                width,               // number of tiles in a row
 		int                height,              // number of tiles rows
@@ -2301,12 +2299,11 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 	if (task_num >= num_tiles) {
 		return; // nothing to do
 	}
-///	int task = gpu_tasks[task_num].task;
 	int task = get_task_task(task_num, gpu_ftasks, num_cams);
 	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
 		return; // NOP tile
 	}
-///	int cxy = gpu_tasks[task_num].txy;
 	int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
 	int x = (cxy & 0xffff);
@@ -2320,6 +2317,7 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 	if ((x < (width - 1))  && *(gpu_texture_indices + (x + 1) + y *      width)) d |= (1 << TASK_TEXTURE_E_BIT);
 	if ((y < (height - 1)) && *(gpu_texture_indices +  x +     (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT);
 	if ((x > 0)            && *(gpu_texture_indices + (x - 1) + y *      width)) d |= (1 << TASK_TEXTURE_W_BIT);
+	// Set task texture bits in global gpu_ftasks array (lower 4 bits)
 ///	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
 	*(int *) (gpu_ftasks +  get_task_size(num_cams) * task_num) = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
 }
@@ -2340,12 +2338,11 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
 */
 __global__ void gen_texture_list(
 		int                num_cams,
-		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		float            * gpu_ftasks,           // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-///		struct tp_task   * gpu_tasks,
 		int                num_tiles,            // number of tiles in task list
 		int                width,                // number of tiles in a row
 		int                height,               // number of tiles rows
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,    // number of texture tiles to process
 		int              * woi)                  // min_x, min_y, max_x, max_y input
@@ -2409,7 +2406,10 @@ __global__ void gen_texture_list(
 	}
 	__syncthreads();// __syncwarp();
 #endif // DEBUG12
-	*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+//	*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+	// keep only 8 LSBs of task, use higher 24 for task number
+	*(gpu_texture_indices + buf_offset) = (task & ((1 << CORR_NTILE_SHIFT) -1)) | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+	//CORR_NTILE_SHIFT
 }
 //inline __device__ int get_task_size(int num_cams){
 //	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
@@ -2455,13 +2455,13 @@ __global__ void index_direct(
 * (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
 *
 * @param num_cams         number of cameras <= NUM_CAMS
- * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ * @param gpu_ftasks           flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param nonoverlap_list      integer array to place the generated list
 * @param pnonoverlap_length   single-element integer array return generated list length
 */
-__global__ void create_nonoverlap_list(
+extern "C" __global__ void create_nonoverlap_list(
 		int                num_cams,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
@@ -2847,30 +2847,38 @@ extern "C" __global__ void textures_nonoverlap(
 				 num_cams,     // int                num_cams,     // actual number of cameras
 				 colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				 0);           // int *              offsets);     // in floats
-		 textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
+#ifdef DEBUG7A
-				 num_cams,                        // 	int               num_cams,           // number of cameras used
+		printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
-				 (int *) 0,                       // int             * woi,                // x, y, width,height
-				 gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+		__syncthreads();
-				 *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
-				 gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+#endif
-				 gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
+		textures_accumulate <<<grid_texture,threads_texture,  shared_size>>>( // 65536>>>( //
-				 colors,                          // int               colors,             // number of colors (3/1)
+				num_cams,                        // 	int               num_cams,           // number of cameras used
-				 is_lwir,                         // int               is_lwir,            // do not perform shot correction
+				(int *) 0,                       // int             * woi,                // x, y, width,height
-				 min_shot,                        // float             min_shot,           // 10.0
+				gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				 scale_shot,                      // float             scale_shot,         // 3.0
+				*pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
-				 diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
+				0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
-				 diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
+				gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				 min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
-				 weights,                         // float             weights[3],         // scale for R,B,G
+				colors,                          // int               colors,             // number of colors (3/1)
-				 dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+				is_lwir,                         // int               is_lwir,            // do not perform shot correction
-				 0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+				min_shot,                        // float             min_shot,           // 10.0
-				 // combining both non-overlap and overlap (each calculated if pointer is not null )
+				scale_shot,                      // float             scale_shot,         // 3.0
-				 0,                               // size_t      texture_rbg_stride, // in floats
+				diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
-				 (float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+				diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
-				 texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+				min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				 gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				weights,                         // float             weights[3],         // scale for R,B,G
-				 gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+				dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-				 num_tilesx);
+				0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+				// combining both non-overlap and overlap (each calculated if pointer is not null )
+				0,                               // size_t      texture_rbg_stride, // in floats
+				(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+				texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+				gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				1, // 	int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+				gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+				num_tilesx);
 	 }
 }
@@ -2887,6 +2895,7 @@ extern "C" __global__ void textures_nonoverlap(
 * @param woi                  WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used
 * @param gpu_clt              array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 * @param num_texture_tiles    number of texture tiles to process
+ * @param gpu_texture_indices_offset add to gpu_texture_indices
 * @param gpu_texture_indices  array - 1 integer per tile to process
 * @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight
 * @param colors               number of colors used:  3 for RGB or 1 for monochrome
@@ -2903,15 +2912,18 @@ extern "C" __global__ void textures_nonoverlap(
 * @param gpu_texture_rbg      output array (number of colors +1 + ?) * woi.height * output stride(first woi.width valid) float values (or 0)
 * @param texture_stride       output stride for non-overlapping texture tile output in floats (or 0 to skip)
 * @param gpu_texture_tiles    output of the non-overlapping tiles (or 0 to skip)
+ * @param linescan_order       if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 * @param gpu_diff_rgb_combo   low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null
+ * @param tilesx               number of tiles in a row. If negative then output gpu_diff_rgb_combo in linescan order,
+ *                             if positive - in gpu_texture_indices order
 */
 extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int               num_cams,           // number of cameras used
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -2928,11 +2940,18 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 		int               tilesx)
 {
 	// will process exactly 4 cameras at a time in one block,
 	// so imclt is executed sequentially for each group of 4 cameras
+///	if ((threadIdx.x == 0) && (threadIdx.y == 0)){
+///		printf("DONE\n");
+///	}
+///	__syncthreads();
+///	return;
 	int offsets [9];
 	int shared_size = get_textures_shared_size( // in bytes
 			 num_cams,     // int                num_cams,     // actual number of cameras
@@ -2945,54 +2964,56 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		return; // nothing to do
 	}
 	// get number of tile
-	int tile_code = gpu_texture_indices[tile_indx];
+	int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP
 	if ((tile_code & (1 << CORR_TEXTURE_BIT)) == 0){
 		return; // nothing to do
 	}
 	int tile_num = tile_code >> CORR_NTILE_SHIFT;
-#ifdef DEBUG22
+#ifdef DEBUG7A
-		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+	__syncthreads();// __syncwarp();
-			printf("\n1. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
+	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		printf("textures_accumulate: diff_sigma =     %f\n",    diff_sigma);
+		printf("textures_accumulate: diff_threshold = %f\n",diff_threshold);
+		printf("textures_accumulate: min_agree =      %f\n",     min_agree);
+		printf("textures_accumulate: weights[0] =     %f\n",weights[0]);
+		printf("textures_accumulate: weights[1] =     %f\n",weights[1]);
+		printf("textures_accumulate: weights[2] =     %f\n",weights[2]);
+		printf("textures_accumulate: dust_remove =    %d\n",dust_remove);
+		printf("textures_accumulate: keep_weights =   %d\n",keep_weights);
+	}
+#endif //DEBUG7A
+#ifdef DEBUG7A // 22
+	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		for (int i = 0; i <9; i++){
+			printf(" offsets[%d] = 0x%x\n",i,offsets[i]);
+		}
+	}
+	__syncthreads();
+#endif	// #ifdef DEBUG22
+#ifdef DEBUG7AXX // 22
+		if ((tile_num == DBG_TILE)) { //  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n1. tile_indx=%d, tile_num=%d threadIdx.x = %d threadIdx.y =%d\n",tile_indx,tile_num,threadIdx.x,threadIdx.y);
 		}
 		__syncthreads();
 #endif	// #ifdef DEBUG22
 	extern __shared__ float all_shared[];
-	float * mclt_tiles =       &all_shared[0] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
+	float * mclt_tiles =       &all_shared[offsets[0]] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-	float * clt_tiles  =       &all_shared[1] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9  = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
+	float * clt_tiles  =       &all_shared[offsets[1]] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9  = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
-	float * mclt_debayer =     &all_shared[1] ; // [num_cams][colors][MCLT_UNION_LEN]; //  16 * 1 * 16 * 18  = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
+	float * mclt_debayer =     &all_shared[offsets[1]] ; // [num_cams][colors][MCLT_UNION_LEN]; //  16 * 1 * 16 * 18  = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
-	float * mclt_tmps =        &all_shared[2] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
+	float * mclt_tmps =        &all_shared[offsets[2]] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-	float * rgbaw =            &all_shared[2] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
+	float * rgbaw =            &all_shared[offsets[2]] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
-	float * port_offsets =     &all_shared[3] ; // [num_cams][2];          // 16 * 2 = 0x20 | 4*2 = 0x8
+	float * port_offsets =     &all_shared[offsets[3]] ; // [num_cams][2];          // 16 * 2 = 0x20 | 4*2 = 0x8
-	float * ports_rgb_shared = &all_shared[4] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
+	float * ports_rgb_shared = &all_shared[offsets[4]] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
-	float * max_diff_shared =  &all_shared[5] ; // [num_cams];             // 16 = 0x10     | 4 = 0x4     | return to system memory (optionally pass null to skip calculation)
+	float * max_diff_shared =  &all_shared[offsets[5]] ; // [num_cams];             // 16 = 0x10     | 4 = 0x4     | return to system memory (optionally pass null to skip calculation)
-	float * max_diff_tmp =     &all_shared[6] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
+	float * max_diff_tmp =     &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
-	float * ports_rgb_tmp =    &all_shared[7] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8  = 0x60 |  [4*3][8]
+	float * ports_rgb_tmp =    &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8  = 0x60 |  [4*3][8]
-//	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-//	__shared__ union {
-//		float clt_tiles  [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9  = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
-//		float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; //  16 * 1 * 16 * 18  = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
-//	} shr;
-//	__shared__ union {
-//		float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-//		float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
-//		 // (1 + 1 + 16 + 1 + 1)*16*17 = 0x1540 | (3 + 1 + 4 + 3 + 1)*16*17 = 0xcc0
-//		// add more
-//	} shr1;
-//	__shared__ float port_offsets      [NUM_CAMS][2];          // 16 * 2 = 0x20 | 4*2 = 0x8
-//	__shared__ float ports_rgb_shared  [NUM_COLORS][NUM_CAMS]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
-//	__shared__ float max_diff_shared   [NUM_CAMS];             // 16 = 0x10     | 4 = 0x4     | return to system memory (optionally pass null to skip calculation)
-//	__shared__ float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
-//	__shared__ float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8  = 0x60 |  [4*3][8]
 #ifdef DBG_TILE
-#ifdef DEBUG7
+#ifdef DEBUG7AXX
-	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+	if (tile_num == DBG_TILE){ // }  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 		printf("\ntextures_gen tile = %d\n",tile_num);
 		//    	debug_print_clt1(clt_tile1, color,  0xf); //
 		//        printf("\textures_gen tile = %d, pair=%d, color = %d   CAMERA22\n",tile_num, corr_pair,color);
@@ -3009,22 +3030,16 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		if (threadIdx.x < 2){ // not more than 16 sensors, not less than
 			port_offsets[camera_num * 2 + threadIdx.x] =   gpu_geometry_correction->rXY[camera_num][threadIdx.x];
 		}
+		__syncthreads();// __syncwarp(); // is it needed?
 		for (int color = 0; color < colors; color++){
-			//        int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
+			// clt_tiles is union with mclt_debayer, so has to have same step
-			//		float * clt_tile = ((float *) shr.clt_tiles[camera_num][color]); // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
+			float * clt_tile =  clt_tiles + (camera_num * colors + color) * MCLT_UNION_LEN;
-			//		float * clt_tilei = clt_tile + threadIdx.x;
-			//		float * gpu_tile = ((float *) gpu_clt[camera_num]) +  (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
-			//		float * mclt_tile = (float *) mclt_tiles [camera_num][color];
-			//		float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
-			//		float * mclt_tmp =  (float *) shr1.mclt_tmp[camera_num][color];
-			int cam_col = (camera_num * colors + color);
-			float * clt_tile =  clt_tiles + cam_col * 2 * DTT_SIZE * DTT_SIZE21; // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
 			float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here
 			float * gpu_tile = ((float *) gpu_clt[camera_num]) +  (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
-			float * mclt_tile = mclt_tiles +    (camera_num * colors+ color) * 2 * DTT_SIZE * DTT_SIZE21;
+			float * mclt_tile = mclt_tiles +    (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;
-			float * mclt_dst =  mclt_debayer +  (camera_num* colors + color) * MCLT_UNION_LEN; // 16 * 18
+			float * mclt_dst =  mclt_debayer +  (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18
-			float * mclt_tmp =  mclt_tmps +     (camera_num* colors + color) * DTT_SIZE2 * DTT_SIZE21;
+			float * mclt_tmp =  mclt_tmps +     (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21; // 16*17
 			// no camera_num below
 #pragma unroll
 			for (int q = 0; q < 4; q++) {
@@ -3039,7 +3054,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				}
 			}
 			__syncthreads();
-#ifdef DEBUG7
+#ifdef DEBUG7AXXX
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 				printf("\ntextures_gen LPF for color = %d\n",color);
 				debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
@@ -3050,7 +3065,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			__syncthreads();// __syncwarp();
 #endif
-#ifdef DBG_TILE		// perform idct
+#ifdef DBG_TILEXXX		// perform idct
 			imclt8threads(
 					0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
 					clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
@@ -3064,17 +3079,19 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 					0);
 #endif
 			__syncthreads();// __syncwarp();
-#ifdef DEBUG7
+#ifdef DEBUG7A
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-				printf("\ntextures_gen mclt color = %d\n",color);
+				for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){
-				debug_print_mclt(
+					printf("\n3104 textures_gen mclt camera = % d,  color = %d\n",ncam, color);
-						mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+					debug_print_mclt(
-						color);
+							mclt_tiles + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							color);
+				}
 			}
 			__syncthreads();// __syncwarp();
 #endif
 			if (colors > 1) {
-#ifdef DBG_TILE
+#ifdef DBG_TILE_XXX
 				debayer_shot(
 						(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
 						min_shot,    // float     min_shot,   // 10.0
@@ -3098,35 +3115,78 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				// copy? - no, just remember to use mclt_tile, not mclt_dst
 				// will have to copy mclt_tiles -> mclt_dst as they have different gaps
 				// untested copy for mono mode
-#pragma unroll
+#ifdef DEBUG7AXXX
+				if (tile_num == DBG_TILE) {
+//					for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
+					int n = 0;
+						printf("textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, threadIdx.x= %d, n=%d, msp=0x%x, dst=0x%x\n",
+								camera_num_offs,threadIdx.y, threadIdx.x, n,
+								(int) (mclt_tile + threadIdx.x + n), (int)(mclt_dst +  threadIdx.x + n));
+//					}
+				}
+				__syncthreads();// __syncwarp();
+#endif
+#ifdef DEBUG7AXX // Good here
+				if (tile_num == DBG_TILE) {
+					for (int ccam = 0; ccam < num_cams; ccam++) {
+						if ((threadIdx.x == 0) && (camera_num == ccam)){
+							printf("\n3155 textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
+							debug_print_mclt( // broken for camera 1
+									mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+									-1);
+						}
+						__syncthreads();// __syncwarp();
+					}
+					printf("3162 camera_num_offs= %d threadIdx.y= %d, color = %d mclt_tile=0x%x, mclt_dst=0x%x\n",
+							camera_num_offs,threadIdx.y, color, (int) mclt_tile, (int) mclt_dst);
+				}
+				__syncthreads();// __syncwarp();
+#endif
+//#ifdef DEBUGXXXX // no copy at all
+				//#pragma unroll
 				for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
 					float * msp = mclt_tile + threadIdx.x + n;
 					float * dst = mclt_dst +  threadIdx.x + n;
-#pragma unroll
+					//#pragma unroll
 					for (int row = 0; row < DTT_SIZE2; row++){
 						*dst = *msp;
 						msp += DTT_SIZE21;
 						dst += DTT_SIZE21;
 					}
 				}
+//#endif
 				__syncthreads();
-			}
+			} //if (colors > 1)  else
-#ifdef DEBUG77
-			//		float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+#ifdef DEBUG7AXX // still good here
+			if (tile_num == DBG_TILE) {
+				for (int ccam = 0; ccam < num_cams; ccam++) {
+					if ((threadIdx.x == 0) && ((camera_num & 0x3) == (ccam & 0x3))){
+						printf("\n 3185 mclt_tile : textures_gen mclt_tile camera_num_offs= %d camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam,threadIdx.y, color);
+						debug_print_mclt( // broken for camera 1
+//								mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+								mclt_tiles +  (ccam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21,
+								-1);
-			for (int ccam = 0; ccam < num_cams; ccam++) {
+						printf("\n 3190 mclt_dst: textures_gen AFTER DEBAER camera_num_offs= %d  camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam, threadIdx.y, color);
-				if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+						debug_print_mclt(
-					printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
+//								mclt_dst, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-					debug_print_mclt(
+								mclt_debayer +(ccam * colors + color) * MCLT_UNION_LEN, // 16 * 18
-							mclt_dst, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+								-1);
-							-1);
+						/*
 					printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
 					debug_print_mclt(
-							(float *) shr.mclt_debayer[ccam][0], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							mclt_debayer + (ccam * colors * MCLT_UNION_LEN), //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 							-1);
+						 */
+					}
+					__syncthreads();// __syncwarp();
 				}
-				__syncthreads();// __syncwarp();
 			}
 			__syncthreads();// __syncwarp();
 #endif
@@ -3134,40 +3194,45 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		__syncthreads(); // __syncwarp();
 		///	return;
-#ifdef DEBUG77
+		//	__shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
+	} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
+#ifdef DEBUG7A
+	//#ifdef DEBUG22
+	for (int ccam = 0; ccam < num_cams; ccam++) {
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			for (int ccam = 0; ccam < num_cams; ccam++) {
+			for (int nncol = 0; nncol < colors; nncol++){
-				//		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+				printf("\n3227: mclt_tiles +  (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21 cam= %d, color = %d\n",ccam, nncol);
-				for (int nncol = 0; nncol < colors; nncol++){
+				//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
-					printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
+				debug_print_mclt(
-					//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+						mclt_tiles +  (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21,
-					debug_print_mclt(
+						-1);
-							(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-							-1);
-				}
 			}
 		}
 		__syncthreads();// __syncwarp();
+	}
+	__syncthreads();// __syncwarp();
 #endif
-#ifdef DEBUG77
+#ifdef DEBUG7A
-		//#ifdef DEBUG22
+	//#ifdef DEBUG22
-		for (int ccam = 0; ccam < num_cams; ccam++) {
+	for (int ccam = 0; ccam < num_cams; ccam++) {
-			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-				for (int nncol = 0; nncol < colors; nncol++){
+			for (int nncol = 0; nncol < colors; nncol++){
-					printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
+				printf("\n 3244 mclt_dst: textures_gen AFTER DEBAER camera number= %d threadIdx.y= %d, color = %d\n", ccam, threadIdx.y, nncol);
-					//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+				debug_print_mclt(
-					debug_print_mclt(
+						mclt_debayer +(ccam * colors + nncol) * MCLT_UNION_LEN, // 16 * 18
-							(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+						-1);
-							-1);
-				}
 			}
-			__syncthreads();// __syncwarp();
 		}
 		__syncthreads();// __syncwarp();
+	}
+	__syncthreads();// __syncwarp();
 #endif
-		//	__shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
-	} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
 #ifdef DBG_TILE
 		int debug = (tile_num == DBG_TILE);
 #else
@@ -3237,7 +3302,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			}
 		}
-#ifdef DEBUG7
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
 		}
@@ -3252,7 +3317,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 	}
 	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
-#ifdef DEBUG12
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 //			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
 //					tile_num, (int) tile_code, border_tile);
@@ -3261,13 +3326,14 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			for (int ncol = 0; ncol <= colors; ncol++) {
 				printf("\ntile[%d]\n",ncol);
 				debug_print_mclt(
-						(float *) (shr1.rgbaw[ncol]),
+//						(float *) (shr1.rgbaw[ncol]),
+						rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
 						-1);
 			}
 		}
 		__syncthreads();// __syncwarp();
 #endif // DEBUG12
-		int alpha_mode = alphaIndex[tile_code];
+		int alpha_mode = alphaIndex[tile_code]; // only 4 lowest bits
 		if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
 			for (int pass = 0; pass < 8; pass ++) {
 				int row = pass * 2 + (threadIdx.y >> 1);
@@ -3296,7 +3362,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
 ///		int height = *(woi + 3) << DTT_SIZE_LOG2;
-#ifdef DEBUG12
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
 					tileX, tileY, tile_x0, tile_y0, slice_stride);
@@ -3304,7 +3370,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			for (int ncol = 0; ncol <= colors; ncol++) {
 				printf("\ntile[%d]\n",ncol);
 				debug_print_mclt(
-						(float *) (shr1.rgbaw[ncol]),
+//						(float *) (shr1.rgbaw[ncol]),
+						rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
 						-1);
 			}
 		}
@@ -3319,7 +3386,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			int gi = g_row * texture_rbg_stride  + g_col; // offset to the top left corner
 			float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
 			float * rgba_i = rgbaw + i;
-#ifdef DEBUG12
+#ifdef DEBUG7A
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 				printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
 						pass, row, col, g_row, g_col, i, gi);
@@ -3348,70 +3415,71 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 	} // 	if (gpu_texture_rbg) { // generate RGBA
 	if (calc_extra){ // gpu_diff_rgb_combo
 		__syncthreads(); // needed?
-#ifdef DEBUG22
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\n3. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
-			printf("max_diff: %f, %f, %f, %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf    ("max_diff: ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
-			printf("R:        %f, %f, %f, %f\n",ports_rgb_shared[0][0],ports_rgb_shared[0][1],ports_rgb_shared[0][2],ports_rgb_shared[0][3]);
+			for (int ccol = 0; ccol < colors; ccol++){
-			printf("B:        %f, %f, %f, %f\n",ports_rgb_shared[1][0],ports_rgb_shared[1][1],ports_rgb_shared[1][2],ports_rgb_shared[1][3]);
+				printf("color%d:   ",ccol);for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ccol * num_cams +ccam]);} printf("\n");
-			printf("G:        %f, %f, %f, %f\n",ports_rgb_shared[2][0],ports_rgb_shared[2][1],ports_rgb_shared[2][2],ports_rgb_shared[2][3]);
+			}
-			printf("\n 3. max_diff\n");
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
 			}
 			for (int ncol = 0; ncol < colors; ncol++){
-				printf("\n%d:total %f %f %f %f\n",
+				printf("\n%d:total ",ncol);
-						ncol,
+				for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams +ccam]);} printf("\n");
-						ports_rgb_shared[ncol][0],
-						ports_rgb_shared[ncol][1],
-						ports_rgb_shared[ncol][2],
-						ports_rgb_shared[ncol][3]);
 				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-					printf("tmp[%d] %f %f %f %f\n",
+					printf("tmp[%d] ",i);
-							i,
+					for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
-							ports_rgb_tmp[ncol][0][i],
-							ports_rgb_tmp[ncol][1][i],
-							ports_rgb_tmp[ncol][2][i],
-							ports_rgb_tmp[ncol][3][i]);
 				}
 			}
+		}
+		__syncthreads();
+//DBG_TILE
+#endif// #ifdef DEBUG7A
+#ifdef DEBUG7A
+		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n4. tile_indx=%d, tile_num=%d, DBG_TILE = %d\n",tile_indx,tile_num, DBG_TILE);
 		}
 		__syncthreads();
 //DBG_TILE
-#endif// #ifdef DEBUG22
+#endif// #ifdef DEBUG7A
+		int tile_offset = (linescan_order ? tile_num : tile_indx) * num_cams* (colors + 1);
 		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
 			int camera_num = threadIdx.y + camera_num_offs;
-//			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
-			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;
+// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position
+//			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num  * num_cams* (colors + 1) + camera_num;//
+			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_offset + camera_num;//
 			if (threadIdx.x == 0){
 				*pdiff_rgb_combo = max_diff_shared[camera_num];
 			}
 			if (threadIdx.x < colors){
-//				*(pdiff_rgb_combo + (threadIdx.x + 1) * NUM_CAMS) = ports_rgb_shared[threadIdx.x][camera_num];// [color][camera]
 				*(pdiff_rgb_combo + (threadIdx.x + 1) * num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]
 			}
 		}
-	}
+	} // if (calc_extra){ // gpu_diff_rgb_combo
 } // textures_accumulate()
 __device__ int get_textures_shared_size( // in bytes
+//__device__ int get_textures_shared_size( // in bytes
 	    int                num_cams,     // actual number of cameras
 	    int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 		int *              offsets){     // in floats
-	int shared_floats = 0;
+//	int shared_floats = 0;
 	int offs = 0;
 //	int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
 	if (offsets) offsets[0] = offs;
-	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
+	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles         [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
 	if (offsets) offsets[1] = offs;
-	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles  [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
+	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles         [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
 	if (offsets) offsets[2] = offs;
-	offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;    //float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
+	int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;                // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
+	int rgbaw_size =    (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21;     // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
+	offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
 	if (offsets) offsets[3] = offs;
 	offs += num_cams * 2;                                      // float port_offsets      [NUM_CAMS][2];
 	if (offsets) offsets[4] = offs;
@@ -3421,9 +3489,9 @@ __device__ int get_textures_shared_size( // in bytes
 	if (offsets) offsets[6] = offs;
 	offs += num_cams * TEXTURE_THREADS_PER_TILE;               // float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
 	if (offsets) offsets[7] = offs;
-	offs += offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
+	offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
 	if (offsets) offsets[8] = offs;
-	return sizeof(float) * shared_floats;
+	return sizeof(float) * offs; // shared_floats;
 }
@@ -3450,11 +3518,25 @@ __global__ void imclt_rbg_all(
 		int                woi_theight,
 		const size_t       dstride)            // in floats (pixels)
 {
 //	int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]);
+	dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1);
+	dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1);
+	dim3 grid_erase8x8_bottom_row(woi_twidth + 1, 1, 1);
 	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
 	if (threadIdx.x == 0) { // anyway 1,1,1
 		for (int ncam = 0; ncam < num_cams; ncam++) { // was NUM_CAMS
 			for (int color = 0; color < colors; color++) {
+				// clear right and bottom 8-pixel column and row
+				float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE);
+				erase8x8<<<grid_erase8x8_right_col,threads_erase8x8>>>(
+						right_col,             // float           * gpu_top_left,
+						dstride);              // const size_t      dstride);
+				float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color +  dstride * (woi_theight * DTT_SIZE);
+				erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8>>>(
+						bottom_row,             // float           * gpu_top_left,
+						dstride);              // const size_t      dstride);
 				for (int v_offs = 0; v_offs < 2; v_offs++){
 					for (int h_offs = 0; h_offs < 2; h_offs++){
 						int tilesy_half = (woi_theight + (v_offs ^ 1)) >> 1;
@@ -3481,7 +3563,25 @@ __global__ void imclt_rbg_all(
 	}
 }
+/**
+ * Clear 8x8 tiles, used to erase right and bottom 8-pixel wide column/row before imclt_rbg
+ * @param gpu_top_left - pointer to the top-left corner of the firsr tile to erase
+ * @param dstride - offset for 1 pixel step down
+ * block.x - horizontal tile  offset
+ * block.y - vertical tile offset
+ * 0<=thread.x < 8 - horizontal pixel offset
+ * 0<=thread.y < 4 - vertical pixel offset
+ */
+extern "C"
+__global__ void erase8x8(
+		float           * gpu_top_left,
+		const size_t      dstride)
+{
+	float * pixel = gpu_top_left + (((blockIdx.y * DTT_SIZE) + threadIdx.y) * dstride) + ((blockIdx.x * DTT_SIZE) + threadIdx.x);
+	* pixel = 0.0f;
+	pixel += dstride * blockDim.y; // add 4 pixel rows (assuming blockDim.x==4)
+	* pixel = 0.0f;
+}
 /**
 * Helper kernel for imclt_rbg_all(), generate per-camera -per color image from the in-memory frequency domain representation.
@@ -3510,7 +3610,7 @@ __global__ void imclt_rbg(
 		const size_t      dstride)            // in floats (pixels)
 {
 	float *color_plane = gpu_rbg + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color;
-	int pass =           (v_offset << 1) + h_offset;     		// 0..3 to correctly acummulate 16x16 tiles stride 8
+	int pass =           (v_offset << 1) + h_offset;     		// 0..3 to correctly accumulate 16x16 tiles stride 8
 	int tile_in_block = threadIdx.y;
 	int tile_num = blockIdx.x * IMCLT_TILES_PER_BLOCK + tile_in_block;
 	int tilesx_half = (woi_twidth + (h_offset ^ 1)) >> 1;
@@ -4771,10 +4871,10 @@ __device__ void debayer( // 8 threads
 * @param calc_extra           calculate ports_rgb, max_diff. If not null - will ignore rbg_tile, so this mode
 *                             should not be combined with texture generation. It is intended to generate a
 *                             lo-res (1/8) images for macro correlation
- * @param ports_rgb_shared     shared memory data to be used to return lo-res images tile average color [NUM_COLORS][NUM_CAMS]
+ * @param ports_rgb_shared     shared memory data to be used to return lo-res images tile average color [NUM_COLORS * NUM_CAMS]
- * @param max_diff_shared      shared memory data to be used to return lo-res images tile mismatch form average [NUM_CAMS]
+ * @param max_diff_shared      shared memory data to be used to return lo-res images tile mismatch from average [NUM_CAMS]
- * @param max_diff_tmp         shared memory to be used here for temporary storage [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
+ * @param max_diff_tmp         shared memory to be used here for temporary storage [NUM_CAMS * TEXTURE_THREADS_PER_TILE]
- * @param ports_rgb_tmp        shared memory to be used here for temporary storage [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], [4*3][8]
+ * @param ports_rgb_tmp        shared memory to be used here for temporary storage [NUM_COLORS *NUM_CAMS * TEXTURE_THREADS_PER_TILE], [4*3][8]
 * @param port_offsets         [port]{x_off, y_off} - just to scale pixel value differences (quad - {{-0.5, -0.5},{0.5,-0.5},{-0.5,0.5},{0.5,0.5}}
 * @param diff_sigma           pixel value/pixel change (1.5)
 * @param diff_threshold       pixel value/pixel change (10)
@@ -4843,9 +4943,18 @@ __device__ void tile_combine_rgba(
 	}
 	int colors_offset = colors * MCLT_UNION_LEN; // padded in union !
-#ifdef DEBUG8
+#ifdef DEBUG7A
 	__syncthreads();// __syncwarp();
 	if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		printf("diff_sigma = %f\n",    diff_sigma);
+		printf("diff_threshold = %f\n",diff_threshold);
+		printf("min_agree = %f\n",     min_agree);
+		printf("chn_weights[0] = %f\n",chn_weights[0]);
+		printf("chn_weights[1] = %f\n",chn_weights[1]);
+		printf("chn_weights[2] = %f\n",chn_weights[2]);
+		printf("dust_remove =    %d\n",dust_remove);
+		printf("keep_weights =   %d\n",keep_weights);
 		printf("\ntile_combine_rgba ksigma = %f\n",ksigma);
 		for (int i = 0; i < indx; i++) {
 			printf("%02d: %d :%d %f\n",i,pair_ports[i][0], pair_ports[i][1], pair_dist2r[i]);
@@ -4941,10 +5050,10 @@ __device__ void tile_combine_rgba(
 		float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQ[col_sym];
 		float wnd2_inv = 1.0/wnd2;
-#pragma unroll
+//#pragma unroll
 		for (int ipair = 0; ipair < (num_cams*(num_cams-1)/2); ipair++){
 			float d = 0;
-#pragma unroll // non-constant
+//#pragma unroll // non-constant
 			for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
 				//					double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i];
 				float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
@@ -4993,7 +5102,7 @@ __device__ void tile_combine_rgba(
 		float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)));
 		float w2 = 1.0 - w1;
 		float * rgba_i = rgba + i;
-#pragma unroll  // non-constant
+//#pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
 			float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
 			* (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))=
@@ -5157,7 +5266,7 @@ __device__ void tile_combine_rgba(
 //		}
 		float a = 0;
-#pragma unroll
+//#pragma unroll
 		for (int cam = 0; cam < num_cams; cam++) {
 			a +=  *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam);
 		}
@@ -5165,7 +5274,7 @@ __device__ void tile_combine_rgba(
 	}// for (int pass = 0; pass < 8; pass ++)
 	__syncthreads();
-#ifdef DEBUG8
+#ifdef DEBUG7A // 8
 	if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 		printf("\ntile_combine_rgba() final\n");
 		for (int ncol = 0; ncol < colors; ncol++) {
@@ -5196,46 +5305,81 @@ __device__ void tile_combine_rgba(
 	if (calc_extra){
-		int cam = threadIdx.y;
+		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
-		int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
+			int cam = camera_num_offs + threadIdx.y;
-//		max_diff_tmp[cam][threadIdx.x] = 0.0;
+			int indx0 = cam * TEXTURE_THREADS_PER_TILE;
-		max_diff_tmp[indx] = 0.0;
+			int indx =  indx0 + threadIdx.x;
+			//		max_diff_tmp[cam][threadIdx.x] = 0.0;
+			max_diff_tmp[indx] = 0.0;
 #pragma unroll
-		for (int pass = 0; pass < 32; pass++){
+			for (int pass = 0; pass < 32; pass++){
-			int row = (pass >> 1);
+				int row = (pass >> 1);
-			int col = ((pass & 1) << 3) + threadIdx.x;
+				int col = ((pass & 1) << 3) + threadIdx.x;
-			int i = row * DTT_SIZE21 + col;
+				int i = row * DTT_SIZE21 + col;
-			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+				float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
-			float d2 = 0.0;
+				float d2 = 0.0;
-#pragma unroll  // non-constant
+//#pragma unroll  // non-constant
-			for (int ncol = 0; ncol < colors; ncol++){
+				for (int ncol = 0; ncol < colors; ncol++){
-				float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
+					float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
-				d2 += *(chn_weights + ncol) * dc * dc;
+					d2 += *(chn_weights + ncol) * dc * dc;
+				}
+				//max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
+				max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
 			}
-//			max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
+			__syncthreads();
-			max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
+			if (threadIdx.x == 0){ // combine results
-		}
+				float mx = 0.0;
-		__syncthreads();
-		if (threadIdx.x == 0){ // combine results
-			float mx = 0.0;
 #pragma unroll
-			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//				mx = fmaxf(mx, max_diff_tmp[cam][i]);
+					//				mx = fmaxf(mx, max_diff_tmp[cam][i]);
-				mx = fmaxf(mx, max_diff_tmp[indx]);
+					mx = fmaxf(mx, max_diff_tmp[indx0 + i]);
+				}
+				max_diff_shared[cam] = sqrtf(mx);
 			}
-			max_diff_shared[cam] = sqrtf(mx);
-		}
 #ifdef DEBUG22
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+				printf("\n 1. max_diff\n");
+				printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					//				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+					printf("tmp[%d] %f %f %f %f\n",i,
+							max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
+				}
+				for (int ncol = 0; ncol < colors; ncol++){
+					printf("\n average for color %d\n",ncol);
+					debug_print_mclt(
+							rgba + (DTT_SIZE2*DTT_SIZE21) * ncol,
+							-1);
+					for (int ncam = 0; ncam < num_cams;ncam ++){
+						printf("\n mclt for color %d, camera %d\n",ncol,ncam);
+						debug_print_mclt(
+								mclt_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
+								-1);
+#if 0
+						printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
+						if (rgb_tile) {
+							debug_print_mclt(
+									rbg_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
+									-1);
+						}
+#endif
+					}
+				}
+			}
+			__syncthreads();// __syncwarp();
+#endif // #ifdef DEBUG22
+		} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
+#ifdef DEBUG7A
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\n 1. max_diff\n");
+			printf("\n X2. max_diff\n");
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("total ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
-				printf("tmp[%d] %f %f %f %f\n",i,
-						max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
 			}
 			for (int ncol = 0; ncol < colors; ncol++){
 				printf("\n average for color %d\n",ncol);
@@ -5247,88 +5391,97 @@ __device__ void tile_combine_rgba(
 					debug_print_mclt(
 							mclt_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
 							-1);
-#if 0
-					printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
-					if (rgb_tile) {
-						debug_print_mclt(
-								rbg_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
-								-1);
-					}
-#endif
 				}
 			}
 		}
 		__syncthreads();// __syncwarp();
+#endif // #ifdef DEBUG7A
-#endif // #ifdef DEBUG22
 	}
 	if (calc_extra) {
 		int incr = num_cams * TEXTURE_THREADS_PER_TILE;
-		int cam =  threadIdx.y;
+		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
-		int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
+			int cam = camera_num_offs + threadIdx.y;
-		int indx1 = indx;
+			//		int cam =  threadIdx.y;  // BUG!
-		for (int ncol = 0; ncol < colors; ncol++){
+			int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
-//			ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
+			int indx1 = indx;
-			ports_rgb_tmp[indx1 += incr] = 0.0;
+			for (int ncol = 0; ncol < colors; ncol++){
-		}
+				//			ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
+				ports_rgb_tmp[indx1] = 0.0; // no difference in wrong zeros when removed
+				indx1 += incr;
+			}
+#ifdef DEBUG7AXX
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+				printf("\nAA: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
+				__syncthreads();// __syncwarp();
+			}
+#endif // #ifdef DEBUG7A
 #pragma unroll
-		for (int pass = 0; pass < 32; pass++){
+			for (int pass = 0; pass < 32; pass++){
-			int row = (pass >> 1);
+				int row = (pass >> 1);
-			int col = ((pass & 1) << 3) + threadIdx.x;
+				int col = ((pass & 1) << 3) + threadIdx.x;
-			int i = row * DTT_SIZE21 + col;
+				int i = row * DTT_SIZE21 + col;
-			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+				float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
-			indx1 = indx;
+				indx1 = indx;
-			for (int ncol = 0; ncol < colors; ncol++){
+				for (int ncol = 0; ncol < colors; ncol++){
-//				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+					//				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
-				ports_rgb_tmp[indx1 += incr] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+//					ports_rgb_tmp[indx1 += incr] += 1.0; /// *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+					ports_rgb_tmp[indx1] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+					indx1 += incr;
+				}
 			}
-		}
+			__syncthreads();
-		__syncthreads();
+#ifdef DEBUG7AXX
-		if (threadIdx.x == 0){ // combine results
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			for (int ncol = 0; ncol < colors; ncol++){
+				printf("\nBB: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
-				int indx2 = ncol * num_cams + cam;
+				__syncthreads();// __syncwarp();
-//				ports_rgb_shared[ncol][cam] = 0;
+			}
-				ports_rgb_shared[indx] = 0;
+#endif // #ifdef DEBUG7A
-				int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
+			if (threadIdx.x == 0){ // combine results
+				for (int ncol = 0; ncol < colors; ncol++){
+					int indx2 = ncol * num_cams + cam;
+					//				ports_rgb_shared[ncol][cam] = 0;
+					ports_rgb_shared[indx2] = 0;
+					int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
 #pragma unroll
-				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//					ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
+						//					ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
-					ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
+						ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
+					}
+					ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 				}
-				ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 			}
-		}
+		} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {
-#ifdef DEBUG22
+		__syncthreads();
+#ifdef DEBUG7A
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\n 2. max_diff\n");
+			printf("\n 2. max_diff, ports_rgb_shared, DBG_TILE = %d\n",DBG_TILE);
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			//				printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("max_diff_shared ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-				printf("tmp[%d] %f %f %f %f\n",i,
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
-						max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
 			}
 			for (int ncol = 0; ncol < colors; ncol++){
-				printf("\n%d:total %f %f %f %f\n", // only first 4 cameras
+				printf("\n%d:ports_rgb_shared ",ncol);
-						ncol,
+				for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams + ccam]);} printf("\n");
-						ports_rgb_shared[ncol * num_cams + 0],
-						ports_rgb_shared[ncol * num_cams + 1],
-						ports_rgb_shared[ncol * num_cams + 2],
-						ports_rgb_shared[ncol * num_cams + 3]);
 				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-					printf("tmp[%d] %f %f %f %f\n",
+					printf("ports_rgb_tmp[%d] ",i);
-							i,
+					for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
-							ports_rgb_tmp[(ncol * num_cams + 0) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 1) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 2) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 3) * TEXTURE_THREADS_PER_TILE + i]);
 				}
 			}
 		}
 		__syncthreads();// __syncwarp();
-#endif // #ifdef DEBUG22
+#endif // #ifdef DEBUG7A
 	}
 }