updated to new GPU kernels

975dadb4 · Andrey Filippov · 907cda8d · 975dadb4 · 975dadb4
Commit 975dadb4 authored Apr 04, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 834 additions and 26 deletions

GPUTileProcessor.java src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java +98 -8

TileProcessor.cuh src/main/resources/kernels/TileProcessor.cuh +736 -18

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -91,7 +91,10 @@ public class GPUTileProcessor {
 	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
 	static String GPU_IMCLT_RBG_NAME =             "imclt_rbg"; // name in C code
 	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
-	static String GPU_TEXTURES_NAME =              "textures_gen"; // name in C code
+//	static String GPU_TEXTURES_NAME =              "textures_gen"; // name in C code
+	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
+
+
 //  pass some defines to gpu source code with #ifdef JCUDA
 	public static int DTT_SIZE =                  8;
 	static int        THREADSX =                  DTT_SIZE;
@@ -120,13 +123,20 @@ public class GPUTileProcessor {
 	public static int CORR_PAIRS_MASK =        0x3f;  // lower bits used to address correlation pair for the selected tile
 	public static int CORR_TEXTURE_BIT =          7;  // bit 7 used to request texture for the tile
 	public static int TASK_CORR_BITS =            4;  // start of pair mask
-	public static int TASK_TEXTURE_BIT =          3;  // bit to request texture calculation int task field of struct tp_task
+	public static int TASK_TEXTURE_N_BIT =        0; // Texture with North neighbor
+	public static int TASK_TEXTURE_E_BIT =        1; // Texture with East  neighbor
+	public static int TASK_TEXTURE_S_BIT =        2; // Texture with South neighbor
+	public static int TASK_TEXTURE_W_BIT =        3; // Texture with West  neighbor
+//	public static int TASK_TEXTURE_BIT =          3;  // bit to request texture calculation int task field of struct tp_task
 	public static int LIST_TEXTURE_BIT =          7;  // bit to request texture calculation
 	public static int CORR_OUT_RAD =              4;  // output radius of the correlations (implemented)
 	public static double FAT_ZERO_WEIGHT =        0.0001; // add to port weights to avoid nan

 	public static int THREADS_DYNAMIC_BITS =      5; // treads in block for CDP creation of the texture list

+	public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT));
+
+
    int DTTTEST_BLOCK_WIDTH =        32; // may be read from the source code
    int DTTTEST_BLOCK_HEIGHT =       16; // may be read from the source code

@@ -330,7 +340,10 @@ public class GPUTileProcessor {
        				"#define CORR_PAIRS_MASK " +          CORR_PAIRS_MASK+"\n"+
        				"#define CORR_TEXTURE_BIT " +         CORR_TEXTURE_BIT+"\n"+
        				"#define TASK_CORR_BITS " +           TASK_CORR_BITS+"\n"+
-        				"#define TASK_TEXTURE_BIT " +         TASK_TEXTURE_BIT+"\n"+
+        				"#define TASK_TEXTURE_N_BIT " +       TASK_TEXTURE_N_BIT+"\n"+
+        				"#define TASK_TEXTURE_E_BIT " +       TASK_TEXTURE_E_BIT+"\n"+
+        				"#define TASK_TEXTURE_S_BIT " +       TASK_TEXTURE_S_BIT+"\n"+
+        				"#define TASK_TEXTURE_W_BIT " +       TASK_TEXTURE_W_BIT+"\n"+
        				"#define LIST_TEXTURE_BIT " +         LIST_TEXTURE_BIT+"\n"+
        				"#define CORR_OUT_RAD " +             CORR_OUT_RAD+"\n" +
        				"#define FAT_ZERO_WEIGHT " +          FAT_ZERO_WEIGHT+"\n"+
@@ -439,7 +452,11 @@ public class GPUTileProcessor {
    	// Set corrs array
 ///    	cuMemAlloc(gpu_corrs,       tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
-    	cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
+
+    	//#define TILESYA       ((TILESY +3) & (~3))
+    	int tilesYa = (tilesY + 3) & ~3;
+//    	cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
+    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.POINTER);


@@ -711,7 +728,7 @@ public class GPUTileProcessor {
    	int tilesX = IMG_WIDTH / DTT_SIZE;
    	int num_textures = 0;
    	for (TpTask tt: tp_tasks) {
-    		if ((tt.task & TASK_TEXTURE_BIT) !=0) {
+    		if ((tt.task & TASK_TEXTURE_BITS) !=0) {
    			num_textures++;
    		}
    	}
@@ -720,7 +737,7 @@ public class GPUTileProcessor {
    	num_textures = 0;
    	int b = (1 << LIST_TEXTURE_BIT);
    	for (TpTask tt: tp_tasks) {
-    		if ((tt.task & TASK_TEXTURE_BIT) !=0) {
+    		if ((tt.task & TASK_TEXTURE_BITS) !=0) {
    			int tile = (tt.ty * tilesX +tt.tx);
    			iarr[num_textures++] = (tile << CORR_NTILE_SHIFT) | b;
    		}
@@ -901,7 +918,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

-    public void execTextures(
+    public void execTexturesOld(
    		double [][] port_offsets,
    		double [] color_weights,
    		boolean   is_lwir,
@@ -966,6 +983,75 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

+    public void execTextures(
+    		double [][] port_offsets,
+    		double [] color_weights,
+    		boolean   is_lwir,
+    		double    min_shot,           // 10.0
+    		double    scale_shot,         // 3.0
+    		double    diff_sigma,         // pixel value/pixel change
+    		double    diff_threshold,     // pixel value/pixel change
+    		double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    		boolean   dust_remove,
+    		boolean   keep_weights) {
+    	if (GPU_TEXTURES_kernel == null)
+    	{
+    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
+    		return;
+    	}
+    	float [] fport_offsets = new float[port_offsets.length * 2];
+    	for (int cam = 0; cam < port_offsets.length; cam++) {
+    		fport_offsets[2*cam + 0] = (float) port_offsets[cam][0];
+    		fport_offsets[2*cam + 1] = (float) port_offsets[cam][1];
+    	}
+        cuMemcpyHtoD(gpu_port_offsets, Pointer.to(fport_offsets),  fport_offsets.length * Sizeof.FLOAT);
+
+    	int num_colors = color_weights.length;
+    	if (num_colors > 3) num_colors = 3;
+    	float weighht0 = (float) color_weights[0];
+    	float weighht1 = (num_colors >1)?((float) color_weights[1]):0.0f;
+    	float weighht2 = (num_colors >2)?((float) color_weights[2]):0.0f;
+    	int iis_lwir =      (is_lwir)? 1:0;
+    	int idust_remove =  (dust_remove)? 1 : 0;
+    	int ikeep_weights = (keep_weights)? 1 : 0;
+
+		int [] GridFullWarps =    {(num_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1};
+    	int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
+
+    	Pointer kernelParameters = Pointer.to(
+    			Pointer.to(new int[] {0}), // 0,          // int               border_tile,        // if 1 - watch for border
+    			Pointer.to(gpu_texture_indices), //  int             * woi, - not used
+    			Pointer.to(gpu_clt),
+    			Pointer.to(new int[] { num_texture_tiles }),
+    			Pointer.to(gpu_texture_indices),
+    			Pointer.to(gpu_port_offsets),
+    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(new int[] { iis_lwir }),
+    			Pointer.to(new float[] {(float) min_shot }),
+    			Pointer.to(new float[] {(float) scale_shot }),
+    			Pointer.to(new float[] {(float) diff_sigma }),
+    			Pointer.to(new float[] {(float) diff_threshold }),
+    			Pointer.to(new float[] {(float) min_agree }),
+    			Pointer.to(new float[] {weighht0 }),
+    			Pointer.to(new float[] {weighht1 }),
+    			Pointer.to(new float[] {weighht2 }),
+    			Pointer.to(new int[] { idust_remove }),
+    			Pointer.to(new int[] { ikeep_weights }),
+    			Pointer.to(new int[] {0}),//  0, // const size_t      texture_rbg_stride, // in floats - DISABLE GENERATION!
+    			Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
+    			Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+    			Pointer.to(gpu_textures)
+    			);
+    	cuCtxSynchronize();
+    	// Call the kernel function
+    	cuLaunchKernel(GPU_TEXTURES_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize();
+    }
+


    public float [][] getCorr2D(int corr_rad){
@@ -1173,7 +1259,11 @@ public class GPUTileProcessor {
    	CUlinkState state = new CUlinkState();
    	cuLinkCreate(jitOptions, state);
    	cuLinkAddFile(state, CU_JIT_INPUT_LIBRARY, LIBRARY_PATH, jitOptions);
-    	cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxData), ptxData.length, "input.ptx", jitOptions);
+
+    	System.out.println("ptxData.length="+ptxData.length);
+//    	System.out.println( ptx[0]);
+
+    	cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxData), ptxData.length, "input.ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
    	long size[] = { 0 };
    	Pointer image = new Pointer();
    	cuLinkComplete(state, image, size);

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -61,6 +61,10 @@
 #define CORR_PAIRS_MASK        0x3f// lower bits used to address correlation pair for the selected tile
 #define CORR_TEXTURE_BIT          7 // bit 7 used to request texture for the tile
 #define TASK_CORR_BITS            4
+#define TASK_TEXTURE_N_BIT        0 // Texture with North neighbor
+#define TASK_TEXTURE_E_BIT        1 // Texture with East  neighbor
+#define TASK_TEXTURE_S_BIT        2 // Texture with South neighbor
+#define TASK_TEXTURE_W_BIT        3 // Texture with West  neighbor
 #define TASK_TEXTURE_BIT          3 // bit to request texture calculation int task field of struct tp_task
 #define LIST_TEXTURE_BIT          7 // bit to request texture calculation
 #define CORR_OUT_RAD              4
@@ -69,6 +73,7 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list

 #undef HAS_PRINTF
+//#define HAS_PRINTF
 //7
 //#define DEBUG1 1
 //#define DEBUG2 1
@@ -81,8 +86,16 @@
 #define DEBUG8 1
 #define DEBUG9 1
 */
+#define DEBUG10 1
+
+//#define USE_textures_gen
+
+#endif //#ifndef JCUDA
+
+#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
+
+

-#endif
 //#define IMCLT14
 //#define NOICLT 1
 //#define TEST_IMCLT
@@ -131,6 +144,8 @@
 #define KERNELS_STEP  (1 << KERNELS_LSTEP)
 #define TILESX        (IMG_WIDTH / DTT_SIZE)
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
+// Make TILESYA >= TILESX and a multiple of 4
+#define TILESYA       ((TILESY +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
 #define DTT_SIZE1        (DTT_SIZE + 1)
 #define DTT_SIZE2        (2 * DTT_SIZE)
@@ -162,9 +177,10 @@
 //#define TASK_SIZE      12
 struct tp_task {
 	int   task;
-	int   txy;
-//	short ty;
-//	short tx;
+	union {
+		int      txy;
+		unsigned short sxy[2];
+	};
 	float xy[NUM_CAMS][2];
 };
 struct CltExtra{
@@ -1101,11 +1117,60 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)

+extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
 		int                height); // <= TILESY, use for faster processing of LWIR images

+extern "C"
+__global__ void mark_texture_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,            // number of tiles in task list
+		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+extern "C"
+__global__ void mark_texture_neighbor_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi);                // x,y,width,height of the woi
+extern "C"
+__global__ void gen_texture_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,  // number of texture tiles to process
+		int              * woi);                // x,y,width,height of the woi
+
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
+		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void textures_accumulate(
+		int               border_tile,        // if 1 - watch for border
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weight0,            // scale for R
+		float             weight1,            // scale for B
+		float             weight2,            // scale for G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles

 extern "C"
 __global__ void correlate2D(
@@ -1377,6 +1442,175 @@ Java code:
 }
 #define USE_CDP
 #ifdef USE_CDP
+extern "C"
+__global__ void generate_RBGA(
+// Parameters to generate texture tasks
+			struct tp_task   * gpu_tasks,
+			int                num_tiles,          // number of tiles in task list
+// declare arrays in device code?
+			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+			int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+			int              * woi,                // x,y,width,height of the woi
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+			int                height, // <= TILESY, use for faster processing of LWIR images
+// Parameters for the texture generation
+			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+			int               colors,             // number of colors (3/1)
+			int               is_lwir,            // do not perform shot correction
+			float             min_shot,           // 10.0
+			float             scale_shot,         // 3.0
+			float             diff_sigma,         // pixel value/pixel change
+			float             diff_threshold,     // pixel value/pixel change
+			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			float             weight0,            // scale for R
+			float             weight1,            // scale for B
+			float             weight2,            // scale for G
+			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			int               keep_weights,       // return channel weights after A in RGBA (was removed)
+			const size_t      texture_rbga_stride,     // in floats
+			float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+{
+// TODO use atomic_add to increment	num_texture_tiles
+// TODO calculate woi
+    dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
+    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+    dim3 blocks0 (blocks_x, height, 1);
+
+	if (threadIdx.x == 0) {
+		clear_texture_list<<<blocks0,threads0>>>(
+				gpu_texture_indices,
+				width,
+				height);
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+		int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
+	    dim3 blocks(blocks_t, 1, 1);
+	    // mark used tiles in gpu_texture_indices memory
+		mark_texture_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		cudaDeviceSynchronize();
+	    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
+		*(woi + 0) = TILESX;
+		*(woi + 1) = TILESY;
+		*(woi + 2) = 0; // maximal x
+		*(woi + 3) = 0; // maximal y
+		mark_texture_neighbor_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,           // number of tiles in task list
+				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+				woi);               // min_x, min_y, max_x, max_y
+		cudaDeviceSynchronize();
+	    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
+		*(num_texture_tiles+0) = 0;
+		*(num_texture_tiles+1) = 0;
+		*(num_texture_tiles+2) = 0;
+		*(num_texture_tiles+3) = 0;
+		*(num_texture_tiles+4) = 0;
+		*(num_texture_tiles+5) = 0;
+		*(num_texture_tiles+6) = 0;
+		*(num_texture_tiles+7) = 0;
+
+		gen_texture_list <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				num_texture_tiles,  // number of texture tiles to process
+				woi);               // x,y, here woi[2] = max_X, woi[3] - max-Y
+
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+		*(woi + 2) += 1 - *(woi + 0); // width
+		*(woi + 3) += 1 - *(woi + 1); // height
+	}
+	 __syncthreads();
+// Zero output textures. Trim
+// texture_rbga_stride
+	 int texture_width =        *(woi + 2) * DTT_SIZE;
+	 int texture_tiles_height = *(woi + 3) * DTT_SIZE;
+	 int texture_height =       texture_tiles_height * DTT_SIZE;
+	 int texture_slices =       colors + 1;
+
+	 if (threadIdx.x == 0) {
+		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
+		    int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
+		    clear_texture_rbga<<<blocks2,threads2>>>(
+		    		texture_width,
+					texture_tiles_height * texture_slices, // int               texture_slice_height,
+					texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
+					gpu_texture_tiles) ;                   // float           * gpu_texture_tiles);
+// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
+// oversize (border 16x116 tiles overhang by 4 pixels)
+			cudaDeviceSynchronize(); // not needed yet, just for testing
+			for (int pass = 0; pass < 8; pass++){
+			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+			    int border_tile =  pass >> 2;
+			    size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
+			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+			    int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2));  // 1/4
+			    if (border_tile){
+			    	ti_offset += TILESX * (TILESYA >> 2) - ntt;
+			    }
+			    /* */
+			    textures_accumulate<<<grid_texture,threads_texture>>>(
+			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
+			    		woi,                             // int             * woi,                // x, y, width,height
+						gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
+						gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+						gpu_port_offsets,                // float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+						colors,                          // int               colors,             // number of colors (3/1)
+						is_lwir,                         // int               is_lwir,            // do not perform shot correction
+						min_shot,                        // float             min_shot,           // 10.0
+						scale_shot,                      // float             scale_shot,         // 3.0
+						diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
+						diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
+						min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+						weight0,                         // float             weight0,            // scale for R
+						weight1,                         // float             weight1,            // scale for B
+						weight2,                         // float             weight2,            // scale for G
+						dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			    		0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+			    // combining both non-overlap and overlap (each calculated if pointer is not null )
+						texture_rbga_stride,             // size_t      texture_rbg_stride, // in floats
+						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+						gpu_texture_tiles); // (float *) 0 );                   // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+
+				/* */
+
+			}
+
+	 }
+	 __syncthreads();
+
+}
+
+// blockDim.x * gridDim.x >= width
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+{
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	if (col > texture_width) {
+		return;
+	}
+	int row = (blockIdx.y << 3); // includes slices
+	float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
+#pragma unroll
+	for (int n = 0; n < DTT_SIZE; n++) {
+		*(pix) = 0.0;
+		pix += texture_rbga_stride;
+	}
+}
+
+
+
+
 /**
 * prepare list of texture tiles, woi, and calculate orthogonal neighbors for tiles (in 4 bits of the task field
 * use 4x8=32 threads,
@@ -1386,41 +1620,187 @@ __global__ void prepare_texture_list(
 			struct tp_task   * gpu_tasks,
 			int                num_tiles,          // number of tiles in task list
 			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-			int              * num_texture_tiles,  // number of texture tiles to process
+			// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
+			// border - down from the sam3\e 1/4 of the buffer
+			int              * num_texture_tiles,  // number of texture tiles to process  (4 separate elements for accumulation)
 			int              * woi,                // x,y,width,height of the woi
-			int                width,  // <= TILESX, use for faster processing of LWIR images
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
 			int                height) // <= TILESY, use for faster processing of LWIR images
 {
+// TODO use atomic_add to increment	num_texture_tiles
+// TODO calculate woi
 //	int task_num = blockIdx.x;
 //	int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
-    dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
-    int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
-    dim3 blocks (blocks_x, height, 1);
+    dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
+//    int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
+    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+
+    dim3 blocks0 (blocks_x, height, 1);

 	if (threadIdx.x == 0) {
-		clear_texture_list<<<blocks,threads>>>(
+		clear_texture_list<<<blocks0,threads0>>>(
 				gpu_texture_indices,
 				width,
 				height);
 		cudaDeviceSynchronize(); // not needed yet, just for testing
+		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+		int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
+	    dim3 blocks(blocks_t, 1, 1);
+	    // mark used tiles in gpu_texture_indices memory
+		mark_texture_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		cudaDeviceSynchronize();
+	    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
+		*(woi + 0) = TILESX;
+		*(woi + 1) = TILESY;
+		*(woi + 2) = 0; // maximal x
+		*(woi + 3) = 0; // maximal y
+		mark_texture_neighbor_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,           // number of tiles in task list
+				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+				woi);               // min_x, min_y, max_x, max_y
+		cudaDeviceSynchronize();
+	    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
+		*(num_texture_tiles+0) = 0;
+		*(num_texture_tiles+1) = 0;
+		*(num_texture_tiles+2) = 0;
+		*(num_texture_tiles+3) = 0;
+		*(num_texture_tiles+4) = 0;
+		*(num_texture_tiles+5) = 0;
+		*(num_texture_tiles+6) = 0;
+		*(num_texture_tiles+7) = 0;
+
+		gen_texture_list <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				num_texture_tiles,  // number of texture tiles to process
+				woi);               // x,y, here woi[2] = max_X, woi[3] - max-Y
+
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+		*(woi + 2) += 1 - *(woi + 0); // width
+		*(woi + 3) += 1 - *(woi + 1); // height
 	}
 	 __syncthreads();
 }

 // blockDim.x * gridDim.x >= width
-
+extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
 		int                height) // <= TILESY, use for faster processing of LWIR images
 {
-	int col = threadIdx.x + blockDim.x * blockIdx.x;
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockIdx.y;
 	if (col > width) {
 		return;
 	}
-	*(gpu_texture_indices + col + row * TILESX) = 0.0;
+	*(gpu_texture_indices + col + row * TILESX) = 0;
+}
+
+// treads (*,1,1), blocks = (*,1,1)
+extern "C"
+__global__ void mark_texture_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
+{
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+	int task = gpu_tasks[task_num].task;
+	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+	int cxy = gpu_tasks[task_num].txy;
+	*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * TILESX) = 1;
+}
+
+
+// treads (*,1,1), blocks = (*,1,1)
+extern "C"
+__global__ void mark_texture_neighbor_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi)                  // x,y,width,height of the woi
+
+{
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+//	struct tp_task  * gpu_task = &gpu_tasks[task_num];
+//	int task = gpu_task->task;
+	int task = gpu_tasks[task_num].task;
+	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+//	int cxy = gpu_task->txy;
+	int cxy = gpu_tasks[task_num].txy;
+	int x = (cxy & 0xffff);
+	int y = (cxy >> 16);
+	atomicMin(woi+0, x);
+	atomicMin(woi+1, y);
+	atomicMax(woi+2, x);
+	atomicMax(woi+3, y);
+	int d = 0;
+	if ((y > 0)            && *(gpu_texture_indices +  x +     (y - 1) * TILESX)) d |= (1 << TASK_TEXTURE_N_BIT);
+	if ((x < (TILESX - 1)) && *(gpu_texture_indices + (x + 1) + y *      TILESX)) d |= (1 << TASK_TEXTURE_E_BIT);
+	if ((y < (TILESY - 1)) && *(gpu_texture_indices +  x +     (y + 1) * TILESX)) d |= (1 << TASK_TEXTURE_S_BIT);
+	if ((x > 0)            && *(gpu_texture_indices + (x - 1) + y *      TILESX)) d |= (1 << TASK_TEXTURE_W_BIT);
+	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
 }
+
+extern "C"
+__global__ void gen_texture_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,            // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * woi)                  // min_x, min_y, max_x, max_y input
+
+{
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+
+	int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
+	if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+	int cxy = gpu_tasks[task_num].txy;
+	int x = (cxy & 0xffff);
+	int y = (cxy >> 16);
+//	int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
+	// don't care if calculate extra pixels that still fit into memory
+	int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
+	if (x & 1) {
+		gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
+		num_texture_tiles += 2; // int *
+	}
+	if (y & 1) {
+		gpu_texture_indices += TILESX * (TILESYA >> 1);
+		num_texture_tiles += 4; // int *
+	}
+	if (is_border){
+		gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
+		num_texture_tiles += 1; // int *
+	}
+	// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
+	int buf_offset = atomicAdd(num_texture_tiles, 1);
+	if (is_border){
+		buf_offset = -buf_offset;
+	}
+	*(gpu_texture_indices + buf_offset) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+}
+
 #endif //#ifdef USE_CDP


@@ -1504,6 +1884,9 @@ __global__ void convert_correct_tiles(
    	}
    }
 }
+
+//#undef USE_textures_gen
+#ifdef USE_textures_gen
 extern "C"
 __global__ void textures_gen(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -1768,10 +2151,347 @@ __global__ void textures_gen(
    }
     __syncthreads();// __syncwarp();
 #endif
+}
+#endif // ifdef USE_textures_gen
+extern "C"
+__global__ void textures_accumulate(
+		int               border_tile,        // if 1 - watch for border
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weight0,            // scale for R
+		float             weight1,            // scale for B
+		float             weight2,            // scale for G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+
+{
+	float weights[3] = {weight0, weight1, weight2};
+	// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
+	int camera_num = threadIdx.y;
+	int tile_indx = blockIdx.x; //  * TEXTURE_TILES_PER_BLOCK + tile_in_block;
+	if (tile_indx >= num_texture_tiles){
+		return; // nothing to do
+	}
+	// get number of tile
+	int tile_code = gpu_texture_indices[tile_indx];
+	if ((tile_code & (1 << CORR_TEXTURE_BIT)) == 0){
+		return; // nothing to do
+	}
+	int tile_num = tile_code >> CORR_NTILE_SHIFT;
+	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
+	__shared__ union {
+		float clt_tiles  [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // NUM_CAMS == 4
+		float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; // to align with clt_tiles
+
+	} shr;
+	__shared__ union {
+		float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
+		float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
+		// add more
+	} shr1;
+	//	__shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
+	//	__shared__ float color_avg   [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
+
+	__shared__ float port_offsets[NUM_CAMS][2];
+	__shared__ float ports_rgb   [NUM_CAMS][NUM_COLORS]; // return to system memory (optionally pass null to skip calculation)
+	__shared__ float max_diff [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
+	if (threadIdx.x < 2){
+		port_offsets[camera_num][threadIdx.x] = * (gpu_port_offsets + 2 * camera_num + threadIdx.x);
+	}
+
+
+#ifdef DBG_TILE
+#ifdef DEBUG7
+	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		printf("\ntextures_gen tile = %d\n",tile_num);
+		//    	debug_print_clt1(clt_tile1, color,  0xf); //
+		//        printf("\textures_gen tile = %d, pair=%d, color = %d   CAMERA22\n",tile_num, corr_pair,color);
+		//    	debug_print_clt1(clt_tile2, color,  0xf); //
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+	// serially for each color, parallel for each camera
+	// copy clt (frequency domain data)
+	for (int color = 0; color < colors; color++){
+		//        int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
+		float * clt_tile = ((float *) shr.clt_tiles[camera_num][color]); // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
+		float * clt_tilei = clt_tile + threadIdx.x;
+		float * gpu_tile = ((float *) gpu_clt[camera_num]) +  (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
+		float * mclt_tile = (float *) mclt_tiles [camera_num][color];
+		float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+		float * mclt_tmp =  (float *) shr1.mclt_tmp[camera_num][color];
+		//		float scale = 0.25;
+
+#pragma unroll
+		for (int q = 0; q < 4; q++) {
+			float *lpf = lpf_data[(colors > 1)? color : 3] + threadIdx.x; // lpf_data[3] - mono
+#pragma unroll
+			for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
+				//				*clt_tilei = *gpu_tile * (*lpf) * scale;
+				*clt_tilei = *gpu_tile * (*lpf);
+				clt_tilei +=  DTT_SIZE1;
+				gpu_tile +=   DTT_SIZE;
+				lpf +=        DTT_SIZE;
+			}
+		}
+		__syncthreads();
+#ifdef DEBUG7
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_gen LPF for color = %d\n",color);
+			debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
+
+			printf("\ntextures_gen tile = %d, color = %d \n",tile_num, color);
+			debug_print_clt_scaled(clt_tile, color,  0xf, 0.25); //
+		}
+		__syncthreads();// __syncwarp();
+#endif
+		// perform idct
+		imclt8threads(
+				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
+				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+				mclt_tile,  // float * mclt_tile )
+				((tile_num == DBG_TILE)  && (threadIdx.x == 0)));
+		__syncthreads();// __syncwarp();
+
+#ifdef DEBUG7
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_gen mclt color = %d\n",color);
+			debug_print_mclt(
+					mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+					color);
+		}
+		__syncthreads();// __syncwarp();
+#endif
+		if (colors > 1) {
+			debayer_shot(
+					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
+					min_shot,    // float     min_shot,   // 10.0
+					scale_shot,  // float     scale_shot, // 3.0 (0.0 for mono)
+					mclt_tile,   // float   * mclt_src,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_tmp,    // float   * mclt_tmp,
+					((tile_num == DBG_TILE)  && (threadIdx.x == 0))); // int debug);
+			__syncthreads();// __syncwarp();
+		} else {
+			// copy? - no, just remember to use mclt_tile, not mclt_dst
+			// will have to copy mclt_tiles -> mclt_dst as they have different gaps
+			// untested copy for mono mode
+#pragma unroll
+			for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
+				float * msp = mclt_tile + threadIdx.x + n;
+				float * dst = mclt_dst +  threadIdx.x + n;
+#pragma unroll
+				for (int row = 0; row < DTT_SIZE2; row++){
+					*dst = *msp;
+					msp += DTT_SIZE21;
+					dst += DTT_SIZE21;
+				}
+			}
+			__syncthreads();
+		}
+#ifdef DEBUG77
+		//		float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+
+		for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
+			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+				printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
+				debug_print_mclt(
+						mclt_dst, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+						-1);
+				printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
+				debug_print_mclt(
+						(float *) shr.mclt_debayer[ccam][0], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+						-1);
+
+			}
+			__syncthreads();// __syncwarp();
+		}
+		__syncthreads();// __syncwarp();
+#endif
+	} // for (int color = 0; color < colors; color++)
+
+	__syncthreads(); // __syncwarp();
+///	return;
+#ifdef DEBUG77
+	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
+			//		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+			for (int nncol = 0; nncol < colors; nncol++){
+				printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
+				//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+				debug_print_mclt(
+						(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+						-1);
+			}
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif
+
+#ifdef DEBUG77
+	for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+			for (int nncol = 0; nncol < colors; nncol++){
+				printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
+				//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+				debug_print_mclt(
+						(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+						-1);
+			}
+		}
+		__syncthreads();// __syncwarp();
+	}
+	__syncthreads();// __syncwarp();
+#endif
+//	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
+	tile_combine_rgba(
+			colors,                    // int     colors,        // number of colors
+			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
+			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
+			(float *) shr1.rgbaw,      // float * rgba,          // result
+			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
+			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
+			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
+			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
+			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
+			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
+			(tile_num == DBG_TILE) );  //int     debug );
+
+// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
+// float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
+//	size_t texture_tile_offset = + tile_indx * texture_stride;
+// multiply all 4(2) slices by a window (if not all directions)
+	if (gpu_texture_tiles && (texture_stride != 0)){ // generate non-ovelapping tiles
+		float * gpu_texture_tile = gpu_texture_tiles + tile_indx * texture_stride;
+
+		for (int pass = 0; pass < 8; pass ++) {
+			int row = pass * 2 + (threadIdx.y >> 1);
+			int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
+			int i  = row * DTT_SIZE21 + col;
+			int gi = row * DTT_SIZE2  + col;
+			float * gpu_texture_tile_gi = gpu_texture_tile + gi;
+			float * rgba_i = ((float *) shr1.rgbaw) + i;
+			// always copy 3 (1) colors + alpha
+			if (colors == 3){
+				if (keep_weights) {
+#pragma unroll
+					for (int ncol = 0; ncol < NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1 ; ncol++) { // 12
+						*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				} else {
+#pragma unroll
+					for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
+						*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				}
+			} else { // assuming colors = 1
+				if (keep_weights) {
+#pragma unroll
+					for (int ncol = 0; ncol < 1 + 1 + NUM_CAMS + 1 + 1 ; ncol++) { // 8
+						*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				} else {
+#pragma unroll
+					for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
+						*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				}
+			}
+
+		}
+#ifdef DEBUG7
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
+		}
+		__syncthreads();// __syncwarp();
+#endif
+	} // if (gpu_texture_tiles){ // generate non-ovelapping tiles
+
+	tile_code &= TASK_TEXTURE_BITS;
+	if (!tile_code){
+		return; // should not happen
+	}
+
+	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
+		if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
+			for (int pass = 0; pass < 8; pass ++) {
+				int row = pass * 2 + (threadIdx.y >> 1);
+				int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
+				int i  = row * DTT_SIZE21 + col;
+				int gi = row * DTT_SIZE2  + col;
+				float * rgba_i = ((float *) shr1.rgbaw) + i;
+				// always copy 3 (1) colors + alpha
+				if (colors == 3){
+#pragma unroll
+					for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
+						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
+					}
+				} else { // assuming colors = 1
+#pragma unroll
+					for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
+						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
+					}
+				}
+			}
+		}
+		int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
+		int tileY = tile_num / TILESX; // slow, but 1 per tile
+		int tileX = tile_num - tileY * TILESX;
+		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+
+		for (int pass = 0; pass < 8; pass ++) {
+			int row = pass * 2 + (threadIdx.y >> 1);          // row    inside a tile (0..15)
+			int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
+			int g_row = row + tile_y0;
+			int g_col = col + tile_x0;
+			int i  = row * DTT_SIZE21 + col;
+			int gi = g_row * texture_rbg_stride  + g_col; // offset to the top left corner
+			float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
+			float * rgba_i = ((float *) shr1.rgbaw) + i;
+			if (!border_tile ||
+					((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
+				// always copy 3 (1) colors + alpha
+				if (colors == 3){
+#pragma unroll
+					for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
+						*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				} else { // assuming colors = 1
+#pragma unroll
+					for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
+						*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+					}
+				}
+			}
+		}
+	} // 	if (gpu_texture_rbg) { // generate RGBA
+} // textures_accumulate()
+
+
+



-}


 extern "C"
@@ -2801,9 +3521,7 @@ __device__ void convertCorrectTile(
 //#endif
 }

-//#ifndef NOICLT1
-
-
+#ifdef NOICLT1
 extern "C"
 __global__ void test_imclt(
 		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -2859,7 +3577,7 @@ __global__ void test_imclt(
    	__syncthreads();// __syncwarp();
    }
 }
-
+#endif // NOICLT1

 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,