debugging code to generate data for macroblocks correlation

24112d36 · Andrey Filippov · 4203dbbb · 24112d36 · 24112d36 · 24112d36
Commit 24112d36 authored Apr 18, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -102,7 +102,8 @@ public class GPUTileProcessor {
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct"; // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
-	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
+//	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
+	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
 	static String GPU_RBGA_NAME =                  "generate_RBGA"; // name in C code
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv"; // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
@@ -191,14 +192,16 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_4_images =            new CUdeviceptr();
    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-    private CUdeviceptr gpu_diff_rgb_combo =      new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_CAMS* (NUM_COLORS + 1) * Sizeof.FLOAT
+    private CUdeviceptr gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int

 //    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT
    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT

    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
-    private CUdeviceptr gpu_num_texture_tiles =   new CUdeviceptr(); //  8 ints
    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT

    private CUdeviceptr gpu_correction_vector=    new CUdeviceptr();
@@ -562,8 +565,8 @@ public class GPUTileProcessor {

    	//#define TILESYA       ((TILESY +3) & (~3))
    	int tilesYa = (tilesY + 3) & ~3;
-//    	cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
-    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_texture_indices,     tilesX * tilesYa * Sizeof.FLOAT); // for non-overlap tiles
+    	cuMemAlloc(gpu_texture_indices_ovlp,tilesX * tilesYa * Sizeof.FLOAT); // for overlapped tiles

    	cuMemAlloc(gpu_diff_rgb_combo, tilesX * tilesYa * NUM_CAMS* (NUM_COLORS + 1) *  Sizeof.FLOAT);

@@ -572,7 +575,10 @@ public class GPUTileProcessor {


    	cuMemAlloc(gpu_woi,                               4 * Sizeof.FLOAT);
-    	cuMemAlloc(gpu_num_texture_tiles,                 8 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_num_texture_ovlp,                  8 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_texture_indices_len,               1 * Sizeof.FLOAT);
+
+
    	cuMemAlloc(gpu_active_tiles,        tilesX * tilesY * Sizeof.FLOAT);
    	cuMemAlloc(gpu_num_active_tiles,                  1 * Sizeof.FLOAT);

@@ -661,6 +667,24 @@ public class GPUTileProcessor {
        cuMemcpyHtoD(gpu_texture_indices, Pointer.to(ftexture_indices),  num_texture_tiles * Sizeof.FLOAT);
    }

+    public int [] getTextureIndices()
+    {
+    	float [] ftexture_indices_len = new float[1];
+    	cuMemcpyDtoH(Pointer.to(ftexture_indices_len), gpu_texture_indices_len,  1 * Sizeof.FLOAT);
+    	int num_tiles =      Float.floatToIntBits(ftexture_indices_len[0]);
+    	float [] ftexture_indices = new float [num_tiles];
+    	cuMemcpyDtoH(Pointer.to(ftexture_indices), gpu_texture_indices,  num_tiles * Sizeof.FLOAT);
+    	int [] texture_indices = new int [num_tiles];
+    	for (int i = 0; i < num_tiles; i++) {
+    		texture_indices[i] = Float.floatToIntBits(ftexture_indices[i]);
+    	}
+    	return texture_indices;
+    }
+
+
+//texture_indices
+
+

    public void setConvolutionKernel(
    		float [] kernel,  // [tileY][tileX][color][..]
@@ -1255,8 +1279,8 @@ public class GPUTileProcessor {
                Pointer.to(gpu_tasks),                           // struct tp_task   * gpu_tasks,
                Pointer.to(new int[] { num_task_tiles }),        // int                num_tiles,          // number of tiles in task list
            	// declare arrays in device code?
-                Pointer.to(gpu_texture_indices),                 // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-    			Pointer.to(gpu_num_texture_tiles),               // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+                Pointer.to(gpu_texture_indices_ovlp),                 // int              * gpu_texture_indices_ovlp,// packed tile + bits (now only (1 << 7)
+    			Pointer.to(gpu_num_texture_ovlp),               // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
    			Pointer.to(gpu_woi),                             // int              * woi,                // x,y,width,height of the woi
    			// set smaller for LWIR - it is used to reduce work aread
    			Pointer.to(new int[] {IMG_WIDTH / DTT_SIZE}),    // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
@@ -1276,7 +1300,8 @@ public class GPUTileProcessor {
 	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
 	            Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
 	            Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+//	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+	            Pointer.to(new int[]   {0})); // gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]

    	cuCtxSynchronize();
    	// Call the kernel function
@@ -1288,7 +1313,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

-    public void execTextures(
+    public void execTextures_old( // old
    		double [] color_weights,
    		boolean   is_lwir,
    		double    min_shot,           // 10.0
@@ -1350,6 +1375,70 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

+    public void execTextures(
+    		double [] color_weights,
+    		boolean   is_lwir,
+    		double    min_shot,           // 10.0
+    		double    scale_shot,         // 3.0
+    		double    diff_sigma,         // pixel value/pixel change
+    		double    diff_threshold,     // pixel value/pixel change
+    		double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    		boolean   dust_remove) {
+    	if (GPU_TEXTURES_kernel == null)
+    	{
+    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
+    		return;
+    	}
+
+    	int num_colors = color_weights.length;
+    	if (num_colors > 3) num_colors = 3;
+    	float [] fcolor_weights = new float[3];
+    	fcolor_weights[0] = (float) color_weights[0];
+    	fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
+    	fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
+        cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);
+
+    	int iis_lwir =      (is_lwir)? 1:0;
+    	int idust_remove =  (dust_remove)? 1 : 0;
+
+		int [] GridFullWarps =    {1, 1, 1};
+    	int [] ThreadsFullWarps = {1, 1, 1};
+
+    	Pointer kernelParameters = Pointer.to(
+                Pointer.to(gpu_tasks),                           // struct tp_task   * gpu_tasks,
+                Pointer.to(new int[] { num_task_tiles }),        // int                num_tiles,          // number of tiles in task list
+            	// declare arrays in device code?
+                Pointer.to(gpu_texture_indices),                 // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+    			Pointer.to(gpu_texture_indices_len),
+    	    	// Parameters for the texture generation
+    			Pointer.to(gpu_clt),                             // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+        		Pointer.to(gpu_geometry_correction),             //	struct gc          * gpu_geometry_correction,
+    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(new int[] { iis_lwir }),
+    			Pointer.to(new float[] {(float) min_shot }),
+    			Pointer.to(new float[] {(float) scale_shot }),
+    			Pointer.to(new float[] {(float) diff_sigma }),
+    			Pointer.to(new float[] {(float) diff_threshold }),
+    			Pointer.to(new float[] {(float) min_agree }),
+	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
+    			Pointer.to(new int[] { idust_remove }),
+//    			Pointer.to(new int[] { 0}), // texture_stride }),        // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+//    			Pointer.to(new int[] {0}),  // gpu_textures),
+    			Pointer.to(new int[] {texture_stride}),        // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+    			Pointer.to(gpu_textures),
+	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+    	cuCtxSynchronize();
+    	// Call the kernel function
+    	cuLaunchKernel(GPU_TEXTURES_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize();
+    }
+
+
+

    public float [][] getCorr2D(int corr_rad){
        int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
@@ -1389,6 +1478,41 @@ public class GPUTileProcessor {

    }

+//    read extra data for macro generation: 4 DIFFs, 4 of R,  4 of B, 4 of G
+    public float [][] getExtra(){
+		int [] texture_indices = getTextureIndices();
+/*
+    	float [] fnum_tiles = new float[1];
+    	cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_num_texture_ovlp,  1 * Sizeof.FLOAT);
+    	int num_tiles =      Float.floatToIntBits(fnum_tiles[0]);
+    	float [] ftiles = new float[num_tiles];
+    	cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_texture_indices_ovlp,  num_tiles * Sizeof.FLOAT);
+    	int [] tiles = new int[num_tiles];
+    	for (int i = 0; i < num_tiles; i++) {
+    		tiles[i] = Float.floatToIntBits(ftiles[i]);
+    	}
+*/
+    	int num_tile_extra = NUM_CAMS*(NUM_COLORS+1);
+    	float [] diff_rgb_combo = new float[texture_indices.length * num_tile_extra];
+    	cuMemcpyDtoH(Pointer.to(diff_rgb_combo), gpu_diff_rgb_combo,  diff_rgb_combo.length * Sizeof.FLOAT);
+    	int tilesX =  IMG_WIDTH / DTT_SIZE;
+    	int tilesY =  IMG_HEIGHT / DTT_SIZE;
+    	float [][] extra = new float[num_tile_extra][tilesX*tilesY];
+    	for (int i = 0; i < texture_indices.length; i++) {
+    		if (((texture_indices[i] >> CORR_TEXTURE_BIT) & 1) != 0) {
+    			int ntile = texture_indices[i] >>  CORR_NTILE_SHIFT;
+        if (ntile == 22507) {
+        	System.out.println("i="+i+", ntile="+ntile);
+        }
+		    	for (int l = 0; l < num_tile_extra; l++) {
+		    		extra[l][ntile] = diff_rgb_combo[i * num_tile_extra + l];
+		    	}
+    		}
+    	}
+    	return extra;
+    }
+
+
    /**
     * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
     * device array has 4 pixels margins on each side, skip them here
@@ -1428,6 +1552,7 @@ public class GPUTileProcessor {
    }

    public float [] getFlatTextures(
+    		int     num_tiles,
    		int     num_colors,
    		boolean keep_weights){

@@ -1447,13 +1572,14 @@ public class GPUTileProcessor {
        copyD2H.dstPitch =        texture_tile_size * Sizeof.FLOAT;

        copyD2H.WidthInBytes =    texture_tile_size * Sizeof.FLOAT;
-        copyD2H.Height =          num_texture_tiles;
+        copyD2H.Height =          num_tiles; // num_texture_tiles;

        cuMemcpy2D(copyD2H); // run copy
        return cpu_textures;
    }

    public float [][][] getTextures( // todo - get rid of copying by multiple CUDA_MEMCPY2D?
+    		int     num_tiles,
    		int     num_colors,
    		boolean keep_weights){

@@ -1462,6 +1588,7 @@ public class GPUTileProcessor {
    	int texture_tile_size =  texture_slices * texture_slice_size;
 //    	int texture_size =       texture_tile_size * num_texture_tiles;
        float [] cpu_textures = getFlatTextures(
+        		num_tiles,
        		num_colors,
        		keep_weights);


--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2083,10 +2083,12 @@ public class TwoQuadCLT {
 		// corr_indices array of integers to be passed to GPU
 //		gPUTileProcessor.setCorrIndices(corr_indices);

+/*
 		int [] texture_indices = gPUTileProcessor.getTextureTasks(
 				tp_tasks);
 		gPUTileProcessor.setTextureIndices(
 				texture_indices);
+*/
 		gPUTileProcessor.setGeometryCorrection(
 				quadCLT_main.getGeometryCorrection(),
 				false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
@@ -2146,12 +2148,11 @@ public class TwoQuadCLT {
 				clt_parameters.diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
 				clt_parameters.diff_threshold, // double    diff_threshold,     // pixel value/pixel change
 				clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				clt_parameters.dust_remove,    // boolean   dust_remove,
-				clt_parameters.keep_weights);  // boolean   keep_weights);
+				clt_parameters.dust_remove);     // boolean   dust_remove,
 		long endTextures = System.nanoTime();
-
 // run texturesRBGA
 		long startTexturesRBGA = System.nanoTime();   // System.nanoTime();
+
 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
 				col_weights,                   // double [] color_weights,
 				quadCLT_main.isLwir(),         // boolean   is_lwir,
@@ -2193,7 +2194,26 @@ public class TwoQuadCLT {

 		int out_width =  GPUTileProcessor.IMG_WIDTH +  GPUTileProcessor.DTT_SIZE;
 		int out_height = GPUTileProcessor.IMG_HEIGHT + GPUTileProcessor.DTT_SIZE;
-
+		int tilesX =  GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
+		int tilesY =  GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
+		// show extra
+		/* */
+		String [] extra_group_titles = {"DIFF","Red","Blue","Green"};
+		String [] extra_titles = new String [extra_group_titles.length*GPUTileProcessor.NUM_CAMS];
+		for (int g = 0; g < extra_group_titles.length;g++) {
+			for (int ncam=0; ncam < GPUTileProcessor.NUM_CAMS;ncam++) {
+				extra_titles[g * GPUTileProcessor.NUM_CAMS+ncam]= extra_group_titles[g]+"-"+ncam;
+			}
+		}
+		float [][] extra = gPUTileProcessor.getExtra();
+		(new ShowDoubleFloatArrays()).showArrays(
+				extra,
+				tilesX,
+				tilesY,
+				true,
+				name+"-EXTRA-D"+clt_parameters.disparity,
+				extra_titles);
+		/* */
 		ImagePlus [] imps_RGB = new ImagePlus[iclt_fimg.length];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
 			String title=name+"-"+String.format("%02d", ncam);
@@ -2215,8 +2235,6 @@ public class TwoQuadCLT {

 		}
 		//show_corr
-		int tilesX =  GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
-		int tilesY =  GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
 			int [] corr_indices = gPUTileProcessor.getCorrIndices();
@@ -2234,7 +2252,7 @@ public class TwoQuadCLT {
 					wh[0],
 					wh[1],
 					true,
-					"CORR2D",
+					name+"-CORR2D-D"+clt_parameters.disparity,
 					GPUTileProcessor.getCorrTitles());
 		}
 // convert to overlapping and show
@@ -2369,14 +2387,16 @@ public class TwoQuadCLT {
 		}

 		// convert textures to RGBA in Java
-		if (clt_parameters.show_rgba_color && (debugLevel > 0)) { // disabling
+		if (clt_parameters.show_rgba_color && (debugLevel > 100)) { // disabling
 			int numcol = quadCLT_main.isMonochrome()?1:3;
 			int ports = imp_quad_main.length;
+			int [] texture_indices = gPUTileProcessor.getTextureIndices();
 			int          num_src_slices = numcol + 1 + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
 //			float [][][] ftextures = gPUTileProcessor.getTextures(
 //		    		(is_mono?1:3), // int     num_colors,
 //		    		clt_parameters.keep_weights); // boolean keep_weights);
 			float [] flat_textures =  gPUTileProcessor.getFlatTextures(
+					texture_indices.length,
 		    		(is_mono?1:3), // int     num_colors,
 		    		clt_parameters.keep_weights); // boolean keep_weights);
 	    	int texture_slice_size = (2 * GPUTileProcessor.DTT_SIZE)* (2 * GPUTileProcessor.DTT_SIZE);

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -797,6 +797,7 @@ __device__ void debayer_shot(
 		float   * mclt_dst,  // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 		float   * mclt_tmp,
 		int       debug);
+/*
 __device__ void tile_combine_rgba(
 		int     colors,        // number of colors
 		float * mclt_tile,     // debayer
@@ -811,9 +812,29 @@ __device__ void tile_combine_rgba(
 		float * chn_weights,     // color channel weights, sum == 1.0
 		int     dust_remove,     // Do not reduce average weight when only one image differes much from the average
 		int     keep_weights,   // return channel weights after A in RGBA - ALWAYS
-		int     debug
-		);
-
+		int     debug);
+*/
+__device__ void tile_combine_rgba(
+		int     colors,        // number of colors
+		float * mclt_tile,     // debayer // has gaps to align with union !
+		float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
+		float * rgba,          // result
+		int     calc_extra,    // 1 - calcualate ports_rgb, max_diff
+		float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+		float max_diff_shared  [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+		float max_diff_tmp     [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
+		float ports_rgb_tmp    [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
+		float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
+		//		int           port_mask,      // which port to use, 0xf - all 4 (will modify as local variable)
+		float   diff_sigma,     // pixel value/pixel change
+		float   diff_threshold, // pixel value/pixel change
+		// next not used
+		//		boolean       diff_gauss,     // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
+		float   min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float * chn_weights,     // color channel weights, sum == 1.0
+		int     dust_remove,     // Do not reduce average weight when only one image differes much from the average
+		int     keep_weights,   // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
+		int     debug);

 __device__ void imclt_plane( // not implemented, not used
 		int               color,
@@ -865,7 +886,11 @@ __global__ void index_correlate(
 		int                num_tiles,         // number of tiles in task
 		int *              gpu_corr_indices,  // array of correlation tasks
 		int *              pnum_corr_tiles);  // pointer to the length of correlation tasks array
-
+__global__ void create_nonoverlap_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task
+		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+		int *              pnonoverlap_length);  //  indices to gpu_tasks  // should be initialized to zero
 //extern "C"
 __global__ void convert_correct_tiles(
 		float           ** gpu_kernel_offsets, // [NUM_CAMS],
@@ -896,6 +921,30 @@ extern "C" __global__ void correlate2D_inner(
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data

+extern "C" __global__ void textures_accumulate(
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weights[3],         // scale for R,B,G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+
 // ====== end of local declarations ====

 extern "C" __global__ void correlate2D(
@@ -1291,7 +1340,7 @@ __global__ void generate_RBGA(
 			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 			    int border_tile =  pass >> 2;
 			    int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
-			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
 			    int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2));  // 1/4
 			    if (border_tile){
 			    	ti_offset += TILESX * (TILESYA >> 2) - ntt;
@@ -1332,6 +1381,7 @@ __global__ void generate_RBGA(
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
 						gpu_texture_tiles, //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 						gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+//						gpu_diff_rgb_combo + ti_offset * NUM_CAMS*(colors+1)); // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]

 				cudaDeviceSynchronize(); // not needed yet, just for testing
 				/* */
@@ -1600,6 +1650,25 @@ __global__ void index_direct(
 		active_tiles[atomicAdd(pnum_active_tiles, 1)] = num_tile;
 	}
 }
+__global__ void create_nonoverlap_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task
+		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+		int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
+		return; // nothing to do
+	}
+	int cxy = gpu_tasks[num_tile].txy;
+	int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
+	if (gpu_tasks[num_tile].task != 0) {
+		nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
+	}
+}

 __global__ void index_correlate(
 		struct tp_task   * gpu_tasks,
@@ -1768,9 +1837,73 @@ __global__ void convert_correct_tiles(
    }
 }

+extern "C" __global__ void textures_nonoverlap(
+		struct tp_task  * gpu_tasks,
+		int               num_tiles,          // number of tiles in task list
+// declare arrays in device code?
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weights[3],         // scale for R,B,G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+//		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+{
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+
+	 if (threadIdx.x == 0) { // only 1 thread, 1 block
+		 *pnum_texture_tiles = 0;
+		 create_nonoverlap_list<<<blocks0,threads0>>>(
+				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_tiles,           // int                num_tiles,           // number of tiles in task
+				 gpu_texture_indices, // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+				 pnum_texture_tiles);  // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
+		 cudaDeviceSynchronize();
+		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+		 dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+
+		 textures_accumulate <<<grid_texture,threads_texture>>>(
+				 (int *) 0,                       // int             * woi,                // x, y, width,height
+				 gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				 *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+				 gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				 gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
+				 colors,                          // int               colors,             // number of colors (3/1)
+				 is_lwir,                         // int               is_lwir,            // do not perform shot correction
+				 min_shot,                        // float             min_shot,           // 10.0
+				 scale_shot,                      // float             scale_shot,         // 3.0
+				 diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
+				 diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
+				 min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				 weights,                         // float             weights[3],         // scale for R,B,G
+				 dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+				 0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+				 // combining both non-overlap and overlap (each calculated if pointer is not null )
+				 0,                               // size_t      texture_rbg_stride, // in floats
+				 (float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+				 texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+				 gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				 gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+	 }
+}
+
+
 //#undef USE_textures_gen
 extern "C"
-__global__ void textures_accumulate(
+__global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
@@ -1808,6 +1941,14 @@ __global__ void textures_accumulate(
 		return; // nothing to do
 	}
 	int tile_num = tile_code >> CORR_NTILE_SHIFT;
+#ifdef DEBUG22
+		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n1. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
+
+		}
+		__syncthreads();
+#endif	// #ifdef DEBUG22
+
 	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
 	__shared__ union {
 		float clt_tiles  [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // NUM_CAMS == 4
@@ -1819,14 +1960,16 @@ __global__ void textures_accumulate(
 		float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 		// add more
 	} shr1;
-	//	__shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
-	//	__shared__ float color_avg   [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];

-	__shared__ float port_offsets[NUM_CAMS][2];
-	__shared__ float ports_rgb   [NUM_CAMS][NUM_COLORS]; // return to system memory (optionally pass null to skip calculation)
-	__shared__ float max_diff [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
+	__shared__ float port_offsets      [NUM_CAMS][2];
+	__shared__ float ports_rgb_shared  [NUM_COLORS][NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
+	__shared__ float max_diff_shared   [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
+	__shared__ float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4][8]
+	__shared__ float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
+
+
+
 	if (threadIdx.x < 2){
-//		port_offsets[camera_num][threadIdx.x] = * (gpu_port_offsets + 2 * camera_num + threadIdx.x);
 		port_offsets[camera_num][threadIdx.x] = gpu_geometry_correction->rXY[camera_num][threadIdx.x];
 	}

@@ -1979,6 +2122,7 @@ __global__ void textures_accumulate(
 #endif

 #ifdef DEBUG77
+//#ifdef DEBUG22
 	for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
 			for (int nncol = 0; nncol < colors; nncol++){
@@ -1995,29 +2139,26 @@ __global__ void textures_accumulate(
 #endif
 //	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
 #ifdef DBG_TILE
-	tile_combine_rgba(
-			colors,                    // int     colors,        // number of colors
-			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
-			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
-			(float *) shr1.rgbaw,      // float * rgba,          // result
-			(float * ) ports_rgb,      // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
-			(float * ) max_diff,       // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
-			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
-			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
-			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
-			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
-			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
-			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
-			(tile_num == DBG_TILE) );  //int     debug );
+	int debug = (tile_num == DBG_TILE);
 #else
+	int debug = 0;
+#endif
+	int calc_extra = (gpu_diff_rgb_combo != 0);
 	tile_combine_rgba(
 			colors,                    // int     colors,        // number of colors
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
 			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
-			(float *) shr1.rgbaw,      // float * rgba,          // result
-			(float * ) ports_rgb,      // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
-			(float * ) max_diff,       // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float *) shr1.rgbaw,      // float * rgba,
+			// result
+			calc_extra, // int     calc_extra,    // 1 - calcualate ports_rgb, max_diff
+
+			ports_rgb_shared, // float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+			max_diff_shared,  // float max_diff_shared  [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+			max_diff_tmp,     //  float max_diff_tmp     [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
+			ports_rgb_tmp,    // float ports_rgb_tmp    [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
+
+
+
 			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
 			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
 			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
@@ -2025,8 +2166,12 @@ __global__ void textures_accumulate(
 			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
 			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
 			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
-			0);  //int     debug );
-#endif
+			debug );  // int     debug );
+
+	__syncthreads(); // _syncthreads();1
+
+
+
 // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
 // float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 //	size_t texture_tile_offset = + tile_indx * texture_stride;
@@ -2077,6 +2222,7 @@ __global__ void textures_accumulate(
 #endif
 	} // if (gpu_texture_tiles){ // generate non-ovelapping tiles

+
 	tile_code &= TASK_TEXTURE_BITS;
 	if (!tile_code){
 		return; // should not happen
@@ -2177,6 +2323,51 @@ __global__ void textures_accumulate(
 ///			}
 		}
 	} // 	if (gpu_texture_rbg) { // generate RGBA
+	if (calc_extra){ // gpu_diff_rgb_combo
+		__syncthreads(); // needed?
+#ifdef DEBUG22
+		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n3. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
+			printf("max_diff: %f, %f, %f, %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("R:        %f, %f, %f, %f\n",ports_rgb_shared[0][0],ports_rgb_shared[0][1],ports_rgb_shared[0][2],ports_rgb_shared[0][3]);
+			printf("B:        %f, %f, %f, %f\n",ports_rgb_shared[1][0],ports_rgb_shared[1][1],ports_rgb_shared[1][2],ports_rgb_shared[1][3]);
+			printf("G:        %f, %f, %f, %f\n",ports_rgb_shared[2][0],ports_rgb_shared[2][1],ports_rgb_shared[2][2],ports_rgb_shared[2][3]);
+
+			printf("\n 3. max_diff\n");
+			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+			}
+			for (int ncol = 0; ncol < colors; ncol++){
+				printf("\n%d:total %f %f %f %f\n",
+						ncol,
+						ports_rgb_shared[ncol][0],
+						ports_rgb_shared[ncol][1],
+						ports_rgb_shared[ncol][2],
+						ports_rgb_shared[ncol][3]);
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					printf("tmp[%d] %f %f %f %f\n",
+							i,
+							ports_rgb_tmp[ncol][0][i],
+							ports_rgb_tmp[ncol][1][i],
+							ports_rgb_tmp[ncol][2][i],
+							ports_rgb_tmp[ncol][3][i]);
+				}
+			}
+
+
+		}
+		__syncthreads();
+//DBG_TILE
+#endif// #ifdef DEBUG22
+		float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
+		if (threadIdx.x == 0){
+			*pdiff_rgb_combo = max_diff_shared[camera_num];
+		}
+		if (threadIdx.x < colors){
+			*(pdiff_rgb_combo + (threadIdx.x + 1) * NUM_CAMS) = ports_rgb_shared[threadIdx.x][camera_num];// [color][camera]
+		}
+	}
 } // textures_accumulate()


@@ -3321,8 +3512,11 @@ __device__ void tile_combine_rgba(
 		float * mclt_tile,     // debayer // has gaps to align with union !
 		float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
 		float * rgba,          // result
-		float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
-		float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+		int     calc_extra,    // 1 - calcualate ports_rgb, max_diff
+		float ports_rgb_shared [NUM_COLORS][NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+		float max_diff_shared  [NUM_CAMS], // return to system memory (optionally pass null to skip calculation)
+		float max_diff_tmp     [NUM_CAMS][TEXTURE_THREADS_PER_TILE],
+		float ports_rgb_tmp    [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], // [4*3][8]
 		float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
 		//		int           port_mask,      // which port to use, 0xf - all 4 (will modify as local variable)
 		float   diff_sigma,     // pixel value/pixel change
@@ -3333,8 +3527,7 @@ __device__ void tile_combine_rgba(
 		float * chn_weights,     // color channel weights, sum == 1.0
 		int     dust_remove,     // Do not reduce average weight when only one image differes much from the average
 		int     keep_weights,   // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
-		int     debug
-)
+		int     debug)
 {
 	float * alpha =        rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
 	float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
@@ -3588,11 +3781,7 @@ __device__ void tile_combine_rgba(
 			// TODO: Should it use pair_dist2r ? no as it is relative?
 			//				port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);

-#ifdef FASTMATH
-			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
-#else
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
-#endif

 		}
 		// and now make a new average with those weights
@@ -3656,11 +3845,6 @@ __device__ void tile_combine_rgba(
 		float wnd2_inv = 1.0/wnd2;

 #endif // #ifdef DEBUG9
-
-
-
-
-
 		///
 		if (rbg_tile) {
 			float k = 0.0;
@@ -3731,9 +3915,7 @@ __device__ void tile_combine_rgba(
 #endif


-
-	if (max_diff){
-		__shared__ float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // [4][8]
+	if (calc_extra){
 		int cam = threadIdx.y;
 		max_diff_tmp[cam][threadIdx.x] = 0.0;
 #pragma unroll
@@ -3743,15 +3925,21 @@ __device__ void tile_combine_rgba(
 			int i = row * DTT_SIZE21 + col;
 			int row_sym = row ^ ((row & 8)? 0xf : 0);
 			int col_sym = col ^ ((col & 8)? 0xf : 0);
-			float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
+
+			// Was it a bug?
+//			float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
+			float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQ[col_sym];
 			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+//			float * mclt_cam_i = rbg_tile +   colors_offset * cam + i;
+			//
 			float d2 = 0.0;
 #pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
-				float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
+//				float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
+				float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
 				d2 += *(chn_weights + ncol) * dc * dc;
 			}
-			d2 *= wnd2;
+//			d2 *= wnd2;
 			max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
 		}
 		__syncthreads();
@@ -3761,20 +3949,25 @@ __device__ void tile_combine_rgba(
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
 				mx = fmaxf(mx, max_diff_tmp[cam][i]);
 			}
-#ifdef FASTMATH
-			max_diff[cam] = __fsqrt_rn(mx);
-#else
-			max_diff[cam] = sqrtf(mx);
-#endif
+			max_diff_shared[cam] = sqrtf(mx);
+		}
+		__syncthreads(); //?
+#ifdef DEBUG22
+		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n 1. max_diff\n");
+			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
 			}
 		}
-
-	if (ports_rgb) {
-		__shared__ float ports_rgb_tmp [NUM_CAMS][NUM_COLORS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
+		__syncthreads();// __syncwarp();
+#endif // #ifdef DEBUG22
+	}
+	if (calc_extra) {
 		int cam = threadIdx.y;
 #pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++){
-			ports_rgb_tmp[cam][ncol][threadIdx.x] = 0.0;
+			ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
 		}

 #pragma unroll
@@ -3782,26 +3975,59 @@ __device__ void tile_combine_rgba(
 			int row = (pass >> 1);
 			int col = ((pass & 1) << 3) + threadIdx.x;
 			int i = row * DTT_SIZE21 + col;
-//			int row_sym = row ^ ((row & 8)? 0xf : 0);
 			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+			//			float * mclt_cam_i = rbg_tile +  colors_offset * cam + i;
+			//
 #pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
-				ports_rgb_tmp[cam][ncol][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
+//				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
+				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
 			}
 		}
 		__syncthreads();
 		if (threadIdx.x == 0){ // combine results
 #pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
-				ports_rgb[ncol * NUM_CAMS + cam] = 0;
+				//				ports_rgb[ncol * NUM_CAMS + cam] = 0;
+				ports_rgb_shared[ncol][cam] = 0;
 #pragma unroll
 				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-					int indx = ncol * NUM_CAMS + cam;
-					ports_rgb[indx] += ports_rgb_tmp[cam][ncol][i];
+					//					int indx = ncol * NUM_CAMS + cam;
+					//					ports_rgb[indx] += ports_rgb_tmp[cam][ncol][i];
+					ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
+				}
+				//				ports_rgb[indx] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
+				ports_rgb_shared[ncol][cam] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 			}
-				ports_rgb[indx] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 		}
+		__syncthreads(); //?
+#ifdef DEBUG22
+		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n 2. max_diff\n");
+			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
 			}
+			for (int ncol = 0; ncol < colors; ncol++){
+				printf("\n%d:total %f %f %f %f\n",
+						ncol,
+						ports_rgb_shared[ncol][0],
+						ports_rgb_shared[ncol][1],
+						ports_rgb_shared[ncol][2],
+						ports_rgb_shared[ncol][3]);
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					printf("tmp[%d] %f %f %f %f\n",
+							i,
+							ports_rgb_tmp[ncol][0][i],
+							ports_rgb_tmp[ncol][1][i],
+							ports_rgb_tmp[ncol][2][i],
+							ports_rgb_tmp[ncol][3][i]);
+				}
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif // #ifdef DEBUG22
+
 	}
 }


--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -75,11 +75,13 @@ extern "C" __global__ void correlate2D(
 		float           * gpu_corrs);          // correlation output data


-extern "C" __global__ void textures_accumulate(
-		int             * woi,                // x, y, width,height
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t            num_texture_tiles,  // number of texture tiles to process
+extern "C" __global__ void textures_nonoverlap(
+		struct tp_task  * gpu_tasks,
+		int               num_tiles,          // number of tiles in task list
+// declare arrays in device code?
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
@@ -91,14 +93,11 @@ extern "C" __global__ void textures_accumulate(
 		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+//		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
 // combining both non-overlap and overlap (each calculated if pointer is not null )
-		size_t            texture_rbg_stride, // in floats
-		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
-		size_t            texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
-		float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
-
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
+		float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed

 extern "C"
 __global__ void imclt_rbg_all(