Merge branch 'foliage-2504' into foliage-gpu

0c53ff72 · Andrey Filippov · 6381665c · b4d8c441 · 0c53ff72 · 0c53ff72
Commit 0c53ff72 authored Apr 16, 2025 by Andrey Filippov
7 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -58,7 +58,8 @@
                <groupId>org.jcuda</groupId>
                <artifactId>jcuda</artifactId>
                <!-- <version>10.1.0</version> -->
-                <version>11.2.0</version>
+                <!--<version>11.2.0</version> -->
+                <version>12.6.0</version>
            </dependency>
            <!--
            	As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
@@ -113,6 +114,15 @@
 				<artifactId>loci_tools</artifactId>
 				<version>6.1.0</version>
 			</dependency>
+			<!-- https://mvnrepository.com/artifact/ome/pom-bio-formats -->
+			<!-- Was source in attic for development -->
+			<dependency>
+			    <groupId>ome</groupId>
+			    <artifactId>pom-bio-formats</artifactId>
+			    <version>6.13.0</version>
+			    <type>pom</type>
+			</dependency>
 <!-- 			
 			<dependency>
 			  <groupId>com.drewnoakes</groupId>

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
 ** GPU acceleration for the Tile Processor
 **
 **
-** Copyright (C) 2018 Elphel, Inc.
+** Copyright (C) 2018-2025 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
 import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
-	public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
+	public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	// Can be downloaded and twice extracted from
 	// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
 	// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
 	// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
 	static String GPU_RESOURCE_DIR =              "kernels";
-	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
 	// "*" - generated defines, first index - separately compiled unit
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct";      // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D";         // name in C code
@@ -89,7 +91,7 @@ public class GPUTileProcessor {
 	static String GPU_CORR2D_COMBINE_NAME =        "corr2D_combine";      // name in C code
 	static String GPU_CORR2D_NORMALIZE_NAME =      "corr2D_normalize";    // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
-	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code
+	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code //// *** Modified 2025 *** ////
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv";      // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALCULATE_TILES_OFFSETS =    "calculate_tiles_offsets";   // calculate pixel offsets and disparity distortions
@@ -100,7 +102,7 @@ public class GPUTileProcessor {
 	static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
 	static String GPU_GEN_TEXTURE_LIST_NAME =      "gen_texture_list";
 	static String GPU_CLEAR_TEXTURE_RBGA_NAME =    "clear_texture_rbga";
-	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";
+	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";  //// *** Modified 2025 *** ////
 	static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
 	static String GPU_ERASE_CLT_TILES_NAME =       "erase_clt_tiles";
@@ -298,7 +300,7 @@ public class GPUTileProcessor {
        ClassLoader classLoader = getClass().getClassLoader();
        String [] kernelSources = new String[GPU_SRC_FILES.length];
-        boolean show_source = false; // true;
+        boolean show_source = true; // false; // true;
        for (int cunit = 0; cunit < kernelSources.length; cunit++) {
        	kernelSources[cunit] = ""; // use StringBuffer?
            for (String src_file:GPU_SRC_FILES[cunit]) {
@@ -370,7 +372,7 @@ public class GPUTileProcessor {
        GPU_CORR2D_COMBINE_kernel =          functions[4];
        GPU_CORR2D_NORMALIZE_kernel =        functions[5];
        GPU_TEXTURES_kernel=                 functions[6];
-        GPU_RBGA_kernel=                     functions[7];
+        GPU_RBGA_kernel=                     functions[7];  //// *** Modified 2025 *** ////
        GPU_ROT_DERIV_kernel =               functions[8];
        GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
@@ -380,7 +382,7 @@ public class GPUTileProcessor {
        GPU_MARK_TEXTURE_NEIGHBOR_kernel =   functions[13];
        GPU_GEN_TEXTURE_LIST_kernel =        functions[14];
        GPU_CLEAR_TEXTURE_RBGA_kernel =      functions[15];
-        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16];
+        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16]; //// *** Modified 2025 *** ////
        GPU_CREATE_NONOVERLAP_LIST_kernel =  functions[17];
        GPU_ERASE_CLT_TILES_kernel =         functions[18];
@@ -504,7 +506,7 @@ public class GPUTileProcessor {
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
-    		String options[] = {"--gpu-architecture=compute_"+capability};
+    		String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
    		try {
    			nvrtcCompileProgram(program, options.length, options);

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
 package com.elphel.imagej.gpu;
 import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
+import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
+import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
 import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuMemAlloc;
@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_color_weights;
 	private CUdeviceptr gpu_generate_RBGA_params;
 	private CUdeviceptr gpu_woi;
+	private CUdeviceptr gpu_twh;
 	private CUdeviceptr gpu_num_texture_tiles;
 	private CUdeviceptr gpu_textures_rgba;
 	private CUdeviceptr gpu_correction_vector;
@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int
 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers
 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int
 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers
 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description
 		// uses dynamic parallelization, top kernel is a single-thread one
 		int [] GridFullWarps =    {1, 1, 1};
 		int [] ThreadsFullWarps = {1, 1, 1};
+		int shared_size = host_get_textures_shared_size( // in bytes
+				num_cams,     // int                num_cams,     // actual number of cameras
+				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				null);           // int *              offsets);     // in floats
+//	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
 		Pointer kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int               num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description
 				Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 				Pointer.to(new int[]   {keep_weights}),          // int               keep_weights,       // return channel weights after A in RGBA
 				Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
-				Pointer.to(gpu_textures_rgba));                  // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				Pointer.to(gpu_twh));                              // int             * twh);  allocate int[2]  for width, heightin DP
 		cuCtxSynchronize();
 		// Call the kernel function
@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
 		if (DEBUG8A) {
 			cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
 		}
+		int [] cpu_pnum_texture_tiles = {0}; //// debugging
 		// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
 		// oversize (border 16x 16 tiles overhang by 4 pixels)
 		int tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
@@ -2886,10 +2900,18 @@ public class GpuQuad{ // quad camera description
 			int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile];
 			if (ntt > 0) {
 				int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1
+				/* Pre CDP2
 				int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
 				if (border_tile != 0){
 					ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+				}
+				*/
+				// for CDP2
+			    int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+			    if (border_tile != 0){
+//			    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			    	ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
+			    	ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
 			    }
 				int shared_size = host_get_textures_shared_size( // in bytes
 						num_cams,     // int                num_cams,     // actual number of cameras
@@ -2917,14 +2939,25 @@ public class GpuQuad{ // quad camera description
 					}
 					System.out.println ("\n\n");
 				}
+				// debugging, copying single int back and forth 
+				cpu_pnum_texture_tiles[0] = ntt;
+				cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles),  1 * Sizeof.INT);
-				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//		cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles),  1 * Sizeof.INT);
-				Pointer kp_textures_accumulate = Pointer.to(
+//				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+				Pointer kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
 						Pointer.to(new int[] {num_cams}),                // int         num_cams,
 						Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
 						Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-						Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+//						Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
-						Pointer.to(new int[] {ti_offset}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+//						Pointer.to(gpu_num_texture_tiles[((pass & 3) << 1) + border_tile)]),               // int       * num_texture_tiles,// number of texture tiles to process
+////						Pointer.to(gpu_num_texture_tiles).withByteOffset(
+////								(((pass & 3) << 1) + border_tile)*Sizeof.INT),               // int       * num_texture_tiles,// number of texture tiles to process
+						Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
+						Pointer.to(new int[] {ti_offset}),               // int         gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
 						Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
 						Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
 						Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
@@ -3066,7 +3099,14 @@ public class GpuQuad{ // quad camera description
 		int [] ThreadsFullWarps = {1, 1, 1};
 		//        	CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		int shared_size = host_get_textures_shared_size( // in bytes
+				num_cams,     // int                num_cams,     // actual number of cameras
+				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				null);           // int *              offsets);     // in floats
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
 		Pointer kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int                num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,
@@ -3169,12 +3209,17 @@ public class GpuQuad{ // quad camera description
 				num_cams,     // int                num_cams,     // actual number of cameras
 				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				null);           // int *              offsets);     // in floats
-		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
 		Pointer kp_textures_accumulate = Pointer.to(
 				Pointer.to(new int[] {num_cams}),                // int         num_cams,
 				Pointer.to(new int[]   {0}),  // Pointer.to(gpu_woi),  // int       * woi,            // min_x, min_y, max_x, max_y
 				Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-				Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+//				Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+//				Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+//				Pointer.to(gpu_num_texture_tiles),               // int       * num_texture_tiles,// number of texture tiles to process
+				Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
 				Pointer.to(new int[] {0}),                     // size_t      num_texture_tiles,// number of texture tiles to process
 				Pointer.to(gpu_texture_indices),          // int   * nonoverlap_list,    // pointer to the calculated number of non-zero tiles
 				Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -36,12 +36,17 @@
 * \brief Top level of the Tile Processor for frequency domain
 */
 // Avoiding includes in jcuda, all source files will be merged
-#pragma once
+//#pragma once
 #ifndef JCUDA
 #include "tp_defines.h"
 #include "dtt8x8.h"
 #include "geometry_correction.h"
 #include "TileProcessor.h"
+	#include <cuda_runtime.h>
+//	#include <helper_cuda.h>
+//	#include <helper_functions.h>
 #endif // #ifndef JCUDA
 // CUDA fast math is slower!
@@ -55,67 +60,16 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
 //#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
-#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
+/*
-		| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
-//#define IMCLT14
-//#define NOICLT 1
-//#define TEST_IMCLT
-//#define SAVE_CLT
-// Not enough shared memory to have more threads per block,even just for the result clt tiles
-// What to do:
-// 1) make single image aberration correction: 1/4 of the result tiles
-// With 4 cameras = calculate correlations (9x9), reusing kernel or just clt ones after color reducing, then output them to device memory
-//Average run time =1308.124146 ms
-//#define TILES_PER_BLOCK    2
-//Average run time =12502.638672 - with 2 tiles/block it is longer!
-///12129.268555 ms
-//Average run time =4704.506348 ms (syncwarp)
-//Average run time =4705.612305 ms (syncthreads)
-//Average run time =1051.411255 ms
-//Average run time =861.866577 ms
-//Average run time =850.871277 ms had bugs
-//Average run time =857.947632 ms fixed bugs
-// Something broke, even w/o LPF: Average run time =1093.115112 ms
-// without clt copying to device memory - Average run time =965.342407 ms - still worse
-//Average run time =965.880554 ms
-// combined tx and ty into a single int : Average run time =871.017944 ms
-//Average run time =873.386597 ms (reduced number of registers)
-//__umul24 : Average run time =879.125122 ms
-// without __umul24 - back to Average run time =871.315552 ms
-// Added copying clt to device memory - Average run time =942.071960 ms
-// Removed rest of NOICLT : Average run time =943.456177 ms
-// Added lpf: Average run time =1046.101318 ms (0.1 sec, 10%) - can be combined with the PSF kernel
-//#define USE_UMUL24
-////#define TILES_PER_BLOCK    4
-//Average run time =5155.922852 ms
-//Average run time =1166.388306 ms
-//Average run time =988.750977 ms
-//#define TILES_PER_BLOCK    8
-//Average run time =9656.743164 ms
-// Average run time =9422.057617 ms (reducing divergence)
-//#define TILES_PER_BLOCK    1
-//#define THREADS_PER_TILE   8
-//#define IMCLT_THREADS_PER_TILE 16
-//#define IMCLT_TILES_PER_BLOCK   4
-/// #define KERNELS_STEP  (1 << KERNELS_LSTEP)
-//#define TILES-X        (IMG-WIDTH / DTT_SIZE)
-//#define TILES-Y        (IMG-HEIGHT / DTT_SIZE)
 #define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
 #define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+*/
 // Make TILES-YA >= TILES-X and a multiple of 4
 //#define TILES-YA       ((TILES-Y +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
-#define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
+//#define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
+/*
 struct CltExtra{
 	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
 	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
@@ -126,7 +80,7 @@ struct CltExtra{
 	float dyc_dx;
 	float dyc_dy;
 };
+*/
 /*
 Python code to generate constant coefficients:
 def setup_hwindow(n=8, l=4):
@@ -724,7 +678,7 @@ __device__ void tile_combine_rgba(
 		// next not used
 		//		boolean       diff_gauss,     // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
 		float   min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		float * chn_weights,     // color channel weights, sum == 1.0
+		const float * chn_weights,     // color channel weights, sum == 1.0
 		int     dust_remove,     // Do not reduce average weight when only one image differs much from the average
 		int     keep_weights,   // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
 		int     debug);
@@ -735,43 +689,17 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)
-extern "C" __global__ void clear_texture_list(
+extern "C" __global__ void update_woi(
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int                texture_slices,
-		int                width,  // <= TILES-X, use for faster processing of LWIR images
+		int              * woi,                  // min_x, min_y, max_x, max_y input, not modified, max_x - not used
-		int                height); // <= TILES-Y, use for faster processing of LWIR images
+		int              * twh);                  // 2-element in device global memory
-extern "C" __global__ void mark_texture_tiles(
+extern "C" __global__ void clear_texture_rbga2( // version for CDP2
-		int                num_cams,
+		int *             twh, //  texture_width, // aligned to DTT_SIZE
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                num_tiles,           // number of tiles in task list
-		int                width,               // number of tiles in a row
-		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
-extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
-		int                num_cams,
-		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                num_tiles,           // number of tiles in task list
-		int                width,               // number of tiles in a row
-		int                height,              // number of tiles rows
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-		int              * woi);                  // x,y,width,height of the woi
-extern "C" __global__ void gen_texture_list(
-		int                num_cams,
-		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                num_tiles,            // number of tiles in task list
-		int                width,                // number of tiles in a row
-		int                height,               // number of tiles rows
-		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,    // number of texture tiles to process
-		int              * woi);                 // min_x, min_y, max_x, max_y input
-extern "C" __global__ void clear_texture_rbga(
-		int               texture_width,
-		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 //inline __device__ int get_task_size(int num_cams);
 inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
 inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
@@ -807,14 +735,6 @@ __global__ void index_inter_correlate(
 		int *              gpu_corr_indices,  // array of correlation tasks
 		int *              pnum_corr_tiles);  // pointer to the length of correlation tasks array
-extern "C" __global__ void create_nonoverlap_list(
-		int                num_cams,
-		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                num_tiles,           // number of tiles in task
-		int                width,               // number of tiles in a row
-		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
-		int *              pnonoverlap_length); //  indices to gpu_tasks  // should be initialized to zero
 __global__ void convert_correct_tiles(
 	    int                num_cams,           // actual number of cameras
 	    int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
@@ -824,7 +744,7 @@ __global__ void convert_correct_tiles(
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
-		int                num_active_tiles,   // number of tiles in task
+		int              * num_active_tiles,   // number of tiles in task
 		float           ** gpu_clt,            // [num_cams][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
 		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
@@ -836,7 +756,8 @@ __global__ void convert_correct_tiles(
 extern "C" __global__ void combine_inter(     // combine per-senor interscene correlations
 		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
-		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+//		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * num_corr_tiles,     // pointer to number of correlation tiles to process (here it includes sum)
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t            corr_stride,        // in floats
 		float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
@@ -850,7 +771,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
 		float             scale0,             // scale for R
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
-		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+///		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * num_corr_tiles,     // pointer to number of correlation tiles to process (here it includes sum)
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t            corr_stride,        // in floats
 		float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain
@@ -863,7 +785,8 @@ extern "C" __global__ void correlate2D_inner(
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero2,          // here - absolute
-		int            num_corr_tiles,     // number of correlation tiles to process
+//		int            num_corr_tiles,     // number of correlation tiles to process
+		int             * num_corr_tiles,     // pointer to a number of correlation tiles to process
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t      corr_stride,        // in floats
 		int               corr_radius0,        // radius of the output correlation (7 for 15x15)
@@ -891,33 +814,6 @@ extern "C" __global__ void corr2D_combine_inner(
 		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
 		float           * gpu_corrs_combo);    // combined correlation output (one per tile)
-extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
-		int               num_cams,           // number of cameras used
-		int             * woi,                // x, y, width,height
-		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-		size_t            num_texture_tiles,  // number of texture tiles to process
-		int               gpu_texture_indices_offset,// add to gpu_texture_indices
-		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		// TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
-		int               colors,             // number of colors (3/1)
-		int               is_lwir,            // do not perform shot correction
-		float             min_shot,           // 10.0
-		float             scale_shot,         // 3.0
-		float             diff_sigma,         // pixel value/pixel change
-		float             diff_threshold,     // pixel value/pixel change
-		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		float             weights[3],         // scale for R,B,G
-		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-// combining both non-overlap and overlap (each calculated if pointer is not null )
-		size_t            texture_rbg_stride, // in floats
-		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
-		size_t            texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
-		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
-		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
-		int               tilesx);
 __device__ int get_textures_shared_size( // in bytes
 	    int                num_cams,     // actual number of cameras
@@ -972,11 +868,16 @@ extern "C" __global__ void correlate2D(
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs)          // correlation output data
 {
-	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 int num_pairs = num_cams * (num_cams-1) / 2;
-	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); // static
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); // static
+	 dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+	 dim3 grid_corr((num_tiles * num_pairs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 	 if (threadIdx.x == 0) { // only 1 thread, 1 block
-		 *pnum_corr_tiles = 0;
+		 *pnum_corr_tiles = 0; // global, allocated by host
-		 index_correlate<<<blocks0,threads0>>>(
+		 index_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				 num_cams,            // int               num_cams,
 				 sel_pairs0,          // int               sel_pairs0,
 				 sel_pairs1,          // int               sel_pairs1,
@@ -988,10 +889,10 @@ extern "C" __global__ void correlate2D(
 				 tilesx,              // int                width,                // number of tiles in a row
 				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 				 pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
-		 cudaDeviceSynchronize();
+///		 cudaDeviceSynchronize();
-		 dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+//		 dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-		 dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+///		 dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-		 correlate2D_inner<<<grid_corr,threads_corr>>>(
+		 correlate2D_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>(
 				 num_cams,           // int               num_cams,
 				 gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 				 colors,             // int               colors,             // number of colors (3/1)
@@ -999,7 +900,8 @@ extern "C" __global__ void correlate2D(
 				 scale1,             // float             scale1,             // scale for B
 				 scale2,             // float             scale2,             // scale for G
 				 fat_zero2,          // float             fat_zero2,           // here - absolute
-				 *pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+//				 *pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+				 pnum_corr_tiles,    // size_t            num_corr_tiles,     // pointer to a number of correlation tiles to process
 				 gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
 				 corr_stride,        // const size_t      corr_stride,        // in floats
 				 corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
@@ -1045,13 +947,19 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		float           * gpu_corrs)          // correlation output data
 {
 	dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
-	dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+//	dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+	dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+	dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
 	if (threadIdx.x == 0) { // only 1 thread, 1 block
 		int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
 		if (num_sel_sensors > 0){
 // try with null tp_tasks to use same sequence from GPU memory
 			*pnum_corr_tiles = 0;
-			index_inter_correlate<<<blocks0,threads0>>>(
+			index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 					num_cams,            // int               num_cams,
 					sel_sensors,         // int               sel_sensors,
 					gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
@@ -1059,13 +967,12 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					tilesx,              // int                width,                // number of tiles in a row
 					gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 					pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
-			cudaDeviceSynchronize();
+///			cudaDeviceSynchronize();
-			int num_corr_tiles_with_sum = (*pnum_corr_tiles);
+///			__device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
-			int num_corr_tiles_wo_sum =   num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
+///			int num_corr_tiles_wo_sum =   num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
-			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+///			dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-			dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+///			dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+			correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
-			correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
 					num_cams,                // int          num_cams,
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
 					gpu_clt,                 // float     ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -1074,14 +981,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 					scale0,                  // float        scale0,             // scale for R
 					scale1,                  // float        scale1,             // scale for B
 					scale2,                  // float        scale2,             // scale for G
-					num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum for compatibility with intra format)
+					pnum_corr_tiles, // num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum for compatibility with intra format)
 					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile + sensor (0xff - sum)
 					corr_stride,             // size_t       corr_stride,        // in floats
 					gpu_corrs);              // float      * gpu_corrs)          // correlation output data (either pixel domain or transform domain
-			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+///			dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-			combine_inter<<<grid_combine,threads_corr>>>(     // combine per-senor interscene correlations
+			combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>(     // combine per-senor interscene correlations
 					num_sel_sensors,         // int          num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
-					num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+					pnum_corr_tiles, //num_corr_tiles_with_sum, // int          num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
 					gpu_corr_indices,        // int        * gpu_corr_indices,   // packed tile+pair NOT USED
 					corr_stride,             // size_t       corr_stride,        // in floats
 					gpu_corrs);              // float      * gpu_corrs);          // correlation output data (either pixel domain or transform domain
@@ -1110,7 +1017,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 */
 extern "C" __global__ void combine_inter(     // combine per-senor interscene correlations
 		int               num_sel_sensors,    // number of sensors to correlate (not counting sum of all)
-		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+//		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * num_corr_tiles,     // pointer to number of correlation tiles to process (here it includes sum)
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t            corr_stride,        // in floats
 		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
@@ -1118,7 +1026,7 @@ extern "C" __global__ void combine_inter(     // combine per-senor interscene co
 	int corr_in_block = threadIdx.y;
 	int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
 	int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
-	if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
+	if (corr_offset >= (*num_corr_tiles - num_sel_sensors)) { // was 	if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
 		return;
 	}
 //    __syncthreads();// __syncwarp();
@@ -1184,7 +1092,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
 		float             scale0,             // scale for R
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
-		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+///		int               num_corr_tiles,     // number of correlation tiles to process (here it includes sum)
+		int             * num_corr_tiles,     // pointer to number of correlation tiles to process (here it includes sum)
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t            corr_stride,        // in floats
 		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
@@ -1194,7 +1103,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
 	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
 	int tile_index =  corr_num / num_sel_sensors;
 	int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
-	if (corr_offset >= num_corr_tiles){
+//	if (corr_offset >= num_corr_tiles){
+	if (corr_offset >= *num_corr_tiles){
 		return; // nothing to do
 	}
@@ -1304,7 +1214,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
 * @param scale1           scale blue (if colors = 3) component before mixing
 * @param scale2           scale green (if colors = 3) component before mixing
 * @param fat_zero2        add this value squared to the sum of squared components before normalization
- * @param num_corr_tiles   number of correlation tiles to process
+ * @param num_corr_tiles   number of correlation tiles to process => a pointer to!
 * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
 * @param corr_stride      stride (in floats) for correlation outputs.
 * @param corr_radius      radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
@@ -1318,7 +1228,8 @@ extern "C" __global__ void correlate2D_inner(
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero2,          // here - absolute
-		int            num_corr_tiles,     // number of correlation tiles to process
+///		int            num_corr_tiles,     // number of correlation tiles to process
+		int             * num_corr_tiles,     // pointer to a number of correlation tiles to process
 		int             * gpu_corr_indices,   // packed tile+pair
 		size_t         corr_stride,        // in floats
 		int               corr_radius0,        // radius of the output correlation (7 for 15x15)
@@ -1331,7 +1242,7 @@ extern "C" __global__ void correlate2D_inner(
 	float scales[3] = {scale0, scale1, scale2};
 	int corr_in_block = threadIdx.y;
 	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
-	if (corr_num >= num_corr_tiles){
+	if (corr_num >= *num_corr_tiles){
 		return; // nothing to do
 	}
 	int pair_list_start = pairs_offsets[num_cams];
@@ -2027,7 +1938,8 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
-		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh)
 {
 	float             min_shot = params[0];           // 10.0
 	float             scale_shot = params[1];         // 3.0
@@ -2038,30 +1950,56 @@ extern "C" __global__ void generate_RBGA(
 	dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
    dim3 blocks0 (blocks_x, height, 1);
+	dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+	int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
+    dim3 blocks(blocks_t, 1, 1);
+	*(woi + 0) = width;  // TILES-X;
+	*(woi + 1) = height; // TILES-Y;
+	*(woi + 2) = 0; // maximal x
+	*(woi + 3) = 0; // maximal y
+	*(num_texture_tiles+0) = 0;
+	*(num_texture_tiles+1) = 0;
+	*(num_texture_tiles+2) = 0;
+	*(num_texture_tiles+3) = 0;
+	*(num_texture_tiles+4) = 0;
+	*(num_texture_tiles+5) = 0;
+	*(num_texture_tiles+6) = 0;
+	*(num_texture_tiles+7) = 0;
+	int texture_slices =       colors + 1;
+	 if (keep_weights & 2){
+		 texture_slices += colors * num_cams;
+	 }
+	int shared_size = get_textures_shared_size( // in bytes
+		    num_cams,     // int                num_cams,     // actual number of cameras
+			colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			0);           // int *              offsets);     // in floats
 	if (threadIdx.x == 0) {
-		clear_texture_list<<<blocks0,threads0>>>(
+		clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				gpu_texture_indices,
 				width,
 				height);
-		cudaDeviceSynchronize(); // not needed yet, just for testing
+///		cudaDeviceSynchronize(); // not needed yet, just for testing
-		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+///		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
-		int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
+///		int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
-	    dim3 blocks(blocks_t, 1, 1);
+///	    dim3 blocks(blocks_t, 1, 1);
 	    // mark used tiles in gpu_texture_indices memory
-		mark_texture_tiles <<<blocks,threads>>>(
+		mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,           // int                num_cams,
 				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				num_tiles,          // number of tiles in task list
 				width,              // number of tiles in a row
 				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-		cudaDeviceSynchronize();
+///		cudaDeviceSynchronize();
 	    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
-		*(woi + 0) = width;  // TILES-X;
+///		*(woi + 0) = width;  // TILES-X;
-		*(woi + 1) = height; // TILES-Y;
+///		*(woi + 1) = height; // TILES-Y;
-		*(woi + 2) = 0; // maximal x
+///		*(woi + 2) = 0; // maximal x
-		*(woi + 3) = 0; // maximal y
+///		*(woi + 3) = 0; // maximal y
-		mark_texture_neighbor_tiles <<<blocks,threads>>>(
+		mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,           // int                num_cams,
 				gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				num_tiles,           // number of tiles in task list
@@ -2070,69 +2008,88 @@ extern "C" __global__ void generate_RBGA(
 				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
 				woi);                // min_x, min_y, max_x, max_y
-		cudaDeviceSynchronize();
+///		cudaDeviceSynchronize();
 	    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
-		*(num_texture_tiles+0) = 0;
+///		*(num_texture_tiles+0) = 0;
-		*(num_texture_tiles+1) = 0;
+///		*(num_texture_tiles+1) = 0;
-		*(num_texture_tiles+2) = 0;
+///		*(num_texture_tiles+2) = 0;
-		*(num_texture_tiles+3) = 0;
+///		*(num_texture_tiles+3) = 0;
-		*(num_texture_tiles+4) = 0;
+///		*(num_texture_tiles+4) = 0;
-		*(num_texture_tiles+5) = 0;
+///		*(num_texture_tiles+5) = 0;
-		*(num_texture_tiles+6) = 0;
+///		*(num_texture_tiles+6) = 0;
-		*(num_texture_tiles+7) = 0;
+///		*(num_texture_tiles+7) = 0;
-		gen_texture_list <<<blocks,threads>>>(
+		gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,            // int                num_cams,
 				gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				num_tiles,           // number of tiles in task list
 				width,               // number of tiles in a row
 				height,              // int                height,               // number of tiles rows
 				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-				num_texture_tiles,   // number of texture tiles to process
+				num_texture_tiles,   // pointer to a number of texture tiles to process
-				woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y
+				woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y input only, not modified, woi[2] not used
-		cudaDeviceSynchronize(); // not needed yet, just for testing
+		/*
+// TODO: Add small kernel to only modify *(woi + 2), *(woi + 3) and generate  texture_width, texture_tiles_height ?
+///		cudaDeviceSynchronize(); // not needed yet, just for testing
 		*(woi + 2) += 1 - *(woi + 0); // width
 		*(woi + 3) += 1 - *(woi + 1); // height
 	}
-	 __syncthreads();
+	 __syncthreads(); // ?
 // Zero output textures. Trim
 // texture_rbga_stride
 	 int texture_width =        (*(woi + 2) + 1)* DTT_SIZE;
 	 int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE;
-	 int texture_slices =       colors + 1;
+///	 int texture_slices =       colors + 1;
-	 if (keep_weights & 2){
+///	 if (keep_weights & 2){
-		 texture_slices += colors * num_cams;
+///		 texture_slices += colors * num_cams;
-	 }
+///	 }
 	 if (threadIdx.x == 0) {
+        */
+///		__device__ int twh[2];
+		update_woi<<<1,1, 0, cudaStreamTailLaunch>>>(
+				texture_slices, // int                texture_slices,
+				woi,            // int              *                  // min_x, min_y, max_x, max_y input, not modified, max_x - not used
+				twh);           // int              * twh)  // 2-element in device global memory
+		// next kernels will see woi as {x,y,width,height}
 		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
-		    int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
-		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
+//		    int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
-		    clear_texture_rbga<<<blocks2,threads2>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
+//		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
+		    int blocks_x = ((width+1) * DTT_SIZE + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
+		    dim3 blocks2 (blocks_x, (height+1) * DTT_SIZE * texture_slices, 1); // each thread - 8 vertical
+		    /*
+		    clear_texture_rbga<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
 		    		texture_width,
 					texture_tiles_height * texture_slices, // int               texture_slice_height,
 					texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
 					gpu_texture_tiles) ;                   // float           * gpu_texture_tiles);
+			*/
+		    clear_texture_rbga2<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
+		    		twh,                 // int * twh, //  {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
+					texture_rbga_stride, // const size_t      texture_rbga_stride,     // in floats 8*stride
+					gpu_texture_tiles) ; // float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
 // Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
 // oversize (border 16x116 tiles overhang by 4 pixels)
-			cudaDeviceSynchronize(); // not needed yet, just for testing
+///			cudaDeviceSynchronize(); // not needed yet, just for testing
 			for (int pass = 0; pass < 8; pass++){
-//			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-//			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-//			    dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 				int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
 				dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-		//		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-		//	     dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 			    int border_tile =  pass >> 2;
-			    int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
+//			    int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
-			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
+//			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
+			    int * pntt = num_texture_tiles + ((pass & 3) << 1) + border_tile; // pointer to global memory
+			    dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
 			    int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
 			    if (border_tile){
-			    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+//			    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			    	ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
+			    	ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
 			    }
 #ifdef DEBUG12
 				printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
@@ -2146,15 +2103,16 @@ extern "C" __global__ void generate_RBGA(
 				printf("\n");
 #endif
 			    /* */
-				int shared_size = get_textures_shared_size( // in bytes
+///				int shared_size = get_textures_shared_size( // in bytes
-					    num_cams,     // int                num_cams,     // actual number of cameras
+///					    num_cams,     // int                num_cams,     // actual number of cameras
-						colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+///						colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
-						0);           // int *              offsets);     // in floats
+///						0);           // int *              offsets);     // in floats
-			    textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
+			    textures_accumulate <<<grid_texture,threads_texture, shared_size, cudaStreamTailLaunch>>>(
 			    		num_cams,                        // int               num_cams,           // number of cameras used
 			    		woi,                             // int             * woi,                // x, y, width,height
 						gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
+//						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
+						pntt,                            // size_t          * pnum_texture_tiles,  // pointer to a number of texture tiles to process
 						ti_offset,                       //                   gpu_texture_indices_offset,// add to gpu_texture_indices
 						gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 						gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
@@ -2176,14 +2134,28 @@ extern "C" __global__ void generate_RBGA(
 						1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 						(float *)0, //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 						width);
-				cudaDeviceSynchronize(); // not needed yet, just for testing
-				/* */
 			}
 	 }
 	 __syncthreads();
 }
+/**
+ * Helper kernel for CDP2 to update woi (was {min_x,min_y,Max_x,max_y}
+ * and calculate texture_width and texture_tiles_height * texture_slices to be used in next kernel in stream
+ */
+__global__ void update_woi(
+		int                texture_slices,
+		int              * woi,                  // min_x, min_y, max_x, max_y input, not modified, max_x - not used
+		int              * twh)  // 2-element in device global memory
+{
+	if (threadIdx.x == 0) { // always
+		*(woi + 2) += 1 - *(woi + 0); // width
+		*(woi + 3) += 1 - *(woi + 1); // height
+		twh[0] =        (*(woi + 2) + 1)* DTT_SIZE;
+		twh[1] = (*(woi + 3) + 1) * DTT_SIZE * texture_slices;
+	}
+}
 /**
 * Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
@@ -2203,7 +2175,7 @@ __global__ void clear_texture_rbga(
 	if (col > texture_width) {
 		return;
 	}
-	int row = blockIdx.y;; // includes slices
+	int row = blockIdx.y; // includes slices
 	float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
 #pragma unroll
 	for (int n = 0; n < DTT_SIZE; n++) {
@@ -2211,6 +2183,36 @@ __global__ void clear_texture_rbga(
 	}
 }
+/**
+ * Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
+ * @param  twh                      {texture width in pixels, aligned to DTT_SIZE,
+ *                                  full number of output rows: texture height in pixels, multiplied by number of color slices}
+ * @param  texture_rbga_stride      texture line  stride in floats
+ * @param  gpu_texture_tiles        pointer to the texture output
+ */
+// blockDim.x * gridDim.x >= width
+__global__ void clear_texture_rbga2(
+		int *             twh, //  {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
+		const size_t      texture_rbga_stride,     // in floats 8*stride
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+{
+	int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
+	if (col > twh[0]) {
+		return;
+	}
+	int row = blockIdx.y; // includes slices
+	if (row > twh[1]) {
+		return;
+	}
+	float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
+#pragma unroll
+	for (int n = 0; n < DTT_SIZE; n++) {
+		*(pix++) = 0.0;
+	}
+}
+// not used - both in C++ and Java
 /**
 * Helper kernel for generate_RBGA() -  prepare list of texture tiles, woi, and calculate orthogonal
 * neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
@@ -2246,29 +2248,41 @@ __global__ void prepare_texture_list(
    dim3 blocks0 (blocks_x, height, 1);
 	if (threadIdx.x == 0) {
-		clear_texture_list<<<blocks0,threads0>>>(
+		*(woi + 0) = width; // TILES-X;
+		*(woi + 1) = height; // TILES-Y;
+		*(woi + 2) = 0; // maximal x
+		*(woi + 3) = 0; // maximal y
+		*(num_texture_tiles+0) = 0;
+		*(num_texture_tiles+1) = 0;
+		*(num_texture_tiles+2) = 0;
+		*(num_texture_tiles+3) = 0;
+		*(num_texture_tiles+4) = 0;
+		*(num_texture_tiles+5) = 0;
+		*(num_texture_tiles+6) = 0;
+		*(num_texture_tiles+7) = 0;
+		clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				gpu_texture_indices,
 				width,
 				height);
-		cudaDeviceSynchronize(); // not needed yet, just for testing
+///		cudaDeviceSynchronize(); // not needed yet, just for testing
 		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
 		int blocks_t =   (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
 	    dim3 blocks(blocks_t, 1, 1);
 	    // mark used tiles in gpu_texture_indices memory
-		mark_texture_tiles <<<blocks,threads>>>(
+		mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,
 				gpu_ftasks,
 //				gpu_tasks,
 				num_tiles,          // number of tiles in task list
 				width,
 				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-		cudaDeviceSynchronize();
+///		cudaDeviceSynchronize();
 	    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
-		*(woi + 0) = width; // TILES-X;
+///		*(woi + 0) = width; // TILES-X;
-		*(woi + 1) = height; // TILES-Y;
+///		*(woi + 1) = height; // TILES-Y;
-		*(woi + 2) = 0; // maximal x
+///		*(woi + 2) = 0; // maximal x
-		*(woi + 3) = 0; // maximal y
+///		*(woi + 3) = 0; // maximal y
-		mark_texture_neighbor_tiles <<<blocks,threads>>>(
+		mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,
 				gpu_ftasks,
 //				gpu_tasks,
@@ -2277,18 +2291,18 @@ __global__ void prepare_texture_list(
 				height,              // number of tiles rows
 				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
 				woi);                // min_x, min_y, max_x, max_y
-		cudaDeviceSynchronize();
+///		cudaDeviceSynchronize();
 	    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
-		*(num_texture_tiles+0) = 0;
+///		*(num_texture_tiles+0) = 0;
-		*(num_texture_tiles+1) = 0;
+///		*(num_texture_tiles+1) = 0;
-		*(num_texture_tiles+2) = 0;
+///		*(num_texture_tiles+2) = 0;
-		*(num_texture_tiles+3) = 0;
+///		*(num_texture_tiles+3) = 0;
-		*(num_texture_tiles+4) = 0;
+///		*(num_texture_tiles+4) = 0;
-		*(num_texture_tiles+5) = 0;
+///		*(num_texture_tiles+5) = 0;
-		*(num_texture_tiles+6) = 0;
+///		*(num_texture_tiles+6) = 0;
-		*(num_texture_tiles+7) = 0;
+///		*(num_texture_tiles+7) = 0;
-		gen_texture_list <<<blocks,threads>>>(
+		gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
 				num_cams,
 				gpu_ftasks,
 //				gpu_tasks,
@@ -2299,7 +2313,7 @@ __global__ void prepare_texture_list(
 				num_texture_tiles,   // number of texture tiles to process
 				woi);                // x,y, here woi[2] = max_X, woi[3] - max-Y
-		cudaDeviceSynchronize(); // not needed yet, just for testing
+///		cudaDeviceSynchronize(); // not needed yet, just for testing
 		*(woi + 2) += 1 - *(woi + 0); // width
 		*(woi + 3) += 1 - *(woi + 1); // height
 	}
@@ -2377,7 +2391,7 @@ __global__ void mark_texture_tiles(
 * @param width                number of tiles in a row
 * @param height               number of tiles rows
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
- * @param woi                  4-element int array ( x,y,width,height of the woi, in tiles)
+ * @param woi                  4-element int array ( x,y,width,height of the woi, in tiles) -> minx, min_y, max_x, max_y
 */
 // treads (*,1,1), blocks = (*,1,1)
 __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
@@ -2445,8 +2459,8 @@ __global__ void gen_texture_list(
 		int                width,                // number of tiles in a row
 		int                height,               // number of tiles rows
 		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * num_texture_tiles,    // pointer to a number of texture tiles to process
-		int              * woi)                  // min_x, min_y, max_x, max_y input
+		int              * woi)                  // min_x, min_y, max_x, max_y input, not modified, max_x - not used
 {
 	int               tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
@@ -2659,7 +2673,7 @@ __global__ void index_correlate(
 /**
 * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
 * For interscene correlation. One correlation output for each selected sensor
- * plus a sum of them all. So for all 16 sensors selected ooutput will have 17
+ * plus a sum of them all. So for all 16 sensors selected output will have 17
 * 2D correlations (with sum being the last one)
 * All pairs for the same tile will always be in the same order: increasing sensor numbers
 * with sum being the last. Sum will be marked by 0xff in the LSB.
@@ -2709,7 +2723,6 @@ __global__ void index_inter_correlate(
 	}
 }
 /**
 * Direct MCLT transform and aberration correction with space-variant deconvolution
 * kernels. Results are used to output aberration-corrected images, textures and
@@ -2755,19 +2768,20 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
 	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
 	 if (threadIdx.x == 0) { // always 1
-		 *pnum_active_tiles = 0;
+		 //__device__
+		 *pnum_active_tiles = 0; // already _device_
 		 int task_size = get_task_size(num_cams);
-		 index_direct<<<blocks0,threads0>>>(
+		 index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
 				 task_size,          // int                task_size,        // flattened task size in 4-byte floats
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				 num_tiles,           //int                num_tiles,          // number of tiles in task
 				 gpu_active_tiles,    //int *              active_tiles,       // pointer to the calculated number of non-zero tiles
 				 pnum_active_tiles);  //int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
-		 cudaDeviceSynchronize();
+///		 cudaDeviceSynchronize();
-		 // now call actual convert_correct_tiles
+///		 dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
 		 dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
-		 dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+		 dim3 grid_tp((num_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1); // use static dimensions - maximal number of tiles
-		 convert_correct_tiles<<<grid_tp,threads_tp>>>(
+		 convert_correct_tiles<<<grid_tp, threads_tp, 0, cudaStreamTailLaunch>>>(
 				 num_cams,           // int                num_cams,           // actual number of cameras
 				 num_colors,         // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				 ((kernels_hor>0)?gpu_kernel_offsets:0), // float           ** gpu_kernel_offsets, // [num_cams],
@@ -2775,16 +2789,15 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 				 gpu_images,         // float           ** gpu_images,         // [num_cams],
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				 gpu_active_tiles,   // int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
-				 *pnum_active_tiles, // int                num_active_tiles,   // number of tiles in task
+				 pnum_active_tiles,  // int              * pnum_active_tiles,   // number of tiles in task
 				 gpu_clt,            // float           ** gpu_clt,            // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
 				 dstride,            // size_t             dstride,            // in floats (pixels)
 				 lpf_mask,           // int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-				 woi_width,          // int                woi_width,          // varaible to swict between EO and LWIR
+				 woi_width,          // int                woi_width,          // variable to switch between EO and LWIR
-				 woi_height,         // int                woi_height,         // varaible to swict between EO and LWIR
+				 woi_height,         // int                woi_height,         // variable to switch between EO and LWIR
-				 kernels_hor,        // int                kernels_hor,        // varaible to swict between EO and LWIR
+				 kernels_hor,        // int                kernels_hor,        // variable to switch between EO and LWIR
 				 kernels_vert, // );      // int                kernels_vert);      // varaible to swict between EO and LWIR
 				 tilesx); // int                tilesx)
 	 }
 }
@@ -2867,7 +2880,7 @@ __global__ void convert_correct_tiles(
 			float           ** gpu_images,         // [num_cams],
 			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
-			int                num_active_tiles,   // number of tiles in task
+			int              * num_active_tiles,   // number of tiles in task
 			float           ** gpu_clt,            // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
 			size_t             dstride,            // in floats (pixels)
 			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
@@ -2877,11 +2890,12 @@ __global__ void convert_correct_tiles(
 			int                kernels_vert,
 			int                tilesx)
 {
+//	printf("\n1.7. convert_correct_tiles():3055\n");
 //    int tilesx = TILES-X;
 ///	dim3 t = threadIdx;
 	int tile_in_block = threadIdx.y;
 	int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
-	if (task_indx >=  num_active_tiles){
+	if (task_indx >=  *num_active_tiles){
 		return; // nothing to do
 	}
 	int task_num = gpu_active_tiles[task_indx];
@@ -2997,6 +3011,7 @@ __global__ void convert_correct_tiles(
    		__syncthreads();
    	}
    }
+//    return;
 }
 /**
@@ -3069,7 +3084,7 @@ extern "C" __global__ void textures_nonoverlap(
 	 if (threadIdx.x == 0) { // only 1 thread, 1 block
 		 *pnum_texture_tiles = 0;
-		 create_nonoverlap_list<<<blocks0,threads0>>>(
+		 create_nonoverlap_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
 				 num_cams,            // int                num_cams,
 				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //				 gpu_tasks,           // struct tp_task   * gpu_tasks,
@@ -3077,13 +3092,14 @@ extern "C" __global__ void textures_nonoverlap(
 				 num_tilesx,          // int                width,               // number of tiles in a row
 				 gpu_texture_indices, // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
 				 pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
-		 cudaDeviceSynchronize();
+///		 cudaDeviceSynchronize();
 		 int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
 //		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 		 dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 //	     dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-		 dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+///		 dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+		 dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // fixed-size grid
 		 int shared_size = get_textures_shared_size( // in bytes
 				 num_cams,     // int                num_cams,     // actual number of cameras
 				 colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
@@ -3094,11 +3110,12 @@ extern "C" __global__ void textures_nonoverlap(
 		__syncthreads();
 #endif
-		textures_accumulate <<<grid_texture,threads_texture,  shared_size>>>( // 65536>>>( //
+		textures_accumulate <<<grid_texture,threads_texture,  shared_size, cudaStreamTailLaunch>>>( // 65536>>>( //
 				num_cams,                        // 	int               num_cams,           // number of cameras used
 				(int *) 0,                       // int             * woi,                // x, y, width,height
 				gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				*pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+//				*pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+				pnum_texture_tiles,              // int             * pnum_texture_tiles,  // pointer to a number of texture tiles to process
 				0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
 				gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
@@ -3135,8 +3152,8 @@ extern "C" __global__ void textures_nonoverlap(
 * @param num_cams             Number of cameras used
 * @param woi                  WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used
 * @param gpu_clt              array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
- * @param num_texture_tiles    number of texture tiles to process
+ * @param pnum_texture_tiles   pointer to a number of texture tiles to process
- * @param gpu_texture_indices_offset add to gpu_texture_indices
+ * @param gpu_texture_indices_offset add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
 * @param gpu_texture_indices  array - 1 integer per tile to process
 * @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight
 * @param colors               number of colors used:  3 for RGB or 1 for monochrome
@@ -3162,8 +3179,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int               num_cams,           // number of cameras used
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-		size_t            num_texture_tiles,  // number of texture tiles to process
+///		size_t            num_texture_tiles,  // number of texture tiles to process
-		int               gpu_texture_indices_offset,// add to gpu_texture_indices
+		int             * pnum_texture_tiles,  // pointer to a number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
@@ -3173,7 +3191,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		float             diff_sigma,         // pixel value/pixel change
 		float             diff_threshold,     // pixel value/pixel change
 		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		float             weights[3],         // scale for R,B,G
+		const float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? Now +2 - output raw channels
 // combining both non-overlap and overlap (each calculated if pointer is not null )
@@ -3201,11 +3219,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 //	int camera_num = threadIdx.y;
 	int tile_indx = blockIdx.x; //  * TEXTURE_TILES_PER_BLOCK + tile_in_block;
-	if (tile_indx >= num_texture_tiles){
+	if (tile_indx >= * pnum_texture_tiles){
 		return; // nothing to do
 	}
 	// get number of tile
-	int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP
+//	int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP (before CDP2)
+	int tile_offs = (gpu_texture_indices_offset >=0) ? gpu_texture_indices_offset : -(gpu_texture_indices_offset + *pnum_texture_tiles);
+	int tile_code = gpu_texture_indices[tile_indx + tile_offs]; // Added for Java, no DP
 	if ((tile_code & (1 << LIST_TEXTURE_BIT)) == 0){
 		return; // nothing to do
 	}
@@ -3900,7 +3920,7 @@ __global__ void imclt_rbg_all(
 		int                woi_theight,
 		const size_t       dstride)            // in floats (pixels)
 {
+// all OK - do not depend on previous kernels
 //	int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]);
 	dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1);
 	dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1);
@@ -3911,11 +3931,11 @@ __global__ void imclt_rbg_all(
 			for (int color = 0; color < colors; color++) {
 				// clear right and bottom 8-pixel column and row
 				float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE);
-				erase8x8<<<grid_erase8x8_right_col,threads_erase8x8>>>(
+				erase8x8<<<grid_erase8x8_right_col,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
 						right_col,             // float           * gpu_top_left,
 						dstride);              // const size_t      dstride);
 				float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color +  dstride * (woi_theight * DTT_SIZE);
-				erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8>>>(
+				erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
 						bottom_row,             // float           * gpu_top_left,
 						dstride);              // const size_t      dstride);
@@ -3926,7 +3946,7 @@ __global__ void imclt_rbg_all(
 						int tiles_in_pass = tilesy_half * tilesx_half;
 						dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
 						//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-						imclt_rbg<<<grid_imclt,threads_imclt>>>(
+						imclt_rbg<<<grid_imclt,threads_imclt, 0, cudaStreamTailLaunch>>>(
 								gpu_clt[ncam],         // float           * gpu_clt,     // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 								gpu_corr_images[ncam], // float           * gpu_rbg,     // WIDTH, 3 * HEIGHT
 								1,                     // int               apply_lpf,
@@ -3937,7 +3957,7 @@ __global__ void imclt_rbg_all(
 								woi_twidth,            // int               woi_twidth,  // will increase by DTT_SIZE (todo - cut away?)
 								woi_theight,           // int               woi_theight, // will increase by DTT_SIZE (todo - cut away?)
 								dstride);              // const size_t      dstride);    // in floats (pixels)
-						cudaDeviceSynchronize();
+///						cudaDeviceSynchronize();
 					}
 				}
 			}
@@ -5343,7 +5363,7 @@ __device__ void tile_combine_rgba(
 		// next not used
 		//		boolean       diff_gauss,     // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
 		float   min_agree,     // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		float * chn_weights,   // color channel weights, sum == 1.0
+		const float * chn_weights,   // color channel weights, sum == 1.0
 		int     dust_remove,   // Do not reduce average weight when only one image differs much from the average
 		int     keep_weights,  // return channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
 		int     debug)

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -37,10 +37,34 @@
 */
 #pragma once
+#ifndef TILE_PROCESSOR_H_
+#define TILE_PROCESSOR_H_
 #ifndef NUM_CAMS
 #include "tp_defines.h"
 #endif
+#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
+		| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
+#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+#define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
+struct CltExtra{
+	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
+	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
+	float center_x; // actual center X (use to find derivatives)
+	float center_y; // actual center X (use to find derivatives)
+	float dxc_dx;   // add this to data_x per each pixel X-shift relative to the kernel center location
+	float dxc_dy;   // same per each Y-shift pixel
+	float dyc_dx;
+	float dyc_dy;
+};
 extern "C" __global__ void convert_direct( // called with a single block, single thread
 		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh);
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
+		int               num_cams,           // number of cameras used
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+///		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * pnum_texture_tiles,  // pointer to a number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		const float       weights[3],         // scale for R,B,G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+		int               tilesx);
+extern "C" __global__ void clear_texture_list(
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int                width,  // <= TILES-X, use for faster processing of LWIR images
+		int                height); // <= TILES-Y, use for faster processing of LWIR images
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void create_nonoverlap_list(
+		int                num_cams,
+		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task
+		int                width,               // number of tiles in a row
+		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+		int *              pnonoverlap_length); //  indices to gpu_tasks  // should be initialized to zero
+extern "C" __global__ void mark_texture_tiles(
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
+extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int                height,              // number of tiles rows
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi);                  // x,y,width,height of the woi
+extern "C" __global__ void gen_texture_list(
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,            // number of tiles in task list
+		int                width,                // number of tiles in a row
+		int                height,               // number of tiles rows
+		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * woi);                 // min_x, min_y, max_x, max_y input
+#endif
--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -40,6 +40,14 @@
 	#include "tp_defines.h"
 	#include "dtt8x8.h"
 	#include "geometry_correction.h"
+//	#include "TileProcessor.h"
+	#include <cuda_runtime.h>
+//	#include <helper_cuda.h>
+//	#include <helper_functions.h>
 #endif // #ifndef JCUDA
 #ifndef get_task_size
@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
 		}
 };
+// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
+__constant__ int angles_offsets [4] {15,0,30,30};
+/*
+__constant__ int angles_offsets [4]  {
+		(int) (offsetof4(corr_vector, azimuth)),
+		(int) (offsetof4(corr_vector, tilt)),
+		(int) (offsetof4(corr_vector, roll)),
+		(int) (offsetof4(corr_vector, roll))};
+*/
+/*
+ __constant__ int angles_offsets [4] = {
+		(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
+		(int) (offsetof(corr_vector, tilt)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float))};
-__constant__ int angles_offsets [4] = {
+ */
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
 __constant__ int mm_seq [3][3][3]={
 		{
 				{6,5,12}, // a_t * a_z -> tmp0
@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets(
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
 	}
-//	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
 }

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -51,6 +51,11 @@
    ((size_t)&(((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
+#ifndef offsetof4
+#define offsetof4(st, m) \
+    (((size_t)&(((st *)0)->m))>>2)
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm