Unified GPU for DP2/no DP2 (12.6.0/11.2.0)

0123f06e · Andrey Filippov · 8755d17e · 0123f06e · 0123f06e
Commit 0123f06e authored Jul 21, 2025 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 197 additions and 57 deletions

GPUTileProcessor.java src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java +30 -13

GpuQuad.java src/main/java/com/elphel/imagej/gpu/GpuQuad.java +167 -44

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
 ** GPU acceleration for the Tile Processor
 **
 **
-** Copyright (C) 2018 Elphel, Inc.
+** Copyright (C) 2018-2025 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
@@ -48,7 +48,7 @@ import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
 import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
 import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
 import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog;
-import static jcuda.nvrtc.JNvrtc.nvrtcVersion;
+//import static jcuda.nvrtc.JNvrtc.nvrtcVersion;
 import static jcuda.nvrtc.JNvrtc.nvrtcGetNumSupportedArchs;
 import static jcuda.nvrtc.JNvrtc.nvrtcGetSupportedArchs;

@@ -62,6 +62,7 @@ import com.elphel.imagej.tileprocessor.Correlation2d;

 import ij.IJ;
 import ij.text.TextWindow;
+import jcuda.JCudaVersion;
 import jcuda.Pointer;
 import jcuda.driver.CUcontext;
 import jcuda.driver.CUdevice;
@@ -75,16 +76,20 @@ import jcuda.nvrtc.JNvrtc;
 import jcuda.nvrtc.nvrtcProgram;

 public class GPUTileProcessor {
+	public static String  CUDA_VERSION = JCudaVersion.get(); 
+	public static boolean USE_CUDA12 = CUDA_VERSION.startsWith("12.");
 	public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	// Can be downloaded and twice extracted from
 	// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
 	// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
 	// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
-	static String GPU_RESOURCE_DIR =              "kernels";
+	static String GPU_RESOURCE_TOP_DIR =              "kernels";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
 	// "*" - generated defines, first index - separately compiled unit
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+	static String [][] GPU_SRC_FILES = USE_CUDA12?
+			(new String[][] {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}}):
+			(new String[][] {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}});
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct";      // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D";         // name in C code
@@ -270,7 +275,6 @@ public class GPUTileProcessor {
    public GPUTileProcessor(
    		String cuda_project_directory) throws IOException
    {
-
    	// From code by Marco Hutter - http://www.jcuda.org
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);
@@ -310,7 +314,7 @@ public class GPUTileProcessor {
            	}else {
                	File file = null;
                	if ((cuda_project_directory == null) || cuda_project_directory.isEmpty()) {
-                		file = new File(classLoader.getResource(GPU_RESOURCE_DIR+"/"+src_file).getFile());
+                		file = new File(classLoader.getResource(GPU_RESOURCE_TOP_DIR+"/"+CUDA_VERSION+"/"+src_file).getFile());
                		System.out.println("Loading resource "+file);
                	} else {
                		File src_dir = new File(cuda_project_directory, "src");
@@ -507,22 +511,35 @@ public class GPUTileProcessor {
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
-    		String options[] = {"--gpu-architecture=compute_"+capability};
-    		int [][] nvrtc_version = new int[2][];
+//    		String options[] = {"--gpu-architecture=compute_"+capability};
+//    		int [][] nvrtc_version = new int[2][];
    		int nvrtc_rslt = -1;
-    		/*
-    		nvrtc_rslt= nvrtcVersion(nvrtc_version[0],nvrtc_version[0]);
-    		System.out.println("nvrtcVersion="+nvrtc_version[0][0]+"."+nvrtc_version[1][0]+" (returned "+nvrtc_rslt+").");
-    		*/
    		int [] nvrtc_num_arch = new int[1]; 
    		nvrtc_rslt= nvrtcGetNumSupportedArchs(nvrtc_num_arch);
    		System.out.println("nvrtc_num_arch="+nvrtc_num_arch[0]+" (returned "+nvrtc_rslt+").");
    		int [] nvrtc_archs = new int[nvrtc_num_arch[0]];
    		nvrtc_rslt= nvrtcGetSupportedArchs(nvrtc_archs);
+    		int max_arch = 0;
+    		for (int sa: nvrtc_archs) {
+    			max_arch = Math.max(max_arch, sa);
+    		}
    		for (int sa: nvrtc_archs) {
    			System.out.println("Supported arch "+sa);
    		}
-    		System.out.println();
+			System.out.println("Max supported arch is "+max_arch+", gpu capability = "+capability);
+			if (capability > max_arch) {
+				capability = max_arch;
+				System.out.println("Reduced capability to match NVRTC compiler to "+capability);
+			}
+			String options[] = new String[USE_CUDA12?2:1];
+			options[0] = "--gpu-architecture=compute_"+capability;
+			if (options.length > 1) {
+				options[1] = "--extensible-whole-program";
+			}
+    		System.out.println("Running NVRTC with the following options:");
+    		for (String s:options) {
+    			System.out.println(s);
+    		}			
    		try {
    			nvrtcCompileProgram(program, options.length, options);
    			OK = true;

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
 package com.elphel.imagej.gpu;

 import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
+import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
+import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
 import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuMemAlloc;
@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_color_weights;
 	private CUdeviceptr gpu_generate_RBGA_params;
 	private CUdeviceptr gpu_woi;
+	private CUdeviceptr gpu_twh;
 	private CUdeviceptr gpu_num_texture_tiles;
 	private CUdeviceptr gpu_textures_rgba;
 	private CUdeviceptr gpu_correction_vector;
@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT/allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int

 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT

 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers

 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT //  allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int

 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT

 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers

 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -2691,9 +2698,41 @@ public class GpuQuad{ // quad camera description
 		// uses dynamic parallelization, top kernel is a single-thread one
 		int [] GridFullWarps =    {1, 1, 1};
 		int [] ThreadsFullWarps = {1, 1, 1};
-
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
-		Pointer kernelParameters = Pointer.to(
+		Pointer kernelParameters;
+		if (GPUTileProcessor.USE_CUDA12) {
+			ThreadsFullWarps = new int[] {1, 1, 1};
+			int shared_size = host_get_textures_shared_size( // in bytes
+					num_cams,     // int                num_cams,     // actual number of cameras
+					num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+					null);           // int *              offsets);     // in floats
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+			kernelParameters = Pointer.to(
+					Pointer.to(new int[] { num_cams}),               // int               num_cams,
+					Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+					Pointer.to(new int[] { num_task_tiles }),        // int               num_tiles,          // number of tiles in task list
+					// declare arrays in device code?
+					Pointer.to(gpu_texture_indices_ovlp),            // int             * gpu_texture_indices_ovlp,// packed tile + bits (now only (1 << 7)
+					Pointer.to(gpu_num_texture_ovlp),                // int             * num_texture_tiles,  // number of texture tiles to process (8 elements)
+					Pointer.to(gpu_woi),                             // int             * woi,                // x,y,width,height of the woi
+					// set smaller for LWIR - it is used to reduce work aread
+					Pointer.to(new int[] {img_width / GPUTileProcessor.DTT_SIZE}),     // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+					Pointer.to(new int[] {img_height / GPUTileProcessor.DTT_SIZE}),    // int                height); // <= TILESY, use for faster processing of LWIR images
+					// Parameters for the texture generation
+					Pointer.to(gpu_clt),                             // float          ** gpu_clt,            // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+					Pointer.to(gpu_geometry_correction),             //	struct gc          * gpu_geometry_correction,
+					Pointer.to(new int[]   {num_colors}),            // int               colors,             // number of colors (3/1)
+					Pointer.to(new int[]   {iis_lwir}),              // int               is_lwir,            // do not perform shot correction
+					Pointer.to(gpu_generate_RBGA_params),            // float             generate_RBGA_params[5],
+					Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
+					Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					Pointer.to(new int[]   {keep_weights}),          // int               keep_weights,       // return channel weights after A in RGBA
+					Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
+					Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+					Pointer.to(gpu_twh));                              // int             * twh);  allocate int[2]  for width, heightin DP
+		} else {
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int               num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				Pointer.to(new int[] { num_task_tiles }),        // int               num_tiles,          // number of tiles in task list
@@ -2715,6 +2754,7 @@ public class GpuQuad{ // quad camera description
 				Pointer.to(new int[]   {keep_weights}),          // int               keep_weights,       // return channel weights after A in RGBA
 				Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
 				Pointer.to(gpu_textures_rgba));                  // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		}

 		cuCtxSynchronize();
 		// Call the kernel function
@@ -2994,7 +3034,7 @@ public class GpuQuad{ // quad camera description
 		if (DEBUG8A) {
 			cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
 		}
-		
+		int [] cpu_pnum_texture_tiles = {0}; //// debugging CDP2
 		// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
 		// oversize (border 16x 16 tiles overhang by 4 pixels)
 		int tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
@@ -3005,12 +3045,20 @@ public class GpuQuad{ // quad camera description

 			int border_tile =  (pass >> 2);
 			int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile];
-			if (ntt > 0) {
+			if (ntt > 0) { 
 				int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1
-
-				int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
-				if (border_tile != 0){
-					ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+				int ti_offset; 
+				if (GPUTileProcessor.USE_CUDA12) { // for CDP2
+					ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+					if (border_tile != 0){
+						ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
+						ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
+					}
+				} else { //  Pre CDP2
+					ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+					if (border_tile != 0){
+						ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+					}
 				}
 				int shared_size = host_get_textures_shared_size( // in bytes
 						num_cams,     // int                num_cams,     // actual number of cameras
@@ -3038,35 +3086,68 @@ public class GpuQuad{ // quad camera description
 					}
 					System.out.println ("\n\n");
 				}
-
-				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
-				Pointer kp_textures_accumulate = Pointer.to(
-						Pointer.to(new int[] {num_cams}),                // int         num_cams,
-						Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
-						Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
-						Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
-						Pointer.to(new int[] {ti_offset}),                     // size_t      num_texture_tiles,// number of texture tiles to process
-						Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
-						Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
-						Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
-						Pointer.to(new int[]   {iis_lwir}),              // int         is_lwir,        // do not perform shot correction
-						Pointer.to(new float[] {(float) min_shot}),      // float       min_shot,       // 10.0
-						Pointer.to(new float[] {(float) scale_shot}),    // float       scale_shot,     // 3.0
-						Pointer.to(new float[] {(float) diff_sigma}),    // float       diff_sigma,     // pixel value/pixel change
-						Pointer.to(new float[] {(float) diff_threshold}),// float       diff_threshold, // pixel value/pixel change
-						Pointer.to(new float[] {(float) min_agree}),     // float       min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-						Pointer.to(gpu_color_weights),                   // float       weights[3],     // scale for R,B,G (or {1.0,0.0,0.0}
-						Pointer.to(new int[]   {idust_remove}),          // int         dust_remove,        // Do not reduce average weight when only one image differes much from the average
-						Pointer.to(new int[]   {keep_weights}),           // int         keep_weights,       // return channel weights after A in RGBA
-						// combining both non-overlap and overlap (each calculated if pointer is not null )
-						Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
-						Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-						Pointer.to(new int[]   {0}),                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
-						Pointer.to(new int[]   {0}), // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-						Pointer.to(new int[]   {0}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
-						Pointer.to(new int[]   {0}), //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
-						Pointer.to(new int[]   {width}));
-
+				Pointer kp_textures_accumulate;
+				if (GPUTileProcessor.USE_CUDA12) { // for CDP2
+					// debugging, copying single int back and forth 
+					cpu_pnum_texture_tiles[0] = ntt;
+					cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles),  1 * Sizeof.INT);
+					cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+					cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+					kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
+							Pointer.to(new int[] {num_cams}),                // int         num_cams,
+							Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
+							Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+							Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
+							Pointer.to(new int[] {ti_offset}),               // int         gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
+							Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
+							Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
+							Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
+							Pointer.to(new int[]   {iis_lwir}),              // int         is_lwir,        // do not perform shot correction
+							Pointer.to(new float[] {(float) min_shot}),      // float       min_shot,       // 10.0
+							Pointer.to(new float[] {(float) scale_shot}),    // float       scale_shot,     // 3.0
+							Pointer.to(new float[] {(float) diff_sigma}),    // float       diff_sigma,     // pixel value/pixel change
+							Pointer.to(new float[] {(float) diff_threshold}),// float       diff_threshold, // pixel value/pixel change
+							Pointer.to(new float[] {(float) min_agree}),     // float       min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+							Pointer.to(gpu_color_weights),                   // float       weights[3],     // scale for R,B,G (or {1.0,0.0,0.0}
+							Pointer.to(new int[]   {idust_remove}),          // int         dust_remove,        // Do not reduce average weight when only one image differes much from the average
+							Pointer.to(new int[]   {keep_weights}),           // int         keep_weights,       // return channel weights after A in RGBA
+							// combining both non-overlap and overlap (each calculated if pointer is not null )
+							Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
+							Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+							Pointer.to(new int[]   {0}),                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+							Pointer.to(new int[]   {0}), // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+							Pointer.to(new int[]   {0}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+							Pointer.to(new int[]   {0}), //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+							Pointer.to(new int[]   {width}));
+				} else { // pre  CDP2
+					cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+					kp_textures_accumulate = Pointer.to(
+							Pointer.to(new int[] {num_cams}),                // int         num_cams,
+							Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
+							Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+							Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+							Pointer.to(new int[] {ti_offset}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+							Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
+							Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
+							Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
+							Pointer.to(new int[]   {iis_lwir}),              // int         is_lwir,        // do not perform shot correction
+							Pointer.to(new float[] {(float) min_shot}),      // float       min_shot,       // 10.0
+							Pointer.to(new float[] {(float) scale_shot}),    // float       scale_shot,     // 3.0
+							Pointer.to(new float[] {(float) diff_sigma}),    // float       diff_sigma,     // pixel value/pixel change
+							Pointer.to(new float[] {(float) diff_threshold}),// float       diff_threshold, // pixel value/pixel change
+							Pointer.to(new float[] {(float) min_agree}),     // float       min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+							Pointer.to(gpu_color_weights),                   // float       weights[3],     // scale for R,B,G (or {1.0,0.0,0.0}
+							Pointer.to(new int[]   {idust_remove}),          // int         dust_remove,        // Do not reduce average weight when only one image differes much from the average
+							Pointer.to(new int[]   {keep_weights}),           // int         keep_weights,       // return channel weights after A in RGBA
+							// combining both non-overlap and overlap (each calculated if pointer is not null )
+							Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
+							Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+							Pointer.to(new int[]   {0}),                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+							Pointer.to(new int[]   {0}), // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+							Pointer.to(new int[]   {0}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+							Pointer.to(new int[]   {0}), //);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+							Pointer.to(new int[]   {width}));
+				}
 				cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, //  jcuda.CudaException: CUDA_ERROR_INVALID_VALUE
 						grid_texture[0],    grid_texture[1],    grid_texture[2],   // Grid dimension
 						threads_texture[0], threads_texture[1], threads_texture[2],  // Block dimension
@@ -3185,9 +3266,16 @@ public class GpuQuad{ // quad camera description
 //		int keep_weights = 0;     // 2 bits now, move to parameters
 		int [] GridFullWarps =    {1, 1, 1};
 		int [] ThreadsFullWarps = {1, 1, 1};
-
-		//        	CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		if (GPUTileProcessor.USE_CUDA12) { // for CDP2
+			int shared_size = host_get_textures_shared_size( // in bytes
+					num_cams,     // int                num_cams,     // actual number of cameras
+					num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+					null);           // int *              offsets);     // in floats
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+		} else {
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		}
 		Pointer kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int                num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,
@@ -3290,8 +3378,39 @@ public class GpuQuad{ // quad camera description
 				num_cams,     // int                num_cams,     // actual number of cameras
 				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				null);           // int *              offsets);     // in floats
+		Pointer kp_textures_accumulate;
+		if (GPUTileProcessor.USE_CUDA12) { // for CDP2
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+			cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+			kp_textures_accumulate = Pointer.to(
+					Pointer.to(new int[] {num_cams}),                // int         num_cams,
+					Pointer.to(new int[]   {0}),  // Pointer.to(gpu_woi),  // int       * woi,            // min_x, min_y, max_x, max_y
+					Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+					Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
+					Pointer.to(new int[] {0}),                     // size_t      num_texture_tiles,// number of texture tiles to process
+					Pointer.to(gpu_texture_indices),          // int   * nonoverlap_list,    // pointer to the calculated number of non-zero tiles
+					Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
+					Pointer.to(new int[]   {num_colors}),            // int         colors,         // number of colors (3/1)
+					Pointer.to(new int[]   {iis_lwir}),              // int         is_lwir,        // do not perform shot correction
+					Pointer.to(new float[] {(float) min_shot}),      // float       min_shot,       // 10.0
+					Pointer.to(new float[] {(float) scale_shot}),    // float       scale_shot,     // 3.0
+					Pointer.to(new float[] {(float) diff_sigma}),    // float       diff_sigma,     // pixel value/pixel change
+					Pointer.to(new float[] {(float) diff_threshold}),// float       diff_threshold, // pixel value/pixel change
+					Pointer.to(new float[] {(float) min_agree}),     // float       min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					Pointer.to(gpu_color_weights),                   // float       weights[3],     // scale for R,B,G (or {1.0,0.0,0.0}
+					Pointer.to(new int[]   {idust_remove}),          // int         dust_remove,        // Do not reduce average weight when only one image differes much from the average
+					Pointer.to(new int[]   {keep_texture_weights}),  // int         keep_weights,       // return channel weights after A in RGBA
+					 // combining both non-overlap and overlap (each calculated if pointer is not null )
+					Pointer.to(new int[]   {0}),                     // const size_t      texture_rbga_stride,     // in floats
+					Pointer.to(new int[]   {0}),                     // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+					Pointer.to(new int[] {calc_textures? texture_stride : 0}), // size_t   texture_stride,     // in floats (now 256*4 = 1024)
+					Pointer.to(gpu_textures), // gpu_texture_tiles, // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+					Pointer.to(new int[]   {ilinescan_order}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+					calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
+					Pointer.to(new int[] { tilesX }));
+		} else {
 		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
-		Pointer kp_textures_accumulate = Pointer.to(
+		kp_textures_accumulate = Pointer.to(
 				Pointer.to(new int[] {num_cams}),                // int         num_cams,
 				Pointer.to(new int[]   {0}),  // Pointer.to(gpu_woi),  // int       * woi,            // min_x, min_y, max_x, max_y
 				Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
@@ -3317,6 +3436,7 @@ public class GpuQuad{ // quad camera description
 				Pointer.to(new int[]   {ilinescan_order}), // 1, // int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 				calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
 				Pointer.to(new int[] { tilesX }));
+		}
 		cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel,
 				grid_texture[0],    grid_texture[1],    grid_texture[2],   // Grid dimension
 				threads_texture[0], threads_texture[1], threads_texture[2],  // Block dimension
@@ -4315,6 +4435,9 @@ public class GpuQuad{ // quad camera description
 			final boolean []          valid_tiles,            
 			final int                 threadsMax)  // maximal number of threads to launch
 	{
+//		int num_pairs = Correlation2d.getNumPairs(num_cams);
+		//change to fixed 511?
+//		final int task_code = ((1 << num_pairs)-1) << GPUTileProcessor.TASK_CORR_BITS; //  correlation only
 		final int task_code = (1 << GPUTileProcessor.TASK_CORR_EN) |  (1 << GPUTileProcessor.TASK_INTER_EN);
 		final double min_px = margin; 
 		final double max_px = geometryCorrection.getSensorWH()[0] - 1 - margin; // sensor width here, not window width