updated to jcuda 12.6

2516bcc1 · Andrey Filippov · 122a22bb · 2516bcc1 · 2516bcc1 · 2516bcc1
Commit 2516bcc1 authored Apr 15, 2025 by Andrey Filippov
Showing with 60 additions and 25 deletions

pom.xml pom.xml +2 -1

GPUTileProcessor.java src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java +12 -10

GpuQuad.java src/main/java/com/elphel/imagej/gpu/GpuQuad.java +46 -14

No files found.
--- a/pom.xml
+++ b/pom.xml
@@ -58,7 +58,8 @@
                <groupId>org.jcuda</groupId>
                <artifactId>jcuda</artifactId>
                <!-- <version>10.1.0</version> -->
-                <version>11.2.0</version>
+                <!--<version>11.2.0</version> -->
+                <version>12.6.0</version>
            </dependency>
            <!--
            	As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
 ** GPU acceleration for the Tile Processor
 **
 **
-** Copyright (C) 2018 Elphel, Inc.
+** Copyright (C) 2018-2025 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
 import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
-	public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
+	public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	// Can be downloaded and twice extracted from
 	// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
 	// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
 	// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
 	static String GPU_RESOURCE_DIR =              "kernels";
-	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
 	// "*" - generated defines, first index - separately compiled unit
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct";      // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D";         // name in C code
@@ -89,7 +91,7 @@ public class GPUTileProcessor {
 	static String GPU_CORR2D_COMBINE_NAME =        "corr2D_combine";      // name in C code
 	static String GPU_CORR2D_NORMALIZE_NAME =      "corr2D_normalize";    // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
-	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code
+	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code //// *** Modified 2025 *** ////
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv";      // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALCULATE_TILES_OFFSETS =    "calculate_tiles_offsets";   // calculate pixel offsets and disparity distortions
@@ -100,7 +102,7 @@ public class GPUTileProcessor {
 	static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
 	static String GPU_GEN_TEXTURE_LIST_NAME =      "gen_texture_list";
 	static String GPU_CLEAR_TEXTURE_RBGA_NAME =    "clear_texture_rbga";
-	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";
+	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";  //// *** Modified 2025 *** ////
 	static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
 	static String GPU_ERASE_CLT_TILES_NAME =       "erase_clt_tiles";
@@ -298,7 +300,7 @@ public class GPUTileProcessor {
        ClassLoader classLoader = getClass().getClassLoader();
        String [] kernelSources = new String[GPU_SRC_FILES.length];
-        boolean show_source = false; // true;
+        boolean show_source = true; // false; // true;
        for (int cunit = 0; cunit < kernelSources.length; cunit++) {
        	kernelSources[cunit] = ""; // use StringBuffer?
            for (String src_file:GPU_SRC_FILES[cunit]) {
@@ -370,7 +372,7 @@ public class GPUTileProcessor {
        GPU_CORR2D_COMBINE_kernel =          functions[4];
        GPU_CORR2D_NORMALIZE_kernel =        functions[5];
        GPU_TEXTURES_kernel=                 functions[6];
-        GPU_RBGA_kernel=                     functions[7];
+        GPU_RBGA_kernel=                     functions[7];  //// *** Modified 2025 *** ////
        GPU_ROT_DERIV_kernel =               functions[8];
        GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
@@ -380,7 +382,7 @@ public class GPUTileProcessor {
        GPU_MARK_TEXTURE_NEIGHBOR_kernel =   functions[13];
        GPU_GEN_TEXTURE_LIST_kernel =        functions[14];
        GPU_CLEAR_TEXTURE_RBGA_kernel =      functions[15];
-        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16];
+        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16]; //// *** Modified 2025 *** ////
        GPU_CREATE_NONOVERLAP_LIST_kernel =  functions[17];
        GPU_ERASE_CLT_TILES_kernel =         functions[18];
@@ -504,7 +506,7 @@ public class GPUTileProcessor {
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
-    		String options[] = {"--gpu-architecture=compute_"+capability};
+    		String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
    		try {
    			nvrtcCompileProgram(program, options.length, options);

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
 package com.elphel.imagej.gpu;
 import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
+import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
+import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
 import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuMemAlloc;
@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_color_weights;
 	private CUdeviceptr gpu_generate_RBGA_params;
 	private CUdeviceptr gpu_woi;
+	private CUdeviceptr gpu_twh;
 	private CUdeviceptr gpu_num_texture_tiles;
 	private CUdeviceptr gpu_textures_rgba;
 	private CUdeviceptr gpu_correction_vector;
@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int
 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers
 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
 		gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
 		gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
-		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+		gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.INT
 		gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int
 		gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
 		gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
 		gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+		gpu_twh =                 new CUdeviceptr(); //  2 integers (width, height) - just allocated space to be used by DP
+		cuMemAlloc (gpu_twh,        2 * Sizeof.INT);
 		gpu_num_texture_tiles =   new CUdeviceptr(); //  8 integers
 		gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description
 		// uses dynamic parallelization, top kernel is a single-thread one
 		int [] GridFullWarps =    {1, 1, 1};
 		int [] ThreadsFullWarps = {1, 1, 1};
+		int shared_size = host_get_textures_shared_size( // in bytes
+				num_cams,     // int                num_cams,     // actual number of cameras
+				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				null);           // int *              offsets);     // in floats
+//	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
 		Pointer kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int               num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description
 				Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 				Pointer.to(new int[]   {keep_weights}),          // int               keep_weights,       // return channel weights after A in RGBA
 				Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
-				Pointer.to(gpu_textures_rgba));                  // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				Pointer.to(gpu_twh));                              // int             * twh);  allocate int[2]  for width, heightin DP
 		cuCtxSynchronize();
 		// Call the kernel function
@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
 		if (DEBUG8A) {
 			cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),               gpu_texture_indices_ovlp,  cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
 		}
+		int [] cpu_pnum_texture_tiles = {0}; //// debugging
 		// Run 8 times - first 4 1-tile offsets  inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
 		// oversize (border 16x 16 tiles overhang by 4 pixels)
 		int tilesya =  ((height +3) & (~3)); //#define TILES-YA       ((TILES-Y +3) & (~3))
@@ -2925,16 +2939,24 @@ public class GpuQuad{ // quad camera description
 					}
 					System.out.println ("\n\n");
 				}
+				// debugging, copying single int back and forth 
+				cpu_pnum_texture_tiles[0] = ntt;
+				cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles),  1 * Sizeof.INT);
-				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//		cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles),  1 * Sizeof.INT);
-				Pointer kp_textures_accumulate = Pointer.to(
+//				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+				cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
+				Pointer kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
 						Pointer.to(new int[] {num_cams}),                // int         num_cams,
 						Pointer.to(gpu_woi),                             // int       * woi,            // min_x, min_y, max_x, max_y
 						Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
 //						Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
 //						Pointer.to(gpu_num_texture_tiles[((pass & 3) << 1) + border_tile)]),               // int       * num_texture_tiles,// number of texture tiles to process
-						Pointer.to(gpu_num_texture_tiles).withByteOffset(
+////						Pointer.to(gpu_num_texture_tiles).withByteOffset(
-								(((pass & 3) << 1) + border_tile)*Sizeof.INT),               // int       * num_texture_tiles,// number of texture tiles to process
+////								(((pass & 3) << 1) + border_tile)*Sizeof.INT),               // int       * num_texture_tiles,// number of texture tiles to process
+						Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
 						Pointer.to(new int[] {ti_offset}),               // int         gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
 						Pointer.to(gpu_texture_indices_ovlp),            //  gpu_texture_indices_offset,// add to gpu_texture_indices
 						Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,
@@ -3077,7 +3099,14 @@ public class GpuQuad{ // quad camera description
 		int [] ThreadsFullWarps = {1, 1, 1};
 		//        	CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
-	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//	    cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		int shared_size = host_get_textures_shared_size( // in bytes
+				num_cams,     // int                num_cams,     // actual number of cameras
+				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+				null);           // int *              offsets);     // in floats
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
 		Pointer kernelParameters = Pointer.to(
 				Pointer.to(new int[] { num_cams}),               // int                num_cams,
 				Pointer.to(gpu_ftasks),                          // float            * gpu_ftasks,
@@ -3180,14 +3209,17 @@ public class GpuQuad{ // quad camera description
 				num_cams,     // int                num_cams,     // actual number of cameras
 				num_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				null);           // int *              offsets);     // in floats
-		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+//		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+		cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
 		Pointer kp_textures_accumulate = Pointer.to(
 				Pointer.to(new int[] {num_cams}),                // int         num_cams,
 				Pointer.to(new int[]   {0}),  // Pointer.to(gpu_woi),  // int       * woi,            // min_x, min_y, max_x, max_y
 				Pointer.to(gpu_clt),                             // float    ** gpu_clt,        // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
 //				Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}),                     // size_t      num_texture_tiles,// number of texture tiles to process
 //				Pointer.to(new int[] {ntt}),                     // size_t      num_texture_tiles,// number of texture tiles to process
-				Pointer.to(gpu_num_texture_tiles),               // int       * num_texture_tiles,// number of texture tiles to process
+//				Pointer.to(gpu_num_texture_tiles),               // int       * num_texture_tiles,// number of texture tiles to process
+				Pointer.to(gpu_texture_indices_len),             // int       * num_texture_tiles,// number of texture tiles to process
 				Pointer.to(new int[] {0}),                     // size_t      num_texture_tiles,// number of texture tiles to process
 				Pointer.to(gpu_texture_indices),          // int   * nonoverlap_list,    // pointer to the calculated number of non-zero tiles
 				Pointer.to(gpu_geometry_correction),             // struct gc * gpu_geometry_correction,