Merge branch 'foliage-2504' into foliage-gpu

0c53ff72 · Andrey Filippov · 6381665c · b4d8c441 · 0c53ff72 · 0c53ff72
Commit 0c53ff72 authored Apr 16, 2025 by Andrey Filippov
7 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -58,7 +58,8 @@
                <groupId>org.jcuda</groupId>
                <artifactId>jcuda</artifactId>
                <!-- <version>10.1.0</version> -->
-                <version>11.2.0</version>
+                <!--<version>11.2.0</version> -->
+                <version>12.6.0</version>
            </dependency>
            <!--
            	As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
@@ -113,6 +114,15 @@
 				<artifactId>loci_tools</artifactId>
 				<version>6.1.0</version>
 			</dependency>
+			<!-- https://mvnrepository.com/artifact/ome/pom-bio-formats -->
+			<!-- Was source in attic for development -->
+			<dependency>
+			    <groupId>ome</groupId>
+			    <artifactId>pom-bio-formats</artifactId>
+			    <version>6.13.0</version>
+			    <type>pom</type>
+			</dependency>
 <!-- 			
 			<dependency>
 			  <groupId>com.drewnoakes</groupId>

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
 ** GPU acceleration for the Tile Processor
 **
 **
-** Copyright (C) 2018 Elphel, Inc.
+** Copyright (C) 2018-2025 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
 import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
-	public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
+	public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)  
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	// Can be downloaded and twice extracted from
 	// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
 	// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
 	// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
 	static String GPU_RESOURCE_DIR =              "kernels";
-	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
+//	static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
 	// "*" - generated defines, first index - separately compiled unit
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct";      // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D";         // name in C code
@@ -89,7 +91,7 @@ public class GPUTileProcessor {
 	static String GPU_CORR2D_COMBINE_NAME =        "corr2D_combine";      // name in C code
 	static String GPU_CORR2D_NORMALIZE_NAME =      "corr2D_normalize";    // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
-	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code
+	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code //// *** Modified 2025 *** ////
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv";      // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALCULATE_TILES_OFFSETS =    "calculate_tiles_offsets";   // calculate pixel offsets and disparity distortions
@@ -100,7 +102,7 @@ public class GPUTileProcessor {
 	static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
 	static String GPU_GEN_TEXTURE_LIST_NAME =      "gen_texture_list";
 	static String GPU_CLEAR_TEXTURE_RBGA_NAME =    "clear_texture_rbga";
-	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";
+	static String GPU_TEXTURES_ACCUMULATE_NAME =   "textures_accumulate";  //// *** Modified 2025 *** ////
 	static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
 	static String GPU_ERASE_CLT_TILES_NAME =       "erase_clt_tiles";
@@ -298,7 +300,7 @@ public class GPUTileProcessor {
        ClassLoader classLoader = getClass().getClassLoader();
        String [] kernelSources = new String[GPU_SRC_FILES.length];
-        boolean show_source = false; // true;
+        boolean show_source = true; // false; // true;
        for (int cunit = 0; cunit < kernelSources.length; cunit++) {
        	kernelSources[cunit] = ""; // use StringBuffer?
            for (String src_file:GPU_SRC_FILES[cunit]) {
@@ -370,7 +372,7 @@ public class GPUTileProcessor {
        GPU_CORR2D_COMBINE_kernel =          functions[4];
        GPU_CORR2D_NORMALIZE_kernel =        functions[5];
        GPU_TEXTURES_kernel=                 functions[6];
-        GPU_RBGA_kernel=                     functions[7];
+        GPU_RBGA_kernel=                     functions[7];  //// *** Modified 2025 *** ////
        GPU_ROT_DERIV_kernel =               functions[8];
        GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
@@ -380,7 +382,7 @@ public class GPUTileProcessor {
        GPU_MARK_TEXTURE_NEIGHBOR_kernel =   functions[13];
        GPU_GEN_TEXTURE_LIST_kernel =        functions[14];
        GPU_CLEAR_TEXTURE_RBGA_kernel =      functions[15];
-        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16];
+        GPU_TEXTURES_ACCUMULATE_kernel =     functions[16]; //// *** Modified 2025 *** ////
        GPU_CREATE_NONOVERLAP_LIST_kernel =  functions[17];
        GPU_ERASE_CLT_TILES_kernel =         functions[18];
@@ -504,7 +506,7 @@ public class GPUTileProcessor {
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
-    		String options[] = {"--gpu-architecture=compute_"+capability};
+    		String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
    		try {
    			nvrtcCompileProgram(program, options.length, options);

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -37,10 +37,34 @@
 */
 #pragma once
+#ifndef TILE_PROCESSOR_H_
+#define TILE_PROCESSOR_H_
 #ifndef NUM_CAMS
 #include "tp_defines.h"
 #endif
+#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
+		| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
+#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+#define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
+struct CltExtra{
+	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
+	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
+	float center_x; // actual center X (use to find derivatives)
+	float center_y; // actual center X (use to find derivatives)
+	float dxc_dx;   // add this to data_x per each pixel X-shift relative to the kernel center location
+	float dxc_dy;   // same per each Y-shift pixel
+	float dyc_dx;
+	float dyc_dy;
+};
 extern "C" __global__ void convert_direct( // called with a single block, single thread
 		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
@@ -102,7 +126,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
 		size_t            corr_stride,        // in floats
-		float           * gpu_corrs);         // correlation output data
+		float           * gpu_corrs);          // correlation output data
 extern "C" __global__ void corr2D_normalize(
@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh);
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
+		int               num_cams,           // number of cameras used
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+///		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * pnum_texture_tiles,  // pointer to a number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		const float       weights[3],         // scale for R,B,G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+		int               tilesx);
+extern "C" __global__ void clear_texture_list(
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int                width,  // <= TILES-X, use for faster processing of LWIR images
+		int                height); // <= TILES-Y, use for faster processing of LWIR images
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void create_nonoverlap_list(
+		int                num_cams,
+		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task
+		int                width,               // number of tiles in a row
+		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+		int *              pnonoverlap_length); //  indices to gpu_tasks  // should be initialized to zero
+extern "C" __global__ void mark_texture_tiles(
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
+extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int                height,              // number of tiles rows
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi);                  // x,y,width,height of the woi
+extern "C" __global__ void gen_texture_list(
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,            // number of tiles in task list
+		int                width,                // number of tiles in a row
+		int                height,               // number of tiles rows
+		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * woi);                 // min_x, min_y, max_x, max_y input
+#endif
--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -40,6 +40,14 @@
 	#include "tp_defines.h"
 	#include "dtt8x8.h"
 	#include "geometry_correction.h"
+//	#include "TileProcessor.h"
+	#include <cuda_runtime.h>
+//	#include <helper_cuda.h>
+//	#include <helper_functions.h>
 #endif // #ifndef JCUDA
 #ifndef get_task_size
@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
 		}
 };
+// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
+__constant__ int angles_offsets [4] {15,0,30,30};
+/*
+__constant__ int angles_offsets [4]  {
+		(int) (offsetof4(corr_vector, azimuth)),
+		(int) (offsetof4(corr_vector, tilt)),
+		(int) (offsetof4(corr_vector, roll)),
+		(int) (offsetof4(corr_vector, roll))};
+*/
+/*
+ __constant__ int angles_offsets [4] = {
+		(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
+		(int) (offsetof(corr_vector, tilt)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float))};
-__constant__ int angles_offsets [4] = {
+ */
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
 __constant__ int mm_seq [3][3][3]={
 		{
 				{6,5,12}, // a_t * a_z -> tmp0
@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets(
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
 	}
-//	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
 }

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -51,6 +51,11 @@
    ((size_t)&(((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
+#ifndef offsetof4
+#define offsetof4(st, m) \
+    (((size_t)&(((st *)0)->m))>>2)
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm