updated to jcuda 12.6, provided kernels

b4d8c441 · Andrey Filippov · 2516bcc1 · b4d8c441 · b4d8c441 · b4d8c441
Commit b4d8c441 authored Apr 15, 2025 by Andrey Filippov
4 changed files
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -37,10 +37,34 @@
 */
 #pragma once
+#ifndef TILE_PROCESSOR_H_
+#define TILE_PROCESSOR_H_
 #ifndef NUM_CAMS
 #include "tp_defines.h"
 #endif
+#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
+		| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
+#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+#define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
+struct CltExtra{
+	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
+	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
+	float center_x; // actual center X (use to find derivatives)
+	float center_y; // actual center X (use to find derivatives)
+	float dxc_dx;   // add this to data_x per each pixel X-shift relative to the kernel center location
+	float dxc_dy;   // same per each Y-shift pixel
+	float dyc_dx;
+	float dyc_dy;
+};
 extern "C" __global__ void convert_direct( // called with a single block, single thread
 		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
@@ -102,7 +126,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
 		size_t            corr_stride,        // in floats
-		float           * gpu_corrs);         // correlation output data
+		float           * gpu_corrs);          // correlation output data
 extern "C" __global__ void corr2D_normalize(
@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh);
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
+		int               num_cams,           // number of cameras used
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+///		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * pnum_texture_tiles,  // pointer to a number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		const float       weights[3],         // scale for R,B,G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+		int               tilesx);
+extern "C" __global__ void clear_texture_list(
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int                width,  // <= TILES-X, use for faster processing of LWIR images
+		int                height); // <= TILES-Y, use for faster processing of LWIR images
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void create_nonoverlap_list(
+		int                num_cams,
+		float            * gpu_ftasks ,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task
+		int                width,               // number of tiles in a row
+		int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+		int *              pnonoverlap_length); //  indices to gpu_tasks  // should be initialized to zero
+extern "C" __global__ void mark_texture_tiles(
+		int                num_cams,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int              * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
+extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,           // number of tiles in task list
+		int                width,               // number of tiles in a row
+		int                height,              // number of tiles rows
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi);                  // x,y,width,height of the woi
+extern "C" __global__ void gen_texture_list(
+		int                num_cams,
+		float            * gpu_ftasks,          // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                num_tiles,            // number of tiles in task list
+		int                width,                // number of tiles in a row
+		int                height,               // number of tiles rows
+		int              * gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * woi);                 // min_x, min_y, max_x, max_y input
+#endif
--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -40,6 +40,14 @@
 	#include "tp_defines.h"
 	#include "dtt8x8.h"
 	#include "geometry_correction.h"
+//	#include "TileProcessor.h"
+	#include <cuda_runtime.h>
+//	#include <helper_cuda.h>
+//	#include <helper_functions.h>
 #endif // #ifndef JCUDA
 #ifndef get_task_size
@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
 		}
 };
+// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
+__constant__ int angles_offsets [4] {15,0,30,30};
+/*
+__constant__ int angles_offsets [4]  {
+		(int) (offsetof4(corr_vector, azimuth)),
+		(int) (offsetof4(corr_vector, tilt)),
+		(int) (offsetof4(corr_vector, roll)),
+		(int) (offsetof4(corr_vector, roll))};
+*/
+/*
+ __constant__ int angles_offsets [4] = {
+		(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
+		(int) (offsetof(corr_vector, tilt)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float))};
-__constant__ int angles_offsets [4] = {
+ */
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
 __constant__ int mm_seq [3][3][3]={
 		{
 				{6,5,12}, // a_t * a_z -> tmp0
@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets(
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
 	}
-//	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
 }

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -51,6 +51,11 @@
    ((size_t)&(((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
+#ifndef offsetof4
+#define offsetof4(st, m) \
+    (((size_t)&(((st *)0)->m))>>2)
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm