separate kernels for cuda versions

54f1979a · Andrey Filippov · 3c6a1850 · 54f1979a · 54f1979a · 54f1979a
Commit 54f1979a authored Jul 21, 2025 by Andrey Filippov
12 changed files
--- a/src/main/resources/kernels/11.2.0/TileProcessor.cuh
+++ b/src/main/resources/kernels/11.2.0/TileProcessor.cuh
--- a/src/main/resources/kernels/11.2.0/TileProcessor.h
+++ b/src/main/resources/kernels/11.2.0/TileProcessor.h
--- a/src/main/resources/kernels/dtt8x8.cu
+++ b/src/main/resources/kernels/dtt8x8.cu
--- a/src/main/resources/kernels/dtt8x8.h
+++ b/src/main/resources/kernels/dtt8x8.h
--- a/src/main/resources/kernels/11.2.0/geometry_correction.cu
+++ b/src/main/resources/kernels/11.2.0/geometry_correction.cu
--- a/src/main/resources/kernels/11.2.0/geometry_correction.h
+++ b/src/main/resources/kernels/11.2.0/geometry_correction.h
+/**
+ **
+ ** geometry_correction.h
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  geometry_correction.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+/**
+**************************************************************************
+* \file geometry_correction.h
+* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
+*/
+#pragma once
+#ifndef NUM_CAMS
+#include "tp_defines.h"
+#endif
+#define NVRTC_BUG 1
+#ifndef M_PI
+#define M_PI  3.14159265358979323846 /* pi */
+#endif
+#ifndef offsetof
+#define offsetof(st, m) \
+    ((size_t)&(((st *)0)->m))
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
+#define SCENE_UNITS_SCALE  0.001 // meters from mm
+#define MIN_DISPARITY      0.01  // minimal disparity to try to convert to world coordinates
+struct tp_task {
+	int   task;
+	union {
+		int      txy;
+		unsigned short sxy[2];
+	};
+	float target_disparity;
+	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
+	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
+	// scale == 0 - old way, just set. Scale !=0 - accumulate. Or make > 0 - set too? only negative - subtract?
+	float scale;                // multiply during direct conversion before accumulating in TD - used for motion blur correction
+	float xy       [NUM_CAMS][2];
+	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
+};
+#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define TP_TASK_TASK_OFFSET      0
+#define TP_TASK_TXY_OFFSET       1
+#define TP_TASK_DISPARITY_OFFSET 2
+#define TP_TASK_CENTERXY_OFFSET  3
+#define TP_TASK_SCALE_OFFSET     5
+#define TP_TASK_XY_OFFSET        6
+struct corr_vector{
+	float tilt    [NUM_CAMS-1]; // 0..2
+	float azimuth [NUM_CAMS-1]; // 3..5
+	float roll    [NUM_CAMS];   // 6..9
+	float zoom    [NUM_CAMS-1]; // 10..12
+	// for ERS correction:
+	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
+};
+#ifdef NVRTC_BUG
+struct trot_deriv{
+	float rots    [NUM_CAMS][3][3];
+	float d_daz   [NUM_CAMS][3][3];
+	float d_tilt  [NUM_CAMS][3][3];
+	float d_roll  [NUM_CAMS][3][3];
+	float d_zoom  [NUM_CAMS][3][3];
+};
+#else
+union trot_deriv{
+	struct {
+		float rots    [NUM_CAMS][3][3];
+		float d_daz   [NUM_CAMS][3][3];
+		float d_tilt  [NUM_CAMS][3][3];
+		float d_roll  [NUM_CAMS][3][3];
+		float d_zoom  [NUM_CAMS][3][3];
+	};
+	float matrices [5][NUM_CAMS][3][3];
+};
+#endif
+struct gc {
+	float pixelCorrectionWidth; //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
+	float pixelCorrectionHeight; // =1936;
+	float line_time;        // duration of one scan line readout (for ERS)
+	float focalLength;      // =FOCAL_LENGTH;
+	float pixelSize;        // =  PIXEL_SIZE; //um
+	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
+#ifndef	NVRTC_BUG
+	union {
+		struct {
+#endif
+			float distortionC;      // r^2
+			float distortionB;      // r^3
+			float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
+			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+#ifndef	NVRTC_BUG
+		};
+		float rad_coeff [7];
+	};
+#endif
+	// parameters, common for all sensors
+	float    elevation;     // degrees, up - positive;
+	float    heading;       // degrees, CW (from top) - positive
+	float forward    [NUM_CAMS];
+	float right      [NUM_CAMS];
+	float height     [NUM_CAMS];
+	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
+	float pXY0       [NUM_CAMS][2];
+	float common_right;    // mm right, camera center
+	float common_forward;  // mm forward (to target), camera center
+	float common_height;   // mm up, camera center
+	float common_roll;     // degrees CW (to target) camera as a whole
+//	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
+//	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
+	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
+//	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
+// only used for the multi-quad systems
+	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
+	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
+	float woi_tops   [NUM_CAMS]; // used to calculate scanline timing
+};
+#define RAD_COEFF_LEN 7
+extern "C" __global__ void get_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+		int                  num_cams,
+//		struct tp_task     * gpu_tasks,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv);
+extern "C" __global__ void calculate_tiles_offsets(
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+		int                  num_cams,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv);
+// uses NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calc_rot_deriv(
+		int                  num_cams,
+		struct corr_vector * gpu_correction_vector,
+		trot_deriv   * gpu_rot_deriv);
+#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
+// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist);
--- a/src/main/resources/kernels/TileProcessor.cu
+++ b/src/main/resources/kernels/TileProcessor.cu
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
--- a/src/main/resources/kernels/12.6.0/dtt8x8.cu
+++ b/src/main/resources/kernels/12.6.0/dtt8x8.cu
--- a/src/main/resources/kernels/12.6.0/dtt8x8.h
+++ b/src/main/resources/kernels/12.6.0/dtt8x8.h
+/**
+ **
+ ** dtt8x8.h
+ **
+ ** Copyright (C) 2018 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  dtt8x8.cuh is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+/**
+**************************************************************************
+* \file dtt8x8.h
+* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
+*        in GPU
+* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
+* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
+* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
+* here it is used to convert convolution kernels and LPF to the frequency domain - done in
+* software.
+*
+* This file is cpompatible with both runtime and driver API, runtime is used for development
+* with Nvidia Nsight, driver API when calling these kernels from Java
+*/
+#ifndef JCUDA
+#define DTT_SIZE_LOG2                 3
+#endif
+#pragma once
+#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
+#define DTT_SIZE1        (DTT_SIZE + 1)
+#define DTT_SIZE2        (2 * DTT_SIZE)
+#define DTT_SIZE21       (DTT_SIZE2 + 1)
+#define DTT_SIZE4        (4 * DTT_SIZE)
+#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
+#define BAYER_RED   0
+#define BAYER_BLUE  1
+#define BAYER_GREEN 2
+// assuming GR/BG as now
+#define BAYER_RED_ROW 0
+#define BAYER_RED_COL 1
+#define DTTTEST_BLOCK_WIDTH          32
+#define DTTTEST_BLOCK_HEIGHT         16
+#define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
+//extern __constant__ float idct_signs[4][4][4];
+//extern __constant__ int imclt_indx9[16];
+//extern __constant__ float HWINDOW2[];
+// kernels (not used so far)
+#if 0
+extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
+#endif// #if 0
+//=========================== 2D functions ===============
+extern __device__ void corrUnfoldTile(
+		int corr_radius,
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+extern __device__ void dttii_2d(
+		float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+extern __device__ void dttiv_color_2d(
+		float * clt_tile,
+		int color);
+extern __device__ void dttiv_mono_2d(
+		float * clt_tile);
+extern __device__ void imclt(
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile );
+extern __device__ void imclt8threads(
+		int     do_acc,     // 1 - add to previous value, 0 - overwrite
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+		int     debug);
--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h