Commit 0c53ff72 authored by Andrey Filippov's avatar Andrey Filippov

Merge branch 'foliage-2504' into foliage-gpu

parents 6381665c b4d8c441
...@@ -58,7 +58,8 @@ ...@@ -58,7 +58,8 @@
<groupId>org.jcuda</groupId> <groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId> <artifactId>jcuda</artifactId>
<!-- <version>10.1.0</version> --> <!-- <version>10.1.0</version> -->
<version>11.2.0</version> <!--<version>11.2.0</version> -->
<version>12.6.0</version>
</dependency> </dependency>
<!-- <!--
As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2) As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
...@@ -113,6 +114,15 @@ ...@@ -113,6 +114,15 @@
<artifactId>loci_tools</artifactId> <artifactId>loci_tools</artifactId>
<version>6.1.0</version> <version>6.1.0</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/ome/pom-bio-formats -->
<!-- Was source in attic for development -->
<dependency>
<groupId>ome</groupId>
<artifactId>pom-bio-formats</artifactId>
<version>6.13.0</version>
<type>pom</type>
</dependency>
<!-- <!--
<dependency> <dependency>
<groupId>com.drewnoakes</groupId> <groupId>com.drewnoakes</groupId>
......
...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu; ...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
** GPU acceleration for the Tile Processor ** GPU acceleration for the Tile Processor
** **
** **
** Copyright (C) 2018 Elphel, Inc. ** Copyright (C) 2018-2025 Elphel, Inc.
** **
** -----------------------------------------------------------------------------** ** -----------------------------------------------------------------------------**
** **
...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc; ...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram; import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor { public class GPUTileProcessor {
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented) public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
// Can be downloaded and twice extracted from // Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb // https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside // First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/ // Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
static String GPU_RESOURCE_DIR = "kernels"; static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
// static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
// "*" - generated defines, first index - separately compiled unit // "*" - generated defines, first index - separately compiled unit
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}}; // static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all"; static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
...@@ -89,7 +91,7 @@ public class GPUTileProcessor { ...@@ -89,7 +91,7 @@ public class GPUTileProcessor {
static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code
static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code
static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code //// *** Modified 2025 *** ////
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions
...@@ -100,7 +102,7 @@ public class GPUTileProcessor { ...@@ -100,7 +102,7 @@ public class GPUTileProcessor {
static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles"; static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list"; static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list";
static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga"; static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga";
static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; //// *** Modified 2025 *** ////
static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list"; static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles"; static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles";
...@@ -298,7 +300,7 @@ public class GPUTileProcessor { ...@@ -298,7 +300,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader(); ClassLoader classLoader = getClass().getClassLoader();
String [] kernelSources = new String[GPU_SRC_FILES.length]; String [] kernelSources = new String[GPU_SRC_FILES.length];
boolean show_source = false; // true; boolean show_source = true; // false; // true;
for (int cunit = 0; cunit < kernelSources.length; cunit++) { for (int cunit = 0; cunit < kernelSources.length; cunit++) {
kernelSources[cunit] = ""; // use StringBuffer? kernelSources[cunit] = ""; // use StringBuffer?
for (String src_file:GPU_SRC_FILES[cunit]) { for (String src_file:GPU_SRC_FILES[cunit]) {
...@@ -370,7 +372,7 @@ public class GPUTileProcessor { ...@@ -370,7 +372,7 @@ public class GPUTileProcessor {
GPU_CORR2D_COMBINE_kernel = functions[4]; GPU_CORR2D_COMBINE_kernel = functions[4];
GPU_CORR2D_NORMALIZE_kernel = functions[5]; GPU_CORR2D_NORMALIZE_kernel = functions[5];
GPU_TEXTURES_kernel= functions[6]; GPU_TEXTURES_kernel= functions[6];
GPU_RBGA_kernel= functions[7]; GPU_RBGA_kernel= functions[7]; //// *** Modified 2025 *** ////
GPU_ROT_DERIV_kernel = functions[8]; GPU_ROT_DERIV_kernel = functions[8];
GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9]; GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
GPU_CALC_REVERSE_DISTORTION_kernel = functions[10]; GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
...@@ -380,7 +382,7 @@ public class GPUTileProcessor { ...@@ -380,7 +382,7 @@ public class GPUTileProcessor {
GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13]; GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13];
GPU_GEN_TEXTURE_LIST_kernel = functions[14]; GPU_GEN_TEXTURE_LIST_kernel = functions[14];
GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15]; GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15];
GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; //// *** Modified 2025 *** ////
GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17]; GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17];
GPU_ERASE_CLT_TILES_kernel = functions[18]; GPU_ERASE_CLT_TILES_kernel = functions[18];
...@@ -504,7 +506,7 @@ public class GPUTileProcessor { ...@@ -504,7 +506,7 @@ public class GPUTileProcessor {
// Use the NVRTC to create a program by compiling the source code // Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram(); nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram( program, sourceCode, null, 0, null, null); nvrtcCreateProgram( program, sourceCode, null, 0, null, null);
String options[] = {"--gpu-architecture=compute_"+capability}; String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
try { try {
nvrtcCompileProgram(program, options.length, options); nvrtcCompileProgram(program, options.length, options);
......
...@@ -37,10 +37,34 @@ ...@@ -37,10 +37,34 @@
*/ */
#pragma once #pragma once
#ifndef TILE_PROCESSOR_H_
#define TILE_PROCESSOR_H_
#ifndef NUM_CAMS #ifndef NUM_CAMS
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
float center_x; // actual center X (use to find derivatives)
float center_y; // actual center X (use to find derivatives)
float dxc_dx; // add this to data_x per each pixel X-shift relative to the kernel center location
float dxc_dy; // same per each Y-shift pixel
float dyc_dx;
float dyc_dy;
};
extern "C" __global__ void convert_direct( // called with a single block, single thread extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
...@@ -102,7 +126,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -102,7 +126,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize( extern "C" __global__ void corr2D_normalize(
...@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA( ...@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA(
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int * twh);
extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
/// size_t num_texture_tiles, // number of texture tiles to process
int * pnum_texture_tiles, // pointer to a number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
const float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx);
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILES-X, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void create_nonoverlap_list(
int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
extern "C" __global__ void mark_texture_tiles(
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C" __global__ void gen_texture_list(
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input
#endif
...@@ -40,6 +40,14 @@ ...@@ -40,6 +40,14 @@
#include "tp_defines.h" #include "tp_defines.h"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
// #include "TileProcessor.h"
#include <cuda_runtime.h>
// #include <helper_cuda.h>
// #include <helper_functions.h>
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
#ifndef get_task_size #ifndef get_task_size
...@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}... ...@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, {{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
} }
}; };
// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
__constant__ int angles_offsets [4] {15,0,30,30};
/*
__constant__ int angles_offsets [4] {
(int) (offsetof4(corr_vector, azimuth)),
(int) (offsetof4(corr_vector, tilt)),
(int) (offsetof4(corr_vector, roll)),
(int) (offsetof4(corr_vector, roll))};
*/
/*
__constant__ int angles_offsets [4] = {
(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
(int) (offsetof(corr_vector, tilt) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float))};
__constant__ int angles_offsets [4] = { */
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={ __constant__ int mm_seq [3][3][3]={
{ {
{6,5,12}, // a_t * a_z -> tmp0 {6,5,12}, // a_t * a_z -> tmp0
...@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets( ...@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets(
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
} }
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
} }
......
...@@ -51,6 +51,11 @@ ...@@ -51,6 +51,11 @@
((size_t)&(((st *)0)->m)) ((size_t)&(((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif #endif
#ifndef offsetof4
#define offsetof4(st, m) \
(((size_t)&(((st *)0)->m))>>2)
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm #define SCENE_UNITS_SCALE 0.001 // meters from mm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment