Commit 2516bcc1 authored by Andrey Filippov's avatar Andrey Filippov

updated to jcuda 12.6

parent 122a22bb
...@@ -58,7 +58,8 @@ ...@@ -58,7 +58,8 @@
<groupId>org.jcuda</groupId> <groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId> <artifactId>jcuda</artifactId>
<!-- <version>10.1.0</version> --> <!-- <version>10.1.0</version> -->
<version>11.2.0</version> <!--<version>11.2.0</version> -->
<version>12.6.0</version>
</dependency> </dependency>
<!-- <!--
As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2) As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
......
...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu; ...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
** GPU acceleration for the Tile Processor ** GPU acceleration for the Tile Processor
** **
** **
** Copyright (C) 2018 Elphel, Inc. ** Copyright (C) 2018-2025 Elphel, Inc.
** **
** -----------------------------------------------------------------------------** ** -----------------------------------------------------------------------------**
** **
...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc; ...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram; import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor { public class GPUTileProcessor {
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented) public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
// Can be downloaded and twice extracted from // Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb // https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside // First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/ // Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
static String GPU_RESOURCE_DIR = "kernels"; static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
// static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
// "*" - generated defines, first index - separately compiled unit // "*" - generated defines, first index - separately compiled unit
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}}; // static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all"; static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
...@@ -89,7 +91,7 @@ public class GPUTileProcessor { ...@@ -89,7 +91,7 @@ public class GPUTileProcessor {
static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code
static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code
static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code //// *** Modified 2025 *** ////
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions
...@@ -100,7 +102,7 @@ public class GPUTileProcessor { ...@@ -100,7 +102,7 @@ public class GPUTileProcessor {
static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles"; static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list"; static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list";
static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga"; static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga";
static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; //// *** Modified 2025 *** ////
static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list"; static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles"; static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles";
...@@ -298,7 +300,7 @@ public class GPUTileProcessor { ...@@ -298,7 +300,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader(); ClassLoader classLoader = getClass().getClassLoader();
String [] kernelSources = new String[GPU_SRC_FILES.length]; String [] kernelSources = new String[GPU_SRC_FILES.length];
boolean show_source = false; // true; boolean show_source = true; // false; // true;
for (int cunit = 0; cunit < kernelSources.length; cunit++) { for (int cunit = 0; cunit < kernelSources.length; cunit++) {
kernelSources[cunit] = ""; // use StringBuffer? kernelSources[cunit] = ""; // use StringBuffer?
for (String src_file:GPU_SRC_FILES[cunit]) { for (String src_file:GPU_SRC_FILES[cunit]) {
...@@ -370,7 +372,7 @@ public class GPUTileProcessor { ...@@ -370,7 +372,7 @@ public class GPUTileProcessor {
GPU_CORR2D_COMBINE_kernel = functions[4]; GPU_CORR2D_COMBINE_kernel = functions[4];
GPU_CORR2D_NORMALIZE_kernel = functions[5]; GPU_CORR2D_NORMALIZE_kernel = functions[5];
GPU_TEXTURES_kernel= functions[6]; GPU_TEXTURES_kernel= functions[6];
GPU_RBGA_kernel= functions[7]; GPU_RBGA_kernel= functions[7]; //// *** Modified 2025 *** ////
GPU_ROT_DERIV_kernel = functions[8]; GPU_ROT_DERIV_kernel = functions[8];
GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9]; GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
GPU_CALC_REVERSE_DISTORTION_kernel = functions[10]; GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
...@@ -380,7 +382,7 @@ public class GPUTileProcessor { ...@@ -380,7 +382,7 @@ public class GPUTileProcessor {
GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13]; GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13];
GPU_GEN_TEXTURE_LIST_kernel = functions[14]; GPU_GEN_TEXTURE_LIST_kernel = functions[14];
GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15]; GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15];
GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; //// *** Modified 2025 *** ////
GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17]; GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17];
GPU_ERASE_CLT_TILES_kernel = functions[18]; GPU_ERASE_CLT_TILES_kernel = functions[18];
...@@ -504,7 +506,7 @@ public class GPUTileProcessor { ...@@ -504,7 +506,7 @@ public class GPUTileProcessor {
// Use the NVRTC to create a program by compiling the source code // Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram(); nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram( program, sourceCode, null, 0, null, null); nvrtcCreateProgram( program, sourceCode, null, 0, null, null);
String options[] = {"--gpu-architecture=compute_"+capability}; String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
try { try {
nvrtcCompileProgram(program, options.length, options); nvrtcCompileProgram(program, options.length, options);
......
package com.elphel.imagej.gpu; package com.elphel.imagej.gpu;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES; import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize; import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuLaunchKernel; import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc; import static jcuda.driver.JCudaDriver.cuMemAlloc;
...@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description ...@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
private CUdeviceptr gpu_color_weights; private CUdeviceptr gpu_color_weights;
private CUdeviceptr gpu_generate_RBGA_params; private CUdeviceptr gpu_generate_RBGA_params;
private CUdeviceptr gpu_woi; private CUdeviceptr gpu_woi;
private CUdeviceptr gpu_twh;
private CUdeviceptr gpu_num_texture_tiles; private CUdeviceptr gpu_num_texture_tiles;
private CUdeviceptr gpu_textures_rgba; private CUdeviceptr gpu_textures_rgba;
private CUdeviceptr gpu_correction_vector; private CUdeviceptr gpu_correction_vector;
...@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description ...@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
...@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description ...@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
...@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description ...@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description
// uses dynamic parallelization, top kernel is a single-thread one // uses dynamic parallelization, top kernel is a single-thread one
int [] GridFullWarps = {1, 1, 1}; int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1}; int [] ThreadsFullWarps = {1, 1, 1};
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
// cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams, Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description ...@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description
Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba)); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_twh)); // int * twh); allocate int[2] for width, heightin DP
cuCtxSynchronize(); cuCtxSynchronize();
// Call the kernel function // Call the kernel function
...@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description ...@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
if (DEBUG8A) { if (DEBUG8A) {
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
} }
int [] cpu_pnum_texture_tiles = {0}; //// debugging
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel // Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels) // oversize (border 16x 16 tiles overhang by 4 pixels)
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3)) int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
...@@ -2925,16 +2939,24 @@ public class GpuQuad{ // quad camera description ...@@ -2925,16 +2939,24 @@ public class GpuQuad{ // quad camera description
} }
System.out.println ("\n\n"); System.out.println ("\n\n");
} }
// debugging, copying single int back and forth
cpu_pnum_texture_tiles[0] = ntt;
cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
Pointer kp_textures_accumulate = Pointer.to(
// cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
Pointer.to(new int[] {num_cams}), // int num_cams, Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
// Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
// Pointer.to(gpu_num_texture_tiles[((pass & 3) << 1) + border_tile)]), // int * num_texture_tiles,// number of texture tiles to process // Pointer.to(gpu_num_texture_tiles[((pass & 3) << 1) + border_tile)]), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_num_texture_tiles).withByteOffset( //// Pointer.to(gpu_num_texture_tiles).withByteOffset(
(((pass & 3) << 1) + border_tile)*Sizeof.INT), // int * num_texture_tiles,// number of texture tiles to process //// (((pass & 3) << 1) + border_tile)*Sizeof.INT), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // int gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate Pointer.to(new int[] {ti_offset}), // int gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction, Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
...@@ -3077,7 +3099,14 @@ public class GpuQuad{ // quad camera description ...@@ -3077,7 +3099,14 @@ public class GpuQuad{ // quad camera description
int [] ThreadsFullWarps = {1, 1, 1}; int [] ThreadsFullWarps = {1, 1, 1};
// CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null; // CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams, Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, Pointer.to(gpu_ftasks), // float * gpu_ftasks,
...@@ -3180,14 +3209,17 @@ public class GpuQuad{ // quad camera description ...@@ -3180,14 +3209,17 @@ public class GpuQuad{ // quad camera description
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kp_textures_accumulate = Pointer.to( Pointer kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams, Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
// Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}), // size_t num_texture_tiles,// number of texture tiles to process
// Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_num_texture_tiles), // int * num_texture_tiles,// number of texture tiles to process // Pointer.to(gpu_num_texture_tiles), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction, Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment