Commit 0123f06e authored by Andrey Filippov's avatar Andrey Filippov

Unified GPU for DP2/no DP2 (12.6.0/11.2.0)

parent 8755d17e
......@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
** GPU acceleration for the Tile Processor
**
**
** Copyright (C) 2018 Elphel, Inc.
** Copyright (C) 2018-2025 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
......@@ -48,7 +48,7 @@ import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
import static jcuda.nvrtc.JNvrtc.nvrtcDestroyProgram;
import static jcuda.nvrtc.JNvrtc.nvrtcGetPTX;
import static jcuda.nvrtc.JNvrtc.nvrtcGetProgramLog;
import static jcuda.nvrtc.JNvrtc.nvrtcVersion;
//import static jcuda.nvrtc.JNvrtc.nvrtcVersion;
import static jcuda.nvrtc.JNvrtc.nvrtcGetNumSupportedArchs;
import static jcuda.nvrtc.JNvrtc.nvrtcGetSupportedArchs;
......@@ -62,6 +62,7 @@ import com.elphel.imagej.tileprocessor.Correlation2d;
import ij.IJ;
import ij.text.TextWindow;
import jcuda.JCudaVersion;
import jcuda.Pointer;
import jcuda.driver.CUcontext;
import jcuda.driver.CUdevice;
......@@ -75,16 +76,20 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor {
public static String CUDA_VERSION = JCudaVersion.get();
public static boolean USE_CUDA12 = CUDA_VERSION.startsWith("12.");
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
// Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
static String GPU_RESOURCE_DIR = "kernels";
static String GPU_RESOURCE_TOP_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
// "*" - generated defines, first index - separately compiled unit
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
static String [][] GPU_SRC_FILES = USE_CUDA12?
(new String[][] {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}}):
(new String[][] {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}});
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
......@@ -270,7 +275,6 @@ public class GPUTileProcessor {
public GPUTileProcessor(
String cuda_project_directory) throws IOException
{
// From code by Marco Hutter - http://www.jcuda.org
// Enable exceptions and omit all subsequent error checks
JCudaDriver.setExceptionsEnabled(true);
......@@ -310,7 +314,7 @@ public class GPUTileProcessor {
}else {
File file = null;
if ((cuda_project_directory == null) || cuda_project_directory.isEmpty()) {
file = new File(classLoader.getResource(GPU_RESOURCE_DIR+"/"+src_file).getFile());
file = new File(classLoader.getResource(GPU_RESOURCE_TOP_DIR+"/"+CUDA_VERSION+"/"+src_file).getFile());
System.out.println("Loading resource "+file);
} else {
File src_dir = new File(cuda_project_directory, "src");
......@@ -507,22 +511,35 @@ public class GPUTileProcessor {
// Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram( program, sourceCode, null, 0, null, null);
String options[] = {"--gpu-architecture=compute_"+capability};
int [][] nvrtc_version = new int[2][];
// String options[] = {"--gpu-architecture=compute_"+capability};
// int [][] nvrtc_version = new int[2][];
int nvrtc_rslt = -1;
/*
nvrtc_rslt= nvrtcVersion(nvrtc_version[0],nvrtc_version[0]);
System.out.println("nvrtcVersion="+nvrtc_version[0][0]+"."+nvrtc_version[1][0]+" (returned "+nvrtc_rslt+").");
*/
int [] nvrtc_num_arch = new int[1];
nvrtc_rslt= nvrtcGetNumSupportedArchs(nvrtc_num_arch);
System.out.println("nvrtc_num_arch="+nvrtc_num_arch[0]+" (returned "+nvrtc_rslt+").");
int [] nvrtc_archs = new int[nvrtc_num_arch[0]];
nvrtc_rslt= nvrtcGetSupportedArchs(nvrtc_archs);
int max_arch = 0;
for (int sa: nvrtc_archs) {
max_arch = Math.max(max_arch, sa);
}
for (int sa: nvrtc_archs) {
System.out.println("Supported arch "+sa);
}
System.out.println();
System.out.println("Max supported arch is "+max_arch+", gpu capability = "+capability);
if (capability > max_arch) {
capability = max_arch;
System.out.println("Reduced capability to match NVRTC compiler to "+capability);
}
String options[] = new String[USE_CUDA12?2:1];
options[0] = "--gpu-architecture=compute_"+capability;
if (options.length > 1) {
options[1] = "--extensible-whole-program";
}
System.out.println("Running NVRTC with the following options:");
for (String s:options) {
System.out.println(s);
}
try {
nvrtcCompileProgram(program, options.length, options);
OK = true;
......
package com.elphel.imagej.gpu;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
......@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
private CUdeviceptr gpu_color_weights;
private CUdeviceptr gpu_generate_RBGA_params;
private CUdeviceptr gpu_woi;
private CUdeviceptr gpu_twh;
private CUdeviceptr gpu_num_texture_tiles;
private CUdeviceptr gpu_textures_rgba;
private CUdeviceptr gpu_correction_vector;
......@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT/allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
......@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT // allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
......@@ -2691,9 +2698,41 @@ public class GpuQuad{ // quad camera description
// uses dynamic parallelization, top kernel is a single-thread one
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kernelParameters = Pointer.to(
Pointer kernelParameters;
if (GPUTileProcessor.USE_CUDA12) {
ThreadsFullWarps = new int[] {1, 1, 1};
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
// declare arrays in device code?
Pointer.to(gpu_texture_indices_ovlp), // int * gpu_texture_indices_ovlp,// packed tile + bits (now only (1 << 7)
Pointer.to(gpu_num_texture_ovlp), // int * num_texture_tiles, // number of texture tiles to process (8 elements)
Pointer.to(gpu_woi), // int * woi, // x,y,width,height of the woi
// set smaller for LWIR - it is used to reduce work aread
Pointer.to(new int[] {img_width / GPUTileProcessor.DTT_SIZE}), // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
Pointer.to(new int[] {img_height / GPUTileProcessor.DTT_SIZE}), // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(gpu_generate_RBGA_params), // float generate_RBGA_params[5],
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G
Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_twh)); // int * twh); allocate int[2] for width, heightin DP
} else {
cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
......@@ -2715,6 +2754,7 @@ public class GpuQuad{ // quad camera description
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba)); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
}
cuCtxSynchronize();
// Call the kernel function
......@@ -2994,7 +3034,7 @@ public class GpuQuad{ // quad camera description
if (DEBUG8A) {
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
}
int [] cpu_pnum_texture_tiles = {0}; //// debugging CDP2
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels)
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
......@@ -3005,12 +3045,20 @@ public class GpuQuad{ // quad camera description
int border_tile = (pass >> 2);
int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile];
if (ntt > 0) {
if (ntt > 0) {
int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile != 0){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
int ti_offset;
if (GPUTileProcessor.USE_CUDA12) { // for CDP2
ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile != 0){
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
}
} else { // Pre CDP2
ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile != 0){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
}
}
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
......@@ -3038,35 +3086,68 @@ public class GpuQuad{ // quad camera description
}
System.out.println ("\n\n");
}
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G (or {1.0,0.0,0.0}
Pointer.to(new int[] {idust_remove}), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // size_t texture_stride, // in floats (now 256*4 = 1024)
Pointer.to(new int[] {0}), // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
Pointer.to(new int[] {0}), //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
Pointer.to(new int[] {width}));
Pointer kp_textures_accumulate;
if (GPUTileProcessor.USE_CUDA12) { // for CDP2
// debugging, copying single int back and forth
cpu_pnum_texture_tiles[0] = ntt;
cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // int gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G (or {1.0,0.0,0.0}
Pointer.to(new int[] {idust_remove}), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // size_t texture_stride, // in floats (now 256*4 = 1024)
Pointer.to(new int[] {0}), // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
Pointer.to(new int[] {0}), //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
Pointer.to(new int[] {width}));
} else { // pre CDP2
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G (or {1.0,0.0,0.0}
Pointer.to(new int[] {idust_remove}), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // size_t texture_stride, // in floats (now 256*4 = 1024)
Pointer.to(new int[] {0}), // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {0}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
Pointer.to(new int[] {0}), //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
Pointer.to(new int[] {width}));
}
cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, // jcuda.CudaException: CUDA_ERROR_INVALID_VALUE
grid_texture[0], grid_texture[1], grid_texture[2], // Grid dimension
threads_texture[0], threads_texture[1], threads_texture[2], // Block dimension
......@@ -3185,9 +3266,16 @@ public class GpuQuad{ // quad camera description
// int keep_weights = 0; // 2 bits now, move to parameters
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
// CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
if (GPUTileProcessor.USE_CUDA12) { // for CDP2
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
} else {
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
}
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks,
......@@ -3290,8 +3378,39 @@ public class GpuQuad{ // quad camera description
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
Pointer kp_textures_accumulate;
if (GPUTileProcessor.USE_CUDA12) { // for CDP2
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(gpu_color_weights), // float weights[3], // scale for R,B,G (or {1.0,0.0,0.0}
Pointer.to(new int[] {idust_remove}), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_texture_weights}), // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
Pointer.to(new int[] {0}), // const size_t texture_rbga_stride, // in floats
Pointer.to(new int[] {0}), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {calc_textures? texture_stride : 0}), // size_t texture_stride, // in floats (now 256*4 = 1024)
Pointer.to(gpu_textures), // gpu_texture_tiles, // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(new int[] {ilinescan_order}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
Pointer.to(new int[] { tilesX }));
} else {
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kp_textures_accumulate = Pointer.to(
kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
......@@ -3317,6 +3436,7 @@ public class GpuQuad{ // quad camera description
Pointer.to(new int[] {ilinescan_order}), // 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
calc_extra ? Pointer.to(gpu_diff_rgb_combo) : Pointer.to(new int[] { 0 }),
Pointer.to(new int[] { tilesX }));
}
cuLaunchKernel(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel,
grid_texture[0], grid_texture[1], grid_texture[2], // Grid dimension
threads_texture[0], threads_texture[1], threads_texture[2], // Block dimension
......@@ -4315,6 +4435,9 @@ public class GpuQuad{ // quad camera description
final boolean [] valid_tiles,
final int threadsMax) // maximal number of threads to launch
{
// int num_pairs = Correlation2d.getNumPairs(num_cams);
//change to fixed 511?
// final int task_code = ((1 << num_pairs)-1) << GPUTileProcessor.TASK_CORR_BITS; // correlation only
final int task_code = (1 << GPUTileProcessor.TASK_CORR_EN) | (1 << GPUTileProcessor.TASK_INTER_EN);
final double min_px = margin;
final double max_px = geometryCorrection.getSensorWH()[0] - 1 - margin; // sensor width here, not window width
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment