Commit 0c53ff72 authored by Andrey Filippov's avatar Andrey Filippov

Merge branch 'foliage-2504' into foliage-gpu

parents 6381665c b4d8c441
...@@ -58,7 +58,8 @@ ...@@ -58,7 +58,8 @@
<groupId>org.jcuda</groupId> <groupId>org.jcuda</groupId>
<artifactId>jcuda</artifactId> <artifactId>jcuda</artifactId>
<!-- <version>10.1.0</version> --> <!-- <version>10.1.0</version> -->
<version>11.2.0</version> <!--<version>11.2.0</version> -->
<version>12.6.0</version>
</dependency> </dependency>
<!-- <!--
As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2) As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)
...@@ -113,6 +114,15 @@ ...@@ -113,6 +114,15 @@
<artifactId>loci_tools</artifactId> <artifactId>loci_tools</artifactId>
<version>6.1.0</version> <version>6.1.0</version>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/ome/pom-bio-formats -->
<!-- Was source in attic for development -->
<dependency>
<groupId>ome</groupId>
<artifactId>pom-bio-formats</artifactId>
<version>6.13.0</version>
<type>pom</type>
</dependency>
<!-- <!--
<dependency> <dependency>
<groupId>com.drewnoakes</groupId> <groupId>com.drewnoakes</groupId>
......
...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu; ...@@ -6,7 +6,7 @@ package com.elphel.imagej.gpu;
** GPU acceleration for the Tile Processor ** GPU acceleration for the Tile Processor
** **
** **
** Copyright (C) 2018 Elphel, Inc. ** Copyright (C) 2018-2025 Elphel, Inc.
** **
** -----------------------------------------------------------------------------** ** -----------------------------------------------------------------------------**
** **
...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc; ...@@ -72,16 +72,18 @@ import jcuda.nvrtc.JNvrtc;
import jcuda.nvrtc.nvrtcProgram; import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor { public class GPUTileProcessor {
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented) public static boolean USE_DS_DP = true; // false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
// Can be downloaded and twice extracted from // Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb // https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
// First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside // First deb itself, then data.tar.xz, and it will have usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a inside
// Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/ // Found "cuda-cudart-dev" on https://ubuntu.pkgs.org/
static String GPU_RESOURCE_DIR = "kernels"; static String GPU_RESOURCE_DIR = "kernels";
static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"}; // was never used and dtt8x8.cuh had incorrect name
// static String [] GPU_KERNEL_FILES = {"dtt8x8.cu","TileProcessor.cu"};
// "*" - generated defines, first index - separately compiled unit // "*" - generated defines, first index - separately compiled unit
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}}; // static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"}};
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all"; static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
...@@ -89,7 +91,7 @@ public class GPUTileProcessor { ...@@ -89,7 +91,7 @@ public class GPUTileProcessor {
static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code static String GPU_CORR2D_COMBINE_NAME = "corr2D_combine"; // name in C code
static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code static String GPU_CORR2D_NORMALIZE_NAME = "corr2D_normalize"; // name in C code
static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code static String GPU_TEXTURES_NAME = "textures_nonoverlap"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code //// *** Modified 2025 *** ////
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_CALCULATE_TILES_OFFSETS = "calculate_tiles_offsets"; // calculate pixel offsets and disparity distortions
...@@ -100,7 +102,7 @@ public class GPUTileProcessor { ...@@ -100,7 +102,7 @@ public class GPUTileProcessor {
static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles"; static String GPU_MARK_TEXTURE_NEIGHBOR_NAME = "mark_texture_neighbor_tiles";
static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list"; static String GPU_GEN_TEXTURE_LIST_NAME = "gen_texture_list";
static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga"; static String GPU_CLEAR_TEXTURE_RBGA_NAME = "clear_texture_rbga";
static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; static String GPU_TEXTURES_ACCUMULATE_NAME = "textures_accumulate"; //// *** Modified 2025 *** ////
static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list"; static String GPU_CREATE_NONOVERLAP_LIST_NAME ="create_nonoverlap_list";
static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles"; static String GPU_ERASE_CLT_TILES_NAME = "erase_clt_tiles";
...@@ -298,7 +300,7 @@ public class GPUTileProcessor { ...@@ -298,7 +300,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader(); ClassLoader classLoader = getClass().getClassLoader();
String [] kernelSources = new String[GPU_SRC_FILES.length]; String [] kernelSources = new String[GPU_SRC_FILES.length];
boolean show_source = false; // true; boolean show_source = true; // false; // true;
for (int cunit = 0; cunit < kernelSources.length; cunit++) { for (int cunit = 0; cunit < kernelSources.length; cunit++) {
kernelSources[cunit] = ""; // use StringBuffer? kernelSources[cunit] = ""; // use StringBuffer?
for (String src_file:GPU_SRC_FILES[cunit]) { for (String src_file:GPU_SRC_FILES[cunit]) {
...@@ -370,7 +372,7 @@ public class GPUTileProcessor { ...@@ -370,7 +372,7 @@ public class GPUTileProcessor {
GPU_CORR2D_COMBINE_kernel = functions[4]; GPU_CORR2D_COMBINE_kernel = functions[4];
GPU_CORR2D_NORMALIZE_kernel = functions[5]; GPU_CORR2D_NORMALIZE_kernel = functions[5];
GPU_TEXTURES_kernel= functions[6]; GPU_TEXTURES_kernel= functions[6];
GPU_RBGA_kernel= functions[7]; GPU_RBGA_kernel= functions[7]; //// *** Modified 2025 *** ////
GPU_ROT_DERIV_kernel = functions[8]; GPU_ROT_DERIV_kernel = functions[8];
GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9]; GPU_CALCULATE_TILES_OFFSETS_kernel = functions[9];
GPU_CALC_REVERSE_DISTORTION_kernel = functions[10]; GPU_CALC_REVERSE_DISTORTION_kernel = functions[10];
...@@ -380,7 +382,7 @@ public class GPUTileProcessor { ...@@ -380,7 +382,7 @@ public class GPUTileProcessor {
GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13]; GPU_MARK_TEXTURE_NEIGHBOR_kernel = functions[13];
GPU_GEN_TEXTURE_LIST_kernel = functions[14]; GPU_GEN_TEXTURE_LIST_kernel = functions[14];
GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15]; GPU_CLEAR_TEXTURE_RBGA_kernel = functions[15];
GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; GPU_TEXTURES_ACCUMULATE_kernel = functions[16]; //// *** Modified 2025 *** ////
GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17]; GPU_CREATE_NONOVERLAP_LIST_kernel = functions[17];
GPU_ERASE_CLT_TILES_kernel = functions[18]; GPU_ERASE_CLT_TILES_kernel = functions[18];
...@@ -504,7 +506,7 @@ public class GPUTileProcessor { ...@@ -504,7 +506,7 @@ public class GPUTileProcessor {
// Use the NVRTC to create a program by compiling the source code // Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram(); nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram( program, sourceCode, null, 0, null, null); nvrtcCreateProgram( program, sourceCode, null, 0, null, null);
String options[] = {"--gpu-architecture=compute_"+capability}; String options[] = {"--gpu-architecture=compute_"+capability,"--extensible-whole-program"};
try { try {
nvrtcCompileProgram(program, options.length, options); nvrtcCompileProgram(program, options.length, options);
......
package com.elphel.imagej.gpu; package com.elphel.imagej.gpu;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES; import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES;
import static jcuda.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT;
import static jcuda.driver.CUshared_carveout.CU_SHAREDMEM_CARVEOUT_MAX_SHARED;
import static jcuda.driver.JCudaDriver.cuCtxSynchronize; import static jcuda.driver.JCudaDriver.cuCtxSynchronize;
import static jcuda.driver.JCudaDriver.cuLaunchKernel; import static jcuda.driver.JCudaDriver.cuLaunchKernel;
import static jcuda.driver.JCudaDriver.cuMemAlloc; import static jcuda.driver.JCudaDriver.cuMemAlloc;
...@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description ...@@ -97,6 +99,7 @@ public class GpuQuad{ // quad camera description
private CUdeviceptr gpu_color_weights; private CUdeviceptr gpu_color_weights;
private CUdeviceptr gpu_generate_RBGA_params; private CUdeviceptr gpu_generate_RBGA_params;
private CUdeviceptr gpu_woi; private CUdeviceptr gpu_woi;
private CUdeviceptr gpu_twh;
private CUdeviceptr gpu_num_texture_tiles; private CUdeviceptr gpu_num_texture_tiles;
private CUdeviceptr gpu_textures_rgba; private CUdeviceptr gpu_textures_rgba;
private CUdeviceptr gpu_correction_vector; private CUdeviceptr gpu_correction_vector;
...@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description ...@@ -298,13 +301,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
...@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description ...@@ -511,13 +516,15 @@ public class GpuQuad{ // quad camera description
gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_ovlp =new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints gpu_num_texture_ovlp = new CUdeviceptr(); // 8 ints
gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.FLOAT gpu_texture_indices_len = new CUdeviceptr(); // allocate tilesX * tilesY * 1 * Sizeof.INT
gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int gpu_diff_rgb_combo = new CUdeviceptr(); // 1 int
gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT gpu_color_weights = new CUdeviceptr(); // allocate 3 * Sizeof.FLOAT
gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT gpu_generate_RBGA_params =new CUdeviceptr(); // allocate 5 * Sizeof.FLOAT
gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
gpu_twh = new CUdeviceptr(); // 2 integers (width, height) - just allocated space to be used by DP
cuMemAlloc (gpu_twh, 2 * Sizeof.INT);
gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers gpu_num_texture_tiles = new CUdeviceptr(); // 8 integers
gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
...@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description ...@@ -2570,8 +2577,14 @@ public class GpuQuad{ // quad camera description
// uses dynamic parallelization, top kernel is a single-thread one // uses dynamic parallelization, top kernel is a single-thread one
int [] GridFullWarps = {1, 1, 1}; int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1}; int [] ThreadsFullWarps = {1, 1, 1};
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
// cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_RBGA_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams, Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 Pointer.to(gpu_ftasks), // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description ...@@ -2593,7 +2606,8 @@ public class GpuQuad{ // quad camera description
Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA Pointer.to(new int[] {keep_weights}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba)); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles Pointer.to(gpu_textures_rgba), // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_twh)); // int * twh); allocate int[2] for width, heightin DP
cuCtxSynchronize(); cuCtxSynchronize();
// Call the kernel function // Call the kernel function
...@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description ...@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
if (DEBUG8A) { if (DEBUG8A) {
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp), gpu_texture_indices_ovlp, cpu_texture_indices_ovlp.length * Sizeof.INT); // hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
} }
int [] cpu_pnum_texture_tiles = {0}; //// debugging
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel // Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels) // oversize (border 16x 16 tiles overhang by 4 pixels)
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3)) int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
...@@ -2886,10 +2900,18 @@ public class GpuQuad{ // quad camera description ...@@ -2886,10 +2900,18 @@ public class GpuQuad{ // quad camera description
int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile]; int ntt = cpu_num_texture_tiles[((pass & 3) << 1) + border_tile];
if (ntt > 0) { if (ntt > 0) {
int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1 int [] grid_texture = {(ntt + GPUTileProcessor.TEXTURE_TILES_PER_BLOCK-1) / GPUTileProcessor.TEXTURE_TILES_PER_BLOCK,1,1}; // TEXTURE_TILES_PER_BLOCK = 1
/* Pre CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile != 0){ if (border_tile != 0){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt; ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
}
*/
// for CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile != 0){
// ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
} }
int shared_size = host_get_textures_shared_size( // in bytes int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
...@@ -2917,14 +2939,25 @@ public class GpuQuad{ // quad camera description ...@@ -2917,14 +2939,25 @@ public class GpuQuad{ // quad camera description
} }
System.out.println ("\n\n"); System.out.println ("\n\n");
} }
// debugging, copying single int back and forth
cpu_pnum_texture_tiles[0] = ntt;
cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuMemcpyHtoD(gpu_texture_indices_len, Pointer.to(cpu_pnum_texture_tiles), 1 * Sizeof.INT);
Pointer kp_textures_accumulate = Pointer.to(
// cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kp_textures_accumulate = Pointer.to( // CUDA_ERROR_ILLEGAL_ADDRESS
Pointer.to(new int[] {num_cams}), // int num_cams, Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(gpu_num_texture_tiles[((pass & 3) << 1) + border_tile)]), // int * num_texture_tiles,// number of texture tiles to process
//// Pointer.to(gpu_num_texture_tiles).withByteOffset(
//// (((pass & 3) << 1) + border_tile)*Sizeof.INT), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {ti_offset}), // int gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices Pointer.to(gpu_texture_indices_ovlp), // gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction, Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1) Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
...@@ -3066,7 +3099,14 @@ public class GpuQuad{ // quad camera description ...@@ -3066,7 +3099,14 @@ public class GpuQuad{ // quad camera description
int [] ThreadsFullWarps = {1, 1, 1}; int [] ThreadsFullWarps = {1, 1, 1};
// CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null; // CUdeviceptr gpu_diff_rgb_combo_local = calc_extra ? gpu_diff_rgb_combo : null;
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] { num_cams}), // int num_cams, Pointer.to(new int[] { num_cams}), // int num_cams,
Pointer.to(gpu_ftasks), // float * gpu_ftasks, Pointer.to(gpu_ftasks), // float * gpu_ftasks,
...@@ -3169,12 +3209,17 @@ public class GpuQuad{ // quad camera description ...@@ -3169,12 +3209,17 @@ public class GpuQuad{ // quad camera description
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
null); // int * offsets); // in floats null); // int * offsets); // in floats
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536); // cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, 65536);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(this.gpuTileProcessor.GPU_TEXTURES_ACCUMULATE_kernel, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, CU_SHAREDMEM_CARVEOUT_MAX_SHARED);
Pointer kp_textures_accumulate = Pointer.to( Pointer kp_textures_accumulate = Pointer.to(
Pointer.to(new int[] {num_cams}), // int num_cams, Pointer.to(new int[] {num_cams}), // int num_cams,
Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y Pointer.to(new int[] {0}), // Pointer.to(gpu_woi), // int * woi, // min_x, min_y, max_x, max_y
Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] Pointer.to(gpu_clt), // float ** gpu_clt, // [num_cams] ->[TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}), // size_t num_texture_tiles,// number of texture tiles to process // Pointer.to(new int[] {cpu_pnum_texture_tiles[0]}), // size_t num_texture_tiles,// number of texture tiles to process
// Pointer.to(new int[] {ntt}), // size_t num_texture_tiles,// number of texture tiles to process
// Pointer.to(gpu_num_texture_tiles), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_len), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process Pointer.to(new int[] {0}), // size_t num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles Pointer.to(gpu_texture_indices), // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction, Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
......
...@@ -36,12 +36,17 @@ ...@@ -36,12 +36,17 @@
* \brief Top level of the Tile Processor for frequency domain * \brief Top level of the Tile Processor for frequency domain
*/ */
// Avoiding includes in jcuda, all source files will be merged // Avoiding includes in jcuda, all source files will be merged
#pragma once //#pragma once
#ifndef JCUDA #ifndef JCUDA
#include "tp_defines.h" #include "tp_defines.h"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
#include "TileProcessor.h" #include "TileProcessor.h"
#include <cuda_runtime.h>
// #include <helper_cuda.h>
// #include <helper_functions.h>
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
// CUDA fast math is slower! // CUDA fast math is slower!
...@@ -55,67 +60,16 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17. ...@@ -55,67 +60,16 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
//#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT)) //#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\ /*
| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
//#define IMCLT14
//#define NOICLT 1
//#define TEST_IMCLT
//#define SAVE_CLT
// Not enough shared memory to have more threads per block,even just for the result clt tiles
// What to do:
// 1) make single image aberration correction: 1/4 of the result tiles
// With 4 cameras = calculate correlations (9x9), reusing kernel or just clt ones after color reducing, then output them to device memory
//Average run time =1308.124146 ms
//#define TILES_PER_BLOCK 2
//Average run time =12502.638672 - with 2 tiles/block it is longer!
///12129.268555 ms
//Average run time =4704.506348 ms (syncwarp)
//Average run time =4705.612305 ms (syncthreads)
//Average run time =1051.411255 ms
//Average run time =861.866577 ms
//Average run time =850.871277 ms had bugs
//Average run time =857.947632 ms fixed bugs
// Something broke, even w/o LPF: Average run time =1093.115112 ms
// without clt copying to device memory - Average run time =965.342407 ms - still worse
//Average run time =965.880554 ms
// combined tx and ty into a single int : Average run time =871.017944 ms
//Average run time =873.386597 ms (reduced number of registers)
//__umul24 : Average run time =879.125122 ms
// without __umul24 - back to Average run time =871.315552 ms
// Added copying clt to device memory - Average run time =942.071960 ms
// Removed rest of NOICLT : Average run time =943.456177 ms
// Added lpf: Average run time =1046.101318 ms (0.1 sec, 10%) - can be combined with the PSF kernel
//#define USE_UMUL24
////#define TILES_PER_BLOCK 4
//Average run time =5155.922852 ms
//Average run time =1166.388306 ms
//Average run time =988.750977 ms
//#define TILES_PER_BLOCK 8
//Average run time =9656.743164 ms
// Average run time =9422.057617 ms (reducing divergence)
//#define TILES_PER_BLOCK 1
//#define THREADS_PER_TILE 8
//#define IMCLT_THREADS_PER_TILE 16
//#define IMCLT_TILES_PER_BLOCK 4
/// #define KERNELS_STEP (1 << KERNELS_LSTEP)
//#define TILES-X (IMG-WIDTH / DTT_SIZE)
//#define TILES-Y (IMG-HEIGHT / DTT_SIZE)
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5 #define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32 #define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
*/
// Make TILES-YA >= TILES-X and a multiple of 4 // Make TILES-YA >= TILES-X and a multiple of 4
//#define TILES-YA ((TILES-Y +3) & (~3)) //#define TILES-YA ((TILES-Y +3) & (~3))
// increase row length by 1 so vertical passes will use different ports // increase row length by 1 so vertical passes will use different ports
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2)) //#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
/*
struct CltExtra{ struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments) float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments) float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
...@@ -126,7 +80,7 @@ struct CltExtra{ ...@@ -126,7 +80,7 @@ struct CltExtra{
float dyc_dx; float dyc_dx;
float dyc_dy; float dyc_dy;
}; };
*/
/* /*
Python code to generate constant coefficients: Python code to generate constant coefficients:
def setup_hwindow(n=8, l=4): def setup_hwindow(n=8, l=4):
...@@ -724,7 +678,7 @@ __device__ void tile_combine_rgba( ...@@ -724,7 +678,7 @@ __device__ void tile_combine_rgba(
// next not used // next not used
// boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing) // boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float * chn_weights, // color channel weights, sum == 1.0 const float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms) int keep_weights, // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug); int debug);
...@@ -735,43 +689,17 @@ __device__ void imclt_plane( // not implemented, not used ...@@ -735,43 +689,17 @@ __device__ void imclt_plane( // not implemented, not used
float * gpu_rbg, // WIDTH, HEIGHT float * gpu_rbg, // WIDTH, HEIGHT
const size_t dstride); // in floats (pixels) const size_t dstride); // in floats (pixels)
extern "C" __global__ void clear_texture_list( extern "C" __global__ void update_woi(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int texture_slices,
int width, // <= TILES-X, use for faster processing of LWIR images int * woi, // min_x, min_y, max_x, max_y input, not modified, max_x - not used
int height); // <= TILES-Y, use for faster processing of LWIR images int * twh); // 2-element in device global memory
extern "C" __global__ void mark_texture_tiles( extern "C" __global__ void clear_texture_rbga2( // version for CDP2
int num_cams, int * twh, // texture_width, // aligned to DTT_SIZE
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C" __global__ void gen_texture_list(
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
//inline __device__ int get_task_size(int num_cams); //inline __device__ int get_task_size(int num_cams);
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams); inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams); inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
...@@ -807,14 +735,6 @@ __global__ void index_inter_correlate( ...@@ -807,14 +735,6 @@ __global__ void index_inter_correlate(
int * gpu_corr_indices, // array of correlation tasks int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array int * pnum_corr_tiles); // pointer to the length of correlation tasks array
extern "C" __global__ void create_nonoverlap_list(
int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
__global__ void convert_correct_tiles( __global__ void convert_correct_tiles(
int num_cams, // actual number of cameras int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
...@@ -824,7 +744,7 @@ __global__ void convert_correct_tiles( ...@@ -824,7 +744,7 @@ __global__ void convert_correct_tiles(
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task int * num_active_tiles, // number of tiles in task
float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels) size_t dstride, // in floats (pixels)
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
...@@ -836,7 +756,8 @@ __global__ void convert_correct_tiles( ...@@ -836,7 +756,8 @@ __global__ void convert_correct_tiles(
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all) int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum) // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain float * gpu_corrs); // correlation output data (either pixel domain or transform domain
...@@ -850,7 +771,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -850,7 +771,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
float scale0, // scale for R float scale0, // scale for R
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum) /// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain float * gpu_corrs); // correlation output data (either pixel domain or transform domain
...@@ -863,7 +785,8 @@ extern "C" __global__ void correlate2D_inner( ...@@ -863,7 +785,8 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute float fat_zero2, // here - absolute
int num_corr_tiles, // number of correlation tiles to process // int num_corr_tiles, // number of correlation tiles to process
int * num_corr_tiles, // pointer to a number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15) int corr_radius0, // radius of the output correlation (7 for 15x15)
...@@ -891,33 +814,6 @@ extern "C" __global__ void corr2D_combine_inner( ...@@ -891,33 +814,6 @@ extern "C" __global__ void corr2D_combine_inner(
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input) const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile) float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx);
__device__ int get_textures_shared_size( // in bytes __device__ int get_textures_shared_size( // in bytes
int num_cams, // actual number of cameras int num_cams, // actual number of cameras
...@@ -972,11 +868,16 @@ extern "C" __global__ void correlate2D( ...@@ -972,11 +868,16 @@ extern "C" __global__ void correlate2D(
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data float * gpu_corrs) // correlation output data
{ {
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); int num_pairs = num_cams * (num_cams-1) / 2;
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); // static
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); // static
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_tiles * num_pairs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_corr_tiles = 0; *pnum_corr_tiles = 0; // global, allocated by host
index_correlate<<<blocks0,threads0>>>( index_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
sel_pairs0, // int sel_pairs0, sel_pairs0, // int sel_pairs0,
sel_pairs1, // int sel_pairs1, sel_pairs1, // int sel_pairs1,
...@@ -988,10 +889,10 @@ extern "C" __global__ void correlate2D( ...@@ -988,10 +889,10 @@ extern "C" __global__ void correlate2D(
tilesx, // int width, // number of tiles in a row tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1); // dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); /// dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>( correlate2D_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1) colors, // int colors, // number of colors (3/1)
...@@ -999,7 +900,8 @@ extern "C" __global__ void correlate2D( ...@@ -999,7 +900,8 @@ extern "C" __global__ void correlate2D(
scale1, // float scale1, // scale for B scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G scale2, // float scale2, // scale for G
fat_zero2, // float fat_zero2, // here - absolute fat_zero2, // float fat_zero2, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process // *pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
pnum_corr_tiles, // size_t num_corr_tiles, // pointer to a number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15) corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
...@@ -1045,13 +947,19 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1045,13 +947,19 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
float * gpu_corrs) // correlation output data float * gpu_corrs) // correlation output data
{ {
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
if (num_sel_sensors > 0){ if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory // try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0; *pnum_corr_tiles = 0;
index_inter_correlate<<<blocks0,threads0>>>( index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
sel_sensors, // int sel_sensors, sel_sensors, // int sel_sensors,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
...@@ -1059,13 +967,12 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1059,13 +967,12 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
tilesx, // int width, // number of tiles in a row tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
int num_corr_tiles_with_sum = (*pnum_corr_tiles); /// __device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count /// int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1); /// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); /// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
num_cams, // int num_cams, num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all) num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...@@ -1074,14 +981,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1074,14 +981,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
scale0, // float scale0, // scale for R scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G scale2, // float scale2, // scale for G
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format) pnum_corr_tiles, // num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum) gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); /// dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all) num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum) pnum_corr_tiles, //num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair NOT USED gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair NOT USED
corr_stride, // size_t corr_stride, // in floats corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs); // correlation output data (either pixel domain or transform domain gpu_corrs); // float * gpu_corrs); // correlation output data (either pixel domain or transform domain
...@@ -1110,7 +1017,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -1110,7 +1017,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
*/ */
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all) int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum) // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain float * gpu_corrs) // correlation output data (either pixel domain or transform domain
...@@ -1118,7 +1026,7 @@ extern "C" __global__ void combine_inter( // combine per-senor interscene co ...@@ -1118,7 +1026,7 @@ extern "C" __global__ void combine_inter( // combine per-senor interscene co
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task; int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
if (corr_offset >= (num_corr_tiles - num_sel_sensors)) { if (corr_offset >= (*num_corr_tiles - num_sel_sensors)) { // was if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
return; return;
} }
// __syncthreads();// __syncwarp(); // __syncthreads();// __syncwarp();
...@@ -1184,7 +1092,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1184,7 +1092,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
float scale0, // scale for R float scale0, // scale for R
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum) /// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain float * gpu_corrs) // correlation output data (either pixel domain or transform domain
...@@ -1194,7 +1103,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1194,7 +1103,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4 int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
int tile_index = corr_num / num_sel_sensors; int tile_index = corr_num / num_sel_sensors;
int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles. int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
if (corr_offset >= num_corr_tiles){ // if (corr_offset >= num_corr_tiles){
if (corr_offset >= *num_corr_tiles){
return; // nothing to do return; // nothing to do
} }
...@@ -1304,7 +1214,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD, ...@@ -1304,7 +1214,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
* @param scale1 scale blue (if colors = 3) component before mixing * @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing * @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero2 add this value squared to the sum of squared components before normalization * @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process * @param num_corr_tiles number of correlation tiles to process => a pointer to!
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks * @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride stride (in floats) for correlation outputs. * @param corr_stride stride (in floats) for correlation outputs.
* @param corr_radius radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization * @param corr_radius radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
...@@ -1318,7 +1228,8 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1318,7 +1228,8 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute float fat_zero2, // here - absolute
int num_corr_tiles, // number of correlation tiles to process /// int num_corr_tiles, // number of correlation tiles to process
int * num_corr_tiles, // pointer to a number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15) int corr_radius0, // radius of the output correlation (7 for 15x15)
...@@ -1331,7 +1242,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1331,7 +1242,7 @@ extern "C" __global__ void correlate2D_inner(
float scales[3] = {scale0, scale1, scale2}; float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4 int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
if (corr_num >= num_corr_tiles){ if (corr_num >= *num_corr_tiles){
return; // nothing to do return; // nothing to do
} }
int pair_list_start = pairs_offsets[num_cams]; int pair_list_start = pairs_offsets[num_cams];
...@@ -2027,7 +1938,8 @@ extern "C" __global__ void generate_RBGA( ...@@ -2027,7 +1938,8 @@ extern "C" __global__ void generate_RBGA(
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int * twh)
{ {
float min_shot = params[0]; // 10.0 float min_shot = params[0]; // 10.0
float scale_shot = params[1]; // 3.0 float scale_shot = params[1]; // 3.0
...@@ -2038,30 +1950,56 @@ extern "C" __global__ void generate_RBGA( ...@@ -2038,30 +1950,56 @@ extern "C" __global__ void generate_RBGA(
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1); dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS; int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1); dim3 blocks0 (blocks_x, height, 1);
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
int texture_slices = colors + 1;
if (keep_weights & 2){
texture_slices += colors * num_cams;
}
int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
clear_texture_list<<<blocks0,threads0>>>( clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
gpu_texture_indices, gpu_texture_indices,
width, width,
height); height);
cudaDeviceSynchronize(); // not needed yet, just for testing /// cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1); /// dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;// /// int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1); /// dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory // mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>( mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7) gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = width; // TILES-X; /// *(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y; /// *(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x /// *(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y /// *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>( mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
...@@ -2070,69 +2008,88 @@ extern "C" __global__ void generate_RBGA( ...@@ -2070,69 +2008,88 @@ extern "C" __global__ void generate_RBGA(
gpu_texture_indices, // packed tile + bits (now only (1 << 7) gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1 // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(num_texture_tiles+0) = 0; /// *(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0; /// *(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0; /// *(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0; /// *(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0; /// *(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0; /// *(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0; /// *(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0; /// *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>( gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // int height, // number of tiles rows height, // int height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7) gpu_texture_indices, // packed tile + bits (now only (1 << 7)
num_texture_tiles, // number of texture tiles to process num_texture_tiles, // pointer to a number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y woi); // x,y, here woi[2] = max_X, woi[3] - max-Y input only, not modified, woi[2] not used
cudaDeviceSynchronize(); // not needed yet, just for testing /*
// TODO: Add small kernel to only modify *(woi + 2), *(woi + 3) and generate texture_width, texture_tiles_height ?
/// cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0); // width *(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height *(woi + 3) += 1 - *(woi + 1); // height
} }
__syncthreads(); __syncthreads(); // ?
// Zero output textures. Trim // Zero output textures. Trim
// texture_rbga_stride // texture_rbga_stride
int texture_width = (*(woi + 2) + 1)* DTT_SIZE; int texture_width = (*(woi + 2) + 1)* DTT_SIZE;
int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE; int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE;
int texture_slices = colors + 1; /// int texture_slices = colors + 1;
if (keep_weights & 2){ /// if (keep_weights & 2){
texture_slices += colors * num_cams; /// texture_slices += colors * num_cams;
} /// }
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
*/
/// __device__ int twh[2];
update_woi<<<1,1, 0, cudaStreamTailLaunch>>>(
texture_slices, // int texture_slices,
woi, // int * // min_x, min_y, max_x, max_y input, not modified, max_x - not used
twh); // int * twh) // 2-element in device global memory
// next kernels will see woi as {x,y,width,height}
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1); dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical // int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
clear_texture_rbga<<<blocks2,threads2>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0) // dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
int blocks_x = ((width+1) * DTT_SIZE + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, (height+1) * DTT_SIZE * texture_slices, 1); // each thread - 8 vertical
/*
clear_texture_rbga<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
texture_width, texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height, texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles); gpu_texture_tiles) ; // float * gpu_texture_tiles);
*/
clear_texture_rbga2<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
twh, // int * twh, // {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel // Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x116 tiles overhang by 4 pixels) // oversize (border 16x116 tiles overhang by 4 pixels)
cudaDeviceSynchronize(); // not needed yet, just for testing /// cudaDeviceSynchronize(); // not needed yet, just for testing
for (int pass = 0; pass < 8; pass++){ for (int pass = 0; pass < 8; pass++){
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2; int border_tile = pass >> 2;
int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile); // int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1 // dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
int * pntt = num_texture_tiles + ((pass & 3) << 1) + border_tile; // pointer to global memory
dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){ if (border_tile){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt; // ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
} }
#ifdef DEBUG12 #ifdef DEBUG12
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n", printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
...@@ -2146,15 +2103,16 @@ extern "C" __global__ void generate_RBGA( ...@@ -2146,15 +2103,16 @@ extern "C" __global__ void generate_RBGA(
printf("\n"); printf("\n");
#endif #endif
/* */ /* */
int shared_size = get_textures_shared_size( // in bytes /// int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras /// num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono /// colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats /// 0); // int * offsets); // in floats
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( textures_accumulate <<<grid_texture,threads_texture, shared_size, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
woi, // int * woi, // x, y, width,height woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process // ntt, // size_t num_texture_tiles, // number of texture tiles to process
pntt, // size_t * pnum_texture_tiles, // pointer to a number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...@@ -2176,14 +2134,28 @@ extern "C" __global__ void generate_RBGA( ...@@ -2176,14 +2134,28 @@ extern "C" __global__ void generate_RBGA(
1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order 1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
(float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams] (float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
width); width);
cudaDeviceSynchronize(); // not needed yet, just for testing
/* */
} }
} }
__syncthreads(); __syncthreads();
} }
/**
* Helper kernel for CDP2 to update woi (was {min_x,min_y,Max_x,max_y}
* and calculate texture_width and texture_tiles_height * texture_slices to be used in next kernel in stream
*/
__global__ void update_woi(
int texture_slices,
int * woi, // min_x, min_y, max_x, max_y input, not modified, max_x - not used
int * twh) // 2-element in device global memory
{
if (threadIdx.x == 0) { // always
*(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height
twh[0] = (*(woi + 2) + 1)* DTT_SIZE;
twh[1] = (*(woi + 3) + 1) * DTT_SIZE * texture_slices;
}
}
/** /**
* Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate) * Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
...@@ -2203,7 +2175,7 @@ __global__ void clear_texture_rbga( ...@@ -2203,7 +2175,7 @@ __global__ void clear_texture_rbga(
if (col > texture_width) { if (col > texture_width) {
return; return;
} }
int row = blockIdx.y;; // includes slices int row = blockIdx.y; // includes slices
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride; float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll #pragma unroll
for (int n = 0; n < DTT_SIZE; n++) { for (int n = 0; n < DTT_SIZE; n++) {
...@@ -2211,6 +2183,36 @@ __global__ void clear_texture_rbga( ...@@ -2211,6 +2183,36 @@ __global__ void clear_texture_rbga(
} }
} }
/**
* Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
* @param twh {texture width in pixels, aligned to DTT_SIZE,
* full number of output rows: texture height in pixels, multiplied by number of color slices}
* @param texture_rbga_stride texture line stride in floats
* @param gpu_texture_tiles pointer to the texture output
*/
// blockDim.x * gridDim.x >= width
__global__ void clear_texture_rbga2(
int * twh, // {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
if (col > twh[0]) {
return;
}
int row = blockIdx.y; // includes slices
if (row > twh[1]) {
return;
}
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
*(pix++) = 0.0;
}
}
// not used - both in C++ and Java
/** /**
* Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal * Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal
* neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads, * neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
...@@ -2246,29 +2248,41 @@ __global__ void prepare_texture_list( ...@@ -2246,29 +2248,41 @@ __global__ void prepare_texture_list(
dim3 blocks0 (blocks_x, height, 1); dim3 blocks0 (blocks_x, height, 1);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
clear_texture_list<<<blocks0,threads0>>>( *(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
gpu_texture_indices, gpu_texture_indices,
width, width,
height); height);
cudaDeviceSynchronize(); // not needed yet, just for testing /// cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1); dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;// int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1); dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory // mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>( mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, num_cams,
gpu_ftasks, gpu_ftasks,
// gpu_tasks, // gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, width,
gpu_texture_indices); // packed tile + bits (now only (1 << 7) gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = width; // TILES-X; /// *(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y; /// *(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x /// *(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y /// *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>( mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, num_cams,
gpu_ftasks, gpu_ftasks,
// gpu_tasks, // gpu_tasks,
...@@ -2277,18 +2291,18 @@ __global__ void prepare_texture_list( ...@@ -2277,18 +2291,18 @@ __global__ void prepare_texture_list(
height, // number of tiles rows height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7) gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1 // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(num_texture_tiles+0) = 0; /// *(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0; /// *(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0; /// *(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0; /// *(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0; /// *(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0; /// *(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0; /// *(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0; /// *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>( gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, num_cams,
gpu_ftasks, gpu_ftasks,
// gpu_tasks, // gpu_tasks,
...@@ -2299,7 +2313,7 @@ __global__ void prepare_texture_list( ...@@ -2299,7 +2313,7 @@ __global__ void prepare_texture_list(
num_texture_tiles, // number of texture tiles to process num_texture_tiles, // number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize(); // not needed yet, just for testing /// cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0); // width *(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height *(woi + 3) += 1 - *(woi + 1); // height
} }
...@@ -2377,7 +2391,7 @@ __global__ void mark_texture_tiles( ...@@ -2377,7 +2391,7 @@ __global__ void mark_texture_tiles(
* @param width number of tiles in a row * @param width number of tiles in a row
* @param height number of tiles rows * @param height number of tiles rows
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles) * @param woi 4-element int array ( x,y,width,height of the woi, in tiles) -> minx, min_y, max_x, max_y
*/ */
// treads (*,1,1), blocks = (*,1,1) // treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...@@ -2445,8 +2459,8 @@ __global__ void gen_texture_list( ...@@ -2445,8 +2459,8 @@ __global__ void gen_texture_list(
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7) int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process int * num_texture_tiles, // pointer to a number of texture tiles to process
int * woi) // min_x, min_y, max_x, max_y input int * woi) // min_x, min_y, max_x, max_y input, not modified, max_x - not used
{ {
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3)) int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
...@@ -2659,7 +2673,7 @@ __global__ void index_correlate( ...@@ -2659,7 +2673,7 @@ __global__ void index_correlate(
/** /**
* Helper kernel for correlateInter2D() - generates dense list of correlation tasks. * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
* For interscene correlation. One correlation output for each selected sensor * For interscene correlation. One correlation output for each selected sensor
* plus a sum of them all. So for all 16 sensors selected ooutput will have 17 * plus a sum of them all. So for all 16 sensors selected output will have 17
* 2D correlations (with sum being the last one) * 2D correlations (with sum being the last one)
* All pairs for the same tile will always be in the same order: increasing sensor numbers * All pairs for the same tile will always be in the same order: increasing sensor numbers
* with sum being the last. Sum will be marked by 0xff in the LSB. * with sum being the last. Sum will be marked by 0xff in the LSB.
...@@ -2709,7 +2723,6 @@ __global__ void index_inter_correlate( ...@@ -2709,7 +2723,6 @@ __global__ void index_inter_correlate(
} }
} }
/** /**
* Direct MCLT transform and aberration correction with space-variant deconvolution * Direct MCLT transform and aberration correction with space-variant deconvolution
* kernels. Results are used to output aberration-corrected images, textures and * kernels. Results are used to output aberration-corrected images, textures and
...@@ -2755,19 +2768,20 @@ extern "C" __global__ void convert_direct( // called with a single block, singl ...@@ -2755,19 +2768,20 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // always 1 if (threadIdx.x == 0) { // always 1
*pnum_active_tiles = 0; //__device__
*pnum_active_tiles = 0; // already _device_
int task_size = get_task_size(num_cams); int task_size = get_task_size(num_cams);
index_direct<<<blocks0,threads0>>>( index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
task_size, // int task_size, // flattened task size in 4-byte floats task_size, // int task_size, // flattened task size in 4-byte floats
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, //int num_tiles, // number of tiles in task num_tiles, //int num_tiles, // number of tiles in task
gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
// now call actual convert_correct_tiles /// dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1); dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1); dim3 grid_tp((num_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1); // use static dimensions - maximal number of tiles
convert_correct_tiles<<<grid_tp,threads_tp>>>( convert_correct_tiles<<<grid_tp, threads_tp, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
((kernels_hor>0)?gpu_kernel_offsets:0), // float ** gpu_kernel_offsets, // [num_cams], ((kernels_hor>0)?gpu_kernel_offsets:0), // float ** gpu_kernel_offsets, // [num_cams],
...@@ -2775,16 +2789,15 @@ extern "C" __global__ void convert_direct( // called with a single block, singl ...@@ -2775,16 +2789,15 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
gpu_images, // float ** gpu_images, // [num_cams], gpu_images, // float ** gpu_images, // [num_cams],
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*pnum_active_tiles, // int num_active_tiles, // number of tiles in task pnum_active_tiles, // int * pnum_active_tiles, // number of tiles in task
gpu_clt, // float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
dstride, // size_t dstride, // in floats (pixels) dstride, // size_t dstride, // in floats (pixels)
lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width, // int woi_width, // varaible to swict between EO and LWIR woi_width, // int woi_width, // variable to switch between EO and LWIR
woi_height, // int woi_height, // varaible to swict between EO and LWIR woi_height, // int woi_height, // variable to switch between EO and LWIR
kernels_hor, // int kernels_hor, // varaible to swict between EO and LWIR kernels_hor, // int kernels_hor, // variable to switch between EO and LWIR
kernels_vert, // ); // int kernels_vert); // varaible to swict between EO and LWIR kernels_vert, // ); // int kernels_vert); // varaible to swict between EO and LWIR
tilesx); // int tilesx) tilesx); // int tilesx)
} }
} }
...@@ -2867,7 +2880,7 @@ __global__ void convert_correct_tiles( ...@@ -2867,7 +2880,7 @@ __global__ void convert_correct_tiles(
float ** gpu_images, // [num_cams], float ** gpu_images, // [num_cams],
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task int * num_active_tiles, // number of tiles in task
float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels) size_t dstride, // in floats (pixels)
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
...@@ -2877,11 +2890,12 @@ __global__ void convert_correct_tiles( ...@@ -2877,11 +2890,12 @@ __global__ void convert_correct_tiles(
int kernels_vert, int kernels_vert,
int tilesx) int tilesx)
{ {
// printf("\n1.7. convert_correct_tiles():3055\n");
// int tilesx = TILES-X; // int tilesx = TILES-X;
/// dim3 t = threadIdx; /// dim3 t = threadIdx;
int tile_in_block = threadIdx.y; int tile_in_block = threadIdx.y;
int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block; int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_indx >= num_active_tiles){ if (task_indx >= *num_active_tiles){
return; // nothing to do return; // nothing to do
} }
int task_num = gpu_active_tiles[task_indx]; int task_num = gpu_active_tiles[task_indx];
...@@ -2997,6 +3011,7 @@ __global__ void convert_correct_tiles( ...@@ -2997,6 +3011,7 @@ __global__ void convert_correct_tiles(
__syncthreads(); __syncthreads();
} }
} }
// return;
} }
/** /**
...@@ -3069,7 +3084,7 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -3069,7 +3084,7 @@ extern "C" __global__ void textures_nonoverlap(
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_texture_tiles = 0; *pnum_texture_tiles = 0;
create_nonoverlap_list<<<blocks0,threads0>>>( create_nonoverlap_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks, // gpu_tasks, // struct tp_task * gpu_tasks,
...@@ -3077,13 +3092,14 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -3077,13 +3092,14 @@ extern "C" __global__ void textures_nonoverlap(
num_tilesx, // int width, // number of tiles in a row num_tilesx, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); /// dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // fixed-size grid
int shared_size = get_textures_shared_size( // in bytes int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
...@@ -3094,11 +3110,12 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -3094,11 +3110,12 @@ extern "C" __global__ void textures_nonoverlap(
__syncthreads(); __syncthreads();
#endif #endif
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( // 65536>>>( // textures_accumulate <<<grid_texture,threads_texture, shared_size, cudaStreamTailLaunch>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height (int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
*pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process // *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
pnum_texture_tiles, // int * pnum_texture_tiles, // pointer to a number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices 0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...@@ -3135,8 +3152,8 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -3135,8 +3152,8 @@ extern "C" __global__ void textures_nonoverlap(
* @param num_cams Number of cameras used * @param num_cams Number of cameras used
* @param woi WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used * @param woi WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] * @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param num_texture_tiles number of texture tiles to process * @param pnum_texture_tiles pointer to a number of texture tiles to process
* @param gpu_texture_indices_offset add to gpu_texture_indices * @param gpu_texture_indices_offset add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
* @param gpu_texture_indices array - 1 integer per tile to process * @param gpu_texture_indices array - 1 integer per tile to process
* @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight * @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight
* @param colors number of colors used: 3 for RGB or 1 for monochrome * @param colors number of colors used: 3 for RGB or 1 for monochrome
...@@ -3162,8 +3179,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1) ...@@ -3162,8 +3179,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used int num_cams, // number of cameras used
int * woi, // x, y, width,height int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process /// size_t num_texture_tiles, // number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices int * pnum_texture_tiles, // pointer to a number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
...@@ -3173,7 +3191,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1) ...@@ -3173,7 +3191,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float diff_sigma, // pixel value/pixel change float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weights[3], // scale for R,B,G const float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? Now +2 - output raw channels int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)? Now +2 - output raw channels
// combining both non-overlap and overlap (each calculated if pointer is not null ) // combining both non-overlap and overlap (each calculated if pointer is not null )
...@@ -3201,11 +3219,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1) ...@@ -3201,11 +3219,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
// int camera_num = threadIdx.y; // int camera_num = threadIdx.y;
int tile_indx = blockIdx.x; // * TEXTURE_TILES_PER_BLOCK + tile_in_block; int tile_indx = blockIdx.x; // * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if (tile_indx >= num_texture_tiles){ if (tile_indx >= * pnum_texture_tiles){
return; // nothing to do return; // nothing to do
} }
// get number of tile // get number of tile
int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP // int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP (before CDP2)
int tile_offs = (gpu_texture_indices_offset >=0) ? gpu_texture_indices_offset : -(gpu_texture_indices_offset + *pnum_texture_tiles);
int tile_code = gpu_texture_indices[tile_indx + tile_offs]; // Added for Java, no DP
if ((tile_code & (1 << LIST_TEXTURE_BIT)) == 0){ if ((tile_code & (1 << LIST_TEXTURE_BIT)) == 0){
return; // nothing to do return; // nothing to do
} }
...@@ -3900,7 +3920,7 @@ __global__ void imclt_rbg_all( ...@@ -3900,7 +3920,7 @@ __global__ void imclt_rbg_all(
int woi_theight, int woi_theight,
const size_t dstride) // in floats (pixels) const size_t dstride) // in floats (pixels)
{ {
// all OK - do not depend on previous kernels
// int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]); // int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]);
dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1); dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1);
dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1); dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1);
...@@ -3911,11 +3931,11 @@ __global__ void imclt_rbg_all( ...@@ -3911,11 +3931,11 @@ __global__ void imclt_rbg_all(
for (int color = 0; color < colors; color++) { for (int color = 0; color < colors; color++) {
// clear right and bottom 8-pixel column and row // clear right and bottom 8-pixel column and row
float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE); float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE);
erase8x8<<<grid_erase8x8_right_col,threads_erase8x8>>>( erase8x8<<<grid_erase8x8_right_col,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
right_col, // float * gpu_top_left, right_col, // float * gpu_top_left,
dstride); // const size_t dstride); dstride); // const size_t dstride);
float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + dstride * (woi_theight * DTT_SIZE); float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + dstride * (woi_theight * DTT_SIZE);
erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8>>>( erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
bottom_row, // float * gpu_top_left, bottom_row, // float * gpu_top_left,
dstride); // const size_t dstride); dstride); // const size_t dstride);
...@@ -3926,7 +3946,7 @@ __global__ void imclt_rbg_all( ...@@ -3926,7 +3946,7 @@ __global__ void imclt_rbg_all(
int tiles_in_pass = tilesy_half * tilesx_half; int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1); dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z); // printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>( imclt_rbg<<<grid_imclt,threads_imclt, 0, cudaStreamTailLaunch>>>(
gpu_clt[ncam], // float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt[ncam], // float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT gpu_corr_images[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
1, // int apply_lpf, 1, // int apply_lpf,
...@@ -3937,7 +3957,7 @@ __global__ void imclt_rbg_all( ...@@ -3937,7 +3957,7 @@ __global__ void imclt_rbg_all(
woi_twidth, // int woi_twidth, // will increase by DTT_SIZE (todo - cut away?) woi_twidth, // int woi_twidth, // will increase by DTT_SIZE (todo - cut away?)
woi_theight, // int woi_theight, // will increase by DTT_SIZE (todo - cut away?) woi_theight, // int woi_theight, // will increase by DTT_SIZE (todo - cut away?)
dstride); // const size_t dstride); // in floats (pixels) dstride); // const size_t dstride); // in floats (pixels)
cudaDeviceSynchronize(); /// cudaDeviceSynchronize();
} }
} }
} }
...@@ -5343,7 +5363,7 @@ __device__ void tile_combine_rgba( ...@@ -5343,7 +5363,7 @@ __device__ void tile_combine_rgba(
// next not used // next not used
// boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing) // boolean diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float * chn_weights, // color channel weights, sum == 1.0 const float * chn_weights, // color channel weights, sum == 1.0
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated, not so for the crms) int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
int debug) int debug)
......
...@@ -37,10 +37,34 @@ ...@@ -37,10 +37,34 @@
*/ */
#pragma once #pragma once
#ifndef TILE_PROCESSOR_H_
#define TILE_PROCESSOR_H_
#ifndef NUM_CAMS #ifndef NUM_CAMS
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define TASK_TEXTURE_BITS ((1 << TASK_TEXT_N_BIT) | (1 << TASK_TEXT_NE_BIT) | (1 << TASK_TEXT_E_BIT) | (1 << TASK_TEXT_SE_BIT)\
| (1 << TASK_TEXT_S_BIT) | (1 << TASK_TEXT_SW_BIT) | (1 << TASK_TEXT_W_BIT) | (1 << TASK_TEXT_NW_BIT))
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
float center_x; // actual center X (use to find derivatives)
float center_y; // actual center X (use to find derivatives)
float dxc_dx; // add this to data_x per each pixel X-shift relative to the kernel center location
float dxc_dy; // same per each Y-shift pixel
float dyc_dx;
float dyc_dy;
};
extern "C" __global__ void convert_direct( // called with a single block, single thread extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
...@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA( ...@@ -216,5 +240,84 @@ extern "C" __global__ void generate_RBGA(
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int * twh);
extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
/// size_t num_texture_tiles, // number of texture tiles to process
int * pnum_texture_tiles, // pointer to a number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
const float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx);
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILES-X, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void create_nonoverlap_list(
int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
extern "C" __global__ void mark_texture_tiles(
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C" __global__ void gen_texture_list(
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input
#endif
...@@ -40,6 +40,14 @@ ...@@ -40,6 +40,14 @@
#include "tp_defines.h" #include "tp_defines.h"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
// #include "TileProcessor.h"
#include <cuda_runtime.h>
// #include <helper_cuda.h>
// #include <helper_functions.h>
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
#ifndef get_task_size #ifndef get_task_size
...@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}... ...@@ -104,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, {{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
} }
}; };
// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
__constant__ int angles_offsets [4] {15,0,30,30};
/*
__constant__ int angles_offsets [4] {
(int) (offsetof4(corr_vector, azimuth)),
(int) (offsetof4(corr_vector, tilt)),
(int) (offsetof4(corr_vector, roll)),
(int) (offsetof4(corr_vector, roll))};
*/
/*
__constant__ int angles_offsets [4] = {
(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
(int) (offsetof(corr_vector, tilt) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float)),
(int) (offsetof(corr_vector, roll) /sizeof(float))};
__constant__ int angles_offsets [4] = { */
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={ __constant__ int mm_seq [3][3][3]={
{ {
{6,5,12}, // a_t * a_z -> tmp0 {6,5,12}, // a_t * a_z -> tmp0
...@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets( ...@@ -337,9 +356,6 @@ extern "C" __global__ void calculate_tiles_offsets(
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
} }
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
} }
......
...@@ -51,6 +51,11 @@ ...@@ -51,6 +51,11 @@
((size_t)&(((st *)0)->m)) ((size_t)&(((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif #endif
#ifndef offsetof4
#define offsetof4(st, m) \
(((size_t)&(((st *)0)->m))>>2)
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm #define SCENE_UNITS_SCALE 0.001 // meters from mm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment