Commit 4e5e1f22 authored by Andrey Filippov's avatar Andrey Filippov

Fixed inter-scene correlation, updated 12.6.0 kernel sources

parent e08db32b
......@@ -78,7 +78,8 @@ import jcuda.nvrtc.nvrtcProgram;
public class GPUTileProcessor {
public static String CUDA_VERSION = JCudaVersion.get();
public static boolean USE_CUDA12 = CUDA_VERSION.startsWith("12.");
public static boolean USE_DS_DP = false; // Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
public static boolean USE_DS_DP = USE_CUDA12; // false; // Use Dynamic Shared memory with Dynamic Parallelism TODO: Split for DP2/shared memory
public static String SOURCE_FOLDER = ""; // where the sources were loaded from - debug feature
String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
// Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
......@@ -321,6 +322,7 @@ public class GPUTileProcessor {
file = new File(src_dir.getPath(), src_file);
System.out.println("Loading resource "+file);
}
SOURCE_FOLDER = file.getPath();
// System.out.println(file.getAbsolutePath());
String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
String sourceFile = readFileAsString(cuFileName); // readResourceAsString(cuFileName);
......
......@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
......@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0;
dim3 grid_corr((num_tiles * num_sel_sensors + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
......@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
/// cudaDeviceSynchronize();
/// __device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
/// int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
/// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
/// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalizations and back conversion
num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
......@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
/// dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
pnum_corr_tiles, //num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
......@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
//__device__
*pnum_active_tiles = 0; // already _device_
int task_size = get_task_size(num_cams);
index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
task_size, // int task_size, // flattened task size in 4-byte floats
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, //int num_tiles, // number of tiles in task
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment