Commit 8612a64e authored by Andrey Filippov's avatar Andrey Filippov

fixed inter-scene correlation

parent d8e9a454
...@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
float * gpu_corrs) // correlation output data float * gpu_corrs) // correlation output data
{ {
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1); dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
...@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
if (num_sel_sensors > 0){ if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory // try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0; *pnum_corr_tiles = 0;
dim3 grid_corr((num_tiles * num_sel_sensors + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
sel_sensors, // int sel_sensors, sel_sensors, // int sel_sensors,
...@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
tilesx, // int width, // number of tiles in a row tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
/// cudaDeviceSynchronize();
/// __device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles); correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalizations and back conversion
/// int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
/// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
/// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
num_cams, // int num_cams, num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all) num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD ...@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum) gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
/// dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>( // combine per-senor interscene correlations combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all) num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
pnum_corr_tiles, //num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum) pnum_corr_tiles, //num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
...@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct( // called with a single block, singl ...@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
//__device__ //__device__
*pnum_active_tiles = 0; // already _device_ *pnum_active_tiles = 0; // already _device_
int task_size = get_task_size(num_cams); int task_size = get_task_size(num_cams);
index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>( index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
task_size, // int task_size, // flattened task size in 4-byte floats task_size, // int task_size, // flattened task size in 4-byte floats
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, //int num_tiles, // number of tiles in task num_tiles, //int num_tiles, // number of tiles in task
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#include <cstdlib> #include <cstdlib>
#include <cstdio> #include <cstdio>
#include <cuda_runtime.h> // cudaFree #include <cuda_runtime.h> // cudaFree
//#define __CUDA_RUNTIME_H__ // turn on, then off to fix missing findCudaDevice()
#include <helper_cuda.h> // for checkCudaErrors #include <helper_cuda.h> // for checkCudaErrors
#include <helper_functions.h> // timer functions #include <helper_functions.h> // timer functions
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment