Commit de3c497a authored by Andrey Filippov's avatar Andrey Filippov

refactoring for CDP2

parent f8839287
......@@ -766,12 +766,23 @@ extern "C" __global__ void gen_texture_list(
int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input
extern "C" __global__ void update_woi(
int texture_slices,
int * woi, // min_x, min_y, max_x, max_y input, not modified, max_x - not used
int * twh); // 2-element in device global memory
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void clear_texture_rbga2( // version for CDP2
int * twh, // texture_width, // aligned to DTT_SIZE
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
//inline __device__ int get_task_size(int num_cams);
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
......@@ -815,6 +826,7 @@ extern "C" __global__ void create_nonoverlap_list(
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
__global__ void convert_correct_tiles(
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
......@@ -824,7 +836,7 @@ __global__ void convert_correct_tiles(
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
int * num_active_tiles, // number of tiles in task
float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
......@@ -836,7 +848,8 @@ __global__ void convert_correct_tiles(
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
......@@ -850,7 +863,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
/// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
......@@ -863,7 +877,8 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute
int num_corr_tiles, // number of correlation tiles to process
// int num_corr_tiles, // number of correlation tiles to process
int * num_corr_tiles, // pointer to a number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15)
......@@ -895,7 +910,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
/// size_t num_texture_tiles, // number of texture tiles to process
int * pnum_texture_tiles, // pointer to a number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
......@@ -972,11 +988,16 @@ extern "C" __global__ void correlate2D(
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
int num_pairs = num_cams * (num_cams-1) / 2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1); // static
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1); // static
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_tiles * num_pairs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_corr_tiles = 0;
index_correlate<<<blocks0,threads0>>>(
*pnum_corr_tiles = 0; // global, allocated by host
index_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams,
sel_pairs0, // int sel_pairs0,
sel_pairs1, // int sel_pairs1,
......@@ -988,10 +1009,10 @@ extern "C" __global__ void correlate2D(
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>(
/// cudaDeviceSynchronize();
// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
/// dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1)
......@@ -999,7 +1020,8 @@ extern "C" __global__ void correlate2D(
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
fat_zero2, // float fat_zero2, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
// *pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
pnum_corr_tiles, // size_t num_corr_tiles, // pointer to a number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
......@@ -1045,13 +1067,19 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
float * gpu_corrs) // correlation output data
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 blocks0 ((num_tiles*num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_corr((num_cams + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
if (threadIdx.x == 0) { // only 1 thread, 1 block
int num_sel_sensors = __popc (sel_sensors); // number of non-zero bits
if (num_sel_sensors > 0){
// try with null tp_tasks to use same sequence from GPU memory
*pnum_corr_tiles = 0;
index_inter_correlate<<<blocks0,threads0>>>(
index_inter_correlate<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams,
sel_sensors, // int sel_sensors,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
......@@ -1059,13 +1087,12 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize();
int num_corr_tiles_with_sum = (*pnum_corr_tiles);
int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr>>>( // will only process to TD, no normalisations and back conversion
/// cudaDeviceSynchronize();
/// __device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
/// int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
/// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
/// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner<<<grid_corr,threads_corr, 0, cudaStreamTailLaunch>>>( // will only process to TD, no normalisations and back conversion
num_cams, // int num_cams,
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
......@@ -1074,14 +1101,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
pnum_corr_tiles, // num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr>>>( // combine per-senor interscene correlations
/// dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter<<<grid_combine,threads_corr, 0, cudaStreamTailLaunch>>>( // combine per-senor interscene correlations
num_sel_sensors, // int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
pnum_corr_tiles, //num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair NOT USED
corr_stride, // size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs); // correlation output data (either pixel domain or transform domain
......@@ -1110,7 +1137,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
*/
extern "C" __global__ void combine_inter( // combine per-senor interscene correlations
int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
......@@ -1184,7 +1212,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
/// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
int * num_corr_tiles, // pointer to number of correlation tiles to process (here it includes sum)
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
......@@ -1194,7 +1223,8 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
int tile_index = corr_num / num_sel_sensors;
int corr_offset = tile_index + corr_num; // added for missing sum correlation tiles.
if (corr_offset >= num_corr_tiles){
// if (corr_offset >= num_corr_tiles){
if (corr_offset >= *num_corr_tiles){
return; // nothing to do
}
......@@ -1304,7 +1334,7 @@ extern "C" __global__ void correlate2D_inter_inner( // will only process to TD,
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process
* @param num_corr_tiles number of correlation tiles to process => a pointer to!
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride stride (in floats) for correlation outputs.
* @param corr_radius radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
......@@ -1318,7 +1348,8 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute
int num_corr_tiles, // number of correlation tiles to process
/// int num_corr_tiles, // number of correlation tiles to process
int * num_corr_tiles, // pointer to a number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15)
......@@ -1331,7 +1362,7 @@ extern "C" __global__ void correlate2D_inner(
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
if (corr_num >= num_corr_tiles){
if (corr_num >= *num_corr_tiles){
return; // nothing to do
}
int pair_list_start = pairs_offsets[num_cams];
......@@ -2038,30 +2069,56 @@ extern "C" __global__ void generate_RBGA(
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1);
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
int texture_slices = colors + 1;
if (keep_weights & 2){
texture_slices += colors * num_cams;
}
int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
if (threadIdx.x == 0) {
clear_texture_list<<<blocks0,threads0>>>(
clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
gpu_texture_indices,
width,
height);
cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
/// cudaDeviceSynchronize(); // not needed yet, just for testing
/// dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
/// int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
/// dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
/// *(woi + 0) = width; // TILES-X;
/// *(woi + 1) = height; // TILES-Y;
/// *(woi + 2) = 0; // maximal x
/// *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
......@@ -2070,69 +2127,89 @@ extern "C" __global__ void generate_RBGA(
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
/// *(num_texture_tiles+0) = 0;
/// *(num_texture_tiles+1) = 0;
/// *(num_texture_tiles+2) = 0;
/// *(num_texture_tiles+3) = 0;
/// *(num_texture_tiles+4) = 0;
/// *(num_texture_tiles+5) = 0;
/// *(num_texture_tiles+6) = 0;
/// *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list
width, // number of tiles in a row
height, // int height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
num_texture_tiles, // number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
num_texture_tiles, // pointer to a number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y input only, not modified, woi[2] not used
cudaDeviceSynchronize(); // not needed yet, just for testing
/*
// TODO: Add small kernel to only modify *(woi + 2), *(woi + 3) and generate texture_width, texture_tiles_height ?
/// cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height
}
__syncthreads();
__syncthreads(); // ?
// Zero output textures. Trim
// texture_rbga_stride
int texture_width = (*(woi + 2) + 1)* DTT_SIZE;
int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE;
int texture_slices = colors + 1;
if (keep_weights & 2){
texture_slices += colors * num_cams;
}
/// int texture_slices = colors + 1;
/// if (keep_weights & 2){
/// texture_slices += colors * num_cams;
/// }
if (threadIdx.x == 0) {
*/
__device__ int twh[2];
update_woi<<<1,1, 0, cudaStreamTailLaunch>>>(
texture_slices, // int texture_slices,
woi, // int * // min_x, min_y, max_x, max_y input, not modified, max_x - not used
twh); // int * twh) // 2-element in device global memory
);
// next kernels will see woi as {x,y,width,height}
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
clear_texture_rbga<<<blocks2,threads2>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
// int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
// dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
int blocks_x = ((width+1) * DTT_SIZE + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, (height+1) * DTT_SIZE * texture_slices, 1); // each thread - 8 vertical
/*
clear_texture_rbga<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles);
*/
clear_texture_rbga2<<<blocks2,threads2, 0, cudaStreamTailLaunch>>>( // add clearing of multi-sensor output (keep_weights & 2 !=0)
twh, // int * twh, // {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x116 tiles overhang by 4 pixels)
cudaDeviceSynchronize(); // not needed yet, just for testing
/// cudaDeviceSynchronize(); // not needed yet, just for testing
for (int pass = 0; pass < 8; pass++){
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
// int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
// dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
int * pntt = num_texture_tiles + ((pass & 3) << 1) + border_tile; // pointer to global memory
dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
// ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
}
#ifdef DEBUG12
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
......@@ -2146,15 +2223,16 @@ extern "C" __global__ void generate_RBGA(
printf("\n");
#endif
/* */
int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
/// int shared_size = get_textures_shared_size( // in bytes
/// num_cams, // int num_cams, // actual number of cameras
/// colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
/// 0); // int * offsets); // in floats
textures_accumulate <<<grid_texture,threads_texture, shared_size, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, // number of cameras used
woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process
// ntt, // size_t num_texture_tiles, // number of texture tiles to process
pntt, // size_t * pnum_texture_tiles, // pointer to a number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
......@@ -2176,14 +2254,28 @@ extern "C" __global__ void generate_RBGA(
1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
(float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
width);
cudaDeviceSynchronize(); // not needed yet, just for testing
/* */
}
}
__syncthreads();
}
/**
* Helper kernel for CDP2 to update woi (was {min_x,min_y,Max_x,max_y}
* and calculate texture_width and texture_tiles_height * texture_slices to be used in next kernel in stream
*/
__global__ void update_woi(
int texture_slices,
int * woi, // min_x, min_y, max_x, max_y input, not modified, max_x - not used
int * twh) // 2-element in device global memory
{
if (threadIdx.x == 0) { // always
*(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height
twh[0] = (*(woi + 2) + 1)* DTT_SIZE;
twh[1] = (*(woi + 3) + 1) * DTT_SIZE * texture_slices;
}
}
/**
* Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
......@@ -2203,7 +2295,7 @@ __global__ void clear_texture_rbga(
if (col > texture_width) {
return;
}
int row = blockIdx.y;; // includes slices
int row = blockIdx.y; // includes slices
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
......@@ -2211,6 +2303,36 @@ __global__ void clear_texture_rbga(
}
}
/**
* Helper kernel for generate_RBGA() - zeroes output array (next passes accumulate)
* @param twh {texture width in pixels, aligned to DTT_SIZE,
* full number of output rows: texture height in pixels, multiplied by number of color slices}
* @param texture_rbga_stride texture line stride in floats
* @param gpu_texture_tiles pointer to the texture output
*/
// blockDim.x * gridDim.x >= width
__global__ void clear_texture_rbga2(
int * twh, // {texture_width, texture_hight*color_slices)// aligned to DTT_SIZE
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
if (col > twh[0]) {
return;
}
int row = blockIdx.y; // includes slices
if (row > twh[1]) {
return;
}
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
*(pix++) = 0.0;
}
}
// not used - both in C++ and Java
/**
* Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal
* neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
......@@ -2246,29 +2368,41 @@ __global__ void prepare_texture_list(
dim3 blocks0 (blocks_x, height, 1);
if (threadIdx.x == 0) {
clear_texture_list<<<blocks0,threads0>>>(
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
clear_texture_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
gpu_texture_indices,
width,
height);
cudaDeviceSynchronize(); // not needed yet, just for testing
/// cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
mark_texture_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list
width,
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = width; // TILES-X;
*(woi + 1) = height; // TILES-Y;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
/// *(woi + 0) = width; // TILES-X;
/// *(woi + 1) = height; // TILES-Y;
/// *(woi + 2) = 0; // maximal x
/// *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams,
gpu_ftasks,
// gpu_tasks,
......@@ -2277,18 +2411,18 @@ __global__ void prepare_texture_list(
height, // number of tiles rows
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
/// *(num_texture_tiles+0) = 0;
/// *(num_texture_tiles+1) = 0;
/// *(num_texture_tiles+2) = 0;
/// *(num_texture_tiles+3) = 0;
/// *(num_texture_tiles+4) = 0;
/// *(num_texture_tiles+5) = 0;
/// *(num_texture_tiles+6) = 0;
/// *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads, 0, cudaStreamTailLaunch>>>(
num_cams,
gpu_ftasks,
// gpu_tasks,
......@@ -2299,7 +2433,7 @@ __global__ void prepare_texture_list(
num_texture_tiles, // number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize(); // not needed yet, just for testing
/// cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height
}
......@@ -2377,7 +2511,7 @@ __global__ void mark_texture_tiles(
* @param width number of tiles in a row
* @param height number of tiles rows
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles)
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles) -> minx, min_y, max_x, max_y
*/
// treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
......@@ -2445,8 +2579,8 @@ __global__ void gen_texture_list(
int width, // number of tiles in a row
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi) // min_x, min_y, max_x, max_y input
int * num_texture_tiles, // pointer to a number of texture tiles to process
int * woi) // min_x, min_y, max_x, max_y input, not modified, max_x - not used
{
int tilesya = ((height +3) & (~3)); //#define TILES-YA ((TILES-Y +3) & (~3))
......@@ -2659,7 +2793,7 @@ __global__ void index_correlate(
/**
* Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
* For interscene correlation. One correlation output for each selected sensor
* plus a sum of them all. So for all 16 sensors selected ooutput will have 17
* plus a sum of them all. So for all 16 sensors selected output will have 17
* 2D correlations (with sum being the last one)
* All pairs for the same tile will always be in the same order: increasing sensor numbers
* with sum being the last. Sum will be marked by 0xff in the LSB.
......@@ -2709,7 +2843,6 @@ __global__ void index_inter_correlate(
}
}
/**
* Direct MCLT transform and aberration correction with space-variant deconvolution
* kernels. Results are used to output aberration-corrected images, textures and
......@@ -2755,19 +2888,20 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // always 1
*pnum_active_tiles = 0;
//__device__
*pnum_active_tiles = 0; // already _device_
int task_size = get_task_size(num_cams);
index_direct<<<blocks0,threads0>>>(
index_direct<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>( // cudaStreamFireAndForget>>>(
task_size, // int task_size, // flattened task size in 4-byte floats
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, //int num_tiles, // number of tiles in task
gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
// now call actual convert_correct_tiles
/// cudaDeviceSynchronize();
/// dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
convert_correct_tiles<<<grid_tp,threads_tp>>>(
dim3 grid_tp((num_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1); // use static dimensions - maximal number of tiles
convert_correct_tiles<<<grid_tp, threads_tp, 0, cudaStreamTailLaunch>>>(
num_cams, // int num_cams, // actual number of cameras
num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
((kernels_hor>0)?gpu_kernel_offsets:0), // float ** gpu_kernel_offsets, // [num_cams],
......@@ -2775,16 +2909,15 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
gpu_images, // float ** gpu_images, // [num_cams],
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*pnum_active_tiles, // int num_active_tiles, // number of tiles in task
pnum_active_tiles, // int * pnum_active_tiles, // number of tiles in task
gpu_clt, // float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
dstride, // size_t dstride, // in floats (pixels)
lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width, // int woi_width, // varaible to swict between EO and LWIR
woi_height, // int woi_height, // varaible to swict between EO and LWIR
kernels_hor, // int kernels_hor, // varaible to swict between EO and LWIR
woi_width, // int woi_width, // variable to switch between EO and LWIR
woi_height, // int woi_height, // variable to switch between EO and LWIR
kernels_hor, // int kernels_hor, // variable to switch between EO and LWIR
kernels_vert, // ); // int kernels_vert); // varaible to swict between EO and LWIR
tilesx); // int tilesx)
}
}
......@@ -2867,7 +3000,7 @@ __global__ void convert_correct_tiles(
float ** gpu_images, // [num_cams],
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
int * num_active_tiles, // number of tiles in task
float ** gpu_clt, // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
......@@ -2877,11 +3010,12 @@ __global__ void convert_correct_tiles(
int kernels_vert,
int tilesx)
{
// printf("\n1.7. convert_correct_tiles():3055\n");
// int tilesx = TILES-X;
/// dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_indx >= num_active_tiles){
if (task_indx >= *num_active_tiles){
return; // nothing to do
}
int task_num = gpu_active_tiles[task_indx];
......@@ -2997,6 +3131,7 @@ __global__ void convert_correct_tiles(
__syncthreads();
}
}
// return;
}
/**
......@@ -3069,7 +3204,7 @@ extern "C" __global__ void textures_nonoverlap(
if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_texture_tiles = 0;
create_nonoverlap_list<<<blocks0,threads0>>>(
create_nonoverlap_list<<<blocks0,threads0, 0, cudaStreamFireAndForget>>>(
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
......@@ -3077,13 +3212,14 @@ extern "C" __global__ void textures_nonoverlap(
num_tilesx, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
/// dim3 grid_texture((*pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
dim3 grid_texture((num_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // fixed-size grid
int shared_size = get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
......@@ -3094,11 +3230,12 @@ extern "C" __global__ void textures_nonoverlap(
__syncthreads();
#endif
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( // 65536>>>( //
textures_accumulate <<<grid_texture,threads_texture, shared_size, cudaStreamTailLaunch>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
*pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
// *pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
pnum_texture_tiles, // int * pnum_texture_tiles, // pointer to a number of texture tiles to process
0, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
......@@ -3135,8 +3272,8 @@ extern "C" __global__ void textures_nonoverlap(
* @param num_cams Number of cameras used
* @param woi WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param num_texture_tiles number of texture tiles to process
* @param gpu_texture_indices_offset add to gpu_texture_indices
* @param pnum_texture_tiles pointer to a number of texture tiles to process
* @param gpu_texture_indices_offset add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
* @param gpu_texture_indices array - 1 integer per tile to process
* @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight
* @param colors number of colors used: 3 for RGB or 1 for monochrome
......@@ -3162,8 +3299,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
int * woi, // x, y, width,height
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices
/// size_t num_texture_tiles, // number of texture tiles to process
int * pnum_texture_tiles, // pointer to a number of texture tiles to process
int gpu_texture_indices_offset,// add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
......@@ -3201,11 +3339,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
// int camera_num = threadIdx.y;
int tile_indx = blockIdx.x; // * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if (tile_indx >= num_texture_tiles){
if (tile_indx >= * pnum_texture_tiles){
return; // nothing to do
}
// get number of tile
int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP
// int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP (before CDP2)
int tile_offs = (gpu_texture_indices_offset >=0) ? gpu_texture_indices_offset : -(gpu_texture_indices_offset + *pnum_texture_tiles);
int tile_code = gpu_texture_indices[tile_indx + tile_offs]; // Added for Java, no DP
if ((tile_code & (1 << LIST_TEXTURE_BIT)) == 0){
return; // nothing to do
}
......@@ -3900,7 +4040,7 @@ __global__ void imclt_rbg_all(
int woi_theight,
const size_t dstride) // in floats (pixels)
{
// all OK - do not depend on previous kernels
// int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]);
dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1);
dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1);
......@@ -3911,11 +4051,11 @@ __global__ void imclt_rbg_all(
for (int color = 0; color < colors; color++) {
// clear right and bottom 8-pixel column and row
float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE);
erase8x8<<<grid_erase8x8_right_col,threads_erase8x8>>>(
erase8x8<<<grid_erase8x8_right_col,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
right_col, // float * gpu_top_left,
dstride); // const size_t dstride);
float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + dstride * (woi_theight * DTT_SIZE);
erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8>>>(
erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8, 0, cudaStreamTailLaunch>>>(
bottom_row, // float * gpu_top_left,
dstride); // const size_t dstride);
......@@ -3926,7 +4066,7 @@ __global__ void imclt_rbg_all(
int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>(
imclt_rbg<<<grid_imclt,threads_imclt, 0, cudaStreamTailLaunch>>>(
gpu_clt[ncam], // float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
1, // int apply_lpf,
......@@ -3937,7 +4077,7 @@ __global__ void imclt_rbg_all(
woi_twidth, // int woi_twidth, // will increase by DTT_SIZE (todo - cut away?)
woi_theight, // int woi_theight, // will increase by DTT_SIZE (todo - cut away?)
dstride); // const size_t dstride); // in floats (pixels)
cudaDeviceSynchronize();
/// cudaDeviceSynchronize();
}
}
}
......
......@@ -40,6 +40,9 @@
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
// #include "TileProcessor.h"
#endif // #ifndef JCUDA
#ifndef get_task_size
......@@ -337,9 +340,6 @@ extern "C" __global__ void calculate_tiles_offsets(
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
}
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
}
......
......@@ -33,9 +33,9 @@
// all of the next 5 were disabled
#define NOCORR
#define NOCORR_TD
#define NOTEXTURES
#define NOTEXTURE_RGBA
#define NOTEXTURE_RGBAXXX
//#define NOTEXTURES
//#define NOTEXTURE_RGBA
//#define NOTEXTURE_RGBAXXX
#define SAVE_CLT
......@@ -574,11 +574,23 @@ void generate_RBGA_host(
int border_tile = pass >> 2;
int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
/* before CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
}
*/
// for CDP2
int ti_offset = (pass & 3) * (width * (tilesya >> 2)); // (TILES-X * (TILES-YA >> 2)); // 1/4
if (border_tile){
// ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
}
#ifdef DEBUG8A
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
pass, border_tile,ti_offset, ntt);
......@@ -604,7 +616,7 @@ void generate_RBGA_host(
num_cams, // int num_cams, // number of cameras used
gpu_woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process
pntt, // ntt, // int * num_texture_tiles, // number of texture tiles to process
ti_offset, // gpu_texture_indices_offset,// add to gpu_texture_indices
gpu_texture_indices, // + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......@@ -1451,10 +1463,11 @@ int main(int argc, char **argv)
gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
TILESX); // int tilesx)
printf("HOST: convert_direct() done\n");
getLastCudaError("Kernel execution failed");
printf("HOST: convert_direct() done - 1\n");
checkCudaErrors(cudaDeviceSynchronize());
printf("HOST: convert_direct() done - 2\n");
// printf("%d\n",i);
}
sdkStopTimer(&timerTP);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment