Commit a51d6a77 authored by Andrey Filippov's avatar Andrey Filippov

More editing to make dynamic number of cameras

parent ee0cfc3b
...@@ -983,27 +983,33 @@ __global__ void clear_texture_list( ...@@ -983,27 +983,33 @@ __global__ void clear_texture_list(
int height); // <= TILES-Y, use for faster processing of LWIR images int height); // <= TILES-Y, use for faster processing of LWIR images
__global__ void mark_texture_tiles( __global__ void mark_texture_tiles(
struct tp_task * gpu_tasks, int num_cams,
int num_tiles, // number of tiles in task list float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int width, // number of tiles in a row /// struct tp_task * gpu_tasks,
int * gpu_texture_indices); // packed tile + bits (now only (1 << 7) int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
__global__ void mark_texture_neighbor_tiles( __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7) int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi int * woi); // x,y,width,height of the woi
__global__ void gen_texture_list( __global__ void gen_texture_list(
struct tp_task * gpu_tasks, int num_cams,
int num_tiles, // number of tiles in task list float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int width, // number of tiles in a row /// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7) int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process int * num_texture_tiles, // number of texture tiles to process
int * woi); // x,y,width,height of the woi int * woi); // min_x, min_y, max_x, max_y input
__global__ void clear_texture_rbga( __global__ void clear_texture_rbga(
int texture_width, int texture_width,
...@@ -1011,7 +1017,7 @@ __global__ void clear_texture_rbga( ...@@ -1011,7 +1017,7 @@ __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
inline __device__ int get_task_size(int num_cams); //inline __device__ int get_task_size(int num_cams);
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams); inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams); inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
...@@ -1034,11 +1040,13 @@ __global__ void index_correlate( ...@@ -1034,11 +1040,13 @@ __global__ void index_correlate(
int * pnum_corr_tiles); // pointer to the length of correlation tasks array int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void create_nonoverlap_list( __global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int width, // number of tiles in a row int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
__global__ void convert_correct_tiles( __global__ void convert_correct_tiles(
int num_cams, // actual number of cameras int num_cams, // actual number of cameras
...@@ -1869,7 +1877,8 @@ extern "C" __global__ void corr2D_normalize_inner( ...@@ -1869,7 +1877,8 @@ extern "C" __global__ void corr2D_normalize_inner(
* This kernel launches others with CDP, from CPU it is just <<<1,1>>> * This kernel launches others with CDP, from CPU it is just <<<1,1>>>
* *
* @param num_cams number of cameras * @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later) * @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
...@@ -1895,7 +1904,8 @@ extern "C" __global__ void corr2D_normalize_inner( ...@@ -1895,7 +1904,8 @@ extern "C" __global__ void corr2D_normalize_inner(
extern "C" __global__ void generate_RBGA( extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
...@@ -1937,7 +1947,9 @@ extern "C" __global__ void generate_RBGA( ...@@ -1937,7 +1947,9 @@ extern "C" __global__ void generate_RBGA(
dim3 blocks(blocks_t, 1, 1); dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory // mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>( mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7) gpu_texture_indices); // packed tile + bits (now only (1 << 7)
...@@ -1948,7 +1960,9 @@ extern "C" __global__ void generate_RBGA( ...@@ -1948,7 +1960,9 @@ extern "C" __global__ void generate_RBGA(
*(woi + 2) = 0; // maximal x *(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>( mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // number of tiles rows height, // number of tiles rows
...@@ -1967,7 +1981,9 @@ extern "C" __global__ void generate_RBGA( ...@@ -1967,7 +1981,9 @@ extern "C" __global__ void generate_RBGA(
*(num_texture_tiles+7) = 0; *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>( gen_texture_list <<<blocks,threads>>>(
gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // int height, // number of tiles rows height, // int height, // number of tiles rows
...@@ -2003,7 +2019,7 @@ extern "C" __global__ void generate_RBGA( ...@@ -2003,7 +2019,7 @@ extern "C" __global__ void generate_RBGA(
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int num_cams_per_thread = TEXTURE_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
...@@ -2094,7 +2110,9 @@ __global__ void clear_texture_rbga( ...@@ -2094,7 +2110,9 @@ __global__ void clear_texture_rbga(
* Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal * Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal
* neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads, * neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
* *
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array) * @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
...@@ -2103,15 +2121,17 @@ __global__ void clear_texture_rbga( ...@@ -2103,15 +2121,17 @@ __global__ void clear_texture_rbga(
* @param height full image height in tiles <= TILES-Y, use for faster processing of LWIR images * @param height full image height in tiles <= TILES-Y, use for faster processing of LWIR images
*/ */
__global__ void prepare_texture_list( __global__ void prepare_texture_list(
struct tp_task * gpu_tasks, int num_cams, // number of cameras used
int num_tiles, // number of tiles in task list float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) // struct tp_task * gpu_tasks,
// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up, int num_tiles, // number of tiles in task list
// border - down from the sam3\e 1/4 of the buffer int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation) // modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
int * woi, // x,y,width,height of the woi // border - down from the sam3\e 1/4 of the buffer
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1) int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int height) // <= TILES-Y, use for faster processing of LWIR images int * woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height) // <= TILES-Y, use for faster processing of LWIR images
{ {
// int task_num = blockIdx.x; // int task_num = blockIdx.x;
// int tid = threadIdx.x; // maybe it will be just <<<1,1>>> // int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
...@@ -2132,7 +2152,9 @@ __global__ void prepare_texture_list( ...@@ -2132,7 +2152,9 @@ __global__ void prepare_texture_list(
dim3 blocks(blocks_t, 1, 1); dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory // mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>( mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks, num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, width,
gpu_texture_indices); // packed tile + bits (now only (1 << 7) gpu_texture_indices); // packed tile + bits (now only (1 << 7)
...@@ -2143,7 +2165,9 @@ __global__ void prepare_texture_list( ...@@ -2143,7 +2165,9 @@ __global__ void prepare_texture_list(
*(woi + 2) = 0; // maximal x *(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y *(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>( mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks, num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // number of tiles rows height, // number of tiles rows
...@@ -2161,7 +2185,9 @@ __global__ void prepare_texture_list( ...@@ -2161,7 +2185,9 @@ __global__ void prepare_texture_list(
*(num_texture_tiles+7) = 0; *(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>( gen_texture_list <<<blocks,threads>>>(
gpu_tasks, num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // int height, // number of tiles rows height, // int height, // number of tiles rows
...@@ -2201,7 +2227,9 @@ __global__ void clear_texture_list( ...@@ -2201,7 +2227,9 @@ __global__ void clear_texture_list(
* Helper kernel for prepare_texture_list() (for generate_RBGA) - mark used tiles in * Helper kernel for prepare_texture_list() (for generate_RBGA) - mark used tiles in
* gpu_texture_indices memory * gpu_texture_indices memory
* *
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param num_cams number of cameras <= NUM_CAMS
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row * @param width number of tiles in a row
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
...@@ -2209,7 +2237,9 @@ __global__ void clear_texture_list( ...@@ -2209,7 +2237,9 @@ __global__ void clear_texture_list(
// treads (*,1,1), blocks = (*,1,1) // treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_tiles( __global__ void mark_texture_tiles(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int * gpu_texture_indices) // packed tile + bits (now only (1 << 7) int * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
...@@ -2218,11 +2248,15 @@ __global__ void mark_texture_tiles( ...@@ -2218,11 +2248,15 @@ __global__ void mark_texture_tiles(
if (task_num >= num_tiles) { if (task_num >= num_tiles) {
return; // nothing to do return; // nothing to do
} }
int task = gpu_tasks[task_num].task; /// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile return; // NOP tile
} }
int cxy = gpu_tasks[task_num].txy; /// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * width) = 1; // TILES-X) = 1; *(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * width) = 1; // TILES-X) = 1;
} }
...@@ -2231,7 +2265,9 @@ __global__ void mark_texture_tiles( ...@@ -2231,7 +2265,9 @@ __global__ void mark_texture_tiles(
* bitmap of available neighbors in 4 directions (needed for alpha generation of * bitmap of available neighbors in 4 directions (needed for alpha generation of
* the result textures to fade along the border. * the result textures to fade along the border.
* *
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row * @param width number of tiles in a row
* @param height number of tiles rows * @param height number of tiles rows
...@@ -2240,7 +2276,9 @@ __global__ void mark_texture_tiles( ...@@ -2240,7 +2276,9 @@ __global__ void mark_texture_tiles(
*/ */
// treads (*,1,1), blocks = (*,1,1) // treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
...@@ -2252,11 +2290,14 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? ...@@ -2252,11 +2290,14 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
if (task_num >= num_tiles) { if (task_num >= num_tiles) {
return; // nothing to do return; // nothing to do
} }
int task = gpu_tasks[task_num].task; /// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile return; // NOP tile
} }
int cxy = gpu_tasks[task_num].txy; /// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff); int x = (cxy & 0xffff);
int y = (cxy >> 16); int y = (cxy >> 16);
atomicMin(woi+0, x); atomicMin(woi+0, x);
...@@ -2264,16 +2305,12 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? ...@@ -2264,16 +2305,12 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
atomicMax(woi+2, x); atomicMax(woi+2, x);
atomicMax(woi+3, y); atomicMax(woi+3, y);
int d = 0; int d = 0;
// if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * TILES-X)) d |= (1 << TASK_TEXTURE_N_BIT);
// if ((x < (TILES-X - 1)) && *(gpu_texture_indices + (x + 1) + y * TILES-X)) d |= (1 << TASK_TEXTURE_E_BIT);
// if ((y < (TILES-Y - 1)) && *(gpu_texture_indices + x + (y + 1) * TILES-X)) d |= (1 << TASK_TEXTURE_S_BIT);
// if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * TILES-X)) d |= (1 << TASK_TEXTURE_W_BIT);
if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * width)) d |= (1 << TASK_TEXTURE_N_BIT); if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * width)) d |= (1 << TASK_TEXTURE_N_BIT);
if ((x < (width - 1)) && *(gpu_texture_indices + (x + 1) + y * width)) d |= (1 << TASK_TEXTURE_E_BIT); if ((x < (width - 1)) && *(gpu_texture_indices + (x + 1) + y * width)) d |= (1 << TASK_TEXTURE_E_BIT);
if ((y < (height - 1)) && *(gpu_texture_indices + x + (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT); if ((y < (height - 1)) && *(gpu_texture_indices + x + (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT);
if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * width)) d |= (1 << TASK_TEXTURE_W_BIT); if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * width)) d |= (1 << TASK_TEXTURE_W_BIT);
gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task; /// gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
*(int *) (gpu_ftasks + get_task_size(num_cams) * task_num) = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
} }
/** /**
...@@ -2282,15 +2319,18 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? ...@@ -2282,15 +2319,18 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
* of non-overlapping tiles (odd/even rows/columns). At first made 8 lists, with pairs of * of non-overlapping tiles (odd/even rows/columns). At first made 8 lists, with pairs of
* growing up and down for inner and border tiles, but now border attribute is not * growing up and down for inner and border tiles, but now border attribute is not
* used anymore. * used anymore.
* * @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array) * @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles) * @param woi 4-element int array ( x,y,width,height of the woi, in tiles)
*/ */
__global__ void gen_texture_list( __global__ void gen_texture_list(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
...@@ -2305,11 +2345,13 @@ __global__ void gen_texture_list( ...@@ -2305,11 +2345,13 @@ __global__ void gen_texture_list(
return; // nothing to do return; // nothing to do
} }
int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS; /// int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile return; // NOP tile
} }
int cxy = gpu_tasks[task_num].txy; // int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff); int x = (cxy & 0xffff);
int y = (cxy >> 16); int y = (cxy >> 16);
...@@ -2325,22 +2367,18 @@ __global__ void gen_texture_list( ...@@ -2325,22 +2367,18 @@ __global__ void gen_texture_list(
// don't care if calculate extra pixels that still fit into memory // don't care if calculate extra pixels that still fit into memory
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILES-X - 1)) || (y == woi[3]);
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (width - 1)) || (y == woi[3]); int is_border = (x == woi[0]) || (y == woi[1]) || (x == (width - 1)) || (y == woi[3]);
int buff_head = 0; int buff_head = 0;
int num_offset = 0; int num_offset = 0;
if (x & 1) { if (x & 1) {
// buff_head += TILES-X * (TILES-YA >> 2); //TILES-YA - 2 LSB == 00
buff_head += width * (tilesya >> 2); //TILES-YA - 2 LSB == 00 buff_head += width * (tilesya >> 2); //TILES-YA - 2 LSB == 00
num_offset += 2; // int * num_offset += 2; // int *
} }
if (y & 1) { if (y & 1) {
// buff_head += TILES-X * (TILES-YA >> 1);
buff_head += width * (tilesya >> 1); buff_head += width * (tilesya >> 1);
num_offset += 4; // int * num_offset += 4; // int *
} }
if (is_border){ if (is_border){
// buff_head += (TILES-X * (TILES-YA >> 2) - 1); // end of the buffer
buff_head += (width * (tilesya >> 2) - 1); // end of the buffer buff_head += (width * (tilesya >> 2) - 1); // end of the buffer
num_offset += 1; // int * num_offset += 1; // int *
} }
...@@ -2360,13 +2398,11 @@ __global__ void gen_texture_list( ...@@ -2360,13 +2398,11 @@ __global__ void gen_texture_list(
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif // DEBUG12 #endif // DEBUG12
// *(gpu_texture_indices + buf_offset) = task | ((x + y * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); *(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
} }
//inline __device__ int get_task_size(int num_cams){
inline __device__ int get_task_size(int num_cams){ // return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams); //}
}
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams) { inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile); return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
...@@ -2374,7 +2410,6 @@ inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_ca ...@@ -2374,7 +2410,6 @@ inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_ca
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams) { inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1); return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
} }
/** /**
* Helper kernel for convert_direct() - generates dense list of tiles for direct MCLT. * Helper kernel for convert_direct() - generates dense list of tiles for direct MCLT.
* Tile order from the original (sparse) list is not preserved * Tile order from the original (sparse) list is not preserved
...@@ -2408,14 +2443,18 @@ __global__ void index_direct( ...@@ -2408,14 +2443,18 @@ __global__ void index_direct(
* Helper kernel for textures_nonoverlap() - generates dense list of tiles for non-overlap * Helper kernel for textures_nonoverlap() - generates dense list of tiles for non-overlap
* (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation * (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
* *
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param num_cams number of cameras <= NUM_CAMS
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row * @param width number of tiles in a row
* @param nonoverlap_list integer array to place the generated list * @param nonoverlap_list integer array to place the generated list
* @param pnonoverlap_length single-element integer array return generated list length * @param pnonoverlap_length single-element integer array return generated list length
*/ */
__global__ void create_nonoverlap_list( __global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int width, // number of tiles in a row int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
...@@ -2425,13 +2464,16 @@ __global__ void create_nonoverlap_list( ...@@ -2425,13 +2464,16 @@ __global__ void create_nonoverlap_list(
if (num_tile >= num_tiles){ if (num_tile >= num_tiles){
return; return;
} }
if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){ int task_task = get_task_task(num_tile, gpu_ftasks, num_cams);
/// if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
if ((task_task & TASK_TEXTURE_BITS) == 0){
return; // nothing to do return; // nothing to do
} }
int cxy = gpu_tasks[num_tile].txy; /// int cxy = gpu_tasks[num_tile].txy;
// int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS; int cxy = get_task_txy(num_tile, gpu_ftasks, num_cams);
int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS; int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
if (gpu_tasks[num_tile].task != 0) { // if (gpu_tasks[num_tile].task != 0) {
if (task_task != 0) {
nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code; nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
} }
} }
...@@ -2441,13 +2483,13 @@ __global__ void create_nonoverlap_list( ...@@ -2441,13 +2483,13 @@ __global__ void create_nonoverlap_list(
* With the quad camera each tile may generate up to 6 pairs (int array elements) * With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are * Tiles are not ordered, but the correlation pairs for each tile are
* *
* @param num_cams number of cameras <= NUM_CAMS * @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs). * @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task) // * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list * @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length * @param pnum_corr_tiles single-element integer array return generated list length
*/ */
__global__ void index_correlate( __global__ void index_correlate(
int num_cams, int num_cams,
...@@ -2746,7 +2788,8 @@ __global__ void convert_correct_tiles( ...@@ -2746,7 +2788,8 @@ __global__ void convert_correct_tiles(
* This kernel launches others with CDP, from CPU it is just <<<1,1>>> * This kernel launches others with CDP, from CPU it is just <<<1,1>>>
* *
* @param num_cams number of cameras * @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task) * @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process * @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later) * @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
...@@ -2768,7 +2811,8 @@ __global__ void convert_correct_tiles( ...@@ -2768,7 +2811,8 @@ __global__ void convert_correct_tiles(
*/ */
extern "C" __global__ void textures_nonoverlap( extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras int num_cams, // number of cameras
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row // int num_tilesx, // number of tiles in a row
// declare arrays in device code? // declare arrays in device code?
...@@ -2802,13 +2846,15 @@ extern "C" __global__ void textures_nonoverlap( ...@@ -2802,13 +2846,15 @@ extern "C" __global__ void textures_nonoverlap(
if (threadIdx.x == 0) { // only 1 thread, 1 block if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_texture_tiles = 0; *pnum_texture_tiles = 0;
create_nonoverlap_list<<<blocks0,threads0>>>( create_nonoverlap_list<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task num_tiles, // int num_tiles, // number of tiles in task
num_tilesx, // int width, // number of tiles in a row num_tilesx, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize(); cudaDeviceSynchronize();
int num_cams_per_thread = TEXTURE_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1); dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
......
...@@ -104,8 +104,9 @@ extern "C" __global__ void corr2D_combine( ...@@ -104,8 +104,9 @@ extern "C" __global__ void corr2D_combine(
float * gpu_corrs_combo); // combined correlation output (one per tile) float * gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap( extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras used int num_cams, // number of cameras
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row // int num_tilesx, // number of tiles in a row
// declare arrays in device code? // declare arrays in device code?
...@@ -151,7 +152,8 @@ extern "C" __global__ void imclt_rbg( ...@@ -151,7 +152,8 @@ extern "C" __global__ void imclt_rbg(
extern "C" __global__ void generate_RBGA( extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
struct tp_task * gpu_tasks, float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......
...@@ -44,16 +44,18 @@ ...@@ -44,16 +44,18 @@
// Using NUM_CAMS threads per tile // Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS) #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) ///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) ///#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) ///#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) //#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) //#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3 #define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g); __device__ void printGeometryCorrection(struct gc * g);
__device__ void printExtrinsicCorrection(corr_vector * cv); __device__ void printExtrinsicCorrection(corr_vector * cv);
/** /**
* Calculate non-distorted radius from distorted using table approximation * Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius * @param rDist distorted radius
...@@ -123,11 +125,28 @@ __constant__ int offset_derivs = 1; // 1..4 // should be nex ...@@ -123,11 +125,28 @@ __constant__ int offset_derivs = 1; // 1..4 // should be nex
__constant__ int offset_matrices = 5; // 5..11 __constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15 __constant__ int offset_tmp = 12; // 12..15
//inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
//inline __device__ int get_task_size_gc(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//}
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
}
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
}
/** /**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles * NUM_CAMS blocks of 3,3,3 tiles
*/ */
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv) trot_deriv * gpu_rot_deriv)
{ {
...@@ -282,18 +301,27 @@ extern "C" __global__ void calc_rot_deriv( ...@@ -282,18 +301,27 @@ extern "C" __global__ void calc_rot_deriv(
extern "C" __global__ void calculate_tiles_offsets( extern "C" __global__ void calculate_tiles_offsets(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv) trot_deriv * gpu_rot_deriv)
{ {
dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1); /// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1); /// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS/ num_cams;
dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
dim3 grid_geom ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1 if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> ( get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks, // struct tp_task * gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task list num_tiles, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector, gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
...@@ -313,66 +341,76 @@ extern "C" __global__ void calculate_tiles_offsets( ...@@ -313,66 +341,76 @@ extern "C" __global__ void calculate_tiles_offsets(
*/ */
extern "C" __global__ void get_tiles_offsets( extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks, int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv) trot_deriv * gpu_rot_deriv)
{ {
int task_size = get_task_size(num_cams);
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x; int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
__shared__ struct gc geometry_correction; __shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN]; __shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr; __shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv; __shared__ trot_deriv rot_deriv;
__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS]; /// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
float pXY[2]; // result to be copied to task float pXY[2]; // result to be copied to task
//blockDim.y
// copy data common to all threads // copy data common to all threads
{ {
int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
float * gcp_local = (float *) &geometry_correction; float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction; float * gcp_global = (float *) gpu_geometry_correction;
int offset = thread_xy; int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_GC; i++){ for (int i = 0; i < cycles_copy_gc; i++){
if (offset < sizeof(struct gc)/sizeof(float)) { if (offset < sizeof(struct gc)/sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset); *(gcp_local + offset) = *(gcp_global + offset);
} }
offset += THREADS_PER_BLOCK_GEOM; offset += dim_xy;
} }
} }
{ {
int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
float * cvp_local = (float *) &extrinsic_corr; float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector; float * cvp_global = (float *) gpu_correction_vector;
int offset = thread_xy; int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_CV; i++){ for (int i = 0; i < cycles_copy_cv; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) { if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset); *(cvp_local + offset) = *(cvp_global + offset);
} }
offset += THREADS_PER_BLOCK_GEOM; offset += dim_xy;
} }
} }
// TODO: maybe it is better to use system memory and not read all table? // TODO: maybe it is better to use system memory and not read all table?
{ {
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
float * rByRDistp_local = (float *) rByRDist; float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist; float * rByRDistp_global = (float *) gpu_rByRDist;
int offset = thread_xy; int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_RBRD; i++){ for (int i = 0; i < cycles_copy_rbrd; i++){
if (offset < RBYRDIST_LEN) { if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset); *(rByRDistp_local + offset) = *(rByRDistp_global + offset);
} }
offset += THREADS_PER_BLOCK_GEOM; offset += dim_xy;
} }
} }
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction) // copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{ {
int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
float * rots_local = (float *) &rot_deriv; float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices; float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int offset = thread_xy; int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_ROTS; i++){ for (int i = 0; i < cycles_copy_rot; i++){
if (offset < sizeof(trot_deriv)/sizeof(float)) { if (offset < sizeof(trot_deriv)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset); *(rots_local + offset) = *(rots_global + offset);
} }
offset += THREADS_PER_BLOCK_GEOM; offset += dim_xy;
} }
} }
__syncthreads(); __syncthreads();
...@@ -411,8 +449,10 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -411,8 +449,10 @@ extern "C" __global__ void get_tiles_offsets(
*/ */
// common code, calculated in parallel // common code, calculated in parallel
int cxy = gpu_tasks[task_num].txy; /// int cxy = gpu_tasks[task_num].txy;
float disparity = gpu_tasks[task_num].target_disparity; /// float disparity = gpu_tasks[task_num].target_disparity;
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
float disparity = * (gpu_ftasks + task_size * task_num + 2);
int tileX = (cxy & 0xffff); int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16); int tileY = (cxy >> 16);
#ifdef DEBUG23 #ifdef DEBUG23
...@@ -638,11 +678,15 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -638,11 +678,15 @@ extern "C" __global__ void get_tiles_offsets(
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0]; /// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1]; /// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2]; /// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3]; float * disp_dist_p = gpu_ftasks + task_size * task_num + 3 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
*(disp_dist_p++) = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels // imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15 // float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
...@@ -697,8 +741,11 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -697,8 +741,11 @@ extern "C" __global__ void get_tiles_offsets(
} }
} }
// copy results to global memory pXY, disp_dist // copy results to global memory pXY, disp_dist
gpu_tasks[task_num].xy[ncam][0] = pXY[0]; // gpu_tasks[task_num].xy[ncam][0] = pXY[0];
gpu_tasks[task_num].xy[ncam][1] = pXY[1]; // gpu_tasks[task_num].xy[ncam][1] = pXY[1];
float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
} }
extern "C" __global__ void calcReverseDistortionTable( extern "C" __global__ void calcReverseDistortionTable(
......
...@@ -41,6 +41,8 @@ ...@@ -41,6 +41,8 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define NVRTC_BUG 1 #define NVRTC_BUG 1
#ifndef M_PI #ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */ #define M_PI 3.14159265358979323846 /* pi */
...@@ -60,8 +62,9 @@ struct tp_task { ...@@ -60,8 +62,9 @@ struct tp_task {
int txy; int txy;
unsigned short sxy[2]; unsigned short sxy[2];
}; };
float xy[NUM_CAMS][2];
float target_disparity; float target_disparity;
float xy[NUM_CAMS][2];
// float target_disparity;
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates() float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
}; };
...@@ -142,7 +145,9 @@ struct gc { ...@@ -142,7 +145,9 @@ struct gc {
}; };
#define RAD_COEFF_LEN 7 #define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets( extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks, int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
...@@ -150,7 +155,9 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -150,7 +155,9 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets( extern "C" __global__ void calculate_tiles_offsets(
struct tp_task * gpu_tasks, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
...@@ -160,6 +167,7 @@ extern "C" __global__ void calculate_tiles_offsets( ...@@ -160,6 +167,7 @@ extern "C" __global__ void calculate_tiles_offsets(
// uses NUM_CAMS blocks, (3,3,3) threads // uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
...@@ -170,3 +178,4 @@ extern "C" __global__ void calcReverseDistortionTable( ...@@ -170,3 +178,4 @@ extern "C" __global__ void calcReverseDistortionTable(
float * rByRDist); float * rByRDist);
...@@ -715,6 +715,7 @@ int main(int argc, char **argv) ...@@ -715,6 +715,7 @@ int main(int argc, char **argv)
} }
calc_rot_deriv<<<grid_rot,threads_rot>>> ( calc_rot_deriv<<<grid_rot,threads_rot>>> (
num_cams, // int num_cams,
gpu_correction_vector , // struct corr_vector * gpu_correction_vector, gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
...@@ -821,6 +822,7 @@ int main(int argc, char **argv) ...@@ -821,6 +822,7 @@ int main(int argc, char **argv)
} }
/* /*
get_tiles_offsets<<<grid_geom,threads_geom>>> ( get_tiles_offsets<<<grid_geom,threads_geom>>> (
num_cams, // int num_cams,
gpu_tasks, // struct tp_task * gpu_tasks, gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...@@ -829,7 +831,9 @@ int main(int argc, char **argv) ...@@ -829,7 +831,9 @@ int main(int argc, char **argv)
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/ */
calculate_tiles_offsets<<<1,1>>> ( calculate_tiles_offsets<<<1,1>>> (
gpu_tasks, // struct tp_task * gpu_tasks, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector, gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
...@@ -1273,7 +1277,8 @@ int main(int argc, char **argv) ...@@ -1273,7 +1277,8 @@ int main(int argc, char **argv)
cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5 cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
textures_nonoverlap<<<1,1>>> ( textures_nonoverlap<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
gpu_tasks, // struct tp_task * gpu_tasks, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
...@@ -1365,7 +1370,8 @@ int main(int argc, char **argv) ...@@ -1365,7 +1370,8 @@ int main(int argc, char **argv)
generate_RBGA<<<1,1>>> ( generate_RBGA<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
gpu_tasks, // struct tp_task * gpu_tasks, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list tp_task_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate. // Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
......
...@@ -56,7 +56,8 @@ ...@@ -56,7 +56,8 @@
#define CORR_TILES_PER_BLOCK 4 #define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8? #define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16? #define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
#define TEXTURE_THREADS 32 // //#define TEXTURE_THREADS 32 //
#define NUM_THREADS 32
#define TEXTURE_THREADS_PER_TILE 8 #define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1 #define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16 #define IMCLT_THREADS_PER_TILE 16
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment