Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
a51d6a77
Commit
a51d6a77
authored
Nov 25, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
More editing to make dynamic number of cameras
parent
ee0cfc3b
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
231 additions
and
120 deletions
+231
-120
TileProcessor.cuh
src/TileProcessor.cuh
+128
-82
TileProcessor.h
src/TileProcessor.h
+5
-3
geometry_correction.cu
src/geometry_correction.cu
+75
-28
geometry_correction.h
src/geometry_correction.h
+12
-3
test_tp.cu
src/test_tp.cu
+9
-3
tp_defines.h
src/tp_defines.h
+2
-1
No files found.
src/TileProcessor.cuh
View file @
a51d6a77
...
@@ -983,27 +983,33 @@ __global__ void clear_texture_list(
...
@@ -983,27 +983,33 @@ __global__ void clear_texture_list(
int height); // <= TILES-Y, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images
__global__ void mark_texture_tiles(
__global__ void mark_texture_tiles(
struct tp_task * gpu_tasks,
int num_cams,
int num_tiles, // number of tiles in task list
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int width, // number of tiles in a row
/// struct tp_task * gpu_tasks,
int * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
__global__ void mark_texture_neighbor_tiles(
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
int * woi);
// x,y,width,height of the woi
__global__ void gen_texture_list(
__global__ void gen_texture_list(
struct tp_task * gpu_tasks,
int num_cams,
int num_tiles, // number of tiles in task list
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int width, // number of tiles in a row
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices,
// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * num_texture_tiles,
// number of texture tiles to process
int * woi);
// x,y,width,height of the woi
int * woi);
// min_x, min_y, max_x, max_y input
__global__ void clear_texture_rbga(
__global__ void clear_texture_rbga(
int texture_width,
int texture_width,
...
@@ -1011,7 +1017,7 @@ __global__ void clear_texture_rbga(
...
@@ -1011,7 +1017,7 @@ __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
inline __device__ int get_task_size(int num_cams);
//
inline __device__ int get_task_size(int num_cams);
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams);
...
@@ -1034,11 +1040,13 @@ __global__ void index_correlate(
...
@@ -1034,11 +1040,13 @@ __global__ void index_correlate(
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void create_nonoverlap_list(
__global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
int width,
// number of tiles in a row
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * pnonoverlap_length);
// indices to gpu_tasks // should be initialized to zero
int * pnonoverlap_length); // indices to gpu_tasks // should be initialized to zero
__global__ void convert_correct_tiles(
__global__ void convert_correct_tiles(
int num_cams, // actual number of cameras
int num_cams, // actual number of cameras
...
@@ -1869,7 +1877,8 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1869,7 +1877,8 @@ extern "C" __global__ void corr2D_normalize_inner(
* This kernel launches others with CDP, from CPU it is just <<<1,1>>>
* This kernel launches others with CDP, from CPU it is just <<<1,1>>>
*
*
* @param num_cams number of cameras
* @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
...
@@ -1895,7 +1904,8 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1895,7 +1904,8 @@ extern "C" __global__ void corr2D_normalize_inner(
extern "C" __global__ void generate_RBGA(
extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used
int num_cams, // number of cameras used
// Parameters to generate texture tasks
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
// declare arrays in device code?
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
...
@@ -1937,7 +1947,9 @@ extern "C" __global__ void generate_RBGA(
...
@@ -1937,7 +1947,9 @@ extern "C" __global__ void generate_RBGA(
dim3 blocks(blocks_t, 1, 1);
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width, // number of tiles in a row
width, // number of tiles in a row
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
...
@@ -1948,7 +1960,9 @@ extern "C" __global__ void generate_RBGA(
...
@@ -1948,7 +1960,9 @@ extern "C" __global__ void generate_RBGA(
*(woi + 2) = 0; // maximal x
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width, // number of tiles in a row
width, // number of tiles in a row
height, // number of tiles rows
height, // number of tiles rows
...
@@ -1967,7 +1981,9 @@ extern "C" __global__ void generate_RBGA(
...
@@ -1967,7 +1981,9 @@ extern "C" __global__ void generate_RBGA(
*(num_texture_tiles+7) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
gen_texture_list <<<blocks,threads>>>(
gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width, // number of tiles in a row
width, // number of tiles in a row
height, // int height, // number of tiles rows
height, // int height, // number of tiles rows
...
@@ -2003,7 +2019,7 @@ extern "C" __global__ void generate_RBGA(
...
@@ -2003,7 +2019,7 @@ extern "C" __global__ void generate_RBGA(
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int num_cams_per_thread =
TEXTURE
_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
int num_cams_per_thread =
NUM
_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
...
@@ -2094,7 +2110,9 @@ __global__ void clear_texture_rbga(
...
@@ -2094,7 +2110,9 @@ __global__ void clear_texture_rbga(
* Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal
* Helper kernel for generate_RBGA() - prepare list of texture tiles, woi, and calculate orthogonal
* neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
* neighbors for tiles (in 4 bits of the task field. Use 4x8=32 threads,
*
*
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
...
@@ -2103,15 +2121,17 @@ __global__ void clear_texture_rbga(
...
@@ -2103,15 +2121,17 @@ __global__ void clear_texture_rbga(
* @param height full image height in tiles <= TILES-Y, use for faster processing of LWIR images
* @param height full image height in tiles <= TILES-Y, use for faster processing of LWIR images
*/
*/
__global__ void prepare_texture_list(
__global__ void prepare_texture_list(
struct tp_task * gpu_tasks,
int num_cams, // number of cameras used
int num_tiles, // number of tiles in task list
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// struct tp_task * gpu_tasks,
// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
int num_tiles, // number of tiles in task list
// border - down from the sam3\e 1/4 of the buffer
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
int * woi, // x,y,width,height of the woi
// border - down from the sam3\e 1/4 of the buffer
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int height) // <= TILES-Y, use for faster processing of LWIR images
int * woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height) // <= TILES-Y, use for faster processing of LWIR images
{
{
// int task_num = blockIdx.x;
// int task_num = blockIdx.x;
// int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
// int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
...
@@ -2132,7 +2152,9 @@ __global__ void prepare_texture_list(
...
@@ -2132,7 +2152,9 @@ __global__ void prepare_texture_list(
dim3 blocks(blocks_t, 1, 1);
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks,
num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width,
width,
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
...
@@ -2143,7 +2165,9 @@ __global__ void prepare_texture_list(
...
@@ -2143,7 +2165,9 @@ __global__ void prepare_texture_list(
*(woi + 2) = 0; // maximal x
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks,
num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width, // number of tiles in a row
width, // number of tiles in a row
height, // number of tiles rows
height, // number of tiles rows
...
@@ -2161,7 +2185,9 @@ __global__ void prepare_texture_list(
...
@@ -2161,7 +2185,9 @@ __global__ void prepare_texture_list(
*(num_texture_tiles+7) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
gen_texture_list <<<blocks,threads>>>(
gpu_tasks,
num_cams,
gpu_ftasks,
// gpu_tasks,
num_tiles, // number of tiles in task list
num_tiles, // number of tiles in task list
width, // number of tiles in a row
width, // number of tiles in a row
height, // int height, // number of tiles rows
height, // int height, // number of tiles rows
...
@@ -2201,7 +2227,9 @@ __global__ void clear_texture_list(
...
@@ -2201,7 +2227,9 @@ __global__ void clear_texture_list(
* Helper kernel for prepare_texture_list() (for generate_RBGA) - mark used tiles in
* Helper kernel for prepare_texture_list() (for generate_RBGA) - mark used tiles in
* gpu_texture_indices memory
* gpu_texture_indices memory
*
*
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_cams number of cameras <= NUM_CAMS
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row
* @param width number of tiles in a row
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param gpu_texture_indices allocated array - 1 integer per tile to process
...
@@ -2209,7 +2237,9 @@ __global__ void clear_texture_list(
...
@@ -2209,7 +2237,9 @@ __global__ void clear_texture_list(
// treads (*,1,1), blocks = (*,1,1)
// treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_tiles(
__global__ void mark_texture_tiles(
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
...
@@ -2218,11 +2248,15 @@ __global__ void mark_texture_tiles(
...
@@ -2218,11 +2248,15 @@ __global__ void mark_texture_tiles(
if (task_num >= num_tiles) {
if (task_num >= num_tiles) {
return; // nothing to do
return; // nothing to do
}
}
int task = gpu_tasks[task_num].task;
/// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
return; // NOP tile
}
}
int cxy = gpu_tasks[task_num].txy;
/// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * width) = 1; // TILES-X) = 1;
*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * width) = 1; // TILES-X) = 1;
}
}
...
@@ -2231,7 +2265,9 @@ __global__ void mark_texture_tiles(
...
@@ -2231,7 +2265,9 @@ __global__ void mark_texture_tiles(
* bitmap of available neighbors in 4 directions (needed for alpha generation of
* bitmap of available neighbors in 4 directions (needed for alpha generation of
* the result textures to fade along the border.
* the result textures to fade along the border.
*
*
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row
* @param width number of tiles in a row
* @param height number of tiles rows
* @param height number of tiles rows
...
@@ -2240,7 +2276,9 @@ __global__ void mark_texture_tiles(
...
@@ -2240,7 +2276,9 @@ __global__ void mark_texture_tiles(
*/
*/
// treads (*,1,1), blocks = (*,1,1)
// treads (*,1,1), blocks = (*,1,1)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
...
@@ -2252,11 +2290,14 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...
@@ -2252,11 +2290,14 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
if (task_num >= num_tiles) {
if (task_num >= num_tiles) {
return; // nothing to do
return; // nothing to do
}
}
int task = gpu_tasks[task_num].task;
/// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
return; // NOP tile
}
}
int cxy = gpu_tasks[task_num].txy;
/// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff);
int x = (cxy & 0xffff);
int y = (cxy >> 16);
int y = (cxy >> 16);
atomicMin(woi+0, x);
atomicMin(woi+0, x);
...
@@ -2264,16 +2305,12 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...
@@ -2264,16 +2305,12 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
atomicMax(woi+2, x);
atomicMax(woi+2, x);
atomicMax(woi+3, y);
atomicMax(woi+3, y);
int d = 0;
int d = 0;
// if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * TILES-X)) d |= (1 << TASK_TEXTURE_N_BIT);
// if ((x < (TILES-X - 1)) && *(gpu_texture_indices + (x + 1) + y * TILES-X)) d |= (1 << TASK_TEXTURE_E_BIT);
// if ((y < (TILES-Y - 1)) && *(gpu_texture_indices + x + (y + 1) * TILES-X)) d |= (1 << TASK_TEXTURE_S_BIT);
// if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * TILES-X)) d |= (1 << TASK_TEXTURE_W_BIT);
if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * width)) d |= (1 << TASK_TEXTURE_N_BIT);
if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * width)) d |= (1 << TASK_TEXTURE_N_BIT);
if ((x < (width - 1)) && *(gpu_texture_indices + (x + 1) + y * width)) d |= (1 << TASK_TEXTURE_E_BIT);
if ((x < (width - 1)) && *(gpu_texture_indices + (x + 1) + y * width)) d |= (1 << TASK_TEXTURE_E_BIT);
if ((y < (height - 1)) && *(gpu_texture_indices + x + (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT);
if ((y < (height - 1)) && *(gpu_texture_indices + x + (y + 1) * width)) d |= (1 << TASK_TEXTURE_S_BIT);
if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * width)) d |= (1 << TASK_TEXTURE_W_BIT);
if ((x > 0) && *(gpu_texture_indices + (x - 1) + y * width)) d |= (1 << TASK_TEXTURE_W_BIT);
gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
/// gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
*(int *) (gpu_ftasks + get_task_size(num_cams) * task_num) = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
}
}
/**
/**
...
@@ -2282,15 +2319,18 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...
@@ -2282,15 +2319,18 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
* of non-overlapping tiles (odd/even rows/columns). At first made 8 lists, with pairs of
* of non-overlapping tiles (odd/even rows/columns). At first made 8 lists, with pairs of
* growing up and down for inner and border tiles, but now border attribute is not
* growing up and down for inner and border tiles, but now border attribute is not
* used anymore.
* used anymore.
*
* @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
* @param num_texture_tiles number of texture tiles to process (allocated 8-element integer array)
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles)
* @param woi 4-element int array ( x,y,width,height of the woi, in tiles)
*/
*/
__global__ void gen_texture_list(
__global__ void gen_texture_list(
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
...
@@ -2305,11 +2345,13 @@ __global__ void gen_texture_list(
...
@@ -2305,11 +2345,13 @@ __global__ void gen_texture_list(
return; // nothing to do
return; // nothing to do
}
}
int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
/// int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
return; // NOP tile
}
}
int cxy = gpu_tasks[task_num].txy;
// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff);
int x = (cxy & 0xffff);
int y = (cxy >> 16);
int y = (cxy >> 16);
...
@@ -2325,22 +2367,18 @@ __global__ void gen_texture_list(
...
@@ -2325,22 +2367,18 @@ __global__ void gen_texture_list(
// don't care if calculate extra pixels that still fit into memory
// don't care if calculate extra pixels that still fit into memory
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILES-X - 1)) || (y == woi[3]);
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (width - 1)) || (y == woi[3]);
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (width - 1)) || (y == woi[3]);
int buff_head = 0;
int buff_head = 0;
int num_offset = 0;
int num_offset = 0;
if (x & 1) {
if (x & 1) {
// buff_head += TILES-X * (TILES-YA >> 2); //TILES-YA - 2 LSB == 00
buff_head += width * (tilesya >> 2); //TILES-YA - 2 LSB == 00
buff_head += width * (tilesya >> 2); //TILES-YA - 2 LSB == 00
num_offset += 2; // int *
num_offset += 2; // int *
}
}
if (y & 1) {
if (y & 1) {
// buff_head += TILES-X * (TILES-YA >> 1);
buff_head += width * (tilesya >> 1);
buff_head += width * (tilesya >> 1);
num_offset += 4; // int *
num_offset += 4; // int *
}
}
if (is_border){
if (is_border){
// buff_head += (TILES-X * (TILES-YA >> 2) - 1); // end of the buffer
buff_head += (width * (tilesya >> 2) - 1); // end of the buffer
buff_head += (width * (tilesya >> 2) - 1); // end of the buffer
num_offset += 1; // int *
num_offset += 1; // int *
}
}
...
@@ -2360,13 +2398,11 @@ __global__ void gen_texture_list(
...
@@ -2360,13 +2398,11 @@ __global__ void gen_texture_list(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif // DEBUG12
#endif // DEBUG12
// *(gpu_texture_indices + buf_offset) = task | ((x + y * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
}
//inline __device__ int get_task_size(int num_cams){
inline __device__ int get_task_size(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//}
}
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams) {
inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
...
@@ -2374,7 +2410,6 @@ inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_ca
...
@@ -2374,7 +2410,6 @@ inline __device__ int get_task_task(int num_tile, float * gpu_ftasks, int num_ca
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams) {
inline __device__ int get_task_txy(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
}
}
/**
/**
* Helper kernel for convert_direct() - generates dense list of tiles for direct MCLT.
* Helper kernel for convert_direct() - generates dense list of tiles for direct MCLT.
* Tile order from the original (sparse) list is not preserved
* Tile order from the original (sparse) list is not preserved
...
@@ -2408,14 +2443,18 @@ __global__ void index_direct(
...
@@ -2408,14 +2443,18 @@ __global__ void index_direct(
* Helper kernel for textures_nonoverlap() - generates dense list of tiles for non-overlap
* Helper kernel for textures_nonoverlap() - generates dense list of tiles for non-overlap
* (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
* (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
*
*
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_cams number of cameras <= NUM_CAMS
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
//* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row
* @param width number of tiles in a row
* @param nonoverlap_list integer array to place the generated list
* @param nonoverlap_list integer array to place the generated list
* @param pnonoverlap_length single-element integer array return generated list length
* @param pnonoverlap_length single-element integer array return generated list length
*/
*/
__global__ void create_nonoverlap_list(
__global__ void create_nonoverlap_list(
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
...
@@ -2425,13 +2464,16 @@ __global__ void create_nonoverlap_list(
...
@@ -2425,13 +2464,16 @@ __global__ void create_nonoverlap_list(
if (num_tile >= num_tiles){
if (num_tile >= num_tiles){
return;
return;
}
}
if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
int task_task = get_task_task(num_tile, gpu_ftasks, num_cams);
/// if ((gpu_tasks[num_tile].task & TASK_TEXTURE_BITS) == 0){
if ((task_task & TASK_TEXTURE_BITS) == 0){
return; // nothing to do
return; // nothing to do
}
}
int cxy = gpu_tasks[num_tile].txy;
///
int cxy = gpu_tasks[num_tile].txy;
// int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * TILES-X) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS
;
int cxy = get_task_txy(num_tile, gpu_ftasks, num_cams)
;
int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
int texture_task_code = (((cxy & 0xffff) + (cxy >> 16) * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT) | TASK_TEXTURE_BITS;
if (gpu_tasks[num_tile].task != 0) {
// if (gpu_tasks[num_tile].task != 0) {
if (task_task != 0) {
nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
nonoverlap_list[atomicAdd(pnonoverlap_length, 1)] = texture_task_code;
}
}
}
}
...
@@ -2441,13 +2483,13 @@ __global__ void create_nonoverlap_list(
...
@@ -2441,13 +2483,13 @@ __global__ void create_nonoverlap_list(
* With the quad camera each tile may generate up to 6 pairs (int array elements)
* With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are
* Tiles are not ordered, but the correlation pairs for each tile are
*
*
* @param num_cams number of cameras <= NUM_CAMS
* @param num_cams
number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param sel_pairs
array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks
flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks
array of per-tile tasks (struct tp_task)
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles
number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices
integer array to place the generated list
* @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles
single-element integer array return generated list length
* @param pnum_corr_tiles single-element integer array return generated list length
*/
*/
__global__ void index_correlate(
__global__ void index_correlate(
int num_cams,
int num_cams,
...
@@ -2746,7 +2788,8 @@ __global__ void convert_correct_tiles(
...
@@ -2746,7 +2788,8 @@ __global__ void convert_correct_tiles(
* This kernel launches others with CDP, from CPU it is just <<<1,1>>>
* This kernel launches others with CDP, from CPU it is just <<<1,1>>>
*
*
* @param num_cams number of cameras
* @param num_cams number of cameras
* @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param gpu_texture_indices allocated array - 1 integer per tile to process
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
* @param num_texture_tiles allocated array - 8 integers (may be reduced to 4 later)
...
@@ -2768,7 +2811,8 @@ __global__ void convert_correct_tiles(
...
@@ -2768,7 +2811,8 @@ __global__ void convert_correct_tiles(
*/
*/
extern "C" __global__ void textures_nonoverlap(
extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras
int num_cams, // number of cameras
struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
// declare arrays in device code?
...
@@ -2802,13 +2846,15 @@ extern "C" __global__ void textures_nonoverlap(
...
@@ -2802,13 +2846,15 @@ extern "C" __global__ void textures_nonoverlap(
if (threadIdx.x == 0) { // only 1 thread, 1 block
if (threadIdx.x == 0) { // only 1 thread, 1 block
*pnum_texture_tiles = 0;
*pnum_texture_tiles = 0;
create_nonoverlap_list<<<blocks0,threads0>>>(
create_nonoverlap_list<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task
num_tiles, // int num_tiles, // number of tiles in task
num_tilesx, // int width, // number of tiles in a row
num_tilesx, // int width, // number of tiles in a row
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
gpu_texture_indices, // int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
pnum_texture_tiles); // int * pnonoverlap_length) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
cudaDeviceSynchronize();
int num_cams_per_thread =
TEXTURE
_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
int num_cams_per_thread =
NUM
_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
// dim3 threads_texture(TEXTURE_THREADS/num_cams, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
...
...
src/TileProcessor.h
View file @
a51d6a77
...
@@ -104,8 +104,9 @@ extern "C" __global__ void corr2D_combine(
...
@@ -104,8 +104,9 @@ extern "C" __global__ void corr2D_combine(
float
*
gpu_corrs_combo
);
// combined correlation output (one per tile)
float
*
gpu_corrs_combo
);
// combined correlation output (one per tile)
extern
"C"
__global__
void
textures_nonoverlap
(
extern
"C"
__global__
void
textures_nonoverlap
(
int
num_cams
,
// number of cameras used
int
num_cams
,
// number of cameras
struct
tp_task
*
gpu_tasks
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats
// struct tp_task * gpu_tasks,
int
num_tiles
,
// number of tiles in task list
int
num_tiles
,
// number of tiles in task list
// int num_tilesx, // number of tiles in a row
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
// declare arrays in device code?
...
@@ -151,7 +152,8 @@ extern "C" __global__ void imclt_rbg(
...
@@ -151,7 +152,8 @@ extern "C" __global__ void imclt_rbg(
extern
"C"
__global__
void
generate_RBGA
(
extern
"C"
__global__
void
generate_RBGA
(
int
num_cams
,
// number of cameras used
int
num_cams
,
// number of cameras used
// Parameters to generate texture tasks
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int
num_tiles
,
// number of tiles in task list
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
...
...
src/geometry_correction.cu
View file @
a51d6a77
...
@@ -44,16 +44,18 @@
...
@@ -44,16 +44,18 @@
// Using NUM_CAMS threads per tile
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
///
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
///
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
///
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3
#define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g);
__device__ void printGeometryCorrection(struct gc * g);
__device__ void printExtrinsicCorrection(corr_vector * cv);
__device__ void printExtrinsicCorrection(corr_vector * cv);
/**
/**
* Calculate non-distorted radius from distorted using table approximation
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @param rDist distorted radius
...
@@ -123,11 +125,28 @@ __constant__ int offset_derivs = 1; // 1..4 // should be nex
...
@@ -123,11 +125,28 @@ __constant__ int offset_derivs = 1; // 1..4 // should be nex
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
__constant__ int offset_tmp = 12; // 12..15
//inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
//inline __device__ int get_task_size_gc(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//}
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
}
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
}
/**
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
* NUM_CAMS blocks of 3,3,3 tiles
*/
*/
extern "C" __global__ void calc_rot_deriv(
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv)
trot_deriv * gpu_rot_deriv)
{
{
...
@@ -282,18 +301,27 @@ extern "C" __global__ void calc_rot_deriv(
...
@@ -282,18 +301,27 @@ extern "C" __global__ void calc_rot_deriv(
extern "C" __global__ void calculate_tiles_offsets(
extern "C" __global__ void calculate_tiles_offsets(
struct tp_task * gpu_tasks,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
trot_deriv * gpu_rot_deriv)
{
{
dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS/ num_cams;
dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
dim3 grid_geom ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> (
get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task list
num_tiles, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
...
@@ -313,66 +341,76 @@ extern "C" __global__ void calculate_tiles_offsets(
...
@@ -313,66 +341,76 @@ extern "C" __global__ void calculate_tiles_offsets(
*/
*/
extern "C" __global__ void get_tiles_offsets(
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv
*
gpu_rot_deriv)
trot_deriv
*
gpu_rot_deriv)
{
{
int task_size = get_task_size(num_cams);
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
__shared__ struct gc geometry_correction;
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv;
__shared__ trot_deriv rot_deriv;
__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
float pXY[2]; // result to be copied to task
float pXY[2]; // result to be copied to task
//blockDim.y
// copy data common to all threads
// copy data common to all threads
{
{
int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
float * gcp_local = (float *) &geometry_correction;
float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
int offset = thread_xy;
int offset = thread_xy;
for (int i = 0; i <
CYCLES_COPY_GC
; i++){
for (int i = 0; i <
cycles_copy_gc
; i++){
if (offset < sizeof(struct gc)/sizeof(float)) {
if (offset < sizeof(struct gc)/sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
*(gcp_local + offset) = *(gcp_global + offset);
}
}
offset +=
THREADS_PER_BLOCK_GEOM
;
offset +=
dim_xy
;
}
}
}
}
{
{
int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector;
float * cvp_global = (float *) gpu_correction_vector;
int offset = thread_xy;
int offset = thread_xy;
for (int i = 0; i <
CYCLES_COPY_CV
; i++){
for (int i = 0; i <
cycles_copy_cv
; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
*(cvp_local + offset) = *(cvp_global + offset);
}
}
offset +=
THREADS_PER_BLOCK_GEOM
;
offset +=
dim_xy
;
}
}
}
}
// TODO: maybe it is better to use system memory and not read all table?
// TODO: maybe it is better to use system memory and not read all table?
{
{
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
int offset = thread_xy;
int offset = thread_xy;
for (int i = 0; i <
CYCLES_COPY_RBRD
; i++){
for (int i = 0; i <
cycles_copy_rbrd
; i++){
if (offset < RBYRDIST_LEN) {
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
}
offset +=
THREADS_PER_BLOCK_GEOM
;
offset +=
dim_xy
;
}
}
}
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
{
int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
float * rots_local = (float *) &rot_deriv;
float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
int offset = thread_xy;
for (int i = 0; i <
CYCLES_COPY_ROTS
; i++){
for (int i = 0; i <
cycles_copy_rot
; i++){
if (offset < sizeof(trot_deriv)/sizeof(float)) {
if (offset < sizeof(trot_deriv)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
*(rots_local + offset) = *(rots_global + offset);
}
}
offset +=
THREADS_PER_BLOCK_GEOM
;
offset +=
dim_xy
;
}
}
}
}
__syncthreads();
__syncthreads();
...
@@ -411,8 +449,10 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -411,8 +449,10 @@ extern "C" __global__ void get_tiles_offsets(
*/
*/
// common code, calculated in parallel
// common code, calculated in parallel
int cxy = gpu_tasks[task_num].txy;
/// int cxy = gpu_tasks[task_num].txy;
float disparity = gpu_tasks[task_num].target_disparity;
/// float disparity = gpu_tasks[task_num].target_disparity;
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
float disparity = * (gpu_ftasks + task_size * task_num + 2);
int tileX = (cxy & 0xffff);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
int tileY = (cxy >> 16);
#ifdef DEBUG23
#ifdef DEBUG23
...
@@ -638,11 +678,15 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -638,11 +678,15 @@ extern "C" __global__ void get_tiles_offsets(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif // DEBUG21
#endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + 3 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
*(disp_dist_p++) = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
...
@@ -697,8 +741,11 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -697,8 +741,11 @@ extern "C" __global__ void get_tiles_offsets(
}
}
}
}
// copy results to global memory pXY, disp_dist
// copy results to global memory pXY, disp_dist
gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
}
extern "C" __global__ void calcReverseDistortionTable(
extern "C" __global__ void calcReverseDistortionTable(
...
...
src/geometry_correction.h
View file @
a51d6a77
...
@@ -41,6 +41,8 @@
...
@@ -41,6 +41,8 @@
#include "tp_defines.h"
#include "tp_defines.h"
#endif
#endif
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define NVRTC_BUG 1
#define NVRTC_BUG 1
#ifndef M_PI
#ifndef M_PI
#define M_PI 3.14159265358979323846
/* pi */
#define M_PI 3.14159265358979323846
/* pi */
...
@@ -60,8 +62,9 @@ struct tp_task {
...
@@ -60,8 +62,9 @@ struct tp_task {
int
txy
;
int
txy
;
unsigned
short
sxy
[
2
];
unsigned
short
sxy
[
2
];
};
};
float
xy
[
NUM_CAMS
][
2
];
float
target_disparity
;
float
target_disparity
;
float
xy
[
NUM_CAMS
][
2
];
// float target_disparity;
float
disp_dist
[
NUM_CAMS
][
4
];
// calculated with getPortsCoordinates()
float
disp_dist
[
NUM_CAMS
][
4
];
// calculated with getPortsCoordinates()
};
};
...
@@ -142,7 +145,9 @@ struct gc {
...
@@ -142,7 +145,9 @@ struct gc {
};
};
#define RAD_COEFF_LEN 7
#define RAD_COEFF_LEN 7
extern
"C"
__global__
void
get_tiles_offsets
(
extern
"C"
__global__
void
get_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_cams
,
// struct tp_task * gpu_tasks,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
struct
corr_vector
*
gpu_correction_vector
,
...
@@ -150,7 +155,9 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -150,7 +155,9 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv
*
gpu_rot_deriv
);
trot_deriv
*
gpu_rot_deriv
);
extern
"C"
__global__
void
calculate_tiles_offsets
(
extern
"C"
__global__
void
calculate_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_cams
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
struct
corr_vector
*
gpu_correction_vector
,
...
@@ -160,6 +167,7 @@ extern "C" __global__ void calculate_tiles_offsets(
...
@@ -160,6 +167,7 @@ extern "C" __global__ void calculate_tiles_offsets(
// uses NUM_CAMS blocks, (3,3,3) threads
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
extern
"C"
__global__
void
calc_rot_deriv
(
int
num_cams
,
struct
corr_vector
*
gpu_correction_vector
,
struct
corr_vector
*
gpu_correction_vector
,
trot_deriv
*
gpu_rot_deriv
);
trot_deriv
*
gpu_rot_deriv
);
...
@@ -170,3 +178,4 @@ extern "C" __global__ void calcReverseDistortionTable(
...
@@ -170,3 +178,4 @@ extern "C" __global__ void calcReverseDistortionTable(
float
*
rByRDist
);
float
*
rByRDist
);
src/test_tp.cu
View file @
a51d6a77
...
@@ -715,6 +715,7 @@ int main(int argc, char **argv)
...
@@ -715,6 +715,7 @@ int main(int argc, char **argv)
}
}
calc_rot_deriv<<<grid_rot,threads_rot>>> (
calc_rot_deriv<<<grid_rot,threads_rot>>> (
num_cams, // int num_cams,
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
...
@@ -821,6 +822,7 @@ int main(int argc, char **argv)
...
@@ -821,6 +822,7 @@ int main(int argc, char **argv)
}
}
/*
/*
get_tiles_offsets<<<grid_geom,threads_geom>>> (
get_tiles_offsets<<<grid_geom,threads_geom>>> (
num_cams, // int num_cams,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
...
@@ -829,7 +831,9 @@ int main(int argc, char **argv)
...
@@ -829,7 +831,9 @@ int main(int argc, char **argv)
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/
*/
calculate_tiles_offsets<<<1,1>>> (
calculate_tiles_offsets<<<1,1>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
...
@@ -1273,7 +1277,8 @@ int main(int argc, char **argv)
...
@@ -1273,7 +1277,8 @@ int main(int argc, char **argv)
cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
textures_nonoverlap<<<1,1>>> (
textures_nonoverlap<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used
num_cams, // int num_cams, // number of cameras used
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
...
@@ -1365,7 +1370,8 @@ int main(int argc, char **argv)
...
@@ -1365,7 +1370,8 @@ int main(int argc, char **argv)
generate_RBGA<<<1,1>>> (
generate_RBGA<<<1,1>>> (
num_cams, // int num_cams, // number of cameras used
num_cams, // int num_cams, // number of cameras used
// Parameters to generate texture tasks
// Parameters to generate texture tasks
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
tp_task_size, // int num_tiles, // number of tiles in task list
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
// Does not require initialized gpu_texture_indices to be initialized - just allocated, will generate.
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
...
...
src/tp_defines.h
View file @
a51d6a77
...
@@ -56,7 +56,8 @@
...
@@ -56,7 +56,8 @@
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
#define TEXTURE_THREADS 32 //
//#define TEXTURE_THREADS 32 //
#define NUM_THREADS 32
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_THREADS_PER_TILE 16
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment