Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
12758c3e
Commit
12758c3e
authored
Dec 11, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Before adding offset parameter to textures_accumulate
parent
0638c622
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
29 deletions
+38
-29
TileProcessor.cuh
src/TileProcessor.cuh
+15
-21
test_tp.cu
src/test_tp.cu
+23
-8
No files found.
src/TileProcessor.cuh
View file @
12758c3e
...
@@ -978,33 +978,30 @@ __device__ void imclt_plane( // not implemented, not used
...
@@ -978,33 +978,30 @@ __device__ void imclt_plane( // not implemented, not used
float * gpu_rbg, // WIDTH, HEIGHT
float * gpu_rbg, // WIDTH, HEIGHT
const size_t dstride); // in floats (pixels)
const size_t dstride); // in floats (pixels)
__global__ void clear_texture_list(
extern "C"
__global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILES-X, use for faster processing of LWIR images
int width, // <= TILES-X, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images
__global__ void mark_texture_tiles(
extern "C"
__global__ void mark_texture_tiles(
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
extern "C"
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
int * woi); // x,y,width,height of the woi
__global__ void gen_texture_list(
extern "C"
__global__ void gen_texture_list(
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
...
@@ -1012,7 +1009,7 @@ __global__ void gen_texture_list(
...
@@ -1012,7 +1009,7 @@ __global__ void gen_texture_list(
int * num_texture_tiles, // number of texture tiles to process
int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input
int * woi); // min_x, min_y, max_x, max_y input
__global__ void clear_texture_rbga(
extern "C"
__global__ void clear_texture_rbga(
int texture_width,
int texture_width,
int texture_slice_height,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
const size_t texture_rbga_stride, // in floats 8*stride
...
@@ -1044,10 +1041,9 @@ __global__ void index_correlate(
...
@@ -1044,10 +1041,9 @@ __global__ void index_correlate(
int * gpu_corr_indices, // array of correlation tasks
int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void create_nonoverlap_list(
extern "C"
__global__ void create_nonoverlap_list(
int num_cams,
int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
int width, // number of tiles in a row
int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
...
@@ -2279,8 +2275,7 @@ __global__ void mark_texture_tiles(
...
@@ -2279,8 +2275,7 @@ __global__ void mark_texture_tiles(
* the result textures to fade along the border.
* the result textures to fade along the border.
*
*
* @param num_cams number of cameras
* @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row
* @param width number of tiles in a row
* @param height number of tiles rows
* @param height number of tiles rows
...
@@ -2291,7 +2286,6 @@ __global__ void mark_texture_tiles(
...
@@ -2291,7 +2286,6 @@ __global__ void mark_texture_tiles(
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
...
@@ -2304,12 +2298,10 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...
@@ -2304,12 +2298,10 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
return; // nothing to do
return; // nothing to do
}
}
/// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams);
int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
return; // NOP tile
}
}
/// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff);
int x = (cxy & 0xffff);
...
@@ -2345,7 +2337,6 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
...
@@ -2345,7 +2337,6 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
__global__ void gen_texture_list(
__global__ void gen_texture_list(
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int num_tiles, // number of tiles in task list
int width, // number of tiles in a row
int width, // number of tiles in a row
int height, // number of tiles rows
int height, // number of tiles rows
...
@@ -2413,7 +2404,10 @@ __global__ void gen_texture_list(
...
@@ -2413,7 +2404,10 @@ __global__ void gen_texture_list(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif // DEBUG12
#endif // DEBUG12
*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
// *(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
// keep only 8 LSBs of task, use higher 24 for task number
*(gpu_texture_indices + buf_offset) = (task & ((1 << CORR_NTILE_SHIFT) -1)) | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
//CORR_NTILE_SHIFT
}
}
//inline __device__ int get_task_size(int num_cams){
//inline __device__ int get_task_size(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
...
@@ -2465,7 +2459,7 @@ __global__ void index_direct(
...
@@ -2465,7 +2459,7 @@ __global__ void index_direct(
* @param nonoverlap_list integer array to place the generated list
* @param nonoverlap_list integer array to place the generated list
* @param pnonoverlap_length single-element integer array return generated list length
* @param pnonoverlap_length single-element integer array return generated list length
*/
*/
__global__ void create_nonoverlap_list(
extern "C"
__global__ void create_nonoverlap_list(
int num_cams,
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
// struct tp_task * gpu_tasks,
...
@@ -3334,7 +3328,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
...
@@ -3334,7 +3328,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif // DEBUG12
#endif // DEBUG12
int alpha_mode = alphaIndex[tile_code];
int alpha_mode = alphaIndex[tile_code];
// only 4 lowest bits
if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
for (int pass = 0; pass < 8; pass ++) {
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int row = pass * 2 + (threadIdx.y >> 1);
...
...
src/test_tp.cu
View file @
12758c3e
...
@@ -32,7 +32,7 @@
...
@@ -32,7 +32,7 @@
#define NOCORR
#define NOCORR
#define NOCORR_TD
#define NOCORR_TD
//
#define NOTEXTURES_HOST
#define NOTEXTURES_HOST
#define NOTEXTURES
#define NOTEXTURES
#define NOTEXTURE_RGBA
#define NOTEXTURE_RGBA
#define SAVE_CLT
#define SAVE_CLT
...
@@ -431,6 +431,22 @@ void generate_RBGA_host(
...
@@ -431,6 +431,22 @@ void generate_RBGA_host(
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
#ifdef DEBUG8A
int cpu_texture_indices [TILESX*TILESYA];
checkCudaErrors(cudaMemcpy(
(float * ) cpu_texture_indices,
gpu_texture_indices,
TILESX*TILESYA * sizeof(float),
cudaMemcpyDeviceToHost));
for (int i = 0; i < 256; i++){
int indx = cpu_texture_indices[i];
printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
}
#endif // #ifdef DEBUG8A
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width,
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_tiles_height * texture_slices, // int texture_slice_height,
...
@@ -461,22 +477,21 @@ void generate_RBGA_host(
...
@@ -461,22 +477,21 @@ void generate_RBGA_host(
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z);
threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n");
printf("\n");
#endif
#endif
/* */
/* */
int shared_size = host_get_textures_shared_size( // in bytes
int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras
num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats
0); // int * offsets); // in floats
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
num_cams, // int num_cams, // number of cameras used
num_cams, // int num_cams, // number of cameras used
gpu_woi, // int * woi, // x, y, width,height
gpu_woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process
ntt, // size_t num_texture_tiles, // number of texture tiles to process
// TODO: add int parameter ti_offset (can not pass to the middle of a device array)?
// alternatively (to minimize dynamic parallelism mods) temporarily copy the 1/4 of a gpu_texture_indices
// or still pass parameter?
// first - try to add offset to GPU address
gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
colors, // int colors, // number of colors (3/1)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment