Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
a6844c60
Project 'Elphel/master' was moved to 'Elphel/image-compression'. Please update any links and bookmarks that may still have the old path.
Commit
a6844c60
authored
Apr 13, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
reconciled geometry with java
parent
dba4dfce
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
341 additions
and
130 deletions
+341
-130
TileProcessor.cuh
src/TileProcessor.cuh
+40
-12
TileProcessor.h
src/TileProcessor.h
+3
-2
geometry_correction.cu
src/geometry_correction.cu
+237
-106
geometry_correction.h
src/geometry_correction.h
+5
-2
test_tp.cu
src/test_tp.cu
+39
-5
tp_defines.h
src/tp_defines.h
+17
-3
No files found.
src/TileProcessor.cuh
View file @
a6844c60
...
...
@@ -110,14 +110,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//56494
// struct tp_task
...
...
@@ -1150,6 +1142,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
...
...
@@ -1900,11 +1893,11 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
...
...
@@ -2006,14 +1999,21 @@ __global__ void textures_accumulate(
}
__syncthreads();// __syncwarp();
#endif
// perform idct
#ifdef DBG_TILE // perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
#else
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
0);
#endif
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
...
...
@@ -2024,6 +2024,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
#ifdef DBG_TILE
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
...
...
@@ -2032,6 +2033,16 @@ __global__ void textures_accumulate(
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
#else
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
0); // int debug);
#endif
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
...
...
@@ -2105,6 +2116,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
...
...
@@ -2120,7 +2132,23 @@ __global__ void textures_accumulate(
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
#else
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0); //int debug );
#endif
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
...
...
src/TileProcessor.h
View file @
a6844c60
...
...
@@ -80,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
textures_accumulate
(
// int border_tile, // if 1 - watch for border
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
...
...
@@ -127,6 +127,7 @@ __global__ void generate_RBGA(
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
...
...
src/geometry_correction.cu
View file @
a6844c60
This diff is collapsed.
Click to expand it.
src/geometry_correction.h
View file @
a6844c60
...
...
@@ -42,6 +42,7 @@
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct
tp_task
{
int
task
;
union
{
...
...
@@ -114,17 +115,19 @@ struct gc {
float
cameraRadius
;
// =0; // average distance from the "mass center" of the sensors to the sensors
float
disparityRadius
;
// =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
extern
"C"
__global__
void
get_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
float
*
gpu_rByRDist
);
// length should match RBYRDIST_LEN
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
union
trot_deriv
*
gpu_rot_deriv
);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
...
...
src/test_tp.cu
View file @
a6844c60
...
...
@@ -339,7 +339,8 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
...
...
@@ -634,8 +635,8 @@ struct tp_task {
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * )
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * )
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
...
...
@@ -683,7 +684,7 @@ struct tp_task {
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
dim3 threads_geom(
TILES_PER_BLOCK_GEOM,1
, 1);
dim3 threads_geom(
NUM_CAMS,TILES_PER_BLOCK_GEOM
, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
...
...
@@ -703,7 +704,8 @@ struct tp_task {
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
...
...
@@ -714,6 +716,38 @@ struct tp_task {
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
// gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
/// DBG_TILE
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
&task_data1,
gpu_tasks,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyDeviceToHost));
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][0], task_data1[DBG_TILE].xy[ncam][0],
task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
printf("camera %d pY old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][1], task_data1[DBG_TILE].xy[ncam][1],
task_data [DBG_TILE].xy[ncam][1]- task_data1[DBG_TILE].xy[ncam][1]);
}
#if 0
// temporarily restore tasks
checkCudaErrors(cudaMemcpy(
gpu_tasks,
&task_data,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyHostToDevice));
#endif
#endif // TEST_GEOM_CORR
...
...
src/tp_defines.h
View file @
a6844c60
...
...
@@ -72,10 +72,22 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY
32
.0 // disparity for which to calculate offsets (not needed in Java)
#define DBG_DISPARITY
56
.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 151 // 161 // 49
#define DBG_TILE_Y 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//#undef HAS_PRINTF
...
...
@@ -99,7 +111,9 @@
//#define USE_textures_gen
//#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
//#define DEBUG20 1
#define DEBUG21 1
#endif //#ifndef JCUDA
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment