Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
d85c42b5
Commit
d85c42b5
authored
Nov 30, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added possibility to provide virtual camet centerXY[] instead of the
uniform grid
parent
e1bf6e5c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
71 additions
and
47 deletions
+71
-47
TileProcessor.cuh
src/TileProcessor.cuh
+17
-18
geometry_correction.cu
src/geometry_correction.cu
+35
-19
geometry_correction.h
src/geometry_correction.h
+8
-1
test_tp.cu
src/test_tp.cu
+11
-9
No files found.
src/TileProcessor.cuh
View file @
d85c42b5
...
...
@@ -2697,9 +2697,9 @@ __global__ void convert_correct_tiles(
int thread0 = threadIdx.x & 1; // 0,1
int thread12 = threadIdx.x >>1; // now 0..3 (total number == (DTT_SIZE), will not change
float * tp = tp0 +
3
+ threadIdx.x;
float * tp = tp0 +
tp_task_xy_offset
+ threadIdx.x;
if (thread12 < num_cams) {
tt[tile_in_block].xy[thread12][thread0] = *(tp); // gpu_task -> xy[thread12][thread0];
tt[tile_in_block].xy[thread12][thread0] = *(tp);
// gpu_task -> xy[thread12][thread0];
}
if (num_cams > 4){ // was unlikely, now 16
for (int nc0 = 4; nc0 < num_cams; nc0 += 4){
...
...
@@ -2714,9 +2714,9 @@ __global__ void convert_correct_tiles(
if (threadIdx.x == 0){ // only one thread calculates, others - wait
tt[tile_in_block].task = *(int *) (tp0++); // get first integer value
tt[tile_in_block].txy = *(int *) (tp0++); // get second integer value
tt[tile_in_block].target_disparity = *(tp0++); //
tt[tile_in_block].target_disparity = *(tp0++);
//
}
// float centerXY[2] is not used/copied here
__syncthreads();// __syncwarp();
__shared__ float clt_tile [TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
...
...
@@ -2732,7 +2732,12 @@ __global__ void convert_correct_tiles(
for (int ncam = 0; ncam < num_cams; ncam++){
for (int color = 0; color < num_colors; color++){
convertCorrectTile(
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
// TODO: remove debug when done
#ifdef DBG_TILE
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
#else
num_colors, // int num_colors, //*
#endif
(struct CltExtra*)(gpu_kernel_offsets[ncam]), // struct CltExtra* gpu_kernel_offsets,
gpu_kernels[ncam], // float * gpu_kernels,
gpu_images[ncam], // float * gpu_images,
...
...
@@ -4251,16 +4256,10 @@ __device__ void convertCorrectTile(
dtt_buf += DTT_SIZE1;
}
__syncthreads();// __syncwarp();
int colorX = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
int colorY = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
int color0 = colorX & 1;
int color1 = (colorX >>1) & 1;
int color0 = color & 1;
int color1 = (color >>1) & 1;
for (int gpass = 0; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
// for (int gpass = 0; gpass < 1; gpass++) { // Only once for R, B, twice - for G
// for (int gpass = 1; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
int col_tl = int_topleft[0]; // + (threadIdx.x << 1);
int row_tl = int_topleft[1];
// for red, blue and green, pass 0
...
...
@@ -4325,7 +4324,7 @@ __device__ void convertCorrectTile(
}
__syncthreads();// __syncwarp();
#endif
if (color
Y
== BAYER_GREEN) {
if (color == BAYER_GREEN) {
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
float *dtt_buf = clt_tile + threadIdx.x;
float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
...
...
@@ -4347,7 +4346,7 @@ __device__ void convertCorrectTile(
#endif
dttiv_color_2d(
clt_tile,
color
Y
);
color);
#ifdef DEBUG30
if (dbg_tile && (threadIdx.x) == 0){
printf("\nDTT Tiles after vertical pass (both passes), color = %d\n",color);
...
...
@@ -4361,8 +4360,8 @@ __device__ void convertCorrectTile(
int negate; // , dst_inc;
// Replicate horizontally (for R and B only):
if (color
Y
!= BAYER_GREEN) {
negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ color
Y
)) << 1); // +1/-1
if (color != BAYER_GREEN) {
negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ color)) << 1); // +1/-1
src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x ];
dst = clt_tile + (DTT_SIZE1 * DTT_SIZE) + (threadIdx.x ^ 7); // &clt_tile[1][0][threadIdx.x ^ 7];
#pragma unroll
...
...
@@ -4382,7 +4381,7 @@ __device__ void convertCorrectTile(
}
// replicate all colors down diagonal
negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (color
Y
>> 1))) << 1); // +1/-1 // 1 -
negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (color >> 1))) << 1); // +1/-1 // 1 -
// CC -> SS
src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x ];
dst = clt_tile + (DTT_SIZE1 * (DTT_SIZE * 3 + 7)) + (threadIdx.x ^ 7); // &clt_tile[3][7][threadIdx.x ^ 7];
...
...
src/geometry_correction.cu
View file @
d85c42b5
...
...
@@ -307,6 +307,7 @@ extern "C" __global__ void calc_rot_deriv(
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
...
...
@@ -325,6 +326,7 @@ extern "C" __global__ void calculate_tiles_offsets(
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> (
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
...
...
@@ -347,6 +349,7 @@ extern "C" __global__ void calculate_tiles_offsets(
*/
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...
...
@@ -457,19 +460,30 @@ extern "C" __global__ void get_tiles_offsets(
// common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
float disparity = * (gpu_ftasks + task_size * task_num + 2);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float px = *(centerXY);
float py = *(centerXY + 1);
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
#ifdef DEBUG23
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
printf("\n");
__syncthreads();
}
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
printf("\n");
__syncthreads();
}
#endif //#ifdef DEBUG23
float px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
float py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
*(centerXY) = px;
*(centerXY + 1) = py;
}
__syncthreads();
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
...
...
@@ -496,15 +510,17 @@ extern "C" __global__ void get_tiles_offsets(
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
}
__syncthreads();// __syncwarp();
...
...
@@ -689,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num +
3
+ num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
float * disp_dist_p = gpu_ftasks + task_size * task_num +
tp_task_xy_offset
+ num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
...
...
@@ -752,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float * tile_xy_p = gpu_ftasks + task_size * task_num +
3
+ ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
float * tile_xy_p = gpu_ftasks + task_size * task_num +
tp_task_xy_offset
+ ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
...
...
src/geometry_correction.h
View file @
d85c42b5
...
...
@@ -41,7 +41,6 @@
#include "tp_defines.h"
#endif
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define NVRTC_BUG 1
#ifndef M_PI
...
...
@@ -63,11 +62,17 @@ struct tp_task {
unsigned
short
sxy
[
2
];
};
float
target_disparity
;
float
centerXY
[
2
];
// "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float
xy
[
NUM_CAMS
][
2
];
// float target_disparity;
float
disp_dist
[
NUM_CAMS
][
4
];
// calculated with getPortsCoordinates()
};
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct
corr_vector
{
float
tilt
[
NUM_CAMS
-
1
];
// 0..2
float
azimuth
[
NUM_CAMS
-
1
];
// 3..5
...
...
@@ -145,6 +150,7 @@ struct gc {
};
#define RAD_COEFF_LEN 7
extern
"C"
__global__
void
get_tiles_offsets
(
int
uniform_grid
,
//==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int
num_cams
,
// struct tp_task * gpu_tasks,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...
...
@@ -155,6 +161,7 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv
*
gpu_rot_deriv
);
extern
"C"
__global__
void
calculate_tiles_offsets
(
int
uniform_grid
,
//==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int
num_cams
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
...
...
src/test_tp.cu
View file @
d85c42b5
...
...
@@ -489,7 +489,7 @@ int main(int argc, char **argv)
/// static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
/// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data =
(float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
trot_deriv rot_deriv;
...
...
@@ -691,6 +691,7 @@ int main(int argc, char **argv)
*(tp++) = *(float *) &task_task;
*(tp++) = *(float *) &task_txy;
*(tp++) = task_target_disparity;
tp += 2; // skip centerX, centerY
for (int ncam = 0; ncam < num_cams; ncam++) {
*(tp++) = tile_coords_h[ncam][nt][0];
*(tp++) = tile_coords_h[ncam][nt][1];
...
...
@@ -982,6 +983,7 @@ int main(int argc, char **argv)
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
...
...
@@ -1037,15 +1039,15 @@ int main(int argc, char **argv)
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
for (int ncam = 0; ncam < num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE +
3
+ 2*ncam + 0),
*(ftask_data1 + task_size * DBG_TILE +
3
+ 2*ncam + 0),
(*(ftask_data + task_size * DBG_TILE +
3
+ 2*ncam + 0)) -
(*(ftask_data1 + task_size * DBG_TILE +
3
+ 2*ncam + 0)));
*(ftask_data + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 0),
*(ftask_data1 + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 0),
(*(ftask_data + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 0)) -
(*(ftask_data1 + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE +
3
+ 2*ncam + 1),
*(ftask_data1 + task_size * DBG_TILE +
3
+ 2*ncam + 1),
(*(ftask_data + task_size * DBG_TILE +
3
+ 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE +
3
+ 2*ncam + 1)));
*(ftask_data + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 1),
*(ftask_data1 + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 1),
(*(ftask_data + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE +
tp_task_xy_offset
+ 2*ncam + 1)));
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment