Commit d85c42b5 authored by Andrey Filippov's avatar Andrey Filippov

Added possibility to provide virtual camet centerXY[] instead of the

uniform grid
parent e1bf6e5c
......@@ -2697,9 +2697,9 @@ __global__ void convert_correct_tiles(
int thread0 = threadIdx.x & 1; // 0,1
int thread12 = threadIdx.x >>1; // now 0..3 (total number == (DTT_SIZE), will not change
float * tp = tp0 + 3 + threadIdx.x;
float * tp = tp0 + tp_task_xy_offset + threadIdx.x;
if (thread12 < num_cams) {
tt[tile_in_block].xy[thread12][thread0] = *(tp); // gpu_task -> xy[thread12][thread0];
tt[tile_in_block].xy[thread12][thread0] = *(tp); // gpu_task -> xy[thread12][thread0];
}
if (num_cams > 4){ // was unlikely, now 16
for (int nc0 = 4; nc0 < num_cams; nc0 += 4){
......@@ -2714,9 +2714,9 @@ __global__ void convert_correct_tiles(
if (threadIdx.x == 0){ // only one thread calculates, others - wait
tt[tile_in_block].task = *(int *) (tp0++); // get first integer value
tt[tile_in_block].txy = *(int *) (tp0++); // get second integer value
tt[tile_in_block].target_disparity = *(tp0++); //
tt[tile_in_block].target_disparity = *(tp0++); //
}
// float centerXY[2] is not used/copied here
__syncthreads();// __syncwarp();
__shared__ float clt_tile [TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
......@@ -2732,7 +2732,12 @@ __global__ void convert_correct_tiles(
for (int ncam = 0; ncam < num_cams; ncam++){
for (int color = 0; color < num_colors; color++){
convertCorrectTile(
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
// TODO: remove debug when done
#ifdef DBG_TILE
num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0), // int num_colors, //*
#else
num_colors, // int num_colors, //*
#endif
(struct CltExtra*)(gpu_kernel_offsets[ncam]), // struct CltExtra* gpu_kernel_offsets,
gpu_kernels[ncam], // float * gpu_kernels,
gpu_images[ncam], // float * gpu_images,
......@@ -4251,16 +4256,10 @@ __device__ void convertCorrectTile(
dtt_buf += DTT_SIZE1;
}
__syncthreads();// __syncwarp();
int colorX = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
int colorY = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
int color0 = colorX & 1;
int color1 = (colorX >>1) & 1;
int color0 = color & 1;
int color1 = (color >>1) & 1;
for (int gpass = 0; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
// for (int gpass = 0; gpass < 1; gpass++) { // Only once for R, B, twice - for G
// for (int gpass = 1; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
int col_tl = int_topleft[0]; // + (threadIdx.x << 1);
int row_tl = int_topleft[1];
// for red, blue and green, pass 0
......@@ -4325,7 +4324,7 @@ __device__ void convertCorrectTile(
}
__syncthreads();// __syncwarp();
#endif
if (colorY == BAYER_GREEN) {
if (color == BAYER_GREEN) {
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
float *dtt_buf = clt_tile + threadIdx.x;
float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
......@@ -4347,7 +4346,7 @@ __device__ void convertCorrectTile(
#endif
dttiv_color_2d(
clt_tile,
colorY);
color);
#ifdef DEBUG30
if (dbg_tile && (threadIdx.x) == 0){
printf("\nDTT Tiles after vertical pass (both passes), color = %d\n",color);
......@@ -4361,8 +4360,8 @@ __device__ void convertCorrectTile(
int negate; // , dst_inc;
// Replicate horizontally (for R and B only):
if (colorY != BAYER_GREEN) {
negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ colorY)) << 1); // +1/-1
if (color != BAYER_GREEN) {
negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ color)) << 1); // +1/-1
src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x ];
dst = clt_tile + (DTT_SIZE1 * DTT_SIZE) + (threadIdx.x ^ 7); // &clt_tile[1][0][threadIdx.x ^ 7];
#pragma unroll
......@@ -4382,7 +4381,7 @@ __device__ void convertCorrectTile(
}
// replicate all colors down diagonal
negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (colorY >> 1))) << 1); // +1/-1 // 1 -
negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (color >> 1))) << 1); // +1/-1 // 1 -
// CC -> SS
src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x ];
dst = clt_tile + (DTT_SIZE1 * (DTT_SIZE * 3 + 7)) + (threadIdx.x ^ 7); // &clt_tile[3][7][threadIdx.x ^ 7];
......
......@@ -307,6 +307,7 @@ extern "C" __global__ void calc_rot_deriv(
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
......@@ -325,6 +326,7 @@ extern "C" __global__ void calculate_tiles_offsets(
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> (
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
......@@ -347,6 +349,7 @@ extern "C" __global__ void calculate_tiles_offsets(
*/
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
......@@ -457,19 +460,30 @@ extern "C" __global__ void get_tiles_offsets(
// common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
float disparity = * (gpu_ftasks + task_size * task_num + 2);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float px = *(centerXY);
float py = *(centerXY + 1);
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
#ifdef DEBUG23
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
printf("\n");
__syncthreads();
}
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
printf("\n");
__syncthreads();
}
#endif //#ifdef DEBUG23
float px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
float py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
*(centerXY) = px;
*(centerXY + 1) = py;
}
__syncthreads();
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
......@@ -496,15 +510,17 @@ extern "C" __global__ void get_tiles_offsets(
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
}
__syncthreads();// __syncwarp();
......@@ -689,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + 3 + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
float * disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
......@@ -752,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
float * tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
......
......@@ -41,7 +41,6 @@
#include "tp_defines.h"
#endif
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define NVRTC_BUG 1
#ifndef M_PI
......@@ -63,11 +62,17 @@ struct tp_task {
unsigned short sxy[2];
};
float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2];
// float target_disparity;
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
};
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5
......@@ -145,6 +150,7 @@ struct gc {
};
#define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
......@@ -155,6 +161,7 @@ extern "C" __global__ void get_tiles_offsets(
trot_deriv * gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
......
......@@ -489,7 +489,7 @@ int main(int argc, char **argv)
/// static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
/// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
trot_deriv rot_deriv;
......@@ -691,6 +691,7 @@ int main(int argc, char **argv)
*(tp++) = *(float *) &task_task;
*(tp++) = *(float *) &task_txy;
*(tp++) = task_target_disparity;
tp += 2; // skip centerX, centerY
for (int ncam = 0; ncam < num_cams; ncam++) {
*(tp++) = tile_coords_h[ncam][nt][0];
*(tp++) = tile_coords_h[ncam][nt][1];
......@@ -982,6 +983,7 @@ int main(int argc, char **argv)
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
......@@ -1037,15 +1039,15 @@ int main(int argc, char **argv)
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
for (int ncam = 0; ncam < num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 0),
*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 0),
(*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 0)) -
(*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 0)));
*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
(*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)) -
(*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 1),
*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 1),
(*(ftask_data + task_size * DBG_TILE + 3 + 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE + 3 + 2*ncam + 1)));
*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1),
(*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment