Commit a6844c60 authored by Andrey Filippov's avatar Andrey Filippov

reconciled geometry with java

parent dba4dfce
......@@ -110,14 +110,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//56494
// struct tp_task
......@@ -1150,6 +1142,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -1900,11 +1893,11 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -2006,14 +1999,21 @@ __global__ void textures_accumulate(
}
__syncthreads();// __syncwarp();
#endif
// perform idct
#ifdef DBG_TILE // perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
#else
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
0);
#endif
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
......@@ -2024,6 +2024,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
#ifdef DBG_TILE
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
......@@ -2032,6 +2033,16 @@ __global__ void textures_accumulate(
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
#else
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
0); // int debug);
#endif
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
......@@ -2105,6 +2116,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
......@@ -2120,7 +2132,23 @@ __global__ void textures_accumulate(
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
#else
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0); //int debug );
#endif
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
......
......@@ -80,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
......@@ -127,6 +127,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......
......@@ -47,7 +47,8 @@
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3
......@@ -115,7 +116,7 @@ __constant__ int mm_seq [3][3][3]={
{-1,-1,-1} // do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
......@@ -309,7 +310,7 @@ extern "C" __global__ void calc_rot_matrices(
}
#endif
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
......@@ -452,6 +453,12 @@ extern "C" __global__ void calc_rot_deriv(
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
}
__syncthreads();
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
......@@ -468,7 +475,8 @@ extern "C" __global__ void get_tiles_offsets(
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist) // length should match RBYRDIST_LEN
float * gpu_rByRDist, // length should match RBYRDIST_LEN
union trot_deriv * gpu_rot_deriv)
{
// int task_num = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
......@@ -478,8 +486,8 @@ extern "C" __global__ void get_tiles_offsets(
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ float rots[NUM_CAMS][3][3];
__shared__ float pXY[NUM_CAMS][2]; // result to be copied to task
__shared__ trot_deriv rot_deriv;
float pXY[2]; // result to be copied to task
// copy data common to all threads
{
float * gcp_local = (float *) &geometry_correction;
......@@ -515,31 +523,37 @@ extern "C" __global__ void get_tiles_offsets(
offset += THREADS_PER_BLOCK_GEOM;
}
}
// copy rotational matrices
// __shared__ float rots[NUM_CAMS][3][3];
//__device__ float rot_matrices [NUM_CAMS][3][3];
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
float * rots_local = (float *) rots;
float * rots_global = (float *) rot_matrices;
float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_ROTS; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
if (offset < sizeof(trot_deriv)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && ( blockIdx.x == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
int imu_exists = // todo - calculate once with rot_deriv?
(extrinsic_corr.imu_rot[0] != 0.0) ||
(extrinsic_corr.imu_rot[1] != 0.0) ||
(extrinsic_corr.imu_rot[2] != 0.0) ||
(extrinsic_corr.imu_move[0] != 0.0) ||
(extrinsic_corr.imu_move[1] != 0.0) ||
(extrinsic_corr.imu_move[2] != 0.0);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
printGeometryCorrection(&geometry_correction);
printExtrinsicCorrection(&extrinsic_corr);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
......@@ -561,11 +575,14 @@ extern "C" __global__ void get_tiles_offsets(
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY [NUM_CAMS][2];
// float rXY [NUM_CAMS][2];
float rXY [2];
// for (int i = 0; i < NUM_CAMS;i++){
rXY[ncam][0] = geometry_correction.rXY[ncam][0];
rXY[ncam][1] = geometry_correction.rXY[ncam][1];
// rXY[ncam][0] = geometry_correction.rXY[ncam][0];
// rXY[ncam][1] = geometry_correction.rXY[ncam][1];
rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1];
// }
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
......@@ -573,7 +590,8 @@ extern "C" __global__ void get_tiles_offsets(
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius / (disparity * 0.001*geometry_correction.pixelSize); // "+" - near, "-" far
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
......@@ -581,23 +599,40 @@ extern "C" __global__ void get_tiles_offsets(
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float pXci0 = pXc - disparity * rXY[ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[ncam][1];
float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[1]; // [ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
#pragma unroll
for (int j = 0; j< 3; j++){
rvi[j] = rots[ncam][j][0] * pXci0 + rots[ncam][j][1] * pYci0 + rots[ncam][j][2] * fl_pix;
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2];
......@@ -619,66 +654,93 @@ extern "C" __global__ void get_tiles_offsets(
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND;
pXY[ncam][0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[ncam][1] = pYid + geometry_correction.pXY0[ncam][1];
pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
// used when calculating derivatives, TODO: combine calculations !
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
printf("rNDi = %f, ri = %f\n", rNDi, ri);
printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// float rvi[3];
float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll
for (int j = 0; j< 3; j++){
// drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * rvi[0] + rot_deriv.d_daz[ncam][j][1] * rvi[1] + rot_deriv.d_daz[ncam][j][2] * rvi[2];
// drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
// drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
}
// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// }//for (int i = 0; i < NUM_CAMS; i++){
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
float dpXci_pYci_imu_lin[2][3];
/*
// used when calculating derivatives, TODO: combine calculations !
double drD2rND_dri = 0.0;
Matrix drvi_daz = null;
Matrix drvi_dtl = null;
Matrix drvi_drl = null;
double dpXci_dazimuth = 0.0;
double dpYci_dazimuth = 0.0;
double dpXci_dtilt = 0.0;
double dpYci_dtilt = 0.0;
double dpXci_droll = 0.0;
double dpYci_droll = 0.0;
if ((disp_dist != null) || (pXYderiv != null)) {
rri = 1.0;
for (int j = 0; j < rad_coeff.length; j++){
drD2rND_dri += rad_coeff[j] * (j+1) * rri;
rri *= ri;
}
if (deriv_rots != null) {
// needed for derivatives and IMU
drvi_daz = deriv_rots[i][0].times(vi);
drvi_dtl = deriv_rots[i][1].times(vi);
drvi_drl = deriv_rots[i][2].times(vi);
dpXci_dazimuth = drvi_daz.get(0, 0) * norm_z - pXci * drvi_daz.get(2, 0) / rvi.get(2, 0);
dpYci_dazimuth = drvi_daz.get(1, 0) * norm_z - pYci * drvi_daz.get(2, 0) / rvi.get(2, 0);
dpXci_dtilt = drvi_dtl.get(0, 0) * norm_z - pXci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
dpYci_dtilt = drvi_dtl.get(1, 0) * norm_z - pYci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
dpXci_droll = drvi_drl.get(0, 0) * norm_z - pXci * drvi_drl.get(2, 0) / rvi.get(2, 0);
dpYci_droll = drvi_drl.get(1, 0) * norm_z - pYci * drvi_drl.get(2, 0) / rvi.get(2, 0);
}
}
double delta_t = 0.0;
double [] imu = null;
double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
if (disp_dist != null) {
disp_dist[i] = new double [4]; // dx/d_disp, dx_d_ccw_disp
// Not clear - what should be in Z direction before rotation here?
double [][] add0 = {
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
//// Matrix dd1 = dd0.getMatrix(0, 1,0,1); // get top left 2x2 sub-matrix
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
double c_dist = pXci/rNDi;
double s_dist = pYci/rNDi;
*/
float dd1[2][2];// get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("dd1[0][0] = %f, dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float c_dist = pXci/rNDi;
float s_dist = pYci/rNDi;
/*
double [][] arot2= {
{c_dist, s_dist},
{-s_dist, c_dist}};
......@@ -696,43 +758,112 @@ extern "C" __global__ void get_tiles_offsets(
disp_dist[i][2] = dd2.get(1, 0); // d_py/d_disp
disp_dist[i][3] = dd2.get(1, 1);
imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// ERS linear does not yet use per-port rotations, probably not needed
// double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
if ((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0) ||(imu[3] != 0.0) ||(imu[4] != 0.0) ||(imu[5] != 0.0)) {
delta_t = dd2.get(1, 0) * disparity * line_time; // positive for top cameras, negative - for bottom
double ers_Xci = delta_t* (dpXci_dtilt * imu[0] + dpXci_dazimuth * imu[1] + dpXci_droll * imu[2]);
double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1] + dpYci_droll * imu[2]);
if (xyz != null) {
double k = SCENE_UNITS_SCALE * this.disparityRadius;
double wdisparity = disparity;
double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_Xci += delta_t* (dpXci_pYci_imu_lin[0][0] * imu[3] + dpXci_pYci_imu_lin[0][2] * imu[5]);
ers_Yci += delta_t* (dpXci_pYci_imu_lin[1][1] * imu[4] + dpXci_pYci_imu_lin[1][2] * imu[5]);
}
pXY[i][0] += ers_Xci * rD2rND; // added correction to pixel X
pXY[i][1] += ers_Yci * rD2rND; // added correction to pixel Y
} else {
imu = null;
}
*/
float drD2rND_dri = 0.0;
{
float rri = 1.0;
#pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
rri *= ri;
}
}
float scale_distort00 = rD2rND + ri* drD2rND_dri;
float scale_distort11 = rD2rND;
// float rot2Xdd1[2][2];
// rot2Xdd1[0][0] = c_dist * dd1[0][0] + s_dist * dd1[1][0];
// rot2Xdd1[0][1] = c_dist * dd1[0][1] + s_dist * dd1[1][1];
// rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
// rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists){
float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
float ers_Xci = delta_t * (
dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
dpXci_droll * extrinsic_corr.imu_rot[2]);
float ers_Yci = delta_t* (
dpYci_dtilt * extrinsic_corr.imu_rot[0] +
dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
dpYci_droll * extrinsic_corr.imu_rot[2]);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("delta_t = %f, ers_Xci = %f, ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
if (disparity >= MIN_DISPARITY){ // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_Xci += delta_t* (
dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
ers_Yci += delta_t* (
dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
pXY[0] += ers_Xci * rD2rND; // added correction to pixel X
pXY[1] += ers_Yci * rD2rND; // added correction to pixel Y
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_Xci = %f, ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist
gpu_tasks[task_num].xy[ncam][0] = pXY[0];
gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// TODO: calculate derivatives of pX, pY by 3 imu omegas
}
}
*/
}
/**
* Calculate non-distorted radius from distorted using table approximation
......
......@@ -42,6 +42,7 @@
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task {
int task;
union {
......@@ -114,17 +115,19 @@ struct gc {
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist); // length should match RBYRDIST_LEN
float * gpu_rByRDist, // length should match RBYRDIST_LEN
union trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
......
......@@ -339,7 +339,8 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
......@@ -634,8 +635,8 @@ struct tp_task {
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * ) gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * ) gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
......@@ -683,7 +684,7 @@ struct tp_task {
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
......@@ -703,7 +704,8 @@ struct tp_task {
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
......@@ -714,6 +716,38 @@ struct tp_task {
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
// gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
/// DBG_TILE
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
&task_data1,
gpu_tasks,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyDeviceToHost));
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][0], task_data1[DBG_TILE].xy[ncam][0],
task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
printf("camera %d pY old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][1], task_data1[DBG_TILE].xy[ncam][1],
task_data [DBG_TILE].xy[ncam][1]- task_data1[DBG_TILE].xy[ncam][1]);
}
#if 0
// temporarily restore tasks
checkCudaErrors(cudaMemcpy(
gpu_tasks,
&task_data,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyHostToDevice));
#endif
#endif // TEST_GEOM_CORR
......
......@@ -72,10 +72,22 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java)
#define DBG_DISPARITY 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 151 // 161 // 49
#define DBG_TILE_Y 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//#undef HAS_PRINTF
......@@ -99,7 +111,9 @@
//#define USE_textures_gen
//#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
//#define DEBUG20 1
#define DEBUG21 1
#endif //#ifndef JCUDA
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment