Commit 3c033192 authored by Andrey Filippov's avatar Andrey Filippov

updated kernels

parent 514057c6
......@@ -45,6 +45,15 @@
#include "TileProcessor.h"
#endif // #ifndef JCUDA
// CUDA fast math is slower!
//#define FASTMATH 1
/*
fast
GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
nofast
GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
*/
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
......@@ -101,14 +110,6 @@
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//56494
// struct tp_task
......@@ -1019,19 +1020,7 @@ __global__ void correlate2D(
__syncthreads();// __syncwarp();
#endif
#endif
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
......@@ -1083,23 +1072,6 @@ __global__ void correlate2D(
#endif
#endif
dttii_2d(clt_corr);
/*
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DEBUG6
......@@ -1170,6 +1142,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -1920,11 +1893,11 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -2026,14 +1999,21 @@ __global__ void textures_accumulate(
}
__syncthreads();// __syncwarp();
#endif
// perform idct
#ifdef DBG_TILE // perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
#else
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
0);
#endif
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
......@@ -2044,6 +2024,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
#ifdef DBG_TILE
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
......@@ -2052,6 +2033,16 @@ __global__ void textures_accumulate(
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
#else
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
0); // int debug);
#endif
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
......@@ -2125,6 +2116,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
......@@ -2140,7 +2132,23 @@ __global__ void textures_accumulate(
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
#else
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0); //int debug );
#endif
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
......@@ -2655,7 +2663,11 @@ __device__ void normalizeTileAmplitude(
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j3) * *(clt_tile_j3);
#ifdef FASTMATH
float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
#else
float scale = rsqrtf(s2); // 1.0/sqrt(s2)
#endif
*(clt_tile_j0) *= scale;
*(clt_tile_j1) *= scale;
*(clt_tile_j2) *= scale;
......@@ -3333,7 +3345,12 @@ __device__ void debayer_shot(
if (scale_shot > 0.0) {
#ifdef FASTMATH
float k = __frsqrt_rn(min_shot);
#else
float k = rsqrtf(min_shot);
#endif
// double k = 1.0/Math.sqrt(min_shot); //sqrtf
//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
......@@ -3343,7 +3360,14 @@ __device__ void debayer_shot(
#pragma unroll
for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
float d = *mcltp;
#ifdef FASTMATH
*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
#else
*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
#endif
mcltp += DTT_SIZE;
}
mcltp += (DTT_SIZE21-DTT_SIZE2);
......@@ -3549,10 +3573,19 @@ __device__ void tile_combine_rgba(
s2 += d * d;
}
float mse = (s0*s2 - s1*s1) / (s0 * s0);
#ifdef FASTMATH
* crms_col_i = __fsqrt_rn(mse);
#else
* crms_col_i = sqrtf(mse);
#endif
sw += *(chn_weights +ncol) * mse;
}
#ifdef FASTMATH
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
#else
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
#endif
}
#ifdef DEBUG9
}
......@@ -3605,7 +3638,12 @@ __device__ void tile_combine_rgba(
dc *= wnd2_inv; // to compensate fading near the edges
d+= *(chn_weights + ncol) * dc * dc;
}
#ifdef FASTMATH
d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#else
d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#endif
// Add weight to both channels in a pair
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
......@@ -3711,7 +3749,13 @@ __device__ void tile_combine_rgba(
}
// TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#else
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#endif
}
// and now make a new average with those weights
// Inserting dust remove here
......@@ -3879,7 +3923,11 @@ __device__ void tile_combine_rgba(
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
mx = fmaxf(mx, max_diff_tmp[cam][i]);
}
#ifdef FASTMATH
max_diff[cam] = __fsqrt_rn(mx);
#else
max_diff[cam] = sqrtf(mx);
#endif
}
}
......
......@@ -41,6 +41,18 @@
#include "tp_defines.h"
#endif
extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask); // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILESX, use for faster processing of LWIR images
......@@ -68,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
......@@ -102,5 +114,35 @@ extern "C" __global__ void imclt_rbg(
int h_offset,
const size_t dstride); // in floats (pixels)
extern "C"
__global__ void generate_RBGA(
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
......@@ -72,9 +72,9 @@
// kernels (not used so far)
#ifdef BBBB
#if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #ifdef BBBB
#endif// #if 0
//=========================== 2D functions ===============
extern __device__ void corrUnfoldTile(
......
/**
**
** geometry_correction.cu
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** geometry_correction.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file geometry_correction.cu
* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g);
__device__ void printExtrinsicCorrection(corr_vector * cv);
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
},{ // tilt
{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
},{ // roll*zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
},{ // d_azimuth
{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
},{ // d_tilt
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
},{ // d_roll
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
},{ // d_zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
}
};
__constant__ int angles_offsets [4] = {
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot - bad
{13,8,1}, // tmp1 * a_daz -> deriv0 - good
{14,5,2}, // tmp2 * a_az -> deriv1 - good
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2 - good
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1,-1,-1} // do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
*/
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv)
{
// __shared__ float zoom;
__shared__ float sincos [4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 +4][3][3];
float angle;
float zoom;
int ncam = blockIdx.x; // threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
if (ncam == (NUM_CAMS-1)){ // for the whole block
angle = 0.0;
zoom = 0.0;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
zoom -= gpu_correction_vector->zoom[n];
}
if (nangle >= 2){ // diverging for roll (last two)
angle = *(gangles + ncam);
}
} else {
angle = *(gangles + ncam);
zoom = gpu_correction_vector->zoom[ncam];
}
if (!is_sin){
angle += M_PI/2;
}
float sc = sinf(angle);
if (nangle ==2) {
sc *= 1.0 + zoom;
}
sincos[nangle][is_sin]= sc;
}
__syncthreads();
#ifdef DEBUG20
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf( " Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf( " Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf( " Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices+threadIdx.z; // 0..2
int const_index = threadIdx.z; // 0..2
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
axis += 3; // skip index == 3
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
if (threadIdx.z == 0){
axis += 3;
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20
const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++){
for (int col = 0; col < 3; col++){
printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot
{13,8,1}, // tmp1 * a_daz -> deriv0
{14,5,2}, // tmp2 * a_az -> deriv1
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for (int i = 0; i < 3; i++){
int srcl = mm_seq[i][threadIdx.z][0];
int srcr = mm_seq[i][threadIdx.z][1];
int dst = mm_seq[i][threadIdx.z][2];
if (srcl >= 0){
matrices[dst][threadIdx.y][threadIdx.x] =
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
}
__syncthreads();
}
// copy results to global memory
int gindx = threadIdx.z;
int lindx = offset_rots + threadIdx.z;
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif
gindx +=3;
lindx+=3;
if (lindx < 5) {
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif
}
__syncthreads();
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
}
/*
* blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM
*/
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
{
// int task_num = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
if (task_num >= num_tiles){
return;
}
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int ncam = threadIdx.x;
// threadIdx.x - numcam, used for per-camera
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv;
float pXY[2]; // result to be copied to task
// copy data common to all threads
{
float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_GC; i++){
if (offset < sizeof(struct gc)/sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
{
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_CV; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
// TODO: maybe it is better to use system memory and not read all table?
{
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_RBRD; i++){
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_ROTS; i++){
if (offset < sizeof(trot_deriv)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
__syncthreads();
int imu_exists = // todo - calculate once with rot_deriv?
(extrinsic_corr.imu_rot[0] != 0.0) ||
(extrinsic_corr.imu_rot[1] != 0.0) ||
(extrinsic_corr.imu_rot[2] != 0.0) ||
(extrinsic_corr.imu_move[0] != 0.0) ||
(extrinsic_corr.imu_move[1] != 0.0) ||
(extrinsic_corr.imu_move[2] != 0.0);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
printGeometryCorrection(&geometry_correction);
printExtrinsicCorrection(&extrinsic_corr);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
* 2) apply disparity
* 3) apply rotations and zoom
* 4) re-apply distortion
* 5) return port center X and Y
* line_time
*/
// common code, calculated in parallel
int cxy = gpu_tasks[task_num].txy;
float disparity = gpu_tasks[task_num].target_disparity;
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
float px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
float py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
// float rXY [NUM_CAMS][2];
float rXY [2];
// for (int i = 0; i < NUM_CAMS;i++){
// rXY[ncam][0] = geometry_correction.rXY[ncam][0];
// rXY[ncam][1] = geometry_correction.rXY[ncam][1];
rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1];
// }
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[1]; // [ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
#pragma unroll
for (int j = 0; j< 3; j++){
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2];
float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z;
// Re-apply distortion
float rNDi = sqrtf(pXci*pXci + pYci*pYci); // in pixels
float ri = rNDi* ri_scale; // relative to distortion radius
float rD2rND = 1.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
rri *= ri;
rD2rND += ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
}
#else
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
rri *= ri;
rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
}
#endif
}
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND;
pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
// used when calculating derivatives, TODO: combine calculations !
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
printf("rNDi = %f, ri = %f\n", rNDi, ri);
printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// float rvi[3];
float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll
for (int j = 0; j< 3; j++){
// drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * rvi[0] + rot_deriv.d_daz[ncam][j][1] * rvi[1] + rot_deriv.d_daz[ncam][j][2] * rvi[2];
// drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
// drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
}
// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
float dpXci_pYci_imu_lin[2][3];
/*
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/
float dd1[2][2];// get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("dd1[0][0] = %f, dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float c_dist = pXci/rNDi;
float s_dist = pYci/rNDi;
/*
double [][] arot2= {
{c_dist, s_dist},
{-s_dist, c_dist}};
Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
double [][] ascale_distort = {
{rD2rND + ri* drD2rND_dri, 0 },
{0, rD2rND}};
Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
disp_dist[i][0] = dd2.get(0, 0);
disp_dist[i][1] = dd2.get(0, 1);
disp_dist[i][2] = dd2.get(1, 0); // d_py/d_disp
disp_dist[i][3] = dd2.get(1, 1);
*/
//#undef NVRTC_BUG
float drD2rND_dri = 0.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
rri *= ri;
}
#else
#pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
rri *= ri;
}
#endif
}
float scale_distort00 = rD2rND + ri* drD2rND_dri;
float scale_distort11 = rD2rND;
// float rot2Xdd1[2][2];
// rot2Xdd1[0][0] = c_dist * dd1[0][0] + s_dist * dd1[1][0];
// rot2Xdd1[0][1] = c_dist * dd1[0][1] + s_dist * dd1[1][1];
// rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
// rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists){
float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
float ers_Xci = delta_t * (
dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
dpXci_droll * extrinsic_corr.imu_rot[2]);
float ers_Yci = delta_t* (
dpYci_dtilt * extrinsic_corr.imu_rot[0] +
dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
dpYci_droll * extrinsic_corr.imu_rot[2]);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("delta_t = %f, ers_Xci = %f, ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
if (disparity >= MIN_DISPARITY){ // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_Xci += delta_t* (
dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
ers_Yci += delta_t* (
dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
pXY[0] += ers_Xci * rD2rND; // added correction to pixel X
pXY[1] += ers_Yci * rD2rND; // added correction to pixel Y
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_Xci = %f, ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist
gpu_tasks[task_num].xy[ncam][0] = pXY[0];
gpu_tasks[task_num].xy[ncam][1] = pXY[1];
}
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]) //shared memory
{
if (rDist < 0) {
return 0.0f; // normally should not happen
}
float findex = rDist/RBYRDIST_STEP;
int index= (int) floorf(findex);
if (index < 0){
index = 0;
}
if (index > (RBYRDIST_LEN - 3)) {
index = RBYRDIST_LEN - 3;
}
float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu;
float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
// use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index+1] - 0.5 * rByRDist[index+2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index+1];
float a3 = rByRDist[index];
float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
return result;
}
__device__ void printGeometryCorrection(struct gc * g){
#ifndef JCUDA
printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n","pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n","line_time", g->line_time);
printf("%22s: %f\n","focalLength", g->focalLength);
printf("%22s: %f\n","pixelSize", g->pixelSize);
printf("%22s: %f\n","distortionRadius",g->distortionRadius);
printf("%22s: %f\n","distortionC", g->distortionC);
printf("%22s: %f\n","distortionB", g->distortionB);
printf("%22s: %f\n","distortionA", g->distortionA);
printf("%22s: %f\n","distortionA5",g->distortionA5);
printf("%22s: %f\n","distortionA6",g->distortionA6);
printf("%22s: %f\n","distortionA7",g->distortionA7);
printf("%22s: %f\n","distortionA8",g->distortionA8);
printf("%22s: %f\n","elevation", g->elevation);
printf("%22s: %f\n","heading", g->heading);
printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s: %f\n","common_right", g->common_right);
printf("%22s: %f\n","common_forward", g->common_forward);
printf("%22s: %f\n","common_height", g->common_height);
printf("%22s: %f\n","common_roll", g->common_roll);
printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
printf("%22s: %f\n","cameraRadius", g->cameraRadius);
printf("%22s: %f\n","disparityRadius", g->disparityRadius);
#endif //ifndef JCUDA
}
__device__ void printExtrinsicCorrection(corr_vector * cv)
{
#ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n");
printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif //ifndef JCUDA
}
......@@ -41,6 +41,19 @@
#include "tp_defines.h"
#endif
#define NVRTC_BUG 1
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
#ifndef offsetof
#define offsetof(st, m) \
((size_t)&(((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task {
int task;
union {
......@@ -61,19 +74,50 @@ struct corr_vector{
float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
};
#ifdef NVRTC_BUG
struct trot_deriv{
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
};
#else
union trot_deriv{
struct {
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
};
float matrices [5][NUM_CAMS][3][3];
};
#endif
struct gc {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionB; // r^3
float distortionC; // r^2
#ifndef NVRTC_BUG
union {
struct {
#endif
float distortionC; // r^2
float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
// };
// float rad_coeff [7];
// };
#endif
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
......@@ -81,19 +125,37 @@ struct gc {
float forward [NUM_CAMS];
float right [NUM_CAMS];
float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2];
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
#define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment