Commit dba4dfce authored by Andrey Filippov's avatar Andrey Filippov

implemented calc_rot_deriv()

parent b0f7d665
......@@ -45,6 +45,15 @@
#include "TileProcessor.h"
#endif // #ifndef JCUDA
// CUDA fast math is slower!
//#define FASTMATH 1
/*
fast
GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
nofast
GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
*/
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
......@@ -1019,19 +1028,7 @@ __global__ void correlate2D(
__syncthreads();// __syncwarp();
#endif
#endif
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
......@@ -1083,23 +1080,6 @@ __global__ void correlate2D(
#endif
#endif
dttii_2d(clt_corr);
/*
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DEBUG6
......@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude(
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j3) * *(clt_tile_j3);
#ifdef FASTMATH
float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
#else
float scale = rsqrtf(s2); // 1.0/sqrt(s2)
#endif
*(clt_tile_j0) *= scale;
*(clt_tile_j1) *= scale;
*(clt_tile_j2) *= scale;
......@@ -3333,7 +3317,12 @@ __device__ void debayer_shot(
if (scale_shot > 0.0) {
#ifdef FASTMATH
float k = __frsqrt_rn(min_shot);
#else
float k = rsqrtf(min_shot);
#endif
// double k = 1.0/Math.sqrt(min_shot); //sqrtf
//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
......@@ -3343,7 +3332,14 @@ __device__ void debayer_shot(
#pragma unroll
for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
float d = *mcltp;
#ifdef FASTMATH
*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
#else
*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
#endif
mcltp += DTT_SIZE;
}
mcltp += (DTT_SIZE21-DTT_SIZE2);
......@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba(
s2 += d * d;
}
float mse = (s0*s2 - s1*s1) / (s0 * s0);
#ifdef FASTMATH
* crms_col_i = __fsqrt_rn(mse);
#else
* crms_col_i = sqrtf(mse);
#endif
sw += *(chn_weights +ncol) * mse;
}
#ifdef FASTMATH
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
#else
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
#endif
}
#ifdef DEBUG9
}
......@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba(
dc *= wnd2_inv; // to compensate fading near the edges
d+= *(chn_weights + ncol) * dc * dc;
}
#ifdef FASTMATH
d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#else
d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#endif
// Add weight to both channels in a pair
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
......@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba(
}
// TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#else
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#endif
}
// and now make a new average with those weights
// Inserting dust remove here
......@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba(
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
mx = fmaxf(mx, max_diff_tmp[cam][i]);
}
#ifdef FASTMATH
max_diff[cam] = __fsqrt_rn(mx);
#else
max_diff[cam] = sqrtf(mx);
#endif
}
}
......
......@@ -41,6 +41,18 @@
#include "tp_defines.h"
#endif
extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask); // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILESX, use for faster processing of LWIR images
......@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg(
int h_offset,
const size_t dstride); // in floats (pixels)
extern "C"
__global__ void generate_RBGA(
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
......@@ -72,9 +72,9 @@
// kernels (not used so far)
#ifdef BBBB
#if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #ifdef BBBB
#endif// #if 0
//=========================== 2D functions ===============
extern __device__ void corrUnfoldTile(
......
/**
**
** geometry_correction.cu
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** geometry_correction.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file geometry_correction.cu
* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g);
__device__ void printExtrinsicCorrection(corr_vector * cv);
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
},{ // tilt
{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
},{ // roll*zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
},{ // d_azimuth
{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
},{ // d_tilt
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
},{ // d_roll
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
},{ // d_zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
}
};
__constant__ int angles_offsets [4] = {
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot - bad
{13,8,1}, // tmp1 * a_daz -> deriv0 - good
{14,5,2}, // tmp2 * a_az -> deriv1 - good
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2 - good
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1,-1,-1} // do nothing
}};
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
*/
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
union trot_deriv * gpu_rot_deriv)
{
// __shared__ float zoom;
__shared__ float sincos [4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 +4][3][3];
float angle;
float zoom;
int ncam = blockIdx.x; // threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
if (ncam == (NUM_CAMS-1)){ // for the whole block
angle = 0.0;
zoom = 0.0;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
zoom -= gpu_correction_vector->zoom[n];
}
if (nangle >= 2){ // diverging for roll (last two)
angle = *(gangles + ncam);
}
} else {
angle = *(gangles + ncam);
zoom = gpu_correction_vector->zoom[ncam];
}
if (!is_sin){
angle += M_PI/2;
}
float sc = sinf(angle);
if (nangle ==2) {
sc *= 1.0 + zoom;
}
sincos[nangle][is_sin]= sc;
}
__syncthreads();
#ifdef DEBUG20
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf( " Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf( " Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf( " Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices+threadIdx.z; // 0..2
int const_index = threadIdx.z; // 0..2
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
axis += 3; // skip index == 3
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
if (threadIdx.z == 0){
axis += 3;
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20
const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++){
for (int col = 0; col < 3; col++){
printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot
{13,8,1}, // tmp1 * a_daz -> deriv0
{14,5,2}, // tmp2 * a_az -> deriv1
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for (int i = 0; i < 3; i++){
int srcl = mm_seq[i][threadIdx.z][0];
int srcr = mm_seq[i][threadIdx.z][1];
int dst = mm_seq[i][threadIdx.z][2];
if (srcl >= 0){
matrices[dst][threadIdx.y][threadIdx.x] =
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
}
__syncthreads();
}
// copy results to global memory
int gindx = threadIdx.z;
int lindx = offset_rots + threadIdx.z;
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
gindx +=3;
lindx+=3;
if (lindx < 5) {
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
}
__syncthreads();
// All done - read/verify all arrays
}
/*
* blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM
*/
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist) // length should match RBYRDIST_LEN
{
// int task_num = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int ncam = threadIdx.x;
// threadIdx.x - numcam, used for per-camera
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ float rots[NUM_CAMS][3][3];
__shared__ float pXY[NUM_CAMS][2]; // result to be copied to task
// copy data common to all threads
{
float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_GC; i++){
if (offset < sizeof(struct gc)/sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
{
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_CV; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
// TODO: maybe it is better to use system memory and not read all table?
{
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_RBRD; i++){
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
// copy rotational matrices
// __shared__ float rots[NUM_CAMS][3][3];
//__device__ float rot_matrices [NUM_CAMS][3][3];
{
float * rots_local = (float *) rots;
float * rots_global = (float *) rot_matrices;
int offset = thread_xy;
for (int i = 0; i < CYCLES_COPY_ROTS; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += THREADS_PER_BLOCK_GEOM;
}
}
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && ( blockIdx.x == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printGeometryCorrection(&geometry_correction);
printExtrinsicCorrection(&extrinsic_corr);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
* 2) apply disparity
* 3) apply rotations and zoom
* 4) re-apply distortion
* 5) return port center X and Y
* line_time
*/
// common code, calculated in parallel
int cxy = gpu_tasks[task_num].txy;
float disparity = gpu_tasks[task_num].target_disparity;
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
float px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
float py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY [NUM_CAMS][2];
// for (int i = 0; i < NUM_CAMS;i++){
rXY[ncam][0] = geometry_correction.rXY[ncam][0];
rXY[ncam][1] = geometry_correction.rXY[ncam][1];
// }
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius / (disparity * 0.001*geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float pXci0 = pXc - disparity * rXY[ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
#pragma unroll
for (int j = 0; j< 3; j++){
rvi[j] = rots[ncam][j][0] * pXci0 + rots[ncam][j][1] * pYci0 + rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2];
float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z;
// Re-apply distortion
float rNDi = sqrtf(pXci*pXci + pYci*pYci); // in pixels
float ri = rNDi* ri_scale; // relative to distortion radius
float rD2rND = 1.0;
{
float rri = 1.0;
#pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
rri *= ri;
rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
}
}
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND;
pXY[ncam][0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[ncam][1] = pYid + geometry_correction.pXY0[ncam][1];
// }//for (int i = 0; i < NUM_CAMS; i++){
/*
// used when calculating derivatives, TODO: combine calculations !
double drD2rND_dri = 0.0;
Matrix drvi_daz = null;
Matrix drvi_dtl = null;
Matrix drvi_drl = null;
double dpXci_dazimuth = 0.0;
double dpYci_dazimuth = 0.0;
double dpXci_dtilt = 0.0;
double dpYci_dtilt = 0.0;
double dpXci_droll = 0.0;
double dpYci_droll = 0.0;
if ((disp_dist != null) || (pXYderiv != null)) {
rri = 1.0;
for (int j = 0; j < rad_coeff.length; j++){
drD2rND_dri += rad_coeff[j] * (j+1) * rri;
rri *= ri;
}
if (deriv_rots != null) {
// needed for derivatives and IMU
drvi_daz = deriv_rots[i][0].times(vi);
drvi_dtl = deriv_rots[i][1].times(vi);
drvi_drl = deriv_rots[i][2].times(vi);
dpXci_dazimuth = drvi_daz.get(0, 0) * norm_z - pXci * drvi_daz.get(2, 0) / rvi.get(2, 0);
dpYci_dazimuth = drvi_daz.get(1, 0) * norm_z - pYci * drvi_daz.get(2, 0) / rvi.get(2, 0);
dpXci_dtilt = drvi_dtl.get(0, 0) * norm_z - pXci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
dpYci_dtilt = drvi_dtl.get(1, 0) * norm_z - pYci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
dpXci_droll = drvi_drl.get(0, 0) * norm_z - pXci * drvi_drl.get(2, 0) / rvi.get(2, 0);
dpYci_droll = drvi_drl.get(1, 0) * norm_z - pYci * drvi_drl.get(2, 0) / rvi.get(2, 0);
}
}
double delta_t = 0.0;
double [] imu = null;
double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
if (disp_dist != null) {
disp_dist[i] = new double [4]; // dx/d_disp, dx_d_ccw_disp
// Not clear - what should be in Z direction before rotation here?
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
//// Matrix dd1 = dd0.getMatrix(0, 1,0,1); // get top left 2x2 sub-matrix
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
double c_dist = pXci/rNDi;
double s_dist = pYci/rNDi;
double [][] arot2= {
{c_dist, s_dist},
{-s_dist, c_dist}};
Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
double [][] ascale_distort = {
{rD2rND + ri* drD2rND_dri, 0 },
{0, rD2rND}};
Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
disp_dist[i][0] = dd2.get(0, 0);
disp_dist[i][1] = dd2.get(0, 1);
disp_dist[i][2] = dd2.get(1, 0); // d_py/d_disp
disp_dist[i][3] = dd2.get(1, 1);
imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// ERS linear does not yet use per-port rotations, probably not needed
// double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
if ((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0) ||(imu[3] != 0.0) ||(imu[4] != 0.0) ||(imu[5] != 0.0)) {
delta_t = dd2.get(1, 0) * disparity * line_time; // positive for top cameras, negative - for bottom
double ers_Xci = delta_t* (dpXci_dtilt * imu[0] + dpXci_dazimuth * imu[1] + dpXci_droll * imu[2]);
double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1] + dpYci_droll * imu[2]);
if (xyz != null) {
double k = SCENE_UNITS_SCALE * this.disparityRadius;
double wdisparity = disparity;
double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_Xci += delta_t* (dpXci_pYci_imu_lin[0][0] * imu[3] + dpXci_pYci_imu_lin[0][2] * imu[5]);
ers_Yci += delta_t* (dpXci_pYci_imu_lin[1][1] * imu[4] + dpXci_pYci_imu_lin[1][2] * imu[5]);
}
pXY[i][0] += ers_Xci * rD2rND; // added correction to pixel X
pXY[i][1] += ers_Yci * rD2rND; // added correction to pixel Y
} else {
imu = null;
}
// TODO: calculate derivatives of pX, pY by 3 imu omegas
}
*/
}
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]) //shared memory
{
if (rDist < 0) {
return 0.0f; // normally should not happen
}
float findex = rDist/RBYRDIST_STEP;
int index= (int) floorf(findex);
if (index < 0){
index = 0;
}
if (index > (RBYRDIST_LEN - 3)) {
index = RBYRDIST_LEN - 3;
}
float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu;
float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
// use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index+1] - 0.5 * rByRDist[index+2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index+1];
float a3 = rByRDist[index];
float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
return result;
}
__device__ void printGeometryCorrection(struct gc * g){
#ifndef JCUDA
printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n","pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n","line_time", g->line_time);
printf("%22s: %f\n","focalLength", g->focalLength);
printf("%22s: %f\n","pixelSize", g->pixelSize);
printf("%22s: %f\n","distortionRadius",g->distortionRadius);
printf("%22s: %f\n","distortionC", g->distortionC);
printf("%22s: %f\n","distortionB", g->distortionB);
printf("%22s: %f\n","distortionA", g->distortionA);
printf("%22s: %f\n","distortionA5",g->distortionA5);
printf("%22s: %f\n","distortionA6",g->distortionA6);
printf("%22s: %f\n","distortionA7",g->distortionA7);
printf("%22s: %f\n","distortionA8",g->distortionA8);
printf("%22s: %f\n","elevation", g->elevation);
printf("%22s: %f\n","heading", g->heading);
printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s: %f\n","common_right", g->common_right);
printf("%22s: %f\n","common_forward", g->common_forward);
printf("%22s: %f\n","common_height", g->common_height);
printf("%22s: %f\n","common_roll", g->common_roll);
printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
printf("%22s: %f\n","cameraRadius", g->cameraRadius);
printf("%22s: %f\n","disparityRadius", g->disparityRadius);
#endif //ifndef JCUDA
}
__device__ void printExtrinsicCorrection(corr_vector * cv)
{
#ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n");
printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif //ifndef JCUDA
}
......@@ -41,6 +41,7 @@
#include "tp_defines.h"
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
struct tp_task {
int task;
union {
......@@ -62,18 +63,36 @@ struct corr_vector{
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
};
union trot_deriv{
struct {
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
};
float matrices [5][NUM_CAMS][3][3];
};
struct gc {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionB; // r^3
float distortionC; // r^2
union {
struct {
float distortionC; // r^2
float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
};
float rad_coeff [7];
};
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
......@@ -81,19 +100,34 @@ struct gc {
float forward [NUM_CAMS];
float right [NUM_CAMS];
float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2];
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist); // length should match RBYRDIST_LEN
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
union trot_deriv * gpu_rot_deriv);
......@@ -44,6 +44,7 @@
//#include "dtt8x8.cuh"
#include "dtt8x8.h"
#include "geometry_correction.h"
#include "TileProcessor.cuh"
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
......@@ -339,6 +340,7 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
int texture_indices [TILESX*TILESYA];
......@@ -386,13 +388,13 @@ struct tp_task {
struct gc fgeometry_correction;
float* correction_vector;
int correction_vector_length;
// float rByRDist
float * rByRDist;
int rByRDist_length;
float * gpu_geometry_correction;
float * gpu_correction_vector;
float * gpu_rByRDist;
struct gc * gpu_geometry_correction;
struct corr_vector * gpu_correction_vector;
float * gpu_rByRDist;
union trot_deriv * gpu_rot_deriv;
readFloatsFromFile(
(float *) &fgeometry_correction, // float * data, // allocated array
......@@ -405,11 +407,11 @@ struct tp_task {
correction_vector_file, // const char * path,
&correction_vector_length); // int * len_in_floats)
gpu_geometry_correction = copyalloc_kernel_gpu(
gpu_geometry_correction = (struct gc *) copyalloc_kernel_gpu(
(float *) &fgeometry_correction,
sizeof(fgeometry_correction)/sizeof(float));
gpu_correction_vector = copyalloc_kernel_gpu(
gpu_correction_vector = (struct corr_vector * ) copyalloc_kernel_gpu(
correction_vector,
correction_vector_length);
......@@ -417,6 +419,8 @@ struct tp_task {
rByRDist,
rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) {
......@@ -597,6 +601,125 @@ struct tp_task {
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
#define TEST_ROT_MATRICES
#ifdef TEST_ROT_MATRICES
// dim3 threads_rot(3,3,NUM_CAMS);
// dim3 grid_rot (1, 1, 1);
dim3 threads_rot(3,3,3);
dim3 grid_rot (NUM_CAMS, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
// calc_rot_matrices<<<grid_rot,threads_rot>>> (
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * ) gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * ) gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
checkCudaErrors(cudaMemcpy(
&rot_deriv,
gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
const char* matrices_names[] = {
"rot","d_daz","d_tilt","d_roll","d_zoom"};
for (int i = 0; i < 5;i++){
printf("Matrix %s for camera\n",matrices_names[i]);
for (int row = 0; row<3; row++){
for (int ncam = 0; ncam<NUM_CAMS;ncam++){
for (int col = 0; col <3; col++){
printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
if (col == 2){
if (ncam == (NUM_CAMS-1)){
printf("\n");
} else {
printf(" ");
}
} else {
printf(" ");
}
}
}
}
}
#endif // TEST_ROT_MATRICES
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
#endif // TEST_GEOM_CORR
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
......@@ -607,28 +730,23 @@ struct tp_task {
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart();
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
......@@ -638,11 +756,11 @@ struct tp_task {
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i);
}
// checkCudaErrors(cudaDeviceSynchronize());
// checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
......@@ -1154,6 +1272,8 @@ struct tp_task {
checkCudaErrors(cudaFree(gpu_geometry_correction));
checkCudaErrors(cudaFree(gpu_correction_vector));
checkCudaErrors(cudaFree(gpu_rByRDist));
checkCudaErrors(cudaFree(gpu_rot_deriv));
free (rByRDist);
free (correction_vector);
......
......@@ -39,6 +39,7 @@
// Avoiding includes in jcuda, all source files will be merged
#pragma once
#ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
......@@ -72,7 +73,11 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 20001 // length of
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
......@@ -87,10 +92,15 @@
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//textures
//#define DEBUG10 1
//#define DEBUG11 1
//#define DEBUG12 1
//#define USE_textures_gen
#define DEBUG_OOB1 1
//#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
#endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment