Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
3c033192
Commit
3c033192
authored
Apr 13, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated kernels
parent
514057c6
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
1196 additions
and
56 deletions
+1196
-56
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+89
-41
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+44
-2
dtt8x8.h
src/main/resources/kernels/dtt8x8.h
+2
-2
geometry_correction.cu
src/main/resources/kernels/geometry_correction.cu
+988
-0
geometry_correction.h
src/main/resources/kernels/geometry_correction.h
+73
-11
No files found.
src/main/resources/kernels/TileProcessor.cuh
View file @
3c033192
...
...
@@ -45,6 +45,15 @@
#include "TileProcessor.h"
#endif // #ifndef JCUDA
// CUDA fast math is slower!
//#define FASTMATH 1
/*
fast
GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
nofast
GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
*/
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
...
...
@@ -101,14 +110,6 @@
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#undef DBG_MARK_DBG_TILE
//56494
// struct tp_task
...
...
@@ -1019,19 +1020,7 @@ __global__ void correlate2D(
__syncthreads
();
// __syncwarp();
#endif
#endif
}
// if (color == 1){ // LPF only after B (nothing in mono)
}
// for (int color = 0; color < colors; color++){
normalizeTileAmplitude
(
clt_corr
,
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
...
...
@@ -1083,23 +1072,6 @@ __global__ void correlate2D(
#endif
#endif
dttii_2d
(
clt_corr
);
/*
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DEBUG6
...
...
@@ -1170,6 +1142,7 @@ __global__ void generate_RBGA(
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
...
...
@@ -1920,11 +1893,11 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen
extern
"C"
__global__
void
textures_accumulate
(
// int border_tile, // if 1 - watch for border
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
...
...
@@ -2026,14 +1999,21 @@ __global__ void textures_accumulate(
}
__syncthreads
();
// __syncwarp();
#endif
// perform idct
#ifdef DBG_TILE // perform idct
imclt8threads
(
0
,
// int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile
,
// float * mclt_tile )
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
#else
imclt8threads
(
0
,
// int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile
,
// float * mclt_tile )
0
);
#endif
__syncthreads
();
// __syncwarp();
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen mclt color = %d
\n
"
,
color
);
...
...
@@ -2044,6 +2024,7 @@ __global__ void textures_accumulate(
__syncthreads
();
// __syncwarp();
#endif
if
(
colors
>
1
)
{
#ifdef DBG_TILE
debayer_shot
(
(
color
<
2
),
// const int rb_mode, // 0 - green, 1 - r/b
min_shot
,
// float min_shot, // 10.0
...
...
@@ -2052,6 +2033,16 @@ __global__ void textures_accumulate(
mclt_dst
,
// float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp
,
// float * mclt_tmp,
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
// int debug);
#else
debayer_shot
(
(
color
<
2
),
// const int rb_mode, // 0 - green, 1 - r/b
min_shot
,
// float min_shot, // 10.0
scale_shot
,
// float scale_shot, // 3.0 (0.0 for mono)
mclt_tile
,
// float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst
,
// float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp
,
// float * mclt_tmp,
0
);
// int debug);
#endif
__syncthreads
();
// __syncwarp();
}
else
{
// copy? - no, just remember to use mclt_tile, not mclt_dst
...
...
@@ -2125,6 +2116,7 @@ __global__ void textures_accumulate(
__syncthreads
();
// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
tile_combine_rgba
(
colors
,
// int colors, // number of colors
(
float
*
)
shr
.
mclt_debayer
,
// float * mclt_tile, // debayer // has gaps to align with union !
...
...
@@ -2140,7 +2132,23 @@ __global__ void textures_accumulate(
dust_remove
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights
,
// int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(
tile_num
==
DBG_TILE
)
);
//int debug );
#else
tile_combine_rgba
(
colors
,
// int colors, // number of colors
(
float
*
)
shr
.
mclt_debayer
,
// float * mclt_tile, // debayer // has gaps to align with union !
(
float
*
)
mclt_tiles
,
// float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(
float
*
)
shr1
.
rgbaw
,
// float * rgba, // result
(
float
*
)
0
,
// float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(
float
*
)
0
,
// float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(
float
*
)
port_offsets
,
// float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma
,
// float diff_sigma, // pixel value/pixel change
diff_threshold
,
// float diff_threshold, // pixel value/pixel change
min_agree
,
// float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights
,
// float * chn_weights, // color channel weights, sum == 1.0
dust_remove
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights
,
// int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0
);
//int debug );
#endif
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
...
...
@@ -2655,7 +2663,11 @@ __device__ void normalizeTileAmplitude(
*
(
clt_tile_j1
)
*
*
(
clt_tile_j1
)
+
*
(
clt_tile_j2
)
*
*
(
clt_tile_j2
)
+
*
(
clt_tile_j3
)
*
*
(
clt_tile_j3
);
#ifdef FASTMATH
float
scale
=
__frsqrt_rn
(
s2
);
// 1.0/sqrt(s2)
#else
float
scale
=
rsqrtf
(
s2
);
// 1.0/sqrt(s2)
#endif
*
(
clt_tile_j0
)
*=
scale
;
*
(
clt_tile_j1
)
*=
scale
;
*
(
clt_tile_j2
)
*=
scale
;
...
...
@@ -3333,7 +3345,12 @@ __device__ void debayer_shot(
if
(
scale_shot
>
0.0
)
{
#ifdef FASTMATH
float
k
=
__frsqrt_rn
(
min_shot
);
#else
float
k
=
rsqrtf
(
min_shot
);
#endif
// double k = 1.0/Math.sqrt(min_shot); //sqrtf
//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
...
...
@@ -3343,7 +3360,14 @@ __device__ void debayer_shot(
#pragma unroll
for
(
int
col
=
0
;
col
<
DTT_SIZE2
;
col
+=
DTT_SIZE
){
float
d
=
*
mcltp
;
#ifdef FASTMATH
*
mcltp
=
scale_shot
*
((
d
>
min_shot
)
?
__fsqrt_rn
(
d
)
:
(
k
*
d
));
#else
*
mcltp
=
scale_shot
*
((
d
>
min_shot
)
?
sqrtf
(
d
)
:
(
k
*
d
));
#endif
mcltp
+=
DTT_SIZE
;
}
mcltp
+=
(
DTT_SIZE21
-
DTT_SIZE2
);
...
...
@@ -3549,10 +3573,19 @@ __device__ void tile_combine_rgba(
s2
+=
d
*
d
;
}
float
mse
=
(
s0
*
s2
-
s1
*
s1
)
/
(
s0
*
s0
);
#ifdef FASTMATH
*
crms_col_i
=
__fsqrt_rn
(
mse
);
#else
*
crms_col_i
=
sqrtf
(
mse
);
#endif
sw
+=
*
(
chn_weights
+
ncol
)
*
mse
;
}
#ifdef FASTMATH
*
(
crms_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
colors
)
=
__fsqrt_rn
(
sw
);
// will fade as window
#else
*
(
crms_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
colors
)
=
sqrtf
(
sw
);
// will fade as window
#endif
}
#ifdef DEBUG9
}
...
...
@@ -3605,7 +3638,12 @@ __device__ void tile_combine_rgba(
dc
*=
wnd2_inv
;
// to compensate fading near the edges
d
+=
*
(
chn_weights
+
ncol
)
*
dc
*
dc
;
}
#ifdef FASTMATH
d
=
__expf
(
-
pair_dist2r
[
ipair
]
*
d
)
+
(
FAT_ZERO_WEIGHT
);
// 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#else
d
=
expf
(
-
pair_dist2r
[
ipair
]
*
d
)
+
(
FAT_ZERO_WEIGHT
);
// 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#endif
// Add weight to both channels in a pair
*
(
port_weights_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
pair_ports
[
ipair
][
0
])
+=
d
;
*
(
port_weights_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
pair_ports
[
ipair
][
1
])
+=
d
;
...
...
@@ -3711,7 +3749,13 @@ __device__ void tile_combine_rgba(
}
// TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*
(
port_weights_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
cam
)
=
__expf
(
-
ksigma
*
d2_ip
)
+
(
FAT_ZERO_WEIGHT
);
#else
*
(
port_weights_i
+
(
DTT_SIZE2
*
DTT_SIZE21
)
*
cam
)
=
expf
(
-
ksigma
*
d2_ip
)
+
(
FAT_ZERO_WEIGHT
);
#endif
}
// and now make a new average with those weights
// Inserting dust remove here
...
...
@@ -3879,7 +3923,11 @@ __device__ void tile_combine_rgba(
for
(
int
i
=
0
;
i
<
TEXTURE_THREADS_PER_TILE
;
i
++
){
mx
=
fmaxf
(
mx
,
max_diff_tmp
[
cam
][
i
]);
}
#ifdef FASTMATH
max_diff
[
cam
]
=
__fsqrt_rn
(
mx
);
#else
max_diff
[
cam
]
=
sqrtf
(
mx
);
#endif
}
}
...
...
src/main/resources/kernels/TileProcessor.h
View file @
3c033192
...
...
@@ -41,6 +41,18 @@
#include "tp_defines.h"
#endif
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
);
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
...
...
@@ -68,11 +80,11 @@ extern "C" __global__ void clear_texture_rbga(
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
textures_accumulate
(
// int border_tile, // if 1 - watch for border
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
...
...
@@ -102,5 +114,35 @@ extern "C" __global__ void imclt_rbg(
int
h_offset
,
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
src/main/resources/kernels/dtt8x8.h
View file @
3c033192
...
...
@@ -72,9 +72,9 @@
// kernels (not used so far)
#if
def BBBB
#if
0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #if
def BBBB
#endif// #if
0
//=========================== 2D functions ===============
extern
__device__
void
corrUnfoldTile
(
...
...
src/main/resources/kernels/geometry_correction.cu
0 → 100644
View file @
3c033192
/**
**
** geometry_correction.cu
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** geometry_correction.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file geometry_correction.cu
* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_CV ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
#define DBG_CAM 3
__device__
void
printGeometryCorrection
(
struct
gc
*
g
);
__device__
void
printExtrinsicCorrection
(
corr_vector
*
cv
);
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline
__device__
float
getRByRDist
(
float
rDist
,
float
rByRDist
[
RBYRDIST_LEN
]);
//shared memory
__constant__
float
ROTS_TEMPLATE
[
7
][
3
][
3
][
3
]
=
{
// ...{cos,sin,const}...
{
// azimuth
{{
1
,
0
,
0
},{
0
,
0
,
0
},{
0
,
-
1
,
0
}},
{{
0
,
0
,
0
},{
0
,
0
,
1
},{
0
,
0
,
0
}},
{{
0
,
1
,
0
},{
0
,
0
,
0
},{
1
,
0
,
0
}},
},{
// tilt
{{
0
,
0
,
1
},{
0
,
0
,
0
},{
0
,
0
,
0
}},
{{
0
,
0
,
0
},{
1
,
0
,
0
},{
0
,
1
,
0
}},
{{
0
,
0
,
0
},{
0
,
-
1
,
0
},{
1
,
0
,
0
}},
},{
// roll*zoom
{{
1
,
0
,
0
},{
0
,
1
,
0
},{
0
,
0
,
0
}},
{{
0
,
-
1
,
0
},{
1
,
0
,
0
},{
0
,
0
,
0
}},
{{
0
,
0
,
0
},{
0
,
0
,
0
},{
0
,
0
,
1
}},
},{
// d_azimuth
{{
0
,
-
1
,
0
},{
0
,
0
,
0
},{
-
1
,
0
,
0
}},
{{
0
,
0
,
0
},{
0
,
0
,
0
},{
0
,
0
,
0
}},
{{
1
,
0
,
0
},{
0
,
0
,
0
},{
0
,
-
1
,
0
}},
},{
// d_tilt
{{
0
,
0
,
0
},{
0
,
0
,
0
},{
0
,
0
,
0
}},
{{
0
,
0
,
0
},{
0
,
-
1
,
0
},{
1
,
0
,
0
}},
{{
0
,
0
,
0
},{
-
1
,
0
,
0
},{
0
,
-
1
,
0
}},
},{
// d_roll
{{
0
,
-
1
,
0
},{
1
,
0
,
0
},{
0
,
0
,
0
}},
{{
-
1
,
0
,
0
},{
0
,
-
1
,
0
},{
0
,
0
,
0
}},
{{
0
,
0
,
0
},{
0
,
0
,
0
},{
0
,
0
,
0
}},
},{
// d_zoom
{{
1
,
0
,
0
},{
0
,
1
,
0
},{
0
,
0
,
0
}},
{{
0
,
-
1
,
0
},{
1
,
0
,
0
},{
0
,
0
,
0
}},
{{
0
,
0
,
0
},{
0
,
0
,
0
},{
0
,
0
,
0
}},
}
};
__constant__
int
angles_offsets
[
4
]
=
{
offsetof
(
corr_vector
,
azimuth
)
/
sizeof
(
float
),
offsetof
(
corr_vector
,
tilt
)
/
sizeof
(
float
),
offsetof
(
corr_vector
,
roll
)
/
sizeof
(
float
),
offsetof
(
corr_vector
,
roll
)
/
sizeof
(
float
)};
__constant__
int
mm_seq
[
3
][
3
][
3
]
=
{
{
{
6
,
5
,
12
},
// a_t * a_z -> tmp0
{
7
,
6
,
13
},
// a_r * a_t -> tmp1
{
7
,
9
,
14
},
// a_r * a_dt -> tmp2
},
{
{
7
,
12
,
0
},
// a_r * tmp0 -> rot - bad
{
13
,
8
,
1
},
// tmp1 * a_daz -> deriv0 - good
{
14
,
5
,
2
},
// tmp2 * a_az -> deriv1 - good
},
{
{
10
,
12
,
3
},
// a_dr * tmp0 -> deriv2 - good
{
11
,
12
,
4
},
// a_dzoom * tnmp0 -> deriv3 - good
{
-
1
,
-
1
,
-
1
}
// do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__
int
offset_rots
=
0
;
//0
__constant__
int
offset_derivs
=
1
;
// 1..4 // should be next
__constant__
int
offset_matrices
=
5
;
// 5..11
__constant__
int
offset_tmp
=
12
;
// 12..15
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
*/
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
trot_deriv
*
gpu_rot_deriv
)
{
// __shared__ float zoom;
__shared__
float
sincos
[
4
][
2
];
// {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__
float
matrices
[
5
+
7
+
4
][
3
][
3
];
float
angle
;
float
zoom
;
int
ncam
=
blockIdx
.
x
;
// threadIdx.z;
int
nangle1
=
threadIdx
.
x
+
threadIdx
.
y
*
blockDim
.
x
;
// * >> 1;
int
nangle
=
nangle1
>>
1
;
int
is_sin
=
nangle1
&
1
;
if
((
threadIdx
.
z
==
0
)
&&
(
nangle
<
4
)){
// others just idle here
float
*
gangles
=
(
float
*
)
gpu_correction_vector
+
angles_offsets
[
nangle
];
// pointer for channel 0
if
(
ncam
==
(
NUM_CAMS
-
1
)){
// for the whole block
angle
=
0.0
;
zoom
=
0.0
;
#pragma unroll
for
(
int
n
=
0
;
n
<
(
NUM_CAMS
-
1
);
n
++
){
angle
-=
*
(
gangles
+
n
);
zoom
-=
gpu_correction_vector
->
zoom
[
n
];
}
if
(
nangle
>=
2
){
// diverging for roll (last two)
angle
=
*
(
gangles
+
ncam
);
}
}
else
{
angle
=
*
(
gangles
+
ncam
);
zoom
=
gpu_correction_vector
->
zoom
[
ncam
];
}
if
(
!
is_sin
){
angle
+=
M_PI
/
2
;
}
float
sc
=
sinf
(
angle
);
if
(
nangle
==
2
)
{
sc
*=
1.0
+
zoom
;
}
sincos
[
nangle
][
is_sin
]
=
sc
;
}
__syncthreads
();
#ifdef DEBUG20
if
((
ncam
==
DBG_CAM
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)
&&
(
threadIdx
.
z
==
0
)){
printf
(
"
\n
Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f
\n
"
,
ncam
,
sincos
[
0
][
0
],
sincos
[
0
][
1
],
zoom
);
printf
(
" Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f
\n
"
,
ncam
,
sincos
[
1
][
0
],
sincos
[
1
][
1
]);
printf
(
" Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f
\n
"
,
ncam
,
sincos
[
2
][
0
],
sincos
[
2
][
1
]);
printf
(
" Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f
\n
"
,
ncam
,
sincos
[
3
][
0
],
sincos
[
3
][
1
]);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int
axis
=
offset_matrices
+
threadIdx
.
z
;
// 0..2
int
const_index
=
threadIdx
.
z
;
// 0..2
matrices
[
axis
][
threadIdx
.
y
][
threadIdx
.
x
]
=
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
0
]
*
sincos
[
threadIdx
.
z
][
0
]
+
// cos
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
1
]
*
sincos
[
threadIdx
.
z
][
1
]
+
// sin
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
2
];
// const
axis
+=
3
;
// skip index == 3
const_index
+=
3
;
matrices
[
axis
][
threadIdx
.
y
][
threadIdx
.
x
]
=
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
0
]
*
sincos
[
threadIdx
.
z
][
0
]
+
// cos
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
1
]
*
sincos
[
threadIdx
.
z
][
1
]
+
// sin
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
2
];
// const
if
(
threadIdx
.
z
==
0
){
axis
+=
3
;
const_index
+=
3
;
matrices
[
axis
][
threadIdx
.
y
][
threadIdx
.
x
]
=
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
0
]
*
sincos
[
3
][
0
]
+
// cos
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
1
]
*
sincos
[
3
][
1
]
+
// sin
ROTS_TEMPLATE
[
const_index
][
threadIdx
.
y
][
threadIdx
.
x
][
2
];
// const
}
__syncthreads
();
#ifdef DEBUG20
const
char
*
matrices_names
[]
=
{
"az"
,
"tilt"
,
"roll*zoom"
,
"d_daz"
,
"d_tilt"
,
"d_roll"
,
"d_zoom"
};
if
((
ncam
==
DBG_CAM
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)
&&
(
threadIdx
.
z
==
0
)){
for
(
int
i
=
0
;
i
<
7
;
i
++
)
{
printf
(
"
\n
----Matrix %s for camera %d:
\n
"
,
matrices_names
[
i
],
ncam
);
for
(
int
row
=
0
;
row
<
3
;
row
++
){
for
(
int
col
=
0
;
col
<
3
;
col
++
){
printf
(
"%9.6f, "
,
matrices
[
offset_matrices
+
i
][
row
][
col
]);
}
printf
(
"
\n
"
);
}
}
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG20
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot
{13,8,1}, // tmp1 * a_daz -> deriv0
{14,5,2}, // tmp2 * a_az -> deriv1
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for
(
int
i
=
0
;
i
<
3
;
i
++
){
int
srcl
=
mm_seq
[
i
][
threadIdx
.
z
][
0
];
int
srcr
=
mm_seq
[
i
][
threadIdx
.
z
][
1
];
int
dst
=
mm_seq
[
i
][
threadIdx
.
z
][
2
];
if
(
srcl
>=
0
){
matrices
[
dst
][
threadIdx
.
y
][
threadIdx
.
x
]
=
matrices
[
srcl
][
threadIdx
.
y
][
0
]
*
matrices
[
srcr
][
0
][
threadIdx
.
x
]
+
matrices
[
srcl
][
threadIdx
.
y
][
1
]
*
matrices
[
srcr
][
1
][
threadIdx
.
x
]
+
matrices
[
srcl
][
threadIdx
.
y
][
2
]
*
matrices
[
srcr
][
2
][
threadIdx
.
x
];
}
__syncthreads
();
}
// copy results to global memory
int
gindx
=
threadIdx
.
z
;
int
lindx
=
offset_rots
+
threadIdx
.
z
;
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv
->
rots
[
ncam
+
gindx
*
NUM_CAMS
][
threadIdx
.
y
][
threadIdx
.
x
]
=
matrices
[
lindx
][
threadIdx
.
y
][
threadIdx
.
x
];
#else
gpu_rot_deriv
->
matrices
[
gindx
][
ncam
][
threadIdx
.
y
][
threadIdx
.
x
]
=
matrices
[
lindx
][
threadIdx
.
y
][
threadIdx
.
x
];
#endif
gindx
+=
3
;
lindx
+=
3
;
if
(
lindx
<
5
)
{
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv
->
rots
[
ncam
+
gindx
*
NUM_CAMS
][
threadIdx
.
y
][
threadIdx
.
x
]
=
matrices
[
lindx
][
threadIdx
.
y
][
threadIdx
.
x
];
#else
gpu_rot_deriv
->
matrices
[
gindx
][
ncam
][
threadIdx
.
y
][
threadIdx
.
x
]
=
matrices
[
lindx
][
threadIdx
.
y
][
threadIdx
.
x
];
#endif
}
__syncthreads
();
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)
&&
(
threadIdx
.
z
==
0
)){
printf
(
"
\n
----All Done with calc_rot_deriv() for ncam=%d
\n
"
,
ncam
);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
}
/*
* blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM
*/
extern
"C"
__global__
void
get_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
trot_deriv
*
gpu_rot_deriv
)
{
// int task_num = blockIdx.x * blockDim.x + threadIdx.x; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
int
task_num
=
blockIdx
.
x
*
blockDim
.
y
+
threadIdx
.
y
;
// blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
if
(
task_num
>=
num_tiles
){
return
;
}
int
thread_xy
=
blockDim
.
x
*
threadIdx
.
y
+
threadIdx
.
x
;
int
ncam
=
threadIdx
.
x
;
// threadIdx.x - numcam, used for per-camera
__shared__
struct
gc
geometry_correction
;
__shared__
float
rByRDist
[
RBYRDIST_LEN
];
__shared__
struct
corr_vector
extrinsic_corr
;
__shared__
trot_deriv
rot_deriv
;
float
pXY
[
2
];
// result to be copied to task
// copy data common to all threads
{
float
*
gcp_local
=
(
float
*
)
&
geometry_correction
;
float
*
gcp_global
=
(
float
*
)
gpu_geometry_correction
;
int
offset
=
thread_xy
;
for
(
int
i
=
0
;
i
<
CYCLES_COPY_GC
;
i
++
){
if
(
offset
<
sizeof
(
struct
gc
)
/
sizeof
(
float
))
{
*
(
gcp_local
+
offset
)
=
*
(
gcp_global
+
offset
);
}
offset
+=
THREADS_PER_BLOCK_GEOM
;
}
}
{
float
*
cvp_local
=
(
float
*
)
&
extrinsic_corr
;
float
*
cvp_global
=
(
float
*
)
gpu_correction_vector
;
int
offset
=
thread_xy
;
for
(
int
i
=
0
;
i
<
CYCLES_COPY_CV
;
i
++
){
if
(
offset
<
sizeof
(
struct
corr_vector
)
/
sizeof
(
float
))
{
*
(
cvp_local
+
offset
)
=
*
(
cvp_global
+
offset
);
}
offset
+=
THREADS_PER_BLOCK_GEOM
;
}
}
// TODO: maybe it is better to use system memory and not read all table?
{
float
*
rByRDistp_local
=
(
float
*
)
rByRDist
;
float
*
rByRDistp_global
=
(
float
*
)
gpu_rByRDist
;
int
offset
=
thread_xy
;
for
(
int
i
=
0
;
i
<
CYCLES_COPY_RBRD
;
i
++
){
if
(
offset
<
RBYRDIST_LEN
)
{
*
(
rByRDistp_local
+
offset
)
=
*
(
rByRDistp_global
+
offset
);
}
offset
+=
THREADS_PER_BLOCK_GEOM
;
}
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
float
*
rots_local
=
(
float
*
)
&
rot_deriv
;
float
*
rots_global
=
(
float
*
)
gpu_rot_deriv
;
// rot_matrices;
int
offset
=
thread_xy
;
for
(
int
i
=
0
;
i
<
CYCLES_COPY_ROTS
;
i
++
){
if
(
offset
<
sizeof
(
trot_deriv
)
/
sizeof
(
float
))
{
*
(
rots_local
+
offset
)
=
*
(
rots_global
+
offset
);
}
offset
+=
THREADS_PER_BLOCK_GEOM
;
}
}
__syncthreads
();
int
imu_exists
=
// todo - calculate once with rot_deriv?
(
extrinsic_corr
.
imu_rot
[
0
]
!=
0.0
)
||
(
extrinsic_corr
.
imu_rot
[
1
]
!=
0.0
)
||
(
extrinsic_corr
.
imu_rot
[
2
]
!=
0.0
)
||
(
extrinsic_corr
.
imu_move
[
0
]
!=
0.0
)
||
(
extrinsic_corr
.
imu_move
[
1
]
!=
0.0
)
||
(
extrinsic_corr
.
imu_move
[
2
]
!=
0.0
);
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"
\n
Tile = %d, camera= %d
\n
"
,
task_num
,
ncam
);
printf
(
"
\n
get_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d
\n
"
,
(
int
)
threadIdx
.
x
,
(
int
)
threadIdx
.
y
,
(
int
)
blockIdx
.
x
);
printGeometryCorrection
(
&
geometry_correction
);
printExtrinsicCorrection
(
&
extrinsic_corr
);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
* 2) apply disparity
* 3) apply rotations and zoom
* 4) re-apply distortion
* 5) return port center X and Y
* line_time
*/
// common code, calculated in parallel
int
cxy
=
gpu_tasks
[
task_num
].
txy
;
float
disparity
=
gpu_tasks
[
task_num
].
target_disparity
;
int
tileX
=
(
cxy
&
0xffff
);
int
tileY
=
(
cxy
>>
16
);
float
px
=
tileX
*
DTT_SIZE
+
DTT_SIZE
/
2
;
// - shiftX;
float
py
=
tileY
*
DTT_SIZE
+
DTT_SIZE
/
2
;
// - shiftY;
float
pXcd
=
px
-
0.5
*
geometry_correction
.
pixelCorrectionWidth
;
float
pYcd
=
py
-
0.5
*
geometry_correction
.
pixelCorrectionHeight
;
// float rXY [NUM_CAMS][2];
float
rXY
[
2
];
// for (int i = 0; i < NUM_CAMS;i++){
// rXY[ncam][0] = geometry_correction.rXY[ncam][0];
// rXY[ncam][1] = geometry_correction.rXY[ncam][1];
rXY
[
0
]
=
geometry_correction
.
rXY
[
ncam
][
0
];
rXY
[
1
]
=
geometry_correction
.
rXY
[
ncam
][
1
];
// }
float
rD
=
sqrtf
(
pXcd
*
pXcd
+
pYcd
*
pYcd
)
*
0.001
*
geometry_correction
.
pixelSize
;
// distorted radius in a virtual center camera
float
rND2R
=
getRByRDist
(
rD
/
geometry_correction
.
distortionRadius
,
rByRDist
);
float
pXc
=
pXcd
*
rND2R
;
// non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float
pYc
=
pYcd
*
rND2R
;
// in pixels
float
xyz
[
3
];
// getWorldCoordinates
xyz
[
2
]
=
-
SCENE_UNITS_SCALE
*
geometry_correction
.
focalLength
*
geometry_correction
.
disparityRadius
/
(
disparity
*
0.001
*
geometry_correction
.
pixelSize
);
// "+" - near, "-" far
xyz
[
0
]
=
SCENE_UNITS_SCALE
*
pXc
*
geometry_correction
.
disparityRadius
/
disparity
;
xyz
[
1
]
=
-
SCENE_UNITS_SCALE
*
pYc
*
geometry_correction
.
disparityRadius
/
disparity
;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float
fl_pix
=
geometry_correction
.
focalLength
/
(
0.001
*
geometry_correction
.
pixelSize
);
// focal length in pixels - this camera
float
ri_scale
=
0.001
*
geometry_correction
.
pixelSize
/
geometry_correction
.
distortionRadius
;
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"
\n
Tile = %d, camera= %d
\n
"
,
task_num
,
ncam
);
printf
(
"tileX = %d, tileY = %d
\n
"
,
tileX
,
tileY
);
printf
(
"px = %f, py = %f
\n
"
,
px
,
py
);
printf
(
"pXcd = %f, pYcd = %f
\n
"
,
pXcd
,
pYcd
);
printf
(
"rXY[0] = %f, rXY[1] = %f
\n
"
,
rXY
[
0
],
rXY
[
1
]);
printf
(
"rD = %f, rND2R = %f
\n
"
,
rD
,
rND2R
);
printf
(
"pXc = %f, pYc = %f
\n
"
,
pXc
,
pYc
);
printf
(
"fl_pix = %f, ri_scale = %f
\n
"
,
fl_pix
,
ri_scale
);
printf
(
"xyz[0] = %f, xyz[1] = %f, xyz[2] = %f
\n
"
,
xyz
[
0
],
xyz
[
1
],
xyz
[
2
]);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float
pXci0
=
pXc
-
disparity
*
rXY
[
0
];
// [ncam][0]; // in pixels
float
pYci0
=
pYc
-
disparity
*
rXY
[
1
];
// [ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float
rvi
[
3
];
#pragma unroll
for
(
int
j
=
0
;
j
<
3
;
j
++
){
rvi
[
j
]
=
rot_deriv
.
rots
[
ncam
][
j
][
0
]
*
pXci0
+
rot_deriv
.
rots
[
ncam
][
j
][
1
]
*
pYci0
+
rot_deriv
.
rots
[
ncam
][
j
][
2
]
*
fl_pix
;
}
// get back to the projection plane by normalizing vector
float
norm_z
=
fl_pix
/
rvi
[
2
];
float
pXci
=
rvi
[
0
]
*
norm_z
;
float
pYci
=
rvi
[
1
]
*
norm_z
;
// Re-apply distortion
float
rNDi
=
sqrtf
(
pXci
*
pXci
+
pYci
*
pYci
);
// in pixels
float
ri
=
rNDi
*
ri_scale
;
// relative to distortion radius
float
rD2rND
=
1.0
;
{
float
rri
=
1.0
;
#ifdef NVRTC_BUG
#pragma unroll
for
(
int
j
=
0
;
j
<
RAD_COEFF_LEN
;
j
++
){
rri
*=
ri
;
rD2rND
+=
((
float
*
)
&
geometry_correction
.
distortionC
)[
j
]
*
(
rri
-
1.0
);
}
#else
for
(
int
j
=
0
;
j
<
sizeof
(
geometry_correction
.
rad_coeff
)
/
sizeof
(
float
);
j
++
){
rri
*=
ri
;
rD2rND
+=
geometry_correction
.
rad_coeff
[
j
]
*
(
rri
-
1.0
);
}
#endif
}
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float
pXid
=
pXci
*
rD2rND
;
float
pYid
=
pYci
*
rD2rND
;
pXY
[
0
]
=
pXid
+
geometry_correction
.
pXY0
[
ncam
][
0
];
pXY
[
1
]
=
pYid
+
geometry_correction
.
pXY0
[
ncam
][
1
];
// used when calculating derivatives, TODO: combine calculations !
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"pXci0 = %f, pYci0 = %f
\n
"
,
pXci0
,
pYci0
);
printf
(
"rvi[0] = %f, rvi[1] = %f, rvi[2] = %f
\n
"
,
rvi
[
0
],
rvi
[
1
],
rvi
[
2
]);
printf
(
"norm_z = %f, pXci = %f, pYci = %f
\n
"
,
norm_z
,
pXci
,
pYci
);
printf
(
"rNDi = %f, ri = %f
\n
"
,
rNDi
,
ri
);
printf
(
"rD2rND = %f
\n
"
,
rD2rND
);
printf
(
"pXid = %f, pYid = %f
\n
"
,
pXid
,
pYid
);
printf
(
"pXY[0] = %f, pXY[1] = %f
\n
"
,
pXY
[
0
],
pXY
[
1
]);
// OK
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
// float rvi[3];
float
drvi_daz
[
3
];
// drvi_daz = deriv_rots[i][0].times(vi);
float
drvi_dtl
[
3
];
// drvi_dtl = deriv_rots[i][1].times(vi);
float
drvi_drl
[
3
];
// drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll
for
(
int
j
=
0
;
j
<
3
;
j
++
){
// drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * rvi[0] + rot_deriv.d_daz[ncam][j][1] * rvi[1] + rot_deriv.d_daz[ncam][j][2] * rvi[2];
// drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
// drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
drvi_daz
[
j
]
=
rot_deriv
.
d_daz
[
ncam
][
j
][
0
]
*
pXci0
+
rot_deriv
.
d_daz
[
ncam
][
j
][
1
]
*
pYci0
+
rot_deriv
.
d_daz
[
ncam
][
j
][
2
]
*
fl_pix
;
drvi_dtl
[
j
]
=
rot_deriv
.
d_tilt
[
ncam
][
j
][
0
]
*
pXci0
+
rot_deriv
.
d_tilt
[
ncam
][
j
][
1
]
*
pYci0
+
rot_deriv
.
d_tilt
[
ncam
][
j
][
2
]
*
fl_pix
;
drvi_drl
[
j
]
=
rot_deriv
.
d_roll
[
ncam
][
j
][
0
]
*
pXci0
+
rot_deriv
.
d_roll
[
ncam
][
j
][
1
]
*
pYci0
+
rot_deriv
.
d_roll
[
ncam
][
j
][
2
]
*
fl_pix
;
}
// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
float
dpXci_dazimuth
=
drvi_daz
[
0
]
*
norm_z
-
pXci
*
drvi_daz
[
2
]
/
rvi
[
2
];
float
dpYci_dazimuth
=
drvi_daz
[
1
]
*
norm_z
-
pYci
*
drvi_daz
[
2
]
/
rvi
[
2
];
float
dpXci_dtilt
=
drvi_dtl
[
0
]
*
norm_z
-
pXci
*
drvi_dtl
[
2
]
/
rvi
[
2
];
float
dpYci_dtilt
=
drvi_dtl
[
1
]
*
norm_z
-
pYci
*
drvi_dtl
[
2
]
/
rvi
[
2
];
float
dpXci_droll
=
drvi_drl
[
0
]
*
norm_z
-
pXci
*
drvi_drl
[
2
]
/
rvi
[
2
];
float
dpYci_droll
=
drvi_drl
[
1
]
*
norm_z
-
pYci
*
drvi_drl
[
2
]
/
rvi
[
2
];
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f
\n
"
,
drvi_daz
[
0
],
drvi_daz
[
1
],
drvi_daz
[
2
]);
printf
(
"drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f
\n
"
,
drvi_dtl
[
0
],
drvi_dtl
[
1
],
drvi_dtl
[
2
]);
printf
(
"drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f
\n
"
,
drvi_drl
[
0
],
drvi_drl
[
1
],
drvi_drl
[
2
]);
printf
(
"dpXci_dazimuth = %f, dpYci_dazimuth = %f
\n
"
,
dpXci_dazimuth
,
dpYci_dazimuth
);
printf
(
"dpXci_dtilt = %f, dpYci_dtilt = %f
\n
"
,
dpXci_dtilt
,
dpYci_dtilt
);
printf
(
"dpXci_droll = %f, dpYci_droll = %f
\n
"
,
dpXci_droll
,
dpYci_droll
);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
float
disp_dist
[
4
];
// only for this channel, to be copied to global gpu_tasks in the end
float
dpXci_pYci_imu_lin
[
2
][
3
];
/*
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/
float
dd1
[
2
][
2
];
// get top left 2x2 sub-matrix
dd1
[
0
][
0
]
=
(
-
rot_deriv
.
rots
[
ncam
][
0
][
0
]
*
rXY
[
0
]
-
rot_deriv
.
rots
[
ncam
][
0
][
1
]
*
rXY
[
1
])
*
norm_z
;
dd1
[
0
][
1
]
=
(
rot_deriv
.
rots
[
ncam
][
0
][
0
]
*
rXY
[
1
]
-
rot_deriv
.
rots
[
ncam
][
0
][
1
]
*
rXY
[
0
])
*
norm_z
;
dd1
[
1
][
0
]
=
(
-
rot_deriv
.
rots
[
ncam
][
1
][
0
]
*
rXY
[
0
]
-
rot_deriv
.
rots
[
ncam
][
1
][
1
]
*
rXY
[
1
])
*
norm_z
;
dd1
[
1
][
1
]
=
(
rot_deriv
.
rots
[
ncam
][
1
][
0
]
*
rXY
[
1
]
-
rot_deriv
.
rots
[
ncam
][
1
][
1
]
*
rXY
[
0
])
*
norm_z
;
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"dd1[0][0] = %f, dd1[0][1] = %f
\n
"
,
dd1
[
0
][
0
],
dd1
[
0
][
1
]);
printf
(
"dd1[1][0] = %f, dd1[1][1] = %f
\n
"
,
dd1
[
1
][
0
],
dd1
[
1
][
1
]);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float
c_dist
=
pXci
/
rNDi
;
float
s_dist
=
pYci
/
rNDi
;
/*
double [][] arot2= {
{c_dist, s_dist},
{-s_dist, c_dist}};
Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
double [][] ascale_distort = {
{rD2rND + ri* drD2rND_dri, 0 },
{0, rD2rND}};
Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
disp_dist[i][0] = dd2.get(0, 0);
disp_dist[i][1] = dd2.get(0, 1);
disp_dist[i][2] = dd2.get(1, 0); // d_py/d_disp
disp_dist[i][3] = dd2.get(1, 1);
*/
//#undef NVRTC_BUG
float
drD2rND_dri
=
0.0
;
{
float
rri
=
1.0
;
#ifdef NVRTC_BUG
#pragma unroll
for
(
int
j
=
0
;
j
<
RAD_COEFF_LEN
;
j
++
){
drD2rND_dri
+=
((
float
*
)
&
geometry_correction
.
distortionC
)[
j
]
*
(
j
+
1
)
*
rri
;
rri
*=
ri
;
}
#else
#pragma unroll
for
(
int
j
=
0
;
j
<
sizeof
(
geometry_correction
.
rad_coeff
)
/
sizeof
(
float
);
j
++
){
drD2rND_dri
+=
geometry_correction
.
rad_coeff
[
j
]
*
(
j
+
1
)
*
rri
;
rri
*=
ri
;
}
#endif
}
float
scale_distort00
=
rD2rND
+
ri
*
drD2rND_dri
;
float
scale_distort11
=
rD2rND
;
// float rot2Xdd1[2][2];
// rot2Xdd1[0][0] = c_dist * dd1[0][0] + s_dist * dd1[1][0];
// rot2Xdd1[0][1] = c_dist * dd1[0][1] + s_dist * dd1[1][1];
// rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
// rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
float
scale_distortXrot2Xdd1
[
2
][
2
];
scale_distortXrot2Xdd1
[
0
][
0
]
=
(
c_dist
*
dd1
[
0
][
0
]
+
s_dist
*
dd1
[
1
][
0
])
*
scale_distort00
;
scale_distortXrot2Xdd1
[
0
][
1
]
=
(
c_dist
*
dd1
[
0
][
1
]
+
s_dist
*
dd1
[
1
][
1
])
*
scale_distort00
;
scale_distortXrot2Xdd1
[
1
][
0
]
=
(
-
s_dist
*
dd1
[
0
][
0
]
+
c_dist
*
dd1
[
1
][
0
])
*
scale_distort11
;
scale_distortXrot2Xdd1
[
1
][
1
]
=
(
-
s_dist
*
dd1
[
0
][
1
]
+
c_dist
*
dd1
[
1
][
1
])
*
scale_distort11
;
disp_dist
[
0
]
=
c_dist
*
scale_distortXrot2Xdd1
[
0
][
0
]
-
s_dist
*
scale_distortXrot2Xdd1
[
1
][
0
];
disp_dist
[
1
]
=
c_dist
*
scale_distortXrot2Xdd1
[
0
][
1
]
-
s_dist
*
scale_distortXrot2Xdd1
[
1
][
1
];
disp_dist
[
2
]
=
s_dist
*
scale_distortXrot2Xdd1
[
0
][
0
]
+
c_dist
*
scale_distortXrot2Xdd1
[
1
][
0
];
disp_dist
[
3
]
=
s_dist
*
scale_distortXrot2Xdd1
[
0
][
1
]
+
c_dist
*
scale_distortXrot2Xdd1
[
1
][
1
];
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f
\n
"
,
scale_distortXrot2Xdd1
[
0
][
0
],
scale_distortXrot2Xdd1
[
0
][
1
]);
printf
(
"scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f
\n
"
,
scale_distortXrot2Xdd1
[
1
][
0
],
scale_distortXrot2Xdd1
[
1
][
1
]);
printf
(
"disp_dist[0] = %f
\n
"
,
disp_dist
[
0
]);
printf
(
"disp_dist[1] = %f
\n
"
,
disp_dist
[
1
]);
printf
(
"disp_dist[2] = %f
\n
"
,
disp_dist
[
2
]);
printf
(
"disp_dist[3] = %f
\n
"
,
disp_dist
[
3
]);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
gpu_tasks
[
task_num
].
disp_dist
[
ncam
][
0
]
=
disp_dist
[
0
];
gpu_tasks
[
task_num
].
disp_dist
[
ncam
][
1
]
=
disp_dist
[
1
];
gpu_tasks
[
task_num
].
disp_dist
[
ncam
][
2
]
=
disp_dist
[
2
];
gpu_tasks
[
task_num
].
disp_dist
[
ncam
][
3
]
=
disp_dist
[
3
];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if
(
imu_exists
){
float
delta_t
=
disp_dist
[
2
]
*
disparity
*
geometry_correction
.
line_time
;
// positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
float
ers_Xci
=
delta_t
*
(
dpXci_dtilt
*
extrinsic_corr
.
imu_rot
[
0
]
+
dpXci_dazimuth
*
extrinsic_corr
.
imu_rot
[
1
]
+
dpXci_droll
*
extrinsic_corr
.
imu_rot
[
2
]);
float
ers_Yci
=
delta_t
*
(
dpYci_dtilt
*
extrinsic_corr
.
imu_rot
[
0
]
+
dpYci_dazimuth
*
extrinsic_corr
.
imu_rot
[
1
]
+
dpYci_droll
*
extrinsic_corr
.
imu_rot
[
2
]);
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"delta_t = %f, ers_Xci = %f, ers_Yci = %f
\n
"
,
delta_t
,
ers_Xci
,
ers_Yci
);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
if
(
disparity
>=
MIN_DISPARITY
){
// all threads together
float
k
=
SCENE_UNITS_SCALE
*
geometry_correction
.
disparityRadius
;
float
wdisparity
=
disparity
;
float
dwdisp_dz
=
(
k
*
geometry_correction
.
focalLength
/
(
0.001
*
geometry_correction
.
pixelSize
))
/
(
xyz
[
2
]
*
xyz
[
2
]);
dpXci_pYci_imu_lin
[
0
][
0
]
=
-
wdisparity
/
k
;
// dpx/ dworld_X
dpXci_pYci_imu_lin
[
1
][
1
]
=
wdisparity
/
k
;
// dpy/ dworld_Y
dpXci_pYci_imu_lin
[
0
][
2
]
=
(
xyz
[
0
]
/
k
)
*
dwdisp_dz
;
// dpx/ dworld_Z
dpXci_pYci_imu_lin
[
1
][
2
]
=
(
xyz
[
1
]
/
k
)
*
dwdisp_dz
;
// dpy/ dworld_Z
ers_Xci
+=
delta_t
*
(
dpXci_pYci_imu_lin
[
0
][
0
]
*
extrinsic_corr
.
imu_move
[
0
]
+
dpXci_pYci_imu_lin
[
0
][
2
]
*
extrinsic_corr
.
imu_move
[
2
]);
ers_Yci
+=
delta_t
*
(
dpXci_pYci_imu_lin
[
1
][
1
]
*
extrinsic_corr
.
imu_move
[
1
]
+
dpXci_pYci_imu_lin
[
1
][
2
]
*
extrinsic_corr
.
imu_move
[
2
]);
pXY
[
0
]
+=
ers_Xci
*
rD2rND
;
// added correction to pixel X
pXY
[
1
]
+=
ers_Yci
*
rD2rND
;
// added correction to pixel Y
#ifdef DEBUG21
if
((
ncam
==
DBG_CAM
)
&&
(
task_num
==
DBG_TILE
)){
printf
(
"k = %f, wdisparity = %f, dwdisp_dz = %f
\n
"
,
k
,
wdisparity
,
dwdisp_dz
);
printf
(
"dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f
\n
"
,
dpXci_pYci_imu_lin
[
0
][
0
],
dpXci_pYci_imu_lin
[
0
][
2
]);
printf
(
"dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f
\n
"
,
dpXci_pYci_imu_lin
[
1
][
1
],
dpXci_pYci_imu_lin
[
1
][
2
]);
printf
(
"delta_t = %f, ers_Xci = %f, ers_Yci = %f
\n
"
,
delta_t
,
ers_Xci
,
ers_Yci
);
printf
(
"pXY[0] = %f, pXY[1] = %f
\n
"
,
pXY
[
0
],
pXY
[
1
]);
// OK
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist
gpu_tasks
[
task_num
].
xy
[
ncam
][
0
]
=
pXY
[
0
];
gpu_tasks
[
task_num
].
xy
[
ncam
][
1
]
=
pXY
[
1
];
}
/**
* Calculate non-distorted radius from distorted using table approximation
* @param rDist distorted radius
* @return corresponding non-distorted radius
*/
inline
__device__
float
getRByRDist
(
float
rDist
,
float
rByRDist
[
RBYRDIST_LEN
])
//shared memory
{
if
(
rDist
<
0
)
{
return
0.0
f
;
// normally should not happen
}
float
findex
=
rDist
/
RBYRDIST_STEP
;
int
index
=
(
int
)
floorf
(
findex
);
if
(
index
<
0
){
index
=
0
;
}
if
(
index
>
(
RBYRDIST_LEN
-
3
))
{
index
=
RBYRDIST_LEN
-
3
;
}
float
mu
=
fmaxf
(
findex
-
index
,
0.0
f
);
float
mu2
=
mu
*
mu
;
float
y0
=
(
index
>
0
)
?
rByRDist
[
index
-
1
]
:
(
2
*
rByRDist
[
index
]
-
rByRDist
[
index
+
1
]);
// use Catmull-Rom
float
a0
=
-
0.5
*
y0
+
1.5
*
rByRDist
[
index
]
-
1.5
*
rByRDist
[
index
+
1
]
+
0.5
*
rByRDist
[
index
+
2
];
float
a1
=
y0
-
2.5
*
rByRDist
[
index
]
+
2
*
rByRDist
[
index
+
1
]
-
0.5
*
rByRDist
[
index
+
2
];
float
a2
=
-
0.5
*
y0
+
0.5
*
rByRDist
[
index
+
1
];
float
a3
=
rByRDist
[
index
];
float
result
=
a0
*
mu
*
mu2
+
a1
*
mu2
+
a2
*
mu
+
a3
;
return
result
;
}
__device__
void
printGeometryCorrection
(
struct
gc
*
g
){
#ifndef JCUDA
printf
(
"
\n
Geometry Correction
\n
------------------
\n
"
);
printf
(
"%22s: %f
\n
"
,
"pixelCorrectionWidth"
,
g
->
pixelCorrectionWidth
);
printf
(
"%22s: %f
\n
"
,
"pixelCorrectionHeight"
,
g
->
pixelCorrectionHeight
);
printf
(
"%22s: %f
\n
"
,
"line_time"
,
g
->
line_time
);
printf
(
"%22s: %f
\n
"
,
"focalLength"
,
g
->
focalLength
);
printf
(
"%22s: %f
\n
"
,
"pixelSize"
,
g
->
pixelSize
);
printf
(
"%22s: %f
\n
"
,
"distortionRadius"
,
g
->
distortionRadius
);
printf
(
"%22s: %f
\n
"
,
"distortionC"
,
g
->
distortionC
);
printf
(
"%22s: %f
\n
"
,
"distortionB"
,
g
->
distortionB
);
printf
(
"%22s: %f
\n
"
,
"distortionA"
,
g
->
distortionA
);
printf
(
"%22s: %f
\n
"
,
"distortionA5"
,
g
->
distortionA5
);
printf
(
"%22s: %f
\n
"
,
"distortionA6"
,
g
->
distortionA6
);
printf
(
"%22s: %f
\n
"
,
"distortionA7"
,
g
->
distortionA7
);
printf
(
"%22s: %f
\n
"
,
"distortionA8"
,
g
->
distortionA8
);
printf
(
"%22s: %f
\n
"
,
"elevation"
,
g
->
elevation
);
printf
(
"%22s: %f
\n
"
,
"heading"
,
g
->
heading
);
printf
(
"%22s: %f, %f, %f, %f
\n
"
,
"forward"
,
g
->
forward
[
0
],
g
->
forward
[
1
],
g
->
forward
[
2
],
g
->
forward
[
3
]);
printf
(
"%22s: %f, %f, %f, %f
\n
"
,
"right"
,
g
->
right
[
0
],
g
->
right
[
1
],
g
->
right
[
2
],
g
->
right
[
3
]);
printf
(
"%22s: %f, %f, %f, %f
\n
"
,
"height"
,
g
->
height
[
0
],
g
->
height
[
1
],
g
->
height
[
2
],
g
->
height
[
3
]);
printf
(
"%22s: %f, %f, %f, %f
\n
"
,
"roll"
,
g
->
roll
[
0
],
g
->
roll
[
1
],
g
->
roll
[
2
],
g
->
roll
[
3
]);
printf
(
"%22s: %f, %f
\n
"
,
"pXY0[0]"
,
g
->
pXY0
[
0
][
0
],
g
->
pXY0
[
0
][
1
]);
printf
(
"%22s: %f, %f
\n
"
,
"pXY0[1]"
,
g
->
pXY0
[
1
][
0
],
g
->
pXY0
[
1
][
1
]);
printf
(
"%22s: %f, %f
\n
"
,
"pXY0[2]"
,
g
->
pXY0
[
2
][
0
],
g
->
pXY0
[
2
][
1
]);
printf
(
"%22s: %f, %f
\n
"
,
"pXY0[3]"
,
g
->
pXY0
[
3
][
0
],
g
->
pXY0
[
3
][
1
]);
printf
(
"%22s: %f
\n
"
,
"common_right"
,
g
->
common_right
);
printf
(
"%22s: %f
\n
"
,
"common_forward"
,
g
->
common_forward
);
printf
(
"%22s: %f
\n
"
,
"common_height"
,
g
->
common_height
);
printf
(
"%22s: %f
\n
"
,
"common_roll"
,
g
->
common_roll
);
printf
(
"%22s: x=%f, y=%f
\n
"
,
"rXY[0]"
,
g
->
rXY
[
0
][
0
],
g
->
rXY
[
0
][
1
]);
printf
(
"%22s: x=%f, y=%f
\n
"
,
"rXY[1]"
,
g
->
rXY
[
1
][
0
],
g
->
rXY
[
1
][
1
]);
printf
(
"%22s: x=%f, y=%f
\n
"
,
"rXY[2]"
,
g
->
rXY
[
2
][
0
],
g
->
rXY
[
2
][
1
]);
printf
(
"%22s: x=%f, y=%f
\n
"
,
"rXY[3]"
,
g
->
rXY
[
3
][
0
],
g
->
rXY
[
3
][
1
]);
printf
(
"%22s: %f
\n
"
,
"cameraRadius"
,
g
->
cameraRadius
);
printf
(
"%22s: %f
\n
"
,
"disparityRadius"
,
g
->
disparityRadius
);
#endif //ifndef JCUDA
}
__device__
void
printExtrinsicCorrection
(
corr_vector
*
cv
)
{
#ifndef JCUDA
printf
(
"
\n
Extrinsic Correction Vector
\n
---------------------------
\n
"
);
printf
(
"%22s: %f, %f, %f
\n
"
,
"tilt"
,
cv
->
tilt
[
0
],
cv
->
tilt
[
1
],
cv
->
tilt
[
2
]);
printf
(
"%22s: %f, %f, %f
\n
"
,
"azimuth"
,
cv
->
azimuth
[
0
],
cv
->
azimuth
[
1
],
cv
->
azimuth
[
2
]);
printf
(
"%22s: %f, %f, %f, %f
\n
"
,
"roll"
,
cv
->
roll
[
0
],
cv
->
roll
[
1
],
cv
->
roll
[
2
],
cv
->
roll
[
3
]);
printf
(
"%22s: %f, %f, %f
\n
"
,
"zoom"
,
cv
->
zoom
[
0
],
cv
->
zoom
[
1
],
cv
->
zoom
[
2
]);
printf
(
"%22s: %f(t), %f(a), %f(r)
\n
"
,
"imu_rot"
,
cv
->
imu_rot
[
0
],
cv
->
imu_rot
[
1
],
cv
->
imu_rot
[
2
]);
printf
(
"%22s: %f(x), %f(y), %f(z)
\n
"
,
"imu_move"
,
cv
->
imu_move
[
0
],
cv
->
imu_move
[
1
],
cv
->
imu_move
[
2
]);
#endif //ifndef JCUDA
}
src/main/resources/kernels/geometry_correction.h
View file @
3c033192
...
...
@@ -41,6 +41,19 @@
#include "tp_defines.h"
#endif
#define NVRTC_BUG 1
#ifndef M_PI
#define M_PI 3.14159265358979323846
/* pi */
#endif
#ifndef offsetof
#define offsetof(st, m) \
((size_t)&(((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct
tp_task
{
int
task
;
union
{
...
...
@@ -61,19 +74,50 @@ struct corr_vector{
float
imu_rot
[
3
];
// d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float
imu_move
[
3
];
// dx/dt, dy/dt, dz/dt 16..19
};
#ifdef NVRTC_BUG
struct
trot_deriv
{
float
rots
[
NUM_CAMS
][
3
][
3
];
float
d_daz
[
NUM_CAMS
][
3
][
3
];
float
d_tilt
[
NUM_CAMS
][
3
][
3
];
float
d_roll
[
NUM_CAMS
][
3
][
3
];
float
d_zoom
[
NUM_CAMS
][
3
][
3
];
};
#else
union
trot_deriv
{
struct
{
float
rots
[
NUM_CAMS
][
3
][
3
];
float
d_daz
[
NUM_CAMS
][
3
][
3
];
float
d_tilt
[
NUM_CAMS
][
3
][
3
];
float
d_roll
[
NUM_CAMS
][
3
][
3
];
float
d_zoom
[
NUM_CAMS
][
3
][
3
];
};
float
matrices
[
5
][
NUM_CAMS
][
3
][
3
];
};
#endif
struct
gc
{
float
pixelCorrectionWidth
;
// =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float
pixelCorrectionHeight
;
// =1936;
float
line_time
;
// duration of one scan line readout (for ERS)
float
focalLength
;
// =FOCAL_LENGTH;
float
pixelSize
;
// = PIXEL_SIZE; //um
float
distortionRadius
;
// = DISTORTION_RADIUS; // mm - half width of the sensor
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA6
;
//r^6 (normalized to focal length or to sensor half width?)
float
distortionA5
;
//r^5 (normalized to focal length or to sensor half width?)
float
distortionA
;
// r^4 (normalized to focal length or to sensor half width?)
float
distortionB
;
// r^3
#ifndef NVRTC_BUG
union
{
struct
{
#endif
float
distortionC
;
// r^2
float
distortionB
;
// r^3
float
distortionA
;
// r^4 (normalized to focal length or to sensor half width?)
float
distortionA5
;
//r^5 (normalized to focal length or to sensor half width?)
float
distortionA6
;
//r^6 (normalized to focal length or to sensor half width?)
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
// };
// float rad_coeff [7];
// };
#endif
// parameters, common for all sensors
float
elevation
;
// degrees, up - positive;
float
heading
;
// degrees, CW (from top) - positive
...
...
@@ -82,18 +126,36 @@ struct gc {
float
right
[
NUM_CAMS
];
float
height
[
NUM_CAMS
];
float
roll
[
NUM_CAMS
];
// degrees, CW (to target) - positive
float
pXY0
[
NUM_CAMS
][
2
];
float
common_right
;
// mm right, camera center
float
common_forward
;
// mm forward (to target), camera center
float
common_height
;
// mm up, camera center
float
common_roll
;
// degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float
rXY
[
NUM_CAMS
][
3
];
// XY pairs of the in a normal plane, relative to disparityRadius
float
rXY
[
NUM_CAMS
][
2
];
// XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float
cameraRadius
;
// =0; // average distance from the "mass center" of the sensors to the sensors
float
disparityRadius
;
// =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
};
#define RAD_COEFF_LEN 7
extern
"C"
__global__
void
get_tiles_offsets
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
struct
gc
*
gpu_geometry_correction
,
struct
corr_vector
*
gpu_correction_vector
,
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
trot_deriv
*
gpu_rot_deriv
);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
trot_deriv
*
gpu_rot_deriv
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment