Commit 6c76931e authored by Palani Johnson's avatar Palani Johnson

ran formatter

parent 4648cb20
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -41,147 +41,152 @@
#include "tp_defines.h"
#endif
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles, // indices to gpu_tasks
int tilesx);
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float** gpu_kernel_offsets, // [NUM_CAMS],
float** gpu_kernels, // [NUM_CAMS],
float** gpu_images, // [NUM_CAMS],
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int* gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int* pnum_active_tiles, // indices to gpu_tasks
int tilesx);
extern "C" __global__ void correlate2D(
int num_cams,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute, squared
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
// int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
int num_cams,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute, squared
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int* gpu_corr_indices, // packed tile+pair
int* pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
// int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float* gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared
int corr_radius); // radius of the output correlation (7 for 15x15)
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float* gpu_corrs_td, // correlation tiles in transform domain
float* corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
float* gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared
int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void corr2D_combine(
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile)
int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int* gpu_corr_indices, // packed tile+pair
int* gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float* gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float* gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5],
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
int num_tilesx);
int num_cams, // number of cameras
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int* pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5],
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float* gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float* gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
int num_tilesx);
extern "C"
__global__ void imclt_rbg_all(
int num_cams,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
int colors,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C" __global__ void imclt_rbg_all(
int num_cams,
float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
int colors,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C" __global__ void erase8x8(
float * gpu_top_left,
const size_t dstride);
float* gpu_top_left,
const size_t dstride);
extern "C" __global__ void imclt_rbg(
float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
int v_offset,
int h_offset,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
float* gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float* gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
int v_offset,
int h_offset,
int woi_twidth,
int woi_theight,
const size_t dstride); // in floats (pixels)
extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5], // mitigating CUDA_ERROR_INVALID_PTX
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int* num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int* woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5], // mitigating CUDA_ERROR_INVALID_PTX
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float* gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void accumulate_correlations(
int tilesY,
int tilesX,
int pairs,
float* num_acc, // number of accumulated tiles [tilesY][tilesX][pair]
float* fcorr_td, // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
float* fcorr_td_acc); // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
......@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f;
__constant__ float SQRT_2 = 1.414214f;
__constant__ float SQRT1_2 = 0.707107f;
__constant__ float SQRT1_8 = 0.353553f;
__constant__ float COSN1[] = {0.980785f,0.831470f};
__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
__constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{-1,-1,-1, 1},
{-1,-1,-1, 1},
{-1,-1,-1, 1}
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{-1, 1, 1, 1}
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
}};
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
__constant__ float COSN1[] = {0.980785f, 0.831470f};
__constant__ float COSN2[] = {0.995185f, 0.956940f, 0.881921f, 0.773010f};
__constant__ float SINN1[] = {0.195090f, 0.555570f};
__constant__ float SINN2[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f};
__constant__ int imclt_indx9[16] = {0x28, 0x29, 0x2a, 0x2b, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x24, 0x25, 0x26, 0x27};
__constant__ float idct_signs[4][4][4] = {
{// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, -1, -1, -1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}},
{// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, 1, 1, -1},
{-1, -1, -1, 1},
{-1, -1, -1, 1},
{-1, -1, -1, 1}},
{// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, -1, -1, -1},
{1, -1, -1, -1},
{1, -1, -1, -1},
{-1, 1, 1, 1}},
{// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, 1, 1, -1},
{1, 1, 1, -1},
{1, 1, 1, -1},
{-1, -1, -1, 1}}};
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dct_ii8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
/**
**************************************************************************
......@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
* \return None
*/
#ifdef BBBB
extern "C"
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
{
int dtt_mode0 = dtt_mode & 1;
int dtt_mode1 = (dtt_mode >>1) & 1;
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) {
int dtt_mode0 = dtt_mode & 1;
int dtt_mode1 = (dtt_mode >> 1) & 1;
__shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
......@@ -162,1185 +157,1151 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
__syncthreads();
// horizontal pass
if (dtt_mode > 3) {
dttii_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
dttii_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} else {
dttiv_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
dttiv_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
}
__syncthreads();
// vertical pass
if (dtt_mode > 3) {
dttii_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
dttii_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} else {
dttiv_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
dttiv_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
}
__syncthreads();
for (unsigned int i = 0; i < DTT_SIZE; i++)
dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
}
#endif //#ifdef BBBB
#endif //#ifdef BBBB
inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00 = (COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10 = (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u01 = (COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11 = -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= ( COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10= (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u02 = (COSN2[2] * x[2] + SINN2[2] * x[5]);
float u12 = (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u01= ( COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11= -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
float u03 = (COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13 = -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
float u02= ( COSN2[2] * x[2] + SINN2[2] * x[5]);
float u12= (-SINN2[1] * x[1] + COSN2[1] * x[6]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float u03= ( COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13= -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float ua00= u00 + u03;
float ua10= u00 - u03;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float ub00= u10 + u13;
float ub10= u10 - u13;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
y[0] = SQRT_2 * v00; // w0[0];
y[1] = v01 - vb11; // w1[0];
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
__device__ void _dttiv(float x0, float x1, float x2, float x3, float x4, float x5, float x6, float x7,
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) {
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00 = (COSN2[0] * x7 + SINN2[0] * x0);
u10 = (-SINN2[3] * x4 + COSN2[3] * x3);
u01 = (COSN2[1] * x6 + SINN2[1] * x1);
u11 = -(-SINN2[2] * x5 + COSN2[2] * x2);
y[0] = SQRT_2 * v00; // w0[0];
y[1] = v01 - vb11; // w1[0];
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
u02 = (COSN2[2] * x5 + SINN2[2] * x2);
u12 = (-SINN2[1] * x6 + COSN2[1] * x1);
__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
{
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * x7 + SINN2[0] * x0);
u10= (-SINN2[3] * x4 + COSN2[3] * x3);
u01= ( COSN2[1] * x6 + SINN2[1] * x1);
u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
u02= ( COSN2[2] * x5 + SINN2[2] * x2);
u12= (-SINN2[1] * x6 + COSN2[1] * x1);
u03= ( COSN2[3] * x4 + SINN2[3] * x3);
u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00= ( COSN2[0] * x0 + SINN2[0] * x7);
u10= (-SINN2[3] * x3 + COSN2[3] * x4);
u01= ( COSN2[1] * x1 + SINN2[1] * x6);
u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
u02= ( COSN2[2] * x2 + SINN2[2] * x5);
u12= (-SINN2[1] * x1 + COSN2[1] * x6);
u03= ( COSN2[3] * x3 + SINN2[3] * x4);
u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
// j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
*y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
} else {
*y1 = (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = vb00 * 0.5f; // w1[3];
}
}
u03 = (COSN2[3] * x4 + SINN2[3] * x3);
u13 = -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00 = (COSN2[0] * x0 + SINN2[0] * x7);
u10 = (-SINN2[3] * x3 + COSN2[3] * x4);
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
u01 = (COSN2[1] * x1 + SINN2[1] * x6);
u11 = -(-SINN2[2] * x2 + COSN2[2] * x5);
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
u02 = (COSN2[2] * x2 + SINN2[2] * x5);
u12 = (-SINN2[1] * x1 + COSN2[1] * x6);
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10= (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u01= ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02= ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12= (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03= ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = -vb00 * 0.5f; // w1[3];
} else {
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
u03 = (COSN2[3] * x3 + SINN2[3] * x4);
u13 = -(-SINN2[0] * x0 + COSN2[0] * x7);
}
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
{
float sgn = 1 - 2* dst_not_dct;
float *y0 = x;
float *y1 = y0 + inc;
float *y2 = y1 + inc;
float *y3 = y2 + inc;
float *y4 = y3 + inc;
float *y5 = y4 + inc;
float *y6 = y5 + inc;
float *y7 = y6 + inc;
float *x0 = x + dst_not_dct * 7 * inc;
// negate inc, replace
inc *= sgn;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
// j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
*y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
} else {
*y1 = (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dctiv_nodiverg(float * x0, int inc)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float w00 = u00 + u03;
float w10 = u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
float ub00= u10 + u13;
float ub10= u10 - u13;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
float ub01= u11 + u12;
float ub11= u11 - u12;
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dstiv_nodiverg(float * x, int inc)
{
float *x0 = x + 7 * inc;
// negate inc, replace
inc = -inc;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00 = (COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10 = (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u01 = (COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11 = -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02 = (COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12 = (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03 = (COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13 = -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
float ub00 = u10 + u13;
float ub10 = u10 - u13;
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
float ub01 = u11 + u12;
float ub11 = u11 - u12;
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
float ua00= u00 + u03;
float ua10= u00 - u03;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = -vb00 * 0.5f; // w1[3];
} else {
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct) {
float sgn = 1 - 2 * dst_not_dct;
float *y0 = x;
float *y1 = y0 + inc;
float *y2 = y1 + inc;
float *y3 = y2 + inc;
float *y4 = y3 + inc;
float *y5 = y4 + inc;
float *y6 = y5 + inc;
float *y7 = y6 + inc;
float *x0 = x + dst_not_dct * 7 * inc;
// negate inc, replace
inc *= sgn;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
}
inline __device__ void dctiv_nodiverg(float *x0, int inc) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
float ua01= u01 + u02;
float ua11= u01 - u02;
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
float v00= ua00 + ua01;
float v02= ua00 - ua01;
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ub00= u10 + u13;
float ub10= u10 - u13;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float ub01= u11 + u12;
float ub11= u11 - u12;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
*x7 = v00 * 0.5f; // w0[0];
*x5 = (v01 + vb11) * SQRT1_8; // w0[1];
*x3 = (v02 - vb01) * SQRT1_8; // w0[2];
*x1 = (v03 + vb10) * SQRT1_8; // w0[3];
float ub01 = u11 + u12;
float ub11 = u11 - u12;
*x6 = (vb11 - v01) * SQRT1_8; // w1[0];
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3];
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
inline __device__ void dstiv_nodiverg(float *x, int inc) {
float *x0 = x + 7 * inc;
// negate inc, replace
inc = -inc;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x7 = v00 * 0.5f; // w0[0];
*x5 = (v01 + vb11) * SQRT1_8; // w0[1];
*x3 = (v02 - vb01) * SQRT1_8; // w0[2];
*x1 = (v03 + vb10) * SQRT1_8; // w0[3];
*x6 = (vb11 - v01) * SQRT1_8; // w1[0];
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3];
}
inline __device__ void _dctii_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= (x[0] + x[7]);
float u10= (x[0] - x[7]);
float u00 = (x[0] + x[7]);
float u10 = (x[0] - x[7]);
float u01= (x[1] + x[6]);
float u11= (x[1] - x[6]);
float u01 = (x[1] + x[6]);
float u11 = (x[1] - x[6]);
float u02= (x[2] + x[5]);
float u12= (x[2] - x[5]);
float u02 = (x[2] + x[5]);
float u12 = (x[2] - x[5]);
float u03= (x[3] + x[4]);
float u13= (x[3] - x[4]);
float u03 = (x[3] + x[4]);
float u13 = (x[3] - x[4]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v00= w00 + w01;
float v02= w00 - w01;
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
float v00 = w00 + w01;
float v02 = w00 - w01;
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00= w20 + w21;
float z01= w20 - w21;
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00 = w20 + w21;
float z01 = w20 - w21;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10= w30 + w31;
float z11= w30 - w31;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10 = w30 + w31;
float z11 = w30 - w31;
float v10 = SQRT_2 * z00;
float v11 = z01 - z11;
float v10 = SQRT_2 * z00;
float v11 = z01 - z11;
float v12 = z01 + z11;
float v13 = SQRT_2 * z10;
float v12 = z01 + z11;
float v13 = SQRT_2 * z10;
y[0] = v00;
y[1] = v10;
y[0] = v00;
y[1] = v10;
y[2] = v01;
y[3] = v11;
y[2] = v01;
y[3] = v11;
y[4] = v02;
y[5] = v12;
y[4] = v02;
y[5] = v12;
y[6] = v03;
y[7] = v13;
y[6] = v03;
y[7] = v13;
}
inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void dct_ii8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctii_nrecurs8(x, y);
_dctii_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
y[i] *= SQRT1_8;
}
for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8;
}
}
__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
__device__ void dct_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctiv_nrecurs8(x, y);
_dctiv_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
y[i] *= SQRT1_8;
}
for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8;
}
}
inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void dst_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float xr[8];
float xr[8];
#pragma unroll
for (int i=0; i < 8;i++){
xr[i] = x[7 - i];
}
_dctiv_nrecurs8(xr, y);
for (int i = 0; i < 8; i++) {
xr[i] = x[7 - i];
}
_dctiv_nrecurs8(xr, y);
#pragma unroll
for (int i=0; i < 8;i+=2){
y[i] *= SQRT1_8;
y[i+1] *= -SQRT1_8;
}
for (int i = 0; i < 8; i += 2) {
y[i] *= SQRT1_8;
y[i + 1] *= -SQRT1_8;
}
}
//=========================== 2D functions ===============
__device__ void corrUnfoldTile(
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
int corr_radius,
float *qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float *rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; //8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
if (i > corr_radius) {
return; // not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
rslt[rslt_base_index_pp] = corr_pixscale * (
qdata0[i_transform_size + j] +
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (
qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
}
if (i == 0) {
return;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float d = corr_pixscale * qdata2[im1_transform_size];
rslt[rslt_base_index_p] += d;
rslt[rslt_base_index_m] -= d;
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_mm = rslt_base_index_m - j;
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_pp] += d2 + d3;
rslt[rslt_base_index_pm] += d2 - d3;
}
int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; // 8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float *qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float *qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float *qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
if (i > corr_radius) {
return; // not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
rslt[rslt_base_index_pp] = corr_pixscale * (qdata0[i_transform_size + j] +
qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
}
if (i == 0) {
return;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float d = corr_pixscale * qdata2[im1_transform_size];
rslt[rslt_base_index_p] += d;
rslt[rslt_base_index_m] -= d;
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_mm = rslt_base_index_m - j;
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
float d3 = corr_pixscale * qdata3[im1_transform_size + j - 1];
// rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_pp] += d2 + d3;
rslt[rslt_base_index_pm] += d2 - d3;
}
}
__device__ void dttii_2d(
float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
float *clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
{
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
for (int q = 0; q < 4; q++) {
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x, DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
for (int q = 0; q < 4; q++) {
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1, 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
}
__device__ void dttiv_color_2d(
float * clt_tile,
int color)
{
dctiv_nodiverg( // all colors
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
1); //int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
1); //int inc);
float *clt_tile,
int color) {
dctiv_nodiverg( // all colors
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
1); // int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN) {
dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
1); // int inc);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
// __syncthreads();// worsened
if (color == BAYER_GREEN){
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
// __syncthreads();// worsened
if (color == BAYER_GREEN) {
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
}
__device__ void dttiv_mono_2d(
float * clt_tile)
{
// Copy 0-> 1
float *clt_tile) {
// Copy 0-> 1
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); // int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); // int inc);
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); // int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
__syncthreads();// __syncwarp();
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); // int inc);
__syncthreads(); // __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
dctiv_nodiverg( // CC
clt_tile + threadIdx.x,
DTT_SIZE1); // int inc,
dctiv_nodiverg( // SC
clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
DTT_SIZE1); // int inc,
dstiv_nodiverg( // CS
clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
__syncthreads();// __syncwarp();
dctiv_nodiverg( // CC
clt_tile + threadIdx.x,
DTT_SIZE1); // int inc,
dctiv_nodiverg( // SC
clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
DTT_SIZE1); // int inc,
dstiv_nodiverg( // CS
clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
__syncthreads(); // __syncwarp();
}
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float *mclt_tile) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; // 0..7,7,..0
float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG3
if ((threadIdx.x) == 0){
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles before IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg(
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
clt_tile + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
__syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg(
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg(
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
__syncthreads(); // __syncwarp();
#ifdef DEBUG3
if ((threadIdx.x) == 0){
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float *rslt = mclt_tile + column;
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = val;
rslt += DTT_SIZE21;
for (int i = 0; i < 4; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i < 3) {
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w, d0, val); // w*d0 + val
*rslt = val;
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
for (int i = 4; i < 8; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
for (int i = 7; i >= 4; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 4) {
clt_offset -= DTT_SIZE1;
}
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
for (int i = 3; i >= 0; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 0) {
clt_offset += DTT_SIZE1;
}
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#ifdef DEBUG3
__syncthreads();// __syncwarp();
if ((threadIdx.x) == 0){
__syncthreads(); // __syncwarp();
if ((threadIdx.x) == 0) {
printf("\nMCLT Tiles after IMCLT\n");
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
}
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
int do_acc, // 1 - add to previous value, 0 - overwrite
float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float *mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug) {
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles before IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( // quadrant 0
clt_tile + threadIdx.x, // pointer to start of row for quadrant 0
DTT_SIZE1);
dctiv_nodiverg( // quadrant 1
clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
DTT_SIZE1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // quadrant 2
clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
DTT_SIZE1);
dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
1);
dctiv_nodiverg( // quadrant 2
clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
1);
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( // quadrant 0
clt_tile + threadIdx.x, // pointer to start of row for quadrant 0
DTT_SIZE1);
dctiv_nodiverg( // quadrant 1
clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
DTT_SIZE1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // quadrant 2
clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
DTT_SIZE1);
dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1);
__syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
1);
dctiv_nodiverg( // quadrant 2
clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg( // quadrant 1
clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
1);
dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1);
__syncthreads();// __syncwarp();
dstiv_nodiverg( // quadrant 1
clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
1);
dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1);
__syncthreads(); // __syncwarp();
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for (int thr3 = 0; thr3 < 2; thr3++){
int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
for (int thr3 = 0; thr3 < 2; thr3++) {
int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; // 0..7,7,..0
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float *rslt = mclt_tile + column;
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads(); // __syncwarp();
#endif
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
// facc
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
for (int i = 0; i < 4; i++) {
float val = *rslt;
// facc
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i < 3) {
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
for (int i = 4; i < 8; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
for (int i = 7; i >= 4; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 4) {
clt_offset -= DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
for (int i = 3; i >= 0; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 0) {
clt_offset += DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
}
#ifdef DEBUG7
__syncthreads();// __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)) {
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads(); // __syncwarp();
}
__syncthreads(); // __syncwarp();
#endif
}
//#endif
......@@ -45,57 +45,56 @@
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE_LOG2 3
#define DTT_SIZE_LOG2 3
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
#define BAYER_RED 0
#define BAYER_BLUE 1
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
#define BAYER_RED 0
#define BAYER_BLUE 1
#define BAYER_GREEN 2
// assuming GR/BG as now
#define BAYER_RED_ROW 0
#define BAYER_RED_COL 1
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//extern __constant__ float idct_signs[4][4][4];
//extern __constant__ int imclt_indx9[16];
//extern __constant__ float HWINDOW2[];
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH + 1)
// extern __constant__ float idct_signs[4][4][4];
// extern __constant__ int imclt_indx9[16];
// extern __constant__ float HWINDOW2[];
// kernels (not used so far)
#if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #if 0
#endif // #if 0
//=========================== 2D functions ===============
extern __device__ void corrUnfoldTile(
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
extern __device__ void dttii_2d(
float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
float* clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
extern __device__ void dttiv_color_2d(
float * clt_tile,
int color);
float* clt_tile,
int color);
extern __device__ void dttiv_mono_2d(
float * clt_tile);
float* clt_tile);
extern __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile );
float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float* mclt_tile);
extern __device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug);
int do_acc, // 1 - add to previous value, 0 - overwrite
float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float* mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug);
......@@ -37,16 +37,15 @@
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
#ifndef get_task_size
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#endif
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
......@@ -57,9 +56,8 @@
#define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g, int num_cams);
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
__device__ void printGeometryCorrection(struct gc *g, int num_cams);
__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams);
/**
* Calculate non-distorted radius from distorted using table approximation
......@@ -67,774 +65,777 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
},{ // tilt
{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
},{ // roll*zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
},{ // d_azimuth
{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
},{ // d_tilt
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
},{ // d_roll
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
},{ // d_zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
}
};
__constant__ int angles_offsets [4] = {
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot - bad
{13,8,1}, // tmp1 * a_daz -> deriv0 - good
{14,5,2}, // tmp2 * a_az -> deriv1 - good
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2 - good
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1,-1,-1} // do nothing
}};
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
//inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
//inline __device__ int get_task_size_gc(int num_cams){
float rByRDist[RBYRDIST_LEN]); // shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = { // ...{cos,sin,const}...
{
// azimuth
{{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
{{0, 0, 0}, {0, 0, 1}, {0, 0, 0}},
{{0, 1, 0}, {0, 0, 0}, {1, 0, 0}},
},
{
// tilt
{{0, 0, 1}, {0, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {1, 0, 0}, {0, 1, 0}},
{{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
},
{
// roll*zoom
{{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 1}},
},
{
// d_azimuth
{{0, -1, 0}, {0, 0, 0}, {-1, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
},
{
// d_tilt
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
{{0, 0, 0}, {-1, 0, 0}, {0, -1, 0}},
},
{
// d_roll
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{-1, 0, 0}, {0, -1, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
},
{
// d_zoom
{{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
}};
__constant__ int angles_offsets[4] = {
offsetof(corr_vector, azimuth) / sizeof(float),
offsetof(corr_vector, tilt) / sizeof(float),
offsetof(corr_vector, roll) / sizeof(float),
offsetof(corr_vector, roll) / sizeof(float)};
__constant__ int mm_seq[3][3][3] = {
{
{6, 5, 12}, // a_t * a_z -> tmp0
{7, 6, 13}, // a_r * a_t -> tmp1
{7, 9, 14}, // a_r * a_dt -> tmp2
},
{
{7, 12, 0}, // a_r * tmp0 -> rot - bad
{13, 8, 1}, // tmp1 * a_daz -> deriv0 - good
{14, 5, 2}, // tmp2 * a_az -> deriv1 - good
},
{
{10, 12, 3}, // a_dr * tmp0 -> deriv2 - good
{11, 12, 4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1, -1, -1} // do nothing
}};
__constant__ int offset_rots = 0; // 0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
// inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams);
// inline __device__ int get_task_size_gc(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//}
// }
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile);
}
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
}
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
*/
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv)
{
__shared__ float sincos [4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 +4][3][3];
float angle;
float zoom;
int ncam = blockIdx.x; // threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll
int is_sin = nangle1 & 1;
if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
/// if (ncam == (NUM_CAMS-1)){ // for the whole block
if (ncam == (num_cams-1)){ // for the whole block
angle = 0.0;
zoom = 0.0;
/// for (int n = 0; n < (NUM_CAMS-1); n++){
for (int n = 0; n < (num_cams-1); n++){
angle -= *(gangles + n);
zoom -= gpu_correction_vector->zoom[n];
}
if (nangle >= 2){ // diverging for roll (last two)
angle = *(gangles + ncam);
}
} else {
angle = *(gangles + ncam);
zoom = gpu_correction_vector->zoom[ncam];
}
if (!is_sin){
angle += M_PI/2;
}
float sc = sinf(angle);
if (nangle ==2) {
sc *= 1.0 + zoom;
}
sincos[nangle][is_sin]= sc;
}
__syncthreads();
int num_cams,
struct corr_vector *gpu_correction_vector,
trot_deriv *gpu_rot_deriv) {
__shared__ float sincos[4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 + 4][3][3];
float angle;
float zoom;
int ncam = blockIdx.x; // threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll
int is_sin = nangle1 & 1;
if ((threadIdx.z == 0) && (nangle < 4)) { // others just idle here
float *gangles = (float *)gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
/// if (ncam == (NUM_CAMS-1)){ // for the whole block
if (ncam == (num_cams - 1)) { // for the whole block
angle = 0.0;
zoom = 0.0;
/// for (int n = 0; n < (NUM_CAMS-1); n++){
for (int n = 0; n < (num_cams - 1); n++) {
angle -= *(gangles + n);
zoom -= gpu_correction_vector->zoom[n];
}
if (nangle >= 2) { // diverging for roll (last two)
angle = *(gangles + ncam);
}
} else {
angle = *(gangles + ncam);
zoom = gpu_correction_vector->zoom[ncam];
}
if (!is_sin) {
angle += M_PI / 2;
}
float sc = sinf(angle);
if (nangle == 2) {
sc *= 1.0 + zoom;
}
sincos[nangle][is_sin] = sc;
}
__syncthreads();
#ifdef DEBUG20
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf( " Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf( " Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf( " Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices+threadIdx.z; // 0..2
int const_index = threadIdx.z; // 0..2
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
axis += 3; // skip index == 3
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
if (threadIdx.z == 0){
axis += 3;
const_index +=3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf(" Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices + threadIdx.z; // 0..2
int const_index = threadIdx.z; // 0..2
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
axis += 3; // skip index == 3
const_index += 3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
if (threadIdx.z == 0) {
axis += 3;
const_index += 3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20
const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++){
for (int col = 0; col < 3; col++){
printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot
{13,8,1}, // tmp1 * a_daz -> deriv0
{14,5,2}, // tmp2 * a_az -> deriv1
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for (int i = 0; i < 3; i++){
int srcl = mm_seq[i][threadIdx.z][0];
int srcr = mm_seq[i][threadIdx.z][1];
int dst = mm_seq[i][threadIdx.z][2];
if (srcl >= 0){
matrices[dst][threadIdx.y][threadIdx.x] =
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
}
__syncthreads();
}
// copy results to global memory
int gindx = threadIdx.z;
int lindx = offset_rots + threadIdx.z;
const char *matrices_names[] = {"az", "tilt", "roll*zoom", "d_daz", "d_tilt", "d_roll", "d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++) {
for (int col = 0; col < 3; col++) {
printf("%9.6f, ", matrices[offset_matrices + i][row][col]);
}
printf("\n");
}
}
}
__syncthreads(); // __syncwarp();
#endif // DEBUG20
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot
{13,8,1}, // tmp1 * a_daz -> deriv0
{14,5,2}, // tmp2 * a_az -> deriv1
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for (int i = 0; i < 3; i++) {
int srcl = mm_seq[i][threadIdx.z][0];
int srcr = mm_seq[i][threadIdx.z][1];
int dst = mm_seq[i][threadIdx.z][2];
if (srcl >= 0) {
matrices[dst][threadIdx.y][threadIdx.x] =
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x] +
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x] +
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
}
__syncthreads();
}
// copy results to global memory
int gindx = threadIdx.z;
int lindx = offset_rots + threadIdx.z;
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif
gindx +=3;
lindx+=3;
if (lindx < 5) {
gindx += 3;
lindx += 3;
if (lindx < 5) {
#ifdef NVRTC_BUG
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
// going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif
}
__syncthreads();
}
__syncthreads();
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
}
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
{
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS/ num_cams;
dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
dim3 grid_geom ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> (
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
}
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv) {
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS / num_cams;
dim3 threads_geom(num_cams, tiles_per_block_geom, 1);
dim3 grid_geom((num_tiles + tiles_per_block_geom - 1) / tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom, threads_geom>>>(
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
}
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
}
/*
* blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM
*/
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
{
int task_size = get_task_size(num_cams);
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv;
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
float pXY[2]; // result to be copied to task
//blockDim.y
// copy data common to all threads
{
int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_gc; i++){
if (offset < sizeof(struct gc)/sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
}
offset += dim_xy;
}
}
{
int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_cv; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
}
offset += dim_xy;
}
}
// TODO: maybe it is better to use system memory and not read all table?
{
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rbrd; i++){
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
offset += dim_xy;
}
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rot; i++){
if (offset < sizeof(trot_deriv)/sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += dim_xy;
}
}
__syncthreads();
int ncam = threadIdx.x;
if (task_num >= num_tiles){
return;
}
int imu_exists = // todo - calculate once with rot_deriv?
(extrinsic_corr.imu_rot[0] != 0.0) ||
(extrinsic_corr.imu_rot[1] != 0.0) ||
(extrinsic_corr.imu_rot[2] != 0.0) ||
(extrinsic_corr.imu_move[0] != 0.0) ||
(extrinsic_corr.imu_move[1] != 0.0) ||
(extrinsic_corr.imu_move[2] != 0.0);
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv) {
int task_size = get_task_size(num_cams);
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
__shared__ struct gc geometry_correction;
__shared__ float rByRDist[RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv;
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
float pXY[2]; // result to be copied to task
// blockDim.y
// copy data common to all threads
{
int cycles_copy_gc = ((sizeof(struct gc) / sizeof(float) + dim_xy - 1) / dim_xy);
float *gcp_local = (float *)&geometry_correction;
float *gcp_global = (float *)gpu_geometry_correction;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_gc; i++) {
if (offset < sizeof(struct gc) / sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
}
offset += dim_xy;
}
}
{
int cycles_copy_cv = ((sizeof(struct corr_vector) / sizeof(float) + dim_xy - 1) / dim_xy);
float *cvp_local = (float *)&extrinsic_corr;
float *cvp_global = (float *)gpu_correction_vector;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_cv; i++) {
if (offset < sizeof(struct corr_vector) / sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
}
offset += dim_xy;
}
}
// TODO: maybe it is better to use system memory and not read all table?
{
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
float *rByRDistp_local = (float *)rByRDist;
float *rByRDistp_global = (float *)gpu_rByRDist;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rbrd; i++) {
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
offset += dim_xy;
}
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
int cycles_copy_rot = ((sizeof(trot_deriv) / sizeof(float)) + dim_xy - 1) / dim_xy;
float *rots_local = (float *)&rot_deriv;
float *rots_global = (float *)gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rot; i++) {
if (offset < sizeof(trot_deriv) / sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += dim_xy;
}
}
__syncthreads();
int ncam = threadIdx.x;
if (task_num >= num_tiles) {
return;
}
int imu_exists = // todo - calculate once with rot_deriv?
(extrinsic_corr.imu_rot[0] != 0.0) ||
(extrinsic_corr.imu_rot[1] != 0.0) ||
(extrinsic_corr.imu_rot[2] != 0.0) ||
(extrinsic_corr.imu_move[0] != 0.0) ||
(extrinsic_corr.imu_move[1] != 0.0) ||
(extrinsic_corr.imu_move[2] != 0.0);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
printGeometryCorrection(&geometry_correction, num_cams);
printExtrinsicCorrection(&extrinsic_corr,num_cams);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
* 2) apply disparity
* 3) apply rotations and zoom
* 4) re-apply distortion
* 5) return port center X and Y
* line_time
*/
// common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
float disparity = * (gpu_ftasks + task_size * task_num + 2);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float px = *(centerXY);
float py = *(centerXY + 1);
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int)blockIdx.x);
printGeometryCorrection(&geometry_correction, num_cams);
printExtrinsicCorrection(&extrinsic_corr, num_cams);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
* 2) apply disparity
* 3) apply rotations and zoom
* 4) re-apply distortion
* 5) return port center X and Y
* line_time
*/
// common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
float disparity = *(gpu_ftasks + task_size * task_num + 2);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float px = *(centerXY);
float py = *(centerXY + 1);
int cxy = *(int *)(gpu_ftasks + task_size * task_num + 1);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
#ifdef DEBUG23
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
printf("\n");
__syncthreads();
}
#endif //#ifdef DEBUG23
px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
*(centerXY) = px;
*(centerXY + 1) = py;
}
__syncthreads();
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY [2];
rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1];
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)) {
printf("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX, tileY, ncam);
printf("\n");
__syncthreads();
}
#endif //#ifdef DEBUG23
px = tileX * DTT_SIZE + DTT_SIZE / 2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE / 2; // - shiftY;
*(centerXY) = px;
*(centerXY + 1) = py;
}
__syncthreads();
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY[2];
rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1];
float rD = sqrtf(pXcd * pXcd + pYcd * pYcd) * 0.001 * geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R = getRByRDist(rD / geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz[3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[1]; // [ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py);
printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0], xyz[1], xyz[2]);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels
float pYci0 = pYc - disparity * rXY[1]; // [ncam][1];
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
// Apply port-individual combined rotation/zoom matrix
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
#pragma unroll
for (int j = 0; j< 3; j++){
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2];
float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z;
// Re-apply distortion
float rNDi = sqrtf(pXci*pXci + pYci*pYci); // in pixels
float ri = rNDi* ri_scale; // relative to distortion radius
float rD2rND = 1.0;
{
float rri = 1.0;
for (int j = 0; j < 3; j++) {
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix / rvi[2];
float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z;
// Re-apply distortion
float rNDi = sqrtf(pXci * pXci + pYci * pYci); // in pixels
float ri = rNDi * ri_scale; // relative to distortion radius
float rD2rND = 1.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
rri *= ri;
rD2rND += ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
}
for (int j = 0; j < RAD_COEFF_LEN; j++) {
rri *= ri;
rD2rND += ((float *)&geometry_correction.distortionC)[j] * (rri - 1.0);
}
#else
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
rri *= ri;
rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
}
for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
rri *= ri;
rD2rND += geometry_correction.rad_coeff[j] * (rri - 1.0);
}
#endif
}
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND;
pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
// new for ERS
pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
__syncthreads();
// Each thread re-calculate same sum
float lines_avg = 0;
for (int i = 0; i < num_cams; i ++){
lines_avg += pY_offsets[threadIdx.y][i];
}
lines_avg *= (1.0/num_cams);
// used when calculating derivatives, TODO: combine calculations !
float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
}
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND;
pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
// new for ERS
pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
__syncthreads();
// Each thread re-calculate same sum
float lines_avg = 0;
for (int i = 0; i < num_cams; i++) {
lines_avg += pY_offsets[threadIdx.y][i];
}
lines_avg *= (1.0 / num_cams);
// used when calculating derivatives, TODO: combine calculations !
float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
printf("rNDi = %f, ri = %f\n", rNDi, ri);
printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // *
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
printf("rNDi = %f, ri = %f\n", rNDi, ri);
printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // *
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
float drvi_daz[3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl[3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl[3]; // drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll
for (int j = 0; j< 3; j++){
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
}
float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
for (int j = 0; j < 3; j++) {
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
}
float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
float dpXci_pYci_imu_lin[2][3];
/*
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/
float dd1[2][2];// get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
float dpXci_pYci_imu_lin[2][3];
/*
double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/
float dd1[2][2]; // get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0] * rXY[0] - rot_deriv.rots[ncam][0][1] * rXY[1]) * norm_z;
dd1[0][1] = (rot_deriv.rots[ncam][0][0] * rXY[1] - rot_deriv.rots[ncam][0][1] * rXY[0]) * norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0] * rXY[0] - rot_deriv.rots[ncam][1][1] * rXY[1]) * norm_z;
dd1[1][1] = (rot_deriv.rots[ncam][1][0] * rXY[1] - rot_deriv.rots[ncam][1][1] * rXY[0]) * norm_z;
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("dd1[0][0] = %f, dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float c_dist = pXci/rNDi;
float s_dist = pYci/rNDi;
//#undef NVRTC_BUG
float drD2rND_dri = 0.0;
{
float rri = 1.0;
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("dd1[0][0] = %f, dd1[0][1] = %f\n", dd1[0][0], dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n", dd1[1][0], dd1[1][1]);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float c_dist = pXci / rNDi;
float s_dist = pYci / rNDi;
//#undef NVRTC_BUG
float drD2rND_dri = 0.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
rri *= ri;
}
for (int j = 0; j < RAD_COEFF_LEN; j++) {
drD2rND_dri += ((float *)&geometry_correction.distortionC)[j] * (j + 1) * rri;
rri *= ri;
}
#else
#pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
rri *= ri;
}
for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
drD2rND_dri += geometry_correction.rad_coeff[j] * (j + 1) * rri;
rri *= ri;
}
#endif
}
float scale_distort00 = rD2rND + ri* drD2rND_dri;
float scale_distort11 = rD2rND;
float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
}
float scale_distort00 = rD2rND + ri * drD2rND_dri;
float scale_distort11 = rD2rND;
float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = (c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = (c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
*(disp_dist_p++) = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists){
float ers_x =
dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
dpXci_droll * extrinsic_corr.imu_rot[2];
float ers_y =
dpYci_dtilt * extrinsic_corr.imu_rot[0] +
dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
dpYci_droll * extrinsic_corr.imu_rot[2];
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n", scale_distortXrot2Xdd1[0][0], scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n", scale_distortXrot2Xdd1[1][0], scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float *disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams * 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
*(disp_dist_p++) = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists) {
float ers_x =
dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
dpXci_droll * extrinsic_corr.imu_rot[2];
float ers_y =
dpYci_dtilt * extrinsic_corr.imu_rot[0] +
dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
dpYci_droll * extrinsic_corr.imu_rot[2];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("ers_x = %f, ers_y = %f\n", ers_x, ers_y);
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
if (disparity >= MIN_DISPARITY){ // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
//// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
pXY[0] += delta_t * ers_x * rD2rND; // added correction to pixel X
pXY[1] += delta_t * ers_y * rD2rND; // added correction to pixel Y
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("ers_x = %f, ers_y = %f\n", ers_x, ers_y);
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
if (disparity >= MIN_DISPARITY) { // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
//// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
float delta_t = (pY_offset / (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
pXY[0] += delta_t * ers_x * rD2rND; // added correction to pixel X
pXY[1] += delta_t * ers_y * rD2rND; // added correction to pixel Y
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_x = %f, ers_y = %f\n", delta_t, ers_x, ers_y);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist (already copied)
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float * tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0], dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1], dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_x = %f, ers_y = %f\n", delta_t, ers_x, ers_y);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads(); // __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist (already copied)
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float *tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
continue;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
break;
}
r+=(rDist-rD)/drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
}
struct gc *geometry_correction,
float *rByRDist) {
// int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta = 1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative = 0.01;
int numIterations = 1000;
double drDistDr = 1.0;
double d = 1.0 - geometry_correction->distortionA8 - geometry_correction->distortionA7 - geometry_correction->distortionA6 - geometry_correction->distortionA5 - geometry_correction->distortionA - geometry_correction->distortionB - geometry_correction->distortionC;
double rPrev = 0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p++) {
int i = indx * num_points + p;
if (i >= RBYRDIST_LEN) {
return;
}
if (i == 0) {
rByRDist[0] = (float)1.0 / d;
continue;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration = 0; iteration < numIterations; iteration++) {
double k = (((((((
geometry_correction->distortionA8) *
r +
geometry_correction->distortionA7) *
r +
geometry_correction->distortionA6) *
r +
geometry_correction->distortionA5) *
r +
geometry_correction->distortionA) *
r +
geometry_correction->distortionB) *
r +
geometry_correction->distortionC) *
r +
d;
drDistDr = (((((((
8 * geometry_correction->distortionA8) *
r +
7 * geometry_correction->distortionA7) *
r +
6 * geometry_correction->distortionA6) *
r +
5 * geometry_correction->distortionA5) *
r +
4 * geometry_correction->distortionA) *
r +
3 * geometry_correction->distortionB) *
r +
2 * geometry_correction->distortionC) *
r +
d;
if (drDistDr < minDerivative) { // folds backwards !
return; // too high distortion
}
double rD = r * k;
if (fabs(rD - rDist) < delta) {
break;
}
r += (rDist - rD) / drDistDr;
}
rPrev = r;
rByRDist[i] = (float)r / rDist;
}
}
/**
......@@ -843,110 +844,122 @@ extern "C" __global__ void calcReverseDistortionTable(
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]) //shared memory
float rByRDist[RBYRDIST_LEN]) // shared memory
{
if (rDist < 0) {
return 0.0f; // normally should not happen
}
float findex = rDist/RBYRDIST_STEP;
int index= (int) floorf(findex);
if (index < 0){
index = 0;
}
if (index > (RBYRDIST_LEN - 3)) {
index = RBYRDIST_LEN - 3;
}
float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu;
float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
// use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index+1] - 0.5 * rByRDist[index+2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index+1];
float a3 = rByRDist[index];
float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
return result;
if (rDist < 0) {
return 0.0f; // normally should not happen
}
float findex = rDist / RBYRDIST_STEP;
int index = (int)floorf(findex);
if (index < 0) {
index = 0;
}
if (index > (RBYRDIST_LEN - 3)) {
index = RBYRDIST_LEN - 3;
}
float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu;
float y0 = (index > 0) ? rByRDist[index - 1] : (2 * rByRDist[index] - rByRDist[index + 1]);
// use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index + 1] + 0.5 * rByRDist[index + 2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index + 1] - 0.5 * rByRDist[index + 2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index + 1];
float a3 = rByRDist[index];
float result = a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3;
return result;
}
__device__ void printGeometryCorrection(struct gc * g, int num_cams){
__device__ void printGeometryCorrection(struct gc *g, int num_cams) {
#ifndef JCUDA
printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n","pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n","line_time", g->line_time);
printf("%22s: %f\n","focalLength", g->focalLength);
printf("%22s: %f\n","pixelSize", g->pixelSize);
printf("%22s: %f\n","distortionRadius",g->distortionRadius);
printf("%22s: %f\n","distortionC", g->distortionC);
printf("%22s: %f\n","distortionB", g->distortionB);
printf("%22s: %f\n","distortionA", g->distortionA);
printf("%22s: %f\n","distortionA5",g->distortionA5);
printf("%22s: %f\n","distortionA6",g->distortionA6);
printf("%22s: %f\n","distortionA7",g->distortionA7);
printf("%22s: %f\n","distortionA8",g->distortionA8);
printf("%22s: %f\n","elevation", g->elevation);
printf("%22s: %f\n","heading", g->heading);
// printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
// printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
// printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
// printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
// printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
// printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
// printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
// printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s:","forward"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]); printf("\n");
printf("%22s:","right"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right [ncam]); printf("\n");
printf("%22s:","height"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height [ncam]); printf("\n");
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll [ncam]); printf("\n");
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "pXY0[",ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
}
printf("%22s: %f\n","common_right", g->common_right);
printf("%22s: %f\n","common_forward", g->common_forward);
printf("%22s: %f\n","common_height", g->common_height);
printf("%22s: %f\n","common_roll", g->common_roll);
// printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
// printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
// printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
}
printf("%22s: %f\n","cameraRadius", g->cameraRadius);
printf("%22s: %f\n","disparityRadius", g->disparityRadius);
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
printf("%22s:","woi_tops"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]); printf("\n");
#endif //ifndef JCUDA
printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n", "pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n", "pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n", "line_time", g->line_time);
printf("%22s: %f\n", "focalLength", g->focalLength);
printf("%22s: %f\n", "pixelSize", g->pixelSize);
printf("%22s: %f\n", "distortionRadius", g->distortionRadius);
printf("%22s: %f\n", "distortionC", g->distortionC);
printf("%22s: %f\n", "distortionB", g->distortionB);
printf("%22s: %f\n", "distortionA", g->distortionA);
printf("%22s: %f\n", "distortionA5", g->distortionA5);
printf("%22s: %f\n", "distortionA6", g->distortionA6);
printf("%22s: %f\n", "distortionA7", g->distortionA7);
printf("%22s: %f\n", "distortionA8", g->distortionA8);
printf("%22s: %f\n", "elevation", g->elevation);
printf("%22s: %f\n", "heading", g->heading);
// printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
// printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
// printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
// printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
// printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
// printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
// printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
// printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s:", "forward");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]);
printf("\n");
printf("%22s:", "right");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right[ncam]);
printf("\n");
printf("%22s:", "height");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height[ncam]);
printf("\n");
printf("%22s:", "roll");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll[ncam]);
printf("\n");
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "pXY0[", ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
}
printf("%22s: %f\n", "common_right", g->common_right);
printf("%22s: %f\n", "common_forward", g->common_forward);
printf("%22s: %f\n", "common_height", g->common_height);
printf("%22s: %f\n", "common_roll", g->common_roll);
// printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
// printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
// printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
}
printf("%22s: %f\n", "cameraRadius", g->cameraRadius);
printf("%22s: %f\n", "disparityRadius", g->disparityRadius);
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
printf("%22s:", "woi_tops");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]);
printf("\n");
#endif // ifndef JCUDA
}
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams)
{
__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams) {
#ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n");
// printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
// printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
// printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
// printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s:","tilt"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->tilt[ncam]); printf("\n");
printf("%22s:","azimuth"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->azimuth[ncam]); printf("\n");
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]); printf("\n");
printf("%22s:","zoom"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->zoom[ncam]); printf("\n");
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif //ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n");
// printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
// printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
// printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
// printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s:", "tilt");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->tilt[ncam]);
printf("\n");
printf("%22s:", "azimuth");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->azimuth[ncam]);
printf("\n");
printf("%22s:", "roll");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]);
printf("\n");
printf("%22s:", "zoom");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->zoom[ncam]);
printf("\n");
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif // ifndef JCUDA
}
......@@ -41,147 +41,141 @@
#include "tp_defines.h"
#endif
#define NVRTC_BUG 1
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#define M_PI 3.14159265358979323846 /* pi */
#endif
#ifndef offsetof
#define offsetof(st, m) \
((size_t)&(((st *)0)->m))
((size_t) & (((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task {
int task;
union {
int txy;
unsigned short sxy[2];
};
float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2];
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
int task;
union {
int txy;
unsigned short sxy[2];
};
float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2];
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
};
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5
float roll [NUM_CAMS]; // 6..9
float zoom [NUM_CAMS-1]; // 10..12
// for ERS correction:
float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
struct corr_vector {
float tilt[NUM_CAMS - 1]; // 0..2
float azimuth[NUM_CAMS - 1]; // 3..5
float roll[NUM_CAMS]; // 6..9
float zoom[NUM_CAMS - 1]; // 10..12
// for ERS correction:
float imu_rot[3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
};
#ifdef NVRTC_BUG
struct trot_deriv{
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
struct trot_deriv {
float rots[NUM_CAMS][3][3];
float d_daz[NUM_CAMS][3][3];
float d_tilt[NUM_CAMS][3][3];
float d_roll[NUM_CAMS][3][3];
float d_zoom[NUM_CAMS][3][3];
};
#else
union trot_deriv{
struct {
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
};
float matrices [5][NUM_CAMS][3][3];
union trot_deriv {
struct {
float rots[NUM_CAMS][3][3];
float d_daz[NUM_CAMS][3][3];
float d_tilt[NUM_CAMS][3][3];
float d_roll[NUM_CAMS][3][3];
float d_zoom[NUM_CAMS][3][3];
};
float matrices[5][NUM_CAMS][3][3];
};
#endif
struct gc {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
#ifndef NVRTC_BUG
union {
struct {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
#ifndef NVRTC_BUG
union {
struct {
#endif
float distortionC; // r^2
float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
};
float rad_coeff [7];
};
float distortionC; // r^2
float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; // r^5 (normalized to focal length or to sensor half width?)
float distortionA6; // r^6 (normalized to focal length or to sensor half width?)
float distortionA7; // r^7 (normalized to focal length or to sensor half width?)
float distortionA8; // r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
};
float rad_coeff[7];
};
#endif
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
float forward [NUM_CAMS];
float right [NUM_CAMS];
float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2];
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
float woi_tops [NUM_CAMS]; // used to calculate scanline timing
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
float forward[NUM_CAMS];
float right[NUM_CAMS];
float height[NUM_CAMS];
float roll[NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0[NUM_CAMS][2];
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY[NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
float woi_tops[NUM_CAMS]; // used to calculate scanline timing
};
#define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv);
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
int num_cams,
struct corr_vector *gpu_correction_vector,
trot_deriv *gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist);
struct gc *geometry_correction,
float *rByRDist);
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -39,61 +39,61 @@
// Avoiding includes in jcuda, all source files will be merged
#pragma once
#ifndef JCUDA
#define TEST_LWIR 1
#define TEST_LWIR 1
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 16 // now maximal number of cameras
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 16 // now maximal number of cameras
//#define NUM_PAIRS 6
//#define NUM_COLORS 1 //3
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
//#define TEXTURE_THREADS 32 //
#define NUM_THREADS 32
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define NUM_THREADS 32
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
// only lower bit will be used to request correlations, correlation mask will be common for all the scene
//#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
//#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
//#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32 / NUM_CAMS) // each tile has NUM_CAMS threads
#define DEBUG_ANY 1
#ifdef DEBUG_ANY
#ifdef DEBUG_ANY
//#define DEBUG_OOB1 1
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#if TEST_LWIR
#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#else
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#endif
#undef DBG_MARK_DBG_TILE
//#undef DBG_TILE
......@@ -101,8 +101,7 @@
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
// 7
//#define DEBUG1 1
//#define DEBUG2 1
//#define DEBUG3 1
......@@ -118,7 +117,7 @@
#define DEBUG9 1
*/
//#define DEBUG8A 1 // generate_RBGA_host
//textures
// textures
//#define DEBUG10 1
//#define DEBUG11 1
//#define DEBUG12 1
......@@ -127,7 +126,6 @@
// geom
//#define DEBUG20 1
#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
//#define DEBUG20 1 // Geometry Correction
//#define DEBUG21 1 // Geometry Correction
......@@ -136,10 +134,8 @@
//#define DEBUG22 1
//#define DEBUG23 1
#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#endif //#ifdef DEBUG_ANY
#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#endif //#ifndef JCUDA
#endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment