Commit 6c76931e authored by Palani Johnson's avatar Palani Johnson

ran formatter

parent 4648cb20
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -41,147 +41,152 @@ ...@@ -41,147 +41,152 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
extern "C" __global__ void convert_direct( // called with a single block, single thread
extern "C" __global__ void convert_direct( // called with a single block, single thread // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters int num_cams, // actual number of cameras
int num_cams, // actual number of cameras int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono float** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernel_offsets, // [NUM_CAMS], float** gpu_kernels, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float** gpu_images, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS], float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16 // struct tp_task * gpu_tasks,
// struct tp_task * gpu_tasks, float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] size_t dstride, // in floats (pixels)
size_t dstride, // in floats (pixels) int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! int woi_width,
int woi_width, int woi_height,
int woi_height, int kernels_hor,
int kernels_hor, int kernels_vert,
int kernels_vert, int* gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles int* pnum_active_tiles, // indices to gpu_tasks
int * pnum_active_tiles, // indices to gpu_tasks int tilesx);
int tilesx);
extern "C" __global__ void correlate2D( extern "C" __global__ void correlate2D(
int num_cams, int num_cams,
// int * sel_pairs, // int * sel_pairs,
int sel_pairs0, int sel_pairs0,
int sel_pairs1, int sel_pairs1,
int sel_pairs2, int sel_pairs2,
int sel_pairs3, int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
float scale0, // scale for R float scale0, // scale for R
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute, squared float fat_zero2, // here - absolute, squared
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16 float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs) // struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int* gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int* pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats size_t corr_stride, // in floats
// int corr_stride, // in floats // int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float* gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize( extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain float* gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it) float* corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain float* gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared float fat_zero2, // here - absolute, squared
int corr_radius); // radius of the output correlation (7 for 15x15) int corr_radius); // radius of the output correlation (7 for 15x15)
extern "C" __global__ void corr2D_combine( extern "C" __global__ void corr2D_combine(
int num_tiles, // number of tiles to process (each with num_pairs) int num_tiles, // number of tiles to process (each with num_pairs)
int num_pairs, // num pairs per tile (should be the same) int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair int* gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair int* gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles float* gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input) const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile) float* gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap( extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras int num_cams, // number of cameras
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row // int num_tilesx, // number of tiles in a row
// declare arrays in device code? // declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array int* pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY ! // TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction, struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction int is_lwir, // do not perform shot correction
float params[5], float params[5],
float weights[3], // scale for R,B,G float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null ) // combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed float* gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed float* gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
int num_tilesx); int num_tilesx);
extern "C" extern "C" __global__ void imclt_rbg_all(
__global__ void imclt_rbg_all( int num_cams,
int num_cams, float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT] int apply_lpf,
int apply_lpf, int colors,
int colors, int woi_twidth,
int woi_twidth, int woi_theight,
int woi_theight, const size_t dstride); // in floats (pixels)
const size_t dstride); // in floats (pixels)
extern "C" __global__ void erase8x8( extern "C" __global__ void erase8x8(
float * gpu_top_left, float* gpu_top_left,
const size_t dstride); const size_t dstride);
extern "C" __global__ void imclt_rbg( extern "C" __global__ void imclt_rbg(
float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float* gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT float* gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf, int apply_lpf,
int mono, // defines lpf filter int mono, // defines lpf filter
int color, // defines location of clt data int color, // defines location of clt data
int v_offset, int v_offset,
int h_offset, int h_offset,
int woi_twidth, int woi_twidth,
int woi_theight, int woi_theight,
const size_t dstride); // in floats (pixels) const size_t dstride); // in floats (pixels)
extern "C" __global__ void generate_RBGA( extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used int num_cams, // number of cameras used
// Parameters to generate texture tasks // Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16 float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
// declare arrays in device code? // declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation) int* num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi int* woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1) int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation // Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY ! // TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction, struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction int is_lwir, // do not perform shot correction
float params[5], // mitigating CUDA_ERROR_INVALID_PTX float params[5], // mitigating CUDA_ERROR_INVALID_PTX
float weights[3], // scale for R,B,G float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float* gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void accumulate_correlations(
int tilesY,
int tilesX,
int pairs,
float* num_acc, // number of accumulated tiles [tilesY][tilesX][pair]
float* fcorr_td, // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
float* fcorr_td_acc); // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
...@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f; ...@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f;
__constant__ float SQRT_2 = 1.414214f; __constant__ float SQRT_2 = 1.414214f;
__constant__ float SQRT1_2 = 0.707107f; __constant__ float SQRT1_2 = 0.707107f;
__constant__ float SQRT1_8 = 0.353553f; __constant__ float SQRT1_8 = 0.353553f;
__constant__ float COSN1[] = {0.980785f,0.831470f}; __constant__ float COSN1[] = {0.980785f, 0.831470f};
__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f}; __constant__ float COSN2[] = {0.995185f, 0.956940f, 0.881921f, 0.773010f};
__constant__ float SINN1[] = {0.195090f,0.555570f}; __constant__ float SINN1[] = {0.195090f, 0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f}; __constant__ float SINN2[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27}; __constant__ int imclt_indx9[16] = {0x28, 0x29, 0x2a, 0x2b, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x24, 0x25, 0x26, 0x27};
__constant__ float idct_signs[4][4][4] ={ __constant__ float idct_signs[4][4][4] = {
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16 {// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1}, {1, -1, -1, -1},
{-1, 1, 1, 1}, {-1, 1, 1, 1},
{-1, 1, 1, 1}, {-1, 1, 1, 1},
{-1, 1, 1, 1} {-1, 1, 1, 1}},
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16 {// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1}, {1, 1, 1, -1},
{-1,-1,-1, 1}, {-1, -1, -1, 1},
{-1,-1,-1, 1}, {-1, -1, -1, 1},
{-1,-1,-1, 1} {-1, -1, -1, 1}},
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16 {// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1}, {1, -1, -1, -1},
{ 1,-1,-1,-1}, {1, -1, -1, -1},
{ 1,-1,-1,-1}, {1, -1, -1, -1},
{-1, 1, 1, 1} {-1, 1, 1, 1}},
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16 {// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1}, {1, 1, 1, -1},
{ 1, 1, 1,-1}, {1, 1, 1, -1},
{ 1, 1, 1,-1}, {1, 1, 1, -1},
{-1,-1,-1, 1} {-1, -1, -1, 1}}};
}}; __constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f, 0.386505f, 0.440961f, 0.478470f, 0.497592f};
0.386505f, 0.440961f, 0.478470f, 0.497592f};
inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0] inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct); // not used
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV inline __device__ void dctiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used inline __device__ void dstiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dctiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg (float * x0, int inc); // used in TP inline __device__ void dct_ii8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used inline __device__ void dst_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used inline __device__ void _dctii_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
/** /**
************************************************************************** **************************************************************************
...@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t ...@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
* \return None * \return None
*/ */
#ifdef BBBB #ifdef BBBB
extern "C" extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) {
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) int dtt_mode0 = dtt_mode & 1;
{ int dtt_mode1 = (dtt_mode >> 1) & 1;
int dtt_mode0 = dtt_mode & 1;
int dtt_mode1 = (dtt_mode >>1) & 1;
__shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE]; __shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
...@@ -162,1185 +157,1151 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo ...@@ -162,1185 +157,1151 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
__syncthreads(); __syncthreads();
// horizontal pass // horizontal pass
if (dtt_mode > 3) { if (dtt_mode > 3) {
dttii_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0); dttii_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} else { } else {
dttiv_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0); dttiv_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} }
__syncthreads(); __syncthreads();
// vertical pass // vertical pass
if (dtt_mode > 3) { if (dtt_mode > 3) {
dttii_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1); dttii_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} else { } else {
dttiv_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1); dttiv_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} }
__syncthreads(); __syncthreads();
for (unsigned int i = 0; i < DTT_SIZE; i++) for (unsigned int i = 0; i < DTT_SIZE; i++)
dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE]; dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
} }
#endif //#ifdef BBBB #endif //#ifdef BBBB
inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00 = (COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10 = (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u01 = (COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11 = -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each float u02 = (COSN2[2] * x[2] + SINN2[2] * x[5]);
{ float u12 = (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u00= ( COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10= (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u01= ( COSN2[1] * x[1] + SINN2[1] * x[6]); float u03 = (COSN2[3] * x[3] + SINN2[3] * x[4]);
float u11= -(-SINN2[2] * x[2] + COSN2[2] * x[5]); float u13 = -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
float u02= ( COSN2[2] * x[2] + SINN2[2] * x[5]); // _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float u12= (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u03= ( COSN2[3] * x[3] + SINN2[3] * x[4]); float ua00 = u00 + u03;
float u13= -(-SINN2[0] * x[0] + COSN2[0] * x[7]); float ua10 = u00 - u03;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03); float ua01 = u01 + u02;
float ua11 = u01 - u02;
float ua00= u00 + u03; float v00 = ua00 + ua01;
float ua10= u00 - u03; float v02 = ua00 - ua01;
float ua01= u01 + u02; float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float ua11= u01 - u02; float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v00= ua00 + ua01; // _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11; float ub00 = u10 + u13;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11; float ub10 = u10 - u13;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13); float ub01 = u11 + u12;
float ub11 = u11 - u12;
float ub00= u10 + u13; float vb00 = ub00 + ub01;
float ub10= u10 - u13; float vb01 = ub00 - ub01;
float ub01= u11 + u12; float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float ub11= u11 - u12; float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
float vb00= ub00 + ub01; y[0] = SQRT_2 * v00; // w0[0];
float vb01= ub00 - ub01; y[1] = v01 - vb11; // w1[0];
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11; __device__ void _dttiv(float x0, float x1, float x2, float x3, float x4, float x5, float x6, float x7,
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11; float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) {
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00 = (COSN2[0] * x7 + SINN2[0] * x0);
u10 = (-SINN2[3] * x4 + COSN2[3] * x3);
u01 = (COSN2[1] * x6 + SINN2[1] * x1);
u11 = -(-SINN2[2] * x5 + COSN2[2] * x2);
y[0] = SQRT_2 * v00; // w0[0]; u02 = (COSN2[2] * x5 + SINN2[2] * x2);
y[1] = v01 - vb11; // w1[0]; u12 = (-SINN2[1] * x6 + COSN2[1] * x1);
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7, u03 = (COSN2[3] * x4 + SINN2[3] * x3);
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) u13 = -(-SINN2[0] * x7 + COSN2[0] * x0);
{ } else { // DCTIV
float u00, u01, u02, u03, u10, u11, u12, u13; u00 = (COSN2[0] * x0 + SINN2[0] * x7);
if (dst_not_dct) { // DSTIV u10 = (-SINN2[3] * x3 + COSN2[3] * x4);
u00= ( COSN2[0] * x7 + SINN2[0] * x0);
u10= (-SINN2[3] * x4 + COSN2[3] * x3);
u01= ( COSN2[1] * x6 + SINN2[1] * x1);
u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
u02= ( COSN2[2] * x5 + SINN2[2] * x2);
u12= (-SINN2[1] * x6 + COSN2[1] * x1);
u03= ( COSN2[3] * x4 + SINN2[3] * x3);
u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00= ( COSN2[0] * x0 + SINN2[0] * x7);
u10= (-SINN2[3] * x3 + COSN2[3] * x4);
u01= ( COSN2[1] * x1 + SINN2[1] * x6);
u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
u02= ( COSN2[2] * x2 + SINN2[2] * x5);
u12= (-SINN2[1] * x1 + COSN2[1] * x6);
u03= ( COSN2[3] * x3 + SINN2[3] * x4);
u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
// j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
*y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
} else {
*y1 = (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct) u01 = (COSN2[1] * x1 + SINN2[1] * x6);
{ u11 = -(-SINN2[2] * x2 + COSN2[2] * x5);
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct) u02 = (COSN2[2] * x2 + SINN2[2] * x5);
{ u12 = (-SINN2[1] * x1 + COSN2[1] * x6);
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct) u03 = (COSN2[3] * x3 + SINN2[3] * x4);
{ u13 = -(-SINN2[0] * x0 + COSN2[0] * x7);
float *x1 = x0 + inc; }
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10= (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u01= ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02= ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12= (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03= ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = -vb00 * 0.5f; // w1[3];
} else {
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct) // _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
{
float sgn = 1 - 2* dst_not_dct; float ua00 = u00 + u03;
float *y0 = x; float ua10 = u00 - u03;
float *y1 = y0 + inc;
float *y2 = y1 + inc; float ua01 = u01 + u02;
float *y3 = y2 + inc; float ua11 = u01 - u02;
float *y4 = y3 + inc;
float *y5 = y4 + inc; float v00 = ua00 + ua01;
float *y6 = y5 + inc; float v02 = ua00 - ua01;
float *y7 = y6 + inc;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float *x0 = x + dst_not_dct * 7 * inc; float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// negate inc, replace
inc *= sgn; // _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float *x1 = x0 + inc;
float *x2 = x1 + inc; float ub00 = u10 + u13;
float *x3 = x2 + inc; float ub10 = u10 - u13;
float *x4 = x3 + inc;
float *x5 = x4 + inc; float ub01 = u11 + u12;
float *x6 = x5 + inc; float ub11 = u11 - u12;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13; float vb00 = ub00 + ub01;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7)); float vb01 = ub00 - ub01;
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6)); float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
*y0 = v00 * 0.5f; // w0[0];
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5)); // j == 1
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6)); *y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4)); *y4 = (v02 - vb01) * SQRT1_8; // w0[2];
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7)); // j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03); if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
float ua00= u00 + u03; *y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
float ua10= u00 - u03; *y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
float ua01= u01 + u02; } else {
float ua11= u01 - u02; *y1 = (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
float v00= ua00 + ua01; *y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
float v02= ua00 - ua01; *y7 = vb00 * 0.5f; // w1[3];
}
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
} }
inline __device__ void dctiv_nodiverg(float * x0, int inc) inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct) {
{ float *x1 = x0 + inc;
float *x1 = x0 + inc; float *x2 = x1 + inc;
float *x2 = x1 + inc; float *x3 = x2 + inc;
float *x3 = x2 + inc; float *x4 = x3 + inc;
float *x4 = x3 + inc; float *x5 = x4 + inc;
float *x5 = x4 + inc; float *x6 = x5 + inc;
float *x6 = x5 + inc; float *x7 = x6 + inc;
float *x7 = x6 + inc; float u00, u01, u02, u03, u10, u11, u12, u13;
float u00, u01, u02, u03, u10, u11, u12, u13; if (dst_not_dct) { // DSTII
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7)); // invert odd input samples
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4)); u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6)); *x2 = v01 * SQRT1_8;
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5)); *x3 = v11 * SQRT1_8;
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5)); *x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6)); *x5 = v12 * SQRT1_8;
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4)); *x6 = v03 * SQRT1_8;
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7)); *x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03); inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03; float w00 = u00 + u03;
float ua10= u00 - u03; float w10 = u00 - u03;
float ua01= u01 + u02; float w01 = (u01 + u02);
float ua11= u01 - u02; float w11 = (u01 - u02);
float v00= ua00 + ua01; float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v02= ua00 - ua01; float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11; float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11; float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13); if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
float ub00= u10 + u13; *x2 = v12 * SQRT1_8;
float ub10= u10 - u13; *x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
float ub01= u11 + u12; *x4 = v11 * SQRT1_8;
float ub11= u11 - u12; *x5 = v01 * SQRT1_8;
float vb00= ub00 + ub01; *x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
float vb01= ub00 - ub01; *x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11; *x2 = v01 * SQRT1_8;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11; *x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x0 = v00 * 0.5f; // w0[0]; *x6 = v03 * SQRT1_8;
*x2 = (v01 + vb11) * SQRT1_8; // w0[1]; *x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x4 = (v02 - vb01) * SQRT1_8; // w0[2]; }
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
} }
inline __device__ void dstiv_nodiverg(float * x, int inc) inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct) {
{ float *x1 = x0 + inc;
float *x0 = x + 7 * inc; float *x2 = x1 + inc;
// negate inc, replace float *x3 = x2 + inc;
inc = -inc; float *x4 = x3 + inc;
float *x1 = x0 + inc; float *x5 = x4 + inc;
float *x2 = x1 + inc; float *x6 = x5 + inc;
float *x3 = x2 + inc; float *x7 = x6 + inc;
float *x4 = x3 + inc; float u00, u01, u02, u03, u10, u11, u12, u13;
float *x5 = x4 + inc; if (dst_not_dct) { // DSTIV
float *x6 = x5 + inc; u00 = (COSN2[0] * (*x7) + SINN2[0] * (*x0));
float *x7 = x6 + inc; u10 = (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7)); u01 = (COSN2[1] * (*x6) + SINN2[1] * (*x1));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4)); u11 = -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02 = (COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12 = (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03 = (COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13 = -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6)); float ub00 = u10 + u13;
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5)); float ub10 = u10 - u13;
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5)); float ub01 = u11 + u12;
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6)); float ub11 = u11 - u12;
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4)); float vb00 = ub00 + ub01;
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7)); float vb01 = ub00 - ub01;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03); float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
float ua00= u00 + u03; *x0 = v00 * 0.5f; // w0[0];
float ua10= u00 - u03; *x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = -vb00 * 0.5f; // w1[3];
} else {
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct) {
float sgn = 1 - 2 * dst_not_dct;
float *y0 = x;
float *y1 = y0 + inc;
float *y2 = y1 + inc;
float *y3 = y2 + inc;
float *y4 = y3 + inc;
float *y5 = y4 + inc;
float *y6 = y5 + inc;
float *y7 = y6 + inc;
float *x0 = x + dst_not_dct * 7 * inc;
// negate inc, replace
inc *= sgn;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
}
inline __device__ void dctiv_nodiverg(float *x0, int inc) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
float ua01= u01 + u02; u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
float ua11= u01 - u02; u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
float v00= ua00 + ua01; u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
float v02= ua00 - ua01; u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11; // _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13); float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ub00= u10 + u13; float ua01 = u01 + u02;
float ub10= u10 - u13; float ua11 = u01 - u02;
float ub01= u11 + u12; float v00 = ua00 + ua01;
float ub11= u11 - u12; float v02 = ua00 - ua01;
float vb00= ub00 + ub01; float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float vb01= ub00 - ub01; float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11; // _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
*x7 = v00 * 0.5f; // w0[0]; float ub01 = u11 + u12;
*x5 = (v01 + vb11) * SQRT1_8; // w0[1]; float ub11 = u11 - u12;
*x3 = (v02 - vb01) * SQRT1_8; // w0[2];
*x1 = (v03 + vb10) * SQRT1_8; // w0[3];
*x6 = (vb11 - v01) * SQRT1_8; // w1[0]; float vb00 = ub00 + ub01;
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1]; float vb01 = ub00 - ub01;
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3]; float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
} }
inline __device__ void dstiv_nodiverg(float *x, int inc) {
float *x0 = x + 7 * inc;
// negate inc, replace
inc = -inc;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00 = u00 + u03;
float ua10 = u00 - u03;
inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x7 = v00 * 0.5f; // w0[0];
*x5 = (v01 + vb11) * SQRT1_8; // w0[1];
*x3 = (v02 - vb01) * SQRT1_8; // w0[2];
*x1 = (v03 + vb10) * SQRT1_8; // w0[3];
*x6 = (vb11 - v01) * SQRT1_8; // w1[0];
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3];
}
inline __device__ void _dctii_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{ {
float u00= (x[0] + x[7]); float u00 = (x[0] + x[7]);
float u10= (x[0] - x[7]); float u10 = (x[0] - x[7]);
float u01= (x[1] + x[6]); float u01 = (x[1] + x[6]);
float u11= (x[1] - x[6]); float u11 = (x[1] - x[6]);
float u02= (x[2] + x[5]); float u02 = (x[2] + x[5]);
float u12= (x[2] - x[5]); float u12 = (x[2] - x[5]);
float u03= (x[3] + x[4]); float u03 = (x[3] + x[4]);
float u13= (x[3] - x[4]); float u13 = (x[3] - x[4]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03); // _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03; float w00 = u00 + u03;
float w10= u00 - u03; float w10 = u00 - u03;
float w01= (u01 + u02); float w01 = (u01 + u02);
float w11= (u01 - u02); float w11 = (u01 - u02);
float v00= w00 + w01; float v00 = w00 + w01;
float v02= w00 - w01; float v02 = w00 - w01;
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11; float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11; float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13); // _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13); float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12); float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12); float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13); float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
// _dctii_nrecurs2(u00, u01, &v00, &v01); // _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00= w20 + w21; float z00 = w20 + w21;
float z01= w20 - w21; float z01 = w20 - w21;
// _dctii_nrecurs2(u10, u11, &v10, &v11); // _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10= w30 + w31; float z10 = w30 + w31;
float z11= w30 - w31; float z11 = w30 - w31;
float v10 = SQRT_2 * z00; float v10 = SQRT_2 * z00;
float v11 = z01 - z11; float v11 = z01 - z11;
float v12 = z01 + z11; float v12 = z01 + z11;
float v13 = SQRT_2 * z10; float v13 = SQRT_2 * z10;
y[0] = v00; y[0] = v00;
y[1] = v10; y[1] = v10;
y[2] = v01; y[2] = v01;
y[3] = v11; y[3] = v11;
y[4] = v02; y[4] = v02;
y[5] = v12; y[5] = v12;
y[6] = v03; y[6] = v03;
y[7] = v13; y[7] = v13;
} }
inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each inline __device__ void dct_ii8(float x[8], float y[8]) // x,y point to 8-element arrays each
{ {
_dctii_nrecurs8(x, y); _dctii_nrecurs8(x, y);
#pragma unroll #pragma unroll
for (int i = 0; i < 8 ; i++) { for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8; y[i] *= SQRT1_8;
} }
} }
__device__ void dct_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
{ {
_dctiv_nrecurs8(x, y); _dctiv_nrecurs8(x, y);
#pragma unroll #pragma unroll
for (int i = 0; i < 8 ; i++) { for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8; y[i] *= SQRT1_8;
} }
} }
inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each inline __device__ void dst_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
{ {
float xr[8]; float xr[8];
#pragma unroll #pragma unroll
for (int i=0; i < 8;i++){ for (int i = 0; i < 8; i++) {
xr[i] = x[7 - i]; xr[i] = x[7 - i];
} }
_dctiv_nrecurs8(xr, y); _dctiv_nrecurs8(xr, y);
#pragma unroll #pragma unroll
for (int i=0; i < 8;i+=2){ for (int i = 0; i < 8; i += 2) {
y[i] *= SQRT1_8; y[i] *= SQRT1_8;
y[i+1] *= -SQRT1_8; y[i + 1] *= -SQRT1_8;
} }
} }
//=========================== 2D functions =============== //=========================== 2D functions ===============
__device__ void corrUnfoldTile( __device__ void corrUnfoldTile(
int corr_radius, int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports float *qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15 float *rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{ {
int size2r1 = 2 * corr_radius + 1; // 15 int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; //8 int crp1 = corr_radius + 1; // 8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center /// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1); float *qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1); float *qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1); float *qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x; int i = threadIdx.x;
if (i > corr_radius) { if (i > corr_radius) {
return; // not needed, only use inner return; // not needed, only use inner
} }
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i); // printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float corr_pixscale = 0.25f; float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1; /// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1; int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1; int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1; int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0 rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0 rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) { /// for (int j = 1; j < DTT_SIZE; j++) {
for (int j = 1; j <= corr_radius; j++) { for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j; int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j; int rslt_base_index_pm = rslt_base_index_p - j;
rslt[rslt_base_index_pp] = corr_pixscale * ( rslt[rslt_base_index_pp] = corr_pixscale * (qdata0[i_transform_size + j] +
qdata0[i_transform_size + j] + qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0 rslt[rslt_base_index_pm] = corr_pixscale * (qdata0[i_transform_size + j] +
rslt[rslt_base_index_pm] = corr_pixscale * ( -qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
qdata0[i_transform_size + j] + }
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0 if (i == 0) {
} return;
if (i == 0) { }
return; /// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
} float d = corr_pixscale * qdata2[im1_transform_size];
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated rslt[rslt_base_index_p] += d;
float d = corr_pixscale * qdata2[im1_transform_size]; rslt[rslt_base_index_m] -= d;
rslt[rslt_base_index_p] += d; for (int j = 1; j <= corr_radius; j++) {
rslt[rslt_base_index_m] -= d; int rslt_base_index_pp = rslt_base_index_p + j;
for (int j = 1; j <= corr_radius; j++) { int rslt_base_index_pm = rslt_base_index_p - j;
int rslt_base_index_pp = rslt_base_index_p + j; int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_pm = rslt_base_index_p - j; int rslt_base_index_mm = rslt_base_index_m - j;
int rslt_base_index_mp = rslt_base_index_m + j; float d2 = corr_pixscale * qdata2[im1_transform_size + j];
int rslt_base_index_mm = rslt_base_index_m - j; float d3 = corr_pixscale * qdata3[im1_transform_size + j - 1];
float d2 = corr_pixscale * qdata2[im1_transform_size + j]; // rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1]; rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0 rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3; rslt[rslt_base_index_pp] += d2 + d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3; rslt[rslt_base_index_pm] += d2 - d3;
rslt[rslt_base_index_pp] += d2 + d3; }
rslt[rslt_base_index_pm] += d2 - d3;
}
} }
__device__ void dttii_2d( __device__ void dttii_2d(
float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE] float *clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
{ {
// change to 16-32 threads?? in next iteration // change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed) // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){ for (int q = 0; q < 4; q++) {
int is_sin = (q >> 1) & 1; int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x, DTT_SIZE1, is_sin); // vertical pass, thread is column
} }
__syncthreads(); __syncthreads();
// hor pass, corresponding to vert pass in Java // hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){ for (int q = 0; q < 4; q++) {
int is_sin = q & 1; int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1, 1, is_sin); // horizontal pass, tread is row
} }
__syncthreads(); __syncthreads();
} }
__device__ void dttiv_color_2d( __device__ void dttiv_color_2d(
float * clt_tile, float *clt_tile,
int color) int color) {
{ dctiv_nodiverg( // all colors
dctiv_nodiverg( // all colors clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row 1); // int inc);
1); //int inc); // __syncthreads();// worsened
// __syncthreads();// worsened if (color == BAYER_GREEN) {
if (color == BAYER_GREEN){ dstiv_nodiverg( // all colors
dstiv_nodiverg( // all colors clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row 1); // int inc);
1); //int inc);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#ifdef DEBUG222 #ifdef DEBUG222
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n",color); printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
dctiv_nodiverg( // all colors dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
// __syncthreads();// worsened // __syncthreads();// worsened
if (color == BAYER_GREEN){ if (color == BAYER_GREEN) {
dctiv_nodiverg( // all colors dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
} }
__device__ void dttiv_mono_2d( __device__ void dttiv_mono_2d(
float * clt_tile) float *clt_tile) {
{ // Copy 0-> 1
// Copy 0-> 1
dctiv_nodiverg( dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE), clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); //int inc); 1); // int inc);
dstiv_nodiverg( dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE), clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); //int inc); 1); // int inc);
dctiv_nodiverg( dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE), clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); //int inc); 1); // int inc);
dstiv_nodiverg( dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE), clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); //int inc); 1); // int inc);
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#ifdef DEBUG222 #ifdef DEBUG222
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n",color); printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
dctiv_nodiverg( // CC dctiv_nodiverg( // CC
clt_tile + threadIdx.x, clt_tile + threadIdx.x,
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
dctiv_nodiverg( // SC dctiv_nodiverg( // SC
clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE), clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
dstiv_nodiverg( // CS dstiv_nodiverg( // CS
clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
dstiv_nodiverg( // SS dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc, DTT_SIZE1); // int inc,
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
} }
// //
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window, // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass // first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f}; //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt( __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17] float *mclt_tile) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{ {
int thr3 = threadIdx.x >> 3; int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed. int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2; int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0 // int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0 // int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0 int wcolumn = ((thr3 << 3) - thr3) ^ thr012; // 0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE); float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE); float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE); float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG3 #ifdef DEBUG3
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0) {
printf("\nDTT Tiles before IDTT\n"); printf("\nDTT Tiles before IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
// perform horizontal dct-iv on quadrants 0 and 1 // perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( dctiv_nodiverg(
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2 clt_tile + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1); 1);
// perform horizontal dst-iv on quadrants 2 and 3 // perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3 clt_tile1 + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1); 1);
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2 // perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( dctiv_nodiverg(
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1 clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1); DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3 // perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg( dstiv_nodiverg(
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3 clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1); DTT_SIZE1);
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#ifdef DEBUG3 #ifdef DEBUG3
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after IDTT\n"); printf("\nDTT Tiles after IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
float hw = HWINDOW2[wcolumn]; float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9 int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column; float *rslt = mclt_tile + column;
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++){ for (int i = 0; i < 4; i++) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i < 3){ if (i < 3) {
clt_offset += DTT_SIZE1; clt_offset += DTT_SIZE1;
} }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val // *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w,d0,val); // w*d0 + val val = __fmaf_rd(w, d0, val); // w*d0 + val
*rslt = val; *rslt = val;
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 4; i < 8; i++){ for (int i = 4; i < 8; i++) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
// if (i < 7){ // if (i < 7){
clt_offset -= DTT_SIZE1; clt_offset -= DTT_SIZE1;
// } // }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val *rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 7; i >= 4; i--){ for (int i = 7; i >= 4; i--) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i > 4){ if (i > 4) {
clt_offset -= DTT_SIZE1; clt_offset -= DTT_SIZE1;
} }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val *rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 3; i >= 0; i--){ for (int i = 3; i >= 0; i--) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i > 0){ if (i > 0) {
clt_offset += DTT_SIZE1; clt_offset += DTT_SIZE1;
} }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val *rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#ifdef DEBUG3 #ifdef DEBUG3
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
if ((threadIdx.x) == 0){ if ((threadIdx.x) == 0) {
printf("\nMCLT Tiles after IMCLT\n"); printf("\nMCLT Tiles after IMCLT\n");
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
} }
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16 // Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass // first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f}; //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt8threads( __device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17] float *mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug) int debug) {
{ // int thr3 = threadIdx.x >> 3;
// int thr3 = threadIdx.x >> 3; // int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int column = threadIdx.x; // modify to use 2*8 threads, if needed. // int thr012 = threadIdx.x & 7;
// int thr012 = threadIdx.x & 7; // int column4 = threadIdx.x >> 2;
// int column4 = threadIdx.x >> 2; // int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0 float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE); float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE); float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG7 #ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){ if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles before IDTT\n"); printf("\nDTT Tiles before IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
// perform horizontal dct-iv on quadrants 0 and 1 // perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( // quadrant 0 dctiv_nodiverg( // quadrant 0
clt_tile + threadIdx.x, // pointer to start of row for quadrant 0 clt_tile + threadIdx.x, // pointer to start of row for quadrant 0
DTT_SIZE1); DTT_SIZE1);
dctiv_nodiverg( // quadrant 1 dctiv_nodiverg( // quadrant 1
clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1 clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
DTT_SIZE1); DTT_SIZE1);
// perform horizontal dst-iv on quadrants 2 and 3 // perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // quadrant 2 dstiv_nodiverg( // quadrant 2
clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2 clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
DTT_SIZE1); DTT_SIZE1);
dstiv_nodiverg( // quadrant 3 dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3 clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1); DTT_SIZE1);
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2 // perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0 dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0 clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
1); 1);
dctiv_nodiverg( // quadrant 2 dctiv_nodiverg( // quadrant 2
clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2 clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
1); 1);
// perform vertical dst-iv on quadrants 1 and 3 // perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg( // quadrant 1 dstiv_nodiverg( // quadrant 1
clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1 clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
1); 1);
dstiv_nodiverg( // quadrant 3 dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3 clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1); 1);
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#ifdef DEBUG7 #ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){ if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles after IDTT\n"); printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x). // re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for (int thr3 = 0; thr3 < 2; thr3++){ for (int thr3 = 0; thr3 < 2; thr3++) {
int thr3m = (thr3 << 3); int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed. int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different ! int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0 int wcolumn = (thr3m - thr3) ^ thr012; // 0..7,7,..0
float hw = HWINDOW2[wcolumn]; float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9 int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column; float *rslt = mclt_tile + column;
#ifdef DEBUG7 #ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){ if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n", printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset); thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
#pragma unroll #pragma unroll
for (int i = 0; i < 4; i++){ for (int i = 0; i < 4; i++) {
float val = *rslt; float val = *rslt;
// facc // facc
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i < 3){ if (i < 3) {
clt_offset += DTT_SIZE1; clt_offset += DTT_SIZE1;
} }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val // *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val // val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val; // *rslt = val;
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 4; i < 8; i++){ for (int i = 4; i < 8; i++) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
// if (i < 7){ // if (i < 7){
clt_offset -= DTT_SIZE1; clt_offset -= DTT_SIZE1;
// } // }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val // *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 7; i >= 4; i--){ for (int i = 7; i >= 4; i--) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i > 4){ if (i > 4) {
clt_offset -= DTT_SIZE1; clt_offset -= DTT_SIZE1;
} }
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
#pragma unroll #pragma unroll
for (int i = 3; i >= 0; i--){ for (int i = 3; i >= 0; i--) {
float val = *rslt; float val = *rslt;
float w = HWINDOW2[i] * hw; float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset)); float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset)); float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset)); float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset)); float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1; d0 += d1;
d2+=d3; d2 += d3;
d0+= d2; d0 += d2;
if (i > 0){ if (i > 0) {
clt_offset += DTT_SIZE1; clt_offset += DTT_SIZE1;
} }
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21; rslt += DTT_SIZE21;
} }
} }
#ifdef DEBUG7 #ifdef DEBUG7
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) { for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)){ if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)) {
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y); printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt( debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports) mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1); -1);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif #endif
} }
//#endif //#endif
...@@ -45,57 +45,56 @@ ...@@ -45,57 +45,56 @@
* with Nvidia Nsight, driver API when calling these kernels from Java * with Nvidia Nsight, driver API when calling these kernels from Java
*/ */
#ifndef JCUDA #ifndef JCUDA
#define DTT_SIZE_LOG2 3 #define DTT_SIZE_LOG2 3
#endif #endif
#pragma once #pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2) #define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTT_SIZE1 (DTT_SIZE + 1) #define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE) #define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1) #define DTT_SIZE21 (DTT_SIZE2 + 1)
#define DTT_SIZE4 (4 * DTT_SIZE) #define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1) #define DTT_SIZE2M1 (DTT_SIZE2 - 1)
#define BAYER_RED 0 #define BAYER_RED 0
#define BAYER_BLUE 1 #define BAYER_BLUE 1
#define BAYER_GREEN 2 #define BAYER_GREEN 2
// assuming GR/BG as now // assuming GR/BG as now
#define BAYER_RED_ROW 0 #define BAYER_RED_ROW 0
#define BAYER_RED_COL 1 #define BAYER_RED_COL 1
#define DTTTEST_BLOCK_WIDTH 32 #define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16 #define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1) #define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH + 1)
//extern __constant__ float idct_signs[4][4][4];
//extern __constant__ int imclt_indx9[16];
//extern __constant__ float HWINDOW2[];
// extern __constant__ float idct_signs[4][4][4];
// extern __constant__ int imclt_indx9[16];
// extern __constant__ float HWINDOW2[];
// kernels (not used so far) // kernels (not used so far)
#if 0 #if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode); extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #if 0 #endif // #if 0
//=========================== 2D functions =============== //=========================== 2D functions ===============
extern __device__ void corrUnfoldTile( extern __device__ void corrUnfoldTile(
int corr_radius, int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15 float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
extern __device__ void dttii_2d( extern __device__ void dttii_2d(
float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE] float* clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
extern __device__ void dttiv_color_2d( extern __device__ void dttiv_color_2d(
float * clt_tile, float* clt_tile,
int color); int color);
extern __device__ void dttiv_mono_2d( extern __device__ void dttiv_mono_2d(
float * clt_tile); float* clt_tile);
extern __device__ void imclt( extern __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ); float* mclt_tile);
extern __device__ void imclt8threads( extern __device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17] float* mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug); int debug);
...@@ -37,16 +37,15 @@ ...@@ -37,16 +37,15 @@
*/ */
#ifndef JCUDA #ifndef JCUDA
#include "tp_defines.h" #include "tp_defines.h"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
#ifndef get_task_size #ifndef get_task_size
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x)) #define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#endif #endif
// Using NUM_CAMS threads per tile // Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS) #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) ///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
...@@ -57,9 +56,8 @@ ...@@ -57,9 +56,8 @@
#define DBG_CAM 3 #define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g, int num_cams); __device__ void printGeometryCorrection(struct gc *g, int num_cams);
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams); __device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams);
/** /**
* Calculate non-distorted radius from distorted using table approximation * Calculate non-distorted radius from distorted using table approximation
...@@ -67,774 +65,777 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams); ...@@ -67,774 +65,777 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
* @return corresponding non-distorted radius * @return corresponding non-distorted radius
*/ */
inline __device__ float getRByRDist(float rDist, inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory float rByRDist[RBYRDIST_LEN]); // shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = { // ...{cos,sin,const}...
{
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}... // azimuth
{ // azimuth {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}}, {{0, 0, 0}, {0, 0, 1}, {0, 0, 0}},
{{ 0, 0,0},{0, 0,1},{ 0, 0,0}}, {{0, 1, 0}, {0, 0, 0}, {1, 0, 0}},
{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
},
},{ // tilt {
{{ 0, 0,1},{0, 0,0},{ 0, 0,0}}, // tilt
{{ 0, 0,0},{1, 0,0},{ 0, 1,0}}, {{0, 0, 1}, {0, 0, 0}, {0, 0, 0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}}, {{0, 0, 0}, {1, 0, 0}, {0, 1, 0}},
},{ // roll*zoom {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}}, },
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}}, {
{{ 0, 0,0},{0, 0,0},{ 0, 0,1}}, // roll*zoom
{{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
},{ // d_azimuth {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{ 0,-1,0},{0, 0,0},{-1, 0,0}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 1}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}}, },
},{ // d_tilt {
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, // d_azimuth
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}}, {{0, -1, 0}, {0, 0, 0}, {-1, 0, 0}},
{{ 0, 0,0},{-1,0,0},{ 0,-1,0}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
},{ // d_roll {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}}, },
{{-1, 0,0},{0,-1,0},{ 0, 0,0}}, {
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, // d_tilt
},{ // d_zoom {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}}, {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}}, {{0, 0, 0}, {-1, 0, 0}, {0, -1, 0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}}, },
} {
}; // d_roll
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
__constant__ int angles_offsets [4] = { {{-1, 0, 0}, {0, -1, 0}, {0, 0, 0}},
offsetof(corr_vector, azimuth)/sizeof(float), {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
offsetof(corr_vector, tilt) /sizeof(float), },
offsetof(corr_vector, roll) /sizeof(float), {
offsetof(corr_vector, roll) /sizeof(float)}; // d_zoom
__constant__ int mm_seq [3][3][3]={ {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
{ {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{6,5,12}, // a_t * a_z -> tmp0 {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{7,6,13}, // a_r * a_t -> tmp1 }};
{7,9,14}, // a_r * a_dt -> tmp2
}, { __constant__ int angles_offsets[4] = {
{7,12,0}, // a_r * tmp0 -> rot - bad offsetof(corr_vector, azimuth) / sizeof(float),
{13,8,1}, // tmp1 * a_daz -> deriv0 - good offsetof(corr_vector, tilt) / sizeof(float),
{14,5,2}, // tmp2 * a_az -> deriv1 - good offsetof(corr_vector, roll) / sizeof(float),
}, { offsetof(corr_vector, roll) / sizeof(float)};
{10,12,3}, // a_dr * tmp0 -> deriv2 - good __constant__ int mm_seq[3][3][3] = {
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good {
{-1,-1,-1} // do nothing {6, 5, 12}, // a_t * a_z -> tmp0
}}; {7, 6, 13}, // a_r * a_t -> tmp1
{7, 9, 14}, // a_r * a_dt -> tmp2
__constant__ int offset_rots = 0; //0 },
__constant__ int offset_derivs = 1; // 1..4 // should be next {
__constant__ int offset_matrices = 5; // 5..11 {7, 12, 0}, // a_r * tmp0 -> rot - bad
__constant__ int offset_tmp = 12; // 12..15 {13, 8, 1}, // tmp1 * a_daz -> deriv0 - good
{14, 5, 2}, // tmp2 * a_az -> deriv1 - good
//inline __device__ int get_task_size_gc(int num_cams); },
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams); {
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams); {10, 12, 3}, // a_dr * tmp0 -> deriv2 - good
{11, 12, 4}, // a_dzoom * tnmp0 -> deriv3 - good
//inline __device__ int get_task_size_gc(int num_cams){ {-1, -1, -1} // do nothing
}};
__constant__ int offset_rots = 0; // 0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
// inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams);
// inline __device__ int get_task_size_gc(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams); // return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//} // }
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) { inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile); return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile);
} }
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) { inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1); return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
} }
/** /**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles * NUM_CAMS blocks of 3,3,3 tiles
*/ */
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
int num_cams, int num_cams,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
trot_deriv * gpu_rot_deriv) trot_deriv *gpu_rot_deriv) {
{ __shared__ float sincos[4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float sincos [4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin} __shared__ float matrices[5 + 7 + 4][3][3];
__shared__ float matrices[5 + 7 +4][3][3]; float angle;
float angle; float zoom;
float zoom; int ncam = blockIdx.x; // threadIdx.z;
int ncam = blockIdx.x; // threadIdx.z; int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1; int nangle = nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll
int nangle = nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll int is_sin = nangle1 & 1;
int is_sin = nangle1 & 1; if ((threadIdx.z == 0) && (nangle < 4)) { // others just idle here
if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here float *gangles = (float *)gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0 /// if (ncam == (NUM_CAMS-1)){ // for the whole block
/// if (ncam == (NUM_CAMS-1)){ // for the whole block if (ncam == (num_cams - 1)) { // for the whole block
if (ncam == (num_cams-1)){ // for the whole block angle = 0.0;
angle = 0.0; zoom = 0.0;
zoom = 0.0; /// for (int n = 0; n < (NUM_CAMS-1); n++){
/// for (int n = 0; n < (NUM_CAMS-1); n++){ for (int n = 0; n < (num_cams - 1); n++) {
for (int n = 0; n < (num_cams-1); n++){ angle -= *(gangles + n);
angle -= *(gangles + n); zoom -= gpu_correction_vector->zoom[n];
zoom -= gpu_correction_vector->zoom[n]; }
} if (nangle >= 2) { // diverging for roll (last two)
if (nangle >= 2){ // diverging for roll (last two) angle = *(gangles + ncam);
angle = *(gangles + ncam); }
}
} else {
} else { angle = *(gangles + ncam);
angle = *(gangles + ncam); zoom = gpu_correction_vector->zoom[ncam];
zoom = gpu_correction_vector->zoom[ncam]; }
} if (!is_sin) {
if (!is_sin){ angle += M_PI / 2;
angle += M_PI/2; }
} float sc = sinf(angle);
float sc = sinf(angle); if (nangle == 2) {
if (nangle ==2) { sc *= 1.0 + zoom;
sc *= 1.0 + zoom; }
} sincos[nangle][is_sin] = sc;
sincos[nangle][is_sin]= sc; }
} __syncthreads();
__syncthreads();
#ifdef DEBUG20 #ifdef DEBUG20
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){ if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom); printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf( " Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]); printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf( " Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]); printf(" Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf( " Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]); printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG20 #endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices + threadIdx.z; // 0..2
// Create 3 3x3 matrices for az, tilt, roll/zoom: int const_index = threadIdx.z; // 0..2
int axis = offset_matrices+threadIdx.z; // 0..2 matrices[axis][threadIdx.y][threadIdx.x] =
int const_index = threadIdx.z; // 0..2 ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
matrices[axis][threadIdx.y][threadIdx.x] = ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin axis += 3; // skip index == 3
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const const_index += 3;
axis += 3; // skip index == 3 matrices[axis][threadIdx.y][threadIdx.x] =
const_index +=3; ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
matrices[axis][threadIdx.y][threadIdx.x] = ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin if (threadIdx.z == 0) {
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const axis += 3;
if (threadIdx.z == 0){ const_index += 3;
axis += 3; matrices[axis][threadIdx.y][threadIdx.x] =
const_index +=3; ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0] + // cos
matrices[axis][threadIdx.y][threadIdx.x] = ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin }
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const __syncthreads();
}
__syncthreads();
#ifdef DEBUG20 #ifdef DEBUG20
const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"}; const char *matrices_names[] = {"az", "tilt", "roll*zoom", "d_daz", "d_tilt", "d_roll", "d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){ if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
for (int i = 0; i < 7; i++) { for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam); printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++){ for (int row = 0; row < 3; row++) {
for (int col = 0; col < 3; col++){ for (int col = 0; col < 3; col++) {
printf("%9.6f, ",matrices[offset_matrices + i][row][col]); printf("%9.6f, ", matrices[offset_matrices + i][row][col]);
} }
printf("\n"); printf("\n");
} }
}
} }
} __syncthreads(); // __syncwarp();
__syncthreads();// __syncwarp(); #endif // DEBUG20
#endif // DEBUG20
/*
/* __constant__ int mm_seq [3][3][3]={
__constant__ int mm_seq [3][3][3]={ {
{ {6,5,12}, // a_t * a_z -> tmp0
{6,5,12}, // a_t * a_z -> tmp0 {7,6,13}, // a_r * a_t -> tmp1
{7,6,13}, // a_r * a_t -> tmp1 {7,9,14}, // a_r * a_dt -> tmp2
{7,9,14}, // a_r * a_dt -> tmp2 }, {
}, { {7,12,0}, // a_r * tmp0 -> rot
{7,12,0}, // a_r * tmp0 -> rot {13,8,1}, // tmp1 * a_daz -> deriv0
{13,8,1}, // tmp1 * a_daz -> deriv0 {14,5,2}, // tmp2 * a_az -> deriv1
{14,5,2}, // tmp2 * a_az -> deriv1 }, {
}, { {10,12,3}, // a_dr * tmp0 -> deriv2
{10,12,3}, // a_dr * tmp0 -> deriv2 {11,12,4}, // a_dzoom * tnmp0 -> deriv3
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 }};
}}; */
*/ for (int i = 0; i < 3; i++) {
for (int i = 0; i < 3; i++){ int srcl = mm_seq[i][threadIdx.z][0];
int srcl = mm_seq[i][threadIdx.z][0]; int srcr = mm_seq[i][threadIdx.z][1];
int srcr = mm_seq[i][threadIdx.z][1]; int dst = mm_seq[i][threadIdx.z][2];
int dst = mm_seq[i][threadIdx.z][2]; if (srcl >= 0) {
if (srcl >= 0){ matrices[dst][threadIdx.y][threadIdx.x] =
matrices[dst][threadIdx.y][threadIdx.x] = matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x] +
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+ matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x] +
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+ matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x]; }
} __syncthreads();
__syncthreads(); }
} // copy results to global memory
// copy results to global memory int gindx = threadIdx.z;
int gindx = threadIdx.z; int lindx = offset_rots + threadIdx.z;
int lindx = offset_rots + threadIdx.z;
#ifdef NVRTC_BUG #ifdef NVRTC_BUG
// going beyond first dimension // going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x]; gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else #else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x]; gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif #endif
gindx +=3; gindx += 3;
lindx+=3; lindx += 3;
if (lindx < 5) { if (lindx < 5) {
#ifdef NVRTC_BUG #ifdef NVRTC_BUG
// going beyond first dimension // going beyond first dimension
gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x]; gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#else #else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x]; gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif #endif
} }
__syncthreads(); __syncthreads();
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){ if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam); printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG20 #endif // DEBUG20
// All done - read/verify all arrays
// All done - read/verify all arrays
} }
extern "C" __global__ void calculate_tiles_offsets( extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc *gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv) trot_deriv *gpu_rot_deriv) {
{ /// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1); /// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1); int tiles_per_block_geom = NUM_THREADS / num_cams;
int tiles_per_block_geom = NUM_THREADS/ num_cams; dim3 threads_geom(num_cams, tiles_per_block_geom, 1);
dim3 threads_geom(num_cams,tiles_per_block_geom, 1); dim3 grid_geom((num_tiles + tiles_per_block_geom - 1) / tiles_per_block_geom, 1, 1);
dim3 grid_geom ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1); //#define NUM_THREADS 32
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1
if (threadIdx.x == 0) { // always 1 get_tiles_offsets<<<grid_geom, threads_geom>>>(
get_tiles_offsets<<<grid_geom,threads_geom>>> ( uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid num_cams, // int num_cams,
num_cams, // int num_cams, gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 // gpu_tasks, // struct tp_task * gpu_tasks,
// gpu_tasks, // struct tp_task * gpu_tasks, num_tiles, // int num_tiles, // number of tiles in task list
num_tiles, // int num_tiles, // number of tiles in task list gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector, gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv); }
// __syncthreads();// __syncwarp();
} // cudaDeviceSynchronize();
// __syncthreads();// __syncwarp(); // cudaDeviceSynchronize();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
} }
/* /*
* blockDim.x = NUM_CAMS * blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM * blockDim.y = TILES_PER_BLOCK_GEOM
*/ */
extern "C" __global__ void get_tiles_offsets( extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams, int num_cams,
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc *gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv) trot_deriv *gpu_rot_deriv) {
{ int task_size = get_task_size(num_cams);
int task_size = get_task_size(num_cams); int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x; int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32) __shared__ struct gc geometry_correction;
__shared__ struct gc geometry_correction; __shared__ float rByRDist[RBYRDIST_LEN];
__shared__ float rByRDist [RBYRDIST_LEN]; __shared__ struct corr_vector extrinsic_corr;
__shared__ struct corr_vector extrinsic_corr; __shared__ trot_deriv rot_deriv;
__shared__ trot_deriv rot_deriv; /// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS]; __shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller float pXY[2]; // result to be copied to task
float pXY[2]; // result to be copied to task // blockDim.y
//blockDim.y // copy data common to all threads
// copy data common to all threads {
{ int cycles_copy_gc = ((sizeof(struct gc) / sizeof(float) + dim_xy - 1) / dim_xy);
int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy); float *gcp_local = (float *)&geometry_correction;
float * gcp_local = (float *) &geometry_correction; float *gcp_global = (float *)gpu_geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction; int offset = thread_xy;
int offset = thread_xy; for (int i = 0; i < cycles_copy_gc; i++) {
for (int i = 0; i < cycles_copy_gc; i++){ if (offset < sizeof(struct gc) / sizeof(float)) {
if (offset < sizeof(struct gc)/sizeof(float)) { *(gcp_local + offset) = *(gcp_global + offset);
*(gcp_local + offset) = *(gcp_global + offset); }
} offset += dim_xy;
offset += dim_xy; }
} }
} {
{ int cycles_copy_cv = ((sizeof(struct corr_vector) / sizeof(float) + dim_xy - 1) / dim_xy);
int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy); float *cvp_local = (float *)&extrinsic_corr;
float * cvp_local = (float *) &extrinsic_corr; float *cvp_global = (float *)gpu_correction_vector;
float * cvp_global = (float *) gpu_correction_vector; int offset = thread_xy;
int offset = thread_xy; for (int i = 0; i < cycles_copy_cv; i++) {
for (int i = 0; i < cycles_copy_cv; i++){ if (offset < sizeof(struct corr_vector) / sizeof(float)) {
if (offset < sizeof(struct corr_vector)/sizeof(float)) { *(cvp_local + offset) = *(cvp_global + offset);
*(cvp_local + offset) = *(cvp_global + offset); }
} offset += dim_xy;
offset += dim_xy; }
} }
} // TODO: maybe it is better to use system memory and not read all table?
// TODO: maybe it is better to use system memory and not read all table? {
{ int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy; float *rByRDistp_local = (float *)rByRDist;
float * rByRDistp_local = (float *) rByRDist; float *rByRDistp_global = (float *)gpu_rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist; int offset = thread_xy;
int offset = thread_xy; for (int i = 0; i < cycles_copy_rbrd; i++) {
for (int i = 0; i < cycles_copy_rbrd; i++){ if (offset < RBYRDIST_LEN) {
if (offset < RBYRDIST_LEN) { *(rByRDistp_local + offset) = *(rByRDistp_global + offset);
*(rByRDistp_local + offset) = *(rByRDistp_global + offset); }
} offset += dim_xy;
offset += dim_xy; }
} }
} // copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction) {
{ int cycles_copy_rot = ((sizeof(trot_deriv) / sizeof(float)) + dim_xy - 1) / dim_xy;
int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy; float *rots_local = (float *)&rot_deriv;
float * rots_local = (float *) &rot_deriv; float *rots_global = (float *)gpu_rot_deriv; // rot_matrices;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices; int offset = thread_xy;
int offset = thread_xy; for (int i = 0; i < cycles_copy_rot; i++) {
for (int i = 0; i < cycles_copy_rot; i++){ if (offset < sizeof(trot_deriv) / sizeof(float)) {
if (offset < sizeof(trot_deriv)/sizeof(float)) { *(rots_local + offset) = *(rots_global + offset);
*(rots_local + offset) = *(rots_global + offset); }
} offset += dim_xy;
offset += dim_xy; }
} }
} __syncthreads();
__syncthreads(); int ncam = threadIdx.x;
int ncam = threadIdx.x; if (task_num >= num_tiles) {
if (task_num >= num_tiles){ return;
return; }
} int imu_exists = // todo - calculate once with rot_deriv?
int imu_exists = // todo - calculate once with rot_deriv? (extrinsic_corr.imu_rot[0] != 0.0) ||
(extrinsic_corr.imu_rot[0] != 0.0) || (extrinsic_corr.imu_rot[1] != 0.0) ||
(extrinsic_corr.imu_rot[1] != 0.0) || (extrinsic_corr.imu_rot[2] != 0.0) ||
(extrinsic_corr.imu_rot[2] != 0.0) || (extrinsic_corr.imu_move[0] != 0.0) ||
(extrinsic_corr.imu_move[0] != 0.0) || (extrinsic_corr.imu_move[1] != 0.0) ||
(extrinsic_corr.imu_move[1] != 0.0) || (extrinsic_corr.imu_move[2] != 0.0);
(extrinsic_corr.imu_move[2] != 0.0);
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nTile = %d, camera= %d\n", task_num, ncam); printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x); printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int)blockIdx.x);
printGeometryCorrection(&geometry_correction, num_cams); printGeometryCorrection(&geometry_correction, num_cams);
printExtrinsicCorrection(&extrinsic_corr,num_cams); printExtrinsicCorrection(&extrinsic_corr, num_cams);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
// String dbg_s = corr_vector.toString(); * 1) unapply common distortion (maybe for different - master camera)
/* Starting with required tile center X, Y and nominal distortion, for each sensor port: * 2) apply disparity
* 1) unapply common distortion (maybe for different - master camera) * 3) apply rotations and zoom
* 2) apply disparity * 4) re-apply distortion
* 3) apply rotations and zoom * 5) return port center X and Y
* 4) re-apply distortion * line_time
* 5) return port center X and Y */
* line_time
*/ // common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
// common code, calculated in parallel /// float disparity = gpu_tasks[task_num].target_disparity;
/// int cxy = gpu_tasks[task_num].txy; float disparity = *(gpu_ftasks + task_size * task_num + 2);
/// float disparity = gpu_tasks[task_num].target_disparity; float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float disparity = * (gpu_ftasks + task_size * task_num + 2); float px = *(centerXY);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset; float py = *(centerXY + 1);
float px = *(centerXY); int cxy = *(int *)(gpu_ftasks + task_size * task_num + 1);
float py = *(centerXY + 1); int tileX = (cxy & 0xffff);
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1); int tileY = (cxy >> 16);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16); // if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
// if (isnan(px)) { if (uniform_grid) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
#ifdef DEBUG23 #ifdef DEBUG23
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){ if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)) {
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam); printf("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX, tileY, ncam);
printf("\n"); printf("\n");
__syncthreads(); __syncthreads();
} }
#endif //#ifdef DEBUG23 #endif //#ifdef DEBUG23
px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX; px = tileX * DTT_SIZE + DTT_SIZE / 2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY; py = tileY * DTT_SIZE + DTT_SIZE / 2; // - shiftY;
*(centerXY) = px; *(centerXY) = px;
*(centerXY + 1) = py; *(centerXY + 1) = py;
} }
__syncthreads(); __syncthreads();
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth; float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight; float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY [2]; float rXY[2];
rXY[0] = geometry_correction.rXY[ncam][0]; rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1]; rXY[1] = geometry_correction.rXY[ncam][1];
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera float rD = sqrtf(pXcd * pXcd + pYcd * pYcd) * 0.001 * geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist); float rND2R = getRByRDist(rD / geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight) float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates float xyz[3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius / xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far (disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity; xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity; xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same) // next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i]; // geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera float fl_pix = geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius; float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nuniform_grid=%d\n", uniform_grid); printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam); printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity); printf("TargetDisparity = %f\n", disparity);
printf("tileX = %d, tileY = %d\n", tileX, tileY); printf("tileX = %d, tileY = %d\n", tileX, tileY);
printf("px = %f, py = %f\n", px, py); printf("px = %f, py = %f\n", px, py);
printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1)); printf("centerXY[0] = %f, centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd); printf("pXcd = %f, pYcd = %f\n", pXcd, pYcd);
printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]); printf("rXY[0] = %f, rXY[1] = %f\n", rXY[0], rXY[1]);
printf("rD = %f, rND2R = %f\n", rD, rND2R); printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc); printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale); printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]); printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0], xyz[1], xyz[2]);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){ // non-distorted XY of the shifted location of the individual sensor
// non-distorted XY of the shifted location of the individual sensor
// -------------- Each camera calculated by its own thread ----------------
// -------------- Each camera calculated by its own thread ---------------- float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels
float pXci0 = pXc - disparity * rXY[0]; // [ncam][0]; // in pixels float pYci0 = pYc - disparity * rXY[1]; // [ncam][1];
float pYci0 = pYc - disparity * rXY[1]; // [ncam][1]; // rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions // Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction /// double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
/// double [][] avi = {{pXci0}, {pYci0},{fl_pix}}; /// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
/// Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis) // Apply port-individual combined rotation/zoom matrix
// Apply port-individual combined rotation/zoom matrix /// Matrix rvi = rots[i].times(vi);
/// Matrix rvi = rots[i].times(vi);
float rvi[3];
float rvi[3];
#pragma unroll #pragma unroll
for (int j = 0; j< 3; j++){ for (int j = 0; j < 3; j++) {
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix; rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
} }
// get back to the projection plane by normalizing vector // get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2]; float norm_z = fl_pix / rvi[2];
float pXci = rvi[0] * norm_z; float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z; float pYci = rvi[1] * norm_z;
// Re-apply distortion // Re-apply distortion
float rNDi = sqrtf(pXci*pXci + pYci*pYci); // in pixels float rNDi = sqrtf(pXci * pXci + pYci * pYci); // in pixels
float ri = rNDi* ri_scale; // relative to distortion radius float ri = rNDi * ri_scale; // relative to distortion radius
float rD2rND = 1.0; float rD2rND = 1.0;
{ {
float rri = 1.0; float rri = 1.0;
#ifdef NVRTC_BUG #ifdef NVRTC_BUG
#pragma unroll #pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){ for (int j = 0; j < RAD_COEFF_LEN; j++) {
rri *= ri; rri *= ri;
rD2rND += ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0); rD2rND += ((float *)&geometry_correction.distortionC)[j] * (rri - 1.0);
} }
#else #else
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){ for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
rri *= ri; rri *= ri;
rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0); rD2rND += geometry_correction.rad_coeff[j] * (rri - 1.0);
} }
#endif #endif
} }
// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient) // Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
float pXid = pXci * rD2rND; float pXid = pXci * rD2rND;
float pYid = pYci * rD2rND; float pYid = pYci * rD2rND;
pXY[0] = pXid + geometry_correction.pXY0[ncam][0]; pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
pXY[1] = pYid + geometry_correction.pXY0[ncam][1]; pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
// new for ERS // new for ERS
pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam]; pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
__syncthreads(); __syncthreads();
// Each thread re-calculate same sum // Each thread re-calculate same sum
float lines_avg = 0; float lines_avg = 0;
for (int i = 0; i < num_cams; i ++){ for (int i = 0; i < num_cams; i++) {
lines_avg += pY_offsets[threadIdx.y][i]; lines_avg += pY_offsets[threadIdx.y][i];
} }
lines_avg *= (1.0/num_cams); lines_avg *= (1.0 / num_cams);
// used when calculating derivatives, TODO: combine calculations ! // used when calculating derivatives, TODO: combine calculations !
float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg; float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0); printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]); printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci); printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
printf("rNDi = %f, ri = %f\n", rNDi, ri); printf("rNDi = %f, ri = %f\n", rNDi, ri);
printf("rD2rND = %f\n", rD2rND); printf("rD2rND = %f\n", rD2rND);
printf("pXid = %f, pYid = %f\n", pXid, pYid); printf("pXid = %f, pYid = %f\n", pXid, pYid);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // * printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // *
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi); float drvi_daz[3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi); float drvi_dtl[3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi); float drvi_drl[3]; // drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll #pragma unroll
for (int j = 0; j< 3; j++){ for (int j = 0; j < 3; j++) {
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix; drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix; drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix; drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
} }
float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2]; float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2]; float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2]; float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2]; float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2]; float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2]; float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
#ifdef DEBUG210 #ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]); printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]); printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]); printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth); printf("dpXci_dazimuth = %f, dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt); printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll); printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
float dpXci_pYci_imu_lin[2][3]; float dpXci_pYci_imu_lin[2][3];
/* /*
double [][] add0 = { double [][] add0 = {
{-rXY[i][0], rXY[i][1], 0.0}, {-rXY[i][0], rXY[i][1], 0.0},
{-rXY[i][1], -rXY[i][0], 0.0}, {-rXY[i][1], -rXY[i][0], 0.0},
{ 0.0, 0.0, 0.0}}; // what is last element??? { 0.0, 0.0, 0.0}}; // what is last element???
Matrix dd0 = new Matrix(add0); Matrix dd0 = new Matrix(add0);
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/ */
float dd1[2][2];// get top left 2x2 sub-matrix float dd1[2][2]; // get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z; dd1[0][0] = (-rot_deriv.rots[ncam][0][0] * rXY[0] - rot_deriv.rots[ncam][0][1] * rXY[1]) * norm_z;
dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z; dd1[0][1] = (rot_deriv.rots[ncam][0][0] * rXY[1] - rot_deriv.rots[ncam][0][1] * rXY[0]) * norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z; dd1[1][0] = (-rot_deriv.rots[ncam][1][0] * rXY[0] - rot_deriv.rots[ncam][1][1] * rXY[1]) * norm_z;
dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z; dd1[1][1] = (rot_deriv.rots[ncam][1][0] * rXY[1] - rot_deriv.rots[ncam][1][1] * rXY[0]) * norm_z;
#ifdef DEBUG210 #ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("dd1[0][0] = %f, dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]); printf("dd1[0][0] = %f, dd1[0][1] = %f\n", dd1[0][0], dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]); printf("dd1[1][0] = %f, dd1[1][1] = %f\n", dd1[1][0], dd1[1][1]);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation) // now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius // unity vector in the direction of radius
float c_dist = pXci/rNDi; float c_dist = pXci / rNDi;
float s_dist = pYci/rNDi; float s_dist = pYci / rNDi;
//#undef NVRTC_BUG //#undef NVRTC_BUG
float drD2rND_dri = 0.0; float drD2rND_dri = 0.0;
{ {
float rri = 1.0; float rri = 1.0;
#ifdef NVRTC_BUG #ifdef NVRTC_BUG
#pragma unroll #pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){ for (int j = 0; j < RAD_COEFF_LEN; j++) {
drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri; drD2rND_dri += ((float *)&geometry_correction.distortionC)[j] * (j + 1) * rri;
rri *= ri; rri *= ri;
} }
#else #else
#pragma unroll #pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){ for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri; drD2rND_dri += geometry_correction.rad_coeff[j] * (j + 1) * rri;
rri *= ri; rri *= ri;
} }
#endif #endif
} }
float scale_distort00 = rD2rND + ri* drD2rND_dri; float scale_distort00 = rD2rND + ri * drD2rND_dri;
float scale_distort11 = rD2rND; float scale_distort11 = rD2rND;
float scale_distortXrot2Xdd1[2][2]; float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00; scale_distortXrot2Xdd1[0][0] = (c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00; scale_distortXrot2Xdd1[0][1] = (c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11; scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11; scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0]; disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1]; disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0]; disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1]; disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
#ifdef DEBUG210 #ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]); printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n", scale_distortXrot2Xdd1[0][0], scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]); printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n", scale_distortXrot2Xdd1[1][0], scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]); printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]); printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]); printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]); printf("disp_dist[3] = %f\n", disp_dist[3]);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0]; /// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1]; /// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2]; /// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3]; /// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset float *disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams * 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory *(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1]; *(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2]; *(disp_dist_p++) = disp_dist[2];
*(disp_dist_p++) = disp_dist[3]; *(disp_dist_p++) = disp_dist[3];
// imu = extrinsic_corr.getIMU(i); // currently it is common for all channels // imu = extrinsic_corr.getIMU(i); // currently it is common for all channels
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15 // float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move // float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed // ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists){ if (imu_exists) {
float ers_x = float ers_x =
dpXci_dtilt * extrinsic_corr.imu_rot[0] + dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] + dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
dpXci_droll * extrinsic_corr.imu_rot[2]; dpXci_droll * extrinsic_corr.imu_rot[2];
float ers_y = float ers_y =
dpYci_dtilt * extrinsic_corr.imu_rot[0] + dpYci_dtilt * extrinsic_corr.imu_rot[0] +
dpYci_dazimuth * extrinsic_corr.imu_rot[1] + dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
dpYci_droll * extrinsic_corr.imu_rot[2]; dpYci_droll * extrinsic_corr.imu_rot[2];
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("ers_x = %f, ers_y = %f\n", ers_x, ers_y); printf("ers_x = %f, ers_y = %f\n", ers_x, ers_y);
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
if (disparity >= MIN_DISPARITY){ // all threads together if (disparity >= MIN_DISPARITY) { // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius; float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity; float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]); float dwdisp_dz = (k * geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
//// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z //// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] + ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]; dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] + ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]; dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0) float delta_t = (pY_offset / (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
pXY[0] += delta_t * ers_x * rD2rND; // added correction to pixel X pXY[0] += delta_t * ers_x * rD2rND; // added correction to pixel X
pXY[1] += delta_t * ers_y * rD2rND; // added correction to pixel Y pXY[1] += delta_t * ers_y * rD2rND; // added correction to pixel Y
#ifdef DEBUG21 #ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){ if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz); printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]); printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0], dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]); printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1], dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_x = %f, ers_y = %f\n", delta_t, ers_x, ers_y); printf("delta_t = %f, ers_x = %f, ers_y = %f\n", delta_t, ers_x, ers_y);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
} }
__syncthreads();// __syncwarp(); __syncthreads(); // __syncwarp();
#endif // DEBUG21 #endif // DEBUG21
}
} }
} // copy results to global memory pXY, disp_dist (already copied)
// copy results to global memory pXY, disp_dist (already copied) // gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][0] = pXY[0]; // gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1]; // float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset // .xy goes right after 3 commonn (tak, txy and target_disparity
// .xy goes right after 3 commonn (tak, txy and target_disparity float *tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
float * tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset *(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[0]; // global memory *(tile_xy_p++) = pXY[1]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
} }
extern "C" __global__ void calcReverseDistortionTable( extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction, struct gc *geometry_correction,
float * rByRDist) float *rByRDist) {
{ // int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36 int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x; // double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms // double delta=1E-4; //rByRDist error = 0.000072
// double delta=1E-4; //rByRDist error = 0.000072 double delta = 1E-10; // 12; // 10; // -8; 0.730000 ms
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms double minDerivative = 0.01;
double minDerivative=0.01; int numIterations = 1000;
int numIterations=1000; double drDistDr = 1.0;
double drDistDr=1.0; double d = 1.0 - geometry_correction->distortionA8 - geometry_correction->distortionA7 - geometry_correction->distortionA6 - geometry_correction->distortionA5 - geometry_correction->distortionA - geometry_correction->distortionB - geometry_correction->distortionC;
double d=1.0 double rPrev = 0.0;
-geometry_correction -> distortionA8 int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
-geometry_correction -> distortionA7 for (int p = 0; p < num_points; p++) {
-geometry_correction -> distortionA6 int i = indx * num_points + p;
-geometry_correction -> distortionA5 if (i >= RBYRDIST_LEN) {
-geometry_correction -> distortionA return;
-geometry_correction -> distortionB }
-geometry_correction -> distortionC; if (i == 0) {
double rPrev=0.0; rByRDist[0] = (float)1.0 / d;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS; continue;
for (int p = 0; p < num_points; p ++){ }
int i = indx * num_points +p; double rDist = RBYRDIST_STEP * i;
if (i >= RBYRDIST_LEN){ double r = (p == 0) ? rDist : rPrev;
return; for (int iteration = 0; iteration < numIterations; iteration++) {
} double k = (((((((
if (i == 0){ geometry_correction->distortionA8) *
rByRDist[0]= (float) 1.0/d; r +
continue; geometry_correction->distortionA7) *
} r +
double rDist = RBYRDIST_STEP * i; geometry_correction->distortionA6) *
double r = (p == 0) ? rDist : rPrev; r +
for (int iteration=0;iteration<numIterations;iteration++){ geometry_correction->distortionA5) *
double k=((((((( r +
geometry_correction -> distortionA8) * r + geometry_correction->distortionA) *
geometry_correction -> distortionA7) * r + r +
geometry_correction -> distortionA6) * r + geometry_correction->distortionB) *
geometry_correction -> distortionA5) * r + r +
geometry_correction -> distortionA) * r + geometry_correction->distortionC) *
geometry_correction -> distortionB) * r + r +
geometry_correction -> distortionC) * r + d; d;
drDistDr=((((((( drDistDr = (((((((
8 * geometry_correction -> distortionA8) * r + 8 * geometry_correction->distortionA8) *
7 * geometry_correction -> distortionA7) * r + r +
6 * geometry_correction -> distortionA6) * r + 7 * geometry_correction->distortionA7) *
5 * geometry_correction -> distortionA5) * r + r +
4 * geometry_correction -> distortionA) * r + 6 * geometry_correction->distortionA6) *
3 * geometry_correction -> distortionB) * r+ r +
2 * geometry_correction -> distortionC) * r+d; 5 * geometry_correction->distortionA5) *
if (drDistDr<minDerivative) { // folds backwards ! r +
return; // too high distortion 4 * geometry_correction->distortionA) *
} r +
double rD=r*k; 3 * geometry_correction->distortionB) *
if (fabs(rD-rDist)<delta){ r +
break; 2 * geometry_correction->distortionC) *
} r +
r+=(rDist-rD)/drDistDr; d;
} if (drDistDr < minDerivative) { // folds backwards !
rPrev=r; return; // too high distortion
rByRDist[i]= (float) r/rDist; }
} double rD = r * k;
if (fabs(rD - rDist) < delta) {
break;
}
r += (rDist - rD) / drDistDr;
}
rPrev = r;
rByRDist[i] = (float)r / rDist;
}
} }
/** /**
...@@ -843,110 +844,122 @@ extern "C" __global__ void calcReverseDistortionTable( ...@@ -843,110 +844,122 @@ extern "C" __global__ void calcReverseDistortionTable(
* @return corresponding non-distorted radius * @return corresponding non-distorted radius
*/ */
inline __device__ float getRByRDist(float rDist, inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]) //shared memory float rByRDist[RBYRDIST_LEN]) // shared memory
{ {
if (rDist < 0) { if (rDist < 0) {
return 0.0f; // normally should not happen return 0.0f; // normally should not happen
} }
float findex = rDist/RBYRDIST_STEP; float findex = rDist / RBYRDIST_STEP;
int index= (int) floorf(findex); int index = (int)floorf(findex);
if (index < 0){ if (index < 0) {
index = 0; index = 0;
} }
if (index > (RBYRDIST_LEN - 3)) { if (index > (RBYRDIST_LEN - 3)) {
index = RBYRDIST_LEN - 3; index = RBYRDIST_LEN - 3;
} }
float mu = fmaxf(findex - index, 0.0f); float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu; float mu2 = mu * mu;
float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]); float y0 = (index > 0) ? rByRDist[index - 1] : (2 * rByRDist[index] - rByRDist[index + 1]);
// use Catmull-Rom // use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2]; float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index + 1] + 0.5 * rByRDist[index + 2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index+1] - 0.5 * rByRDist[index+2]; float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index + 1] - 0.5 * rByRDist[index + 2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index+1]; float a2 = -0.5 * y0 + 0.5 * rByRDist[index + 1];
float a3 = rByRDist[index]; float a3 = rByRDist[index];
float result= a0*mu*mu2+a1*mu2+a2*mu+a3; float result = a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3;
return result; return result;
} }
__device__ void printGeometryCorrection(struct gc * g, int num_cams){ __device__ void printGeometryCorrection(struct gc *g, int num_cams) {
#ifndef JCUDA #ifndef JCUDA
printf("\nGeometry Correction\n------------------\n"); printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n","pixelCorrectionWidth", g->pixelCorrectionWidth); printf("%22s: %f\n", "pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight); printf("%22s: %f\n", "pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n","line_time", g->line_time); printf("%22s: %f\n", "line_time", g->line_time);
printf("%22s: %f\n","focalLength", g->focalLength); printf("%22s: %f\n", "focalLength", g->focalLength);
printf("%22s: %f\n","pixelSize", g->pixelSize); printf("%22s: %f\n", "pixelSize", g->pixelSize);
printf("%22s: %f\n","distortionRadius",g->distortionRadius); printf("%22s: %f\n", "distortionRadius", g->distortionRadius);
printf("%22s: %f\n","distortionC", g->distortionC); printf("%22s: %f\n", "distortionC", g->distortionC);
printf("%22s: %f\n","distortionB", g->distortionB); printf("%22s: %f\n", "distortionB", g->distortionB);
printf("%22s: %f\n","distortionA", g->distortionA); printf("%22s: %f\n", "distortionA", g->distortionA);
printf("%22s: %f\n","distortionA5",g->distortionA5); printf("%22s: %f\n", "distortionA5", g->distortionA5);
printf("%22s: %f\n","distortionA6",g->distortionA6); printf("%22s: %f\n", "distortionA6", g->distortionA6);
printf("%22s: %f\n","distortionA7",g->distortionA7); printf("%22s: %f\n", "distortionA7", g->distortionA7);
printf("%22s: %f\n","distortionA8",g->distortionA8); printf("%22s: %f\n", "distortionA8", g->distortionA8);
printf("%22s: %f\n","elevation", g->elevation); printf("%22s: %f\n", "elevation", g->elevation);
printf("%22s: %f\n","heading", g->heading); printf("%22s: %f\n", "heading", g->heading);
// printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]); // printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
// printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]); // printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
// printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]); // printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
// printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]); // printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
// printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]); // printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
// printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]); // printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
// printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]); // printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
// printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]); // printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s:","forward"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]); printf("\n"); printf("%22s:", "forward");
printf("%22s:","right"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right [ncam]); printf("\n"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]);
printf("%22s:","height"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height [ncam]); printf("\n"); printf("\n");
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll [ncam]); printf("\n"); printf("%22s:", "right");
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right[ncam]);
printf("%19s%2d]: %f, %f \n", "pXY0[",ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]); printf("\n");
} printf("%22s:", "height");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height[ncam]);
printf("%22s: %f\n","common_right", g->common_right); printf("\n");
printf("%22s: %f\n","common_forward", g->common_forward); printf("%22s:", "roll");
printf("%22s: %f\n","common_height", g->common_height); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll[ncam]);
printf("%22s: %f\n","common_roll", g->common_roll); printf("\n");
for (int ncam = 0; ncam < num_cams; ncam++) {
// printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]); printf("%19s%2d]: %f, %f \n", "pXY0[", ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]); }
// printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
// printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]); printf("%22s: %f\n", "common_right", g->common_right);
for (int ncam = 0; ncam < num_cams; ncam++) { printf("%22s: %f\n", "common_forward", g->common_forward);
printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]); printf("%22s: %f\n", "common_height", g->common_height);
} printf("%22s: %f\n", "common_roll", g->common_roll);
printf("%22s: %f\n","cameraRadius", g->cameraRadius);
printf("%22s: %f\n","disparityRadius", g->disparityRadius); // printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]); // printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
printf("%22s:","woi_tops"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]); printf("\n"); // printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
for (int ncam = 0; ncam < num_cams; ncam++) {
#endif //ifndef JCUDA printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
}
printf("%22s: %f\n", "cameraRadius", g->cameraRadius);
printf("%22s: %f\n", "disparityRadius", g->disparityRadius);
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
printf("%22s:", "woi_tops");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]);
printf("\n");
#endif // ifndef JCUDA
} }
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams) __device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams) {
{
#ifndef JCUDA #ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n"); printf("\nExtrinsic Correction Vector\n---------------------------\n");
// printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]); // printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
// printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]); // printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
// printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]); // printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
// printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]); // printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s:","tilt"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->tilt[ncam]); printf("\n"); printf("%22s:", "tilt");
printf("%22s:","azimuth"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->azimuth[ncam]); printf("\n"); for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->tilt[ncam]);
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]); printf("\n"); printf("\n");
printf("%22s:","zoom"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->zoom[ncam]); printf("\n"); printf("%22s:", "azimuth");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->azimuth[ncam]);
printf("\n");
printf("%22s:", "roll");
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]); printf("\n");
#endif //ifndef JCUDA printf("%22s:", "zoom");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->zoom[ncam]);
printf("\n");
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif // ifndef JCUDA
} }
...@@ -41,147 +41,141 @@ ...@@ -41,147 +41,141 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define NVRTC_BUG 1 #define NVRTC_BUG 1
#ifndef M_PI #ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */ #define M_PI 3.14159265358979323846 /* pi */
#endif #endif
#ifndef offsetof #ifndef offsetof
#define offsetof(st, m) \ #define offsetof(st, m) \
((size_t)&(((st *)0)->m)) ((size_t) & (((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif #endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define SCENE_UNITS_SCALE 0.001 // meters from mm #define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task { struct tp_task {
int task; int task;
union { union {
int txy; int txy;
unsigned short sxy[2]; unsigned short sxy[2];
}; };
float target_disparity; float target_disparity;
float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation float centerXY[2]; // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
// if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
float xy[NUM_CAMS][2]; float xy[NUM_CAMS][2];
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates() float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
}; };
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x)) #define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5 #define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3 #define tp_task_centerXY_offset 3
struct corr_vector{ struct corr_vector {
float tilt [NUM_CAMS-1]; // 0..2 float tilt[NUM_CAMS - 1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5 float azimuth[NUM_CAMS - 1]; // 3..5
float roll [NUM_CAMS]; // 6..9 float roll[NUM_CAMS]; // 6..9
float zoom [NUM_CAMS-1]; // 10..12 float zoom[NUM_CAMS - 1]; // 10..12
// for ERS correction: // for ERS correction:
float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15 float imu_rot[3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
}; };
#ifdef NVRTC_BUG #ifdef NVRTC_BUG
struct trot_deriv{ struct trot_deriv {
float rots [NUM_CAMS][3][3]; float rots[NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3]; float d_daz[NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3]; float d_tilt[NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3]; float d_roll[NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3]; float d_zoom[NUM_CAMS][3][3];
}; };
#else #else
union trot_deriv{ union trot_deriv {
struct { struct {
float rots [NUM_CAMS][3][3]; float rots[NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3]; float d_daz[NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3]; float d_tilt[NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3]; float d_roll[NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3]; float d_zoom[NUM_CAMS][3][3];
}; };
float matrices [5][NUM_CAMS][3][3]; float matrices[5][NUM_CAMS][3][3];
}; };
#endif #endif
struct gc { struct gc {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2) float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936; float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS) float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH; float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
#ifndef NVRTC_BUG #ifndef NVRTC_BUG
union { union {
struct { struct {
#endif #endif
float distortionC; // r^2 float distortionC; // r^2
float distortionB; // r^3 float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?) float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?) float distortionA5; // r^5 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?) float distortionA6; // r^6 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?) float distortionA7; // r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?) float distortionA8; // r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG #ifndef NVRTC_BUG
}; };
float rad_coeff [7]; float rad_coeff[7];
}; };
#endif #endif
// parameters, common for all sensors // parameters, common for all sensors
float elevation; // degrees, up - positive; float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive float heading; // degrees, CW (from top) - positive
float forward [NUM_CAMS]; float forward[NUM_CAMS];
float right [NUM_CAMS]; float right[NUM_CAMS];
float height [NUM_CAMS]; float height[NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive float roll[NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2]; float pXY0[NUM_CAMS][2];
float common_right; // mm right, camera center float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved) // float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll) // float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius float rXY[NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}}; // float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems // only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
float woi_tops [NUM_CAMS]; // used to calculate scanline timing float woi_tops[NUM_CAMS]; // used to calculate scanline timing
}; };
#define RAD_COEFF_LEN 7 #define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets( extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams, int num_cams,
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc *gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv); trot_deriv *gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets( extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction, struct gc *gpu_geometry_correction,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv); trot_deriv *gpu_rot_deriv);
// uses NUM_CAMS blocks, (3,3,3) threads // uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
int num_cams, int num_cams,
struct corr_vector * gpu_correction_vector, struct corr_vector *gpu_correction_vector,
trot_deriv * gpu_rot_deriv); trot_deriv *gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim #define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads // Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable( extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction, struct gc *geometry_correction,
float * rByRDist); float *rByRDist);
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -39,61 +39,61 @@ ...@@ -39,61 +39,61 @@
// Avoiding includes in jcuda, all source files will be merged // Avoiding includes in jcuda, all source files will be merged
#pragma once #pragma once
#ifndef JCUDA #ifndef JCUDA
#define TEST_LWIR 1 #define TEST_LWIR 1
#include <stdio.h> #include <stdio.h>
#define THREADSX (DTT_SIZE) #define THREADSX (DTT_SIZE)
#define NUM_CAMS 16 // now maximal number of cameras #define NUM_CAMS 16 // now maximal number of cameras
//#define NUM_PAIRS 6 //#define NUM_PAIRS 6
//#define NUM_COLORS 1 //3 //#define NUM_COLORS 1 //3
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64] // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#define KERNELS_LSTEP 4 #define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8 #define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4 #define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8 #define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4 #define CORR_TILES_PER_BLOCK 4
#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8? #define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
#define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16? #define CORR_TILES_PER_BLOCK_COMBINE 4 // increase to 16?
//#define TEXTURE_THREADS 32 // //#define TEXTURE_THREADS 32 //
#define NUM_THREADS 32 #define NUM_THREADS 32
#define TEXTURE_THREADS_PER_TILE 8 #define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1 #define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16 #define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4 #define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number #define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
// only lower bit will be used to request correlations, correlation mask will be common for all the scene // only lower bit will be used to request correlations, correlation mask will be common for all the scene
//#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile //#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile #define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4 #define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor #define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor #define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor #define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor #define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
//#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task //#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation #define LIST_TEXTURE_BIT 7 // bit to request texture calculation
//#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9) //#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan #define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list #define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory #define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001); #define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads #define TILES_PER_BLOCK_GEOM (32 / NUM_CAMS) // each tile has NUM_CAMS threads
#define DEBUG_ANY 1 #define DEBUG_ANY 1
#ifdef DEBUG_ANY #ifdef DEBUG_ANY
//#define DEBUG_OOB1 1 //#define DEBUG_OOB1 1
// Use CORR_OUT_RAD for the correlation output // Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40 //#define DBG_TILE_X 40
//#define DBG_TILE_Y 80 //#define DBG_TILE_Y 80
#if TEST_LWIR #if TEST_LWIR
#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49 #define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66 #define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X) #define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#else #else
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49 #define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66 #define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X) #define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#endif #endif
#undef DBG_MARK_DBG_TILE #undef DBG_MARK_DBG_TILE
//#undef DBG_TILE //#undef DBG_TILE
...@@ -101,8 +101,7 @@ ...@@ -101,8 +101,7 @@
//#undef HAS_PRINTF //#undef HAS_PRINTF
#define HAS_PRINTF #define HAS_PRINTF
// 7
//7
//#define DEBUG1 1 //#define DEBUG1 1
//#define DEBUG2 1 //#define DEBUG2 1
//#define DEBUG3 1 //#define DEBUG3 1
...@@ -118,7 +117,7 @@ ...@@ -118,7 +117,7 @@
#define DEBUG9 1 #define DEBUG9 1
*/ */
//#define DEBUG8A 1 // generate_RBGA_host //#define DEBUG8A 1 // generate_RBGA_host
//textures // textures
//#define DEBUG10 1 //#define DEBUG10 1
//#define DEBUG11 1 //#define DEBUG11 1
//#define DEBUG12 1 //#define DEBUG12 1
...@@ -127,7 +126,6 @@ ...@@ -127,7 +126,6 @@
// geom // geom
//#define DEBUG20 1 //#define DEBUG20 1
#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0) #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
//#define DEBUG20 1 // Geometry Correction //#define DEBUG20 1 // Geometry Correction
//#define DEBUG21 1 // Geometry Correction //#define DEBUG21 1 // Geometry Correction
...@@ -136,10 +134,8 @@ ...@@ -136,10 +134,8 @@
//#define DEBUG22 1 //#define DEBUG22 1
//#define DEBUG23 1 //#define DEBUG23 1
#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0) #endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA #endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment