Commit 6c76931e authored by Palani Johnson's avatar Palani Johnson

ran formatter

parent 4648cb20
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -41,17 +41,16 @@
#include "tp_defines.h"
#endif
extern "C" __global__ void convert_direct( // called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
int num_cams, // actual number of cameras
int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float** gpu_kernel_offsets, // [NUM_CAMS],
float** gpu_kernels, // [NUM_CAMS],
float** gpu_images, // [NUM_CAMS],
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
......@@ -59,41 +58,41 @@ extern "C" __global__ void convert_direct( // called with a single block, single
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles, // indices to gpu_tasks
int* gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int* pnum_active_tiles, // indices to gpu_tasks
int tilesx);
extern "C" __global__ void correlate2D(
int num_cams,
// int * sel_pairs,
// int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute, squared
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
int* gpu_corr_indices, // packed tile+pair
int* pnum_corr_tiles, // pointer to a number of correlation tiles to process
size_t corr_stride, // in floats
// int corr_stride, // in floats
// int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
float* gpu_corrs); // correlation output data
extern "C" __global__ void corr2D_normalize(
int num_corr_tiles, // number of correlation tiles to process
const size_t corr_stride_td, // in floats
float * gpu_corrs_td, // correlation tiles in transform domain
float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
float* gpu_corrs_td, // correlation tiles in transform domain
float* corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
const size_t corr_stride, // in floats
float * gpu_corrs, // correlation output data (either pixel domain or transform domain
float* gpu_corrs, // correlation output data (either pixel domain or transform domain
float fat_zero2, // here - absolute, squared
int corr_radius); // radius of the output correlation (7 for 15x15)
......@@ -102,42 +101,41 @@ extern "C" __global__ void corr2D_combine(
int num_pairs, // num pairs per tile (should be the same)
int init_output, // !=0 - reset output tiles to zero before accumulating
int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
int * gpu_corr_indices, // packed tile+pair
int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
int* gpu_corr_indices, // packed tile+pair
int* gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
const size_t corr_stride, // (in floats) stride for the input TD correlations
float * gpu_corrs, // input correlation tiles
float* gpu_corrs, // input correlation tiles
const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
float * gpu_corrs_combo); // combined correlation output (one per tile)
float* gpu_corrs_combo); // combined correlation output (one per tile)
extern "C" __global__ void textures_nonoverlap(
int num_cams, // number of cameras
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// int num_tilesx, // number of tiles in a row
// declare arrays in device code?
int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int* pnum_texture_tiles, // returns total number of elements in gpu_texture_indices array
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5],
float weights[3], // scale for R,B,G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
// combining both non-overlap and overlap (each calculated if pointer is not null )
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_stride, // in floats (now 256*4 = 1024) // may be 0 if not needed
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
float* gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles // may be 0 if not needed
int linescan_order, // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
float * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
float* gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
int num_tilesx);
extern "C"
__global__ void imclt_rbg_all(
extern "C" __global__ void imclt_rbg_all(
int num_cams,
float ** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
float** gpu_clt, // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
int apply_lpf,
int colors,
int woi_twidth,
......@@ -145,12 +143,12 @@ __global__ void imclt_rbg_all(
const size_t dstride); // in floats (pixels)
extern "C" __global__ void erase8x8(
float * gpu_top_left,
float* gpu_top_left,
const size_t dstride);
extern "C" __global__ void imclt_rbg(
float * gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
float* gpu_clt, // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float* gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
......@@ -163,19 +161,19 @@ extern "C" __global__ void imclt_rbg(
extern "C" __global__ void generate_RBGA(
int num_cams, // number of cameras used
// Parameters to generate texture tasks
float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
float* gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int* gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int* num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int* woi, // x,y,width,height of the woi
int width, // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILES-Y, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
struct gc* gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float params[5], // mitigating CUDA_ERROR_INVALID_PTX
......@@ -183,5 +181,12 @@ extern "C" __global__ void generate_RBGA(
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
float* gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void accumulate_correlations(
int tilesY,
int tilesX,
int pairs,
float* num_acc, // number of accumulated tiles [tilesY][tilesX][pair]
float* fcorr_td, // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
float* fcorr_td_acc); // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
......@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f;
__constant__ float SQRT_2 = 1.414214f;
__constant__ float SQRT1_2 = 0.707107f;
__constant__ float SQRT1_8 = 0.353553f;
__constant__ float COSN1[] = {0.980785f,0.831470f};
__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
__constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
__constant__ float COSN1[] = {0.980785f, 0.831470f};
__constant__ float COSN2[] = {0.995185f, 0.956940f, 0.881921f, 0.773010f};
__constant__ float SINN1[] = {0.195090f, 0.555570f};
__constant__ float SINN2[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f};
__constant__ int imclt_indx9[16] = {0x28, 0x29, 0x2a, 0x2b, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x24, 0x25, 0x26, 0x27};
__constant__ float idct_signs[4][4][4] = {
{// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, -1, -1, -1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{-1,-1,-1, 1},
{-1,-1,-1, 1},
{-1,-1,-1, 1}
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{-1, 1, 1, 1}
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
}};
{-1, 1, 1, 1}},
{// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, 1, 1, -1},
{-1, -1, -1, 1},
{-1, -1, -1, 1},
{-1, -1, -1, 1}},
{// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, -1, -1, -1},
{1, -1, -1, -1},
{1, -1, -1, -1},
{-1, 1, 1, 1}},
{// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{1, 1, 1, -1},
{1, 1, 1, -1},
{1, 1, 1, -1},
{-1, -1, -1, 1}}};
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg(float *x0, int inc); // used in TP
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_ii8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]); // x,y point to 8-element arrays each // not used
/**
**************************************************************************
......@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
* \return None
*/
#ifdef BBBB
extern "C"
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
{
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) {
int dtt_mode0 = dtt_mode & 1;
int dtt_mode1 = (dtt_mode >>1) & 1;
int dtt_mode1 = (dtt_mode >> 1) & 1;
__shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
......@@ -162,17 +157,17 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
__syncthreads();
// horizontal pass
if (dtt_mode > 3) {
dttii_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
dttii_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} else {
dttiv_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
dttiv_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
}
__syncthreads();
// vertical pass
if (dtt_mode > 3) {
dttii_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
dttii_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} else {
dttiv_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
dttiv_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
}
__syncthreads();
for (unsigned int i = 0; i < DTT_SIZE; i++)
......@@ -180,50 +175,47 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
}
#endif //#ifdef BBBB
inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= ( COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10= (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u00 = (COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10 = (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u01= ( COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11= -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
float u01 = (COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11 = -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
float u02= ( COSN2[2] * x[2] + SINN2[2] * x[5]);
float u12= (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u02 = (COSN2[2] * x[2] + SINN2[2] * x[5]);
float u12 = (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u03= ( COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13= -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
float u03 = (COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13 = -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
y[0] = SQRT_2 * v00; // w0[0];
y[1] = v01 - vb11; // w1[0];
......@@ -238,63 +230,62 @@ inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to
y[7] = SQRT_2 * vb00; // w1[3];
}
__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
{
__device__ void _dttiv(float x0, float x1, float x2, float x3, float x4, float x5, float x6, float x7,
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) {
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * x7 + SINN2[0] * x0);
u10= (-SINN2[3] * x4 + COSN2[3] * x3);
u00 = (COSN2[0] * x7 + SINN2[0] * x0);
u10 = (-SINN2[3] * x4 + COSN2[3] * x3);
u01= ( COSN2[1] * x6 + SINN2[1] * x1);
u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
u01 = (COSN2[1] * x6 + SINN2[1] * x1);
u11 = -(-SINN2[2] * x5 + COSN2[2] * x2);
u02= ( COSN2[2] * x5 + SINN2[2] * x2);
u12= (-SINN2[1] * x6 + COSN2[1] * x1);
u02 = (COSN2[2] * x5 + SINN2[2] * x2);
u12 = (-SINN2[1] * x6 + COSN2[1] * x1);
u03= ( COSN2[3] * x4 + SINN2[3] * x3);
u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
u03 = (COSN2[3] * x4 + SINN2[3] * x3);
u13 = -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00= ( COSN2[0] * x0 + SINN2[0] * x7);
u10= (-SINN2[3] * x3 + COSN2[3] * x4);
u00 = (COSN2[0] * x0 + SINN2[0] * x7);
u10 = (-SINN2[3] * x3 + COSN2[3] * x4);
u01= ( COSN2[1] * x1 + SINN2[1] * x6);
u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
u01 = (COSN2[1] * x1 + SINN2[1] * x6);
u11 = -(-SINN2[2] * x2 + COSN2[2] * x5);
u02= ( COSN2[2] * x2 + SINN2[2] * x5);
u12= (-SINN2[1] * x1 + COSN2[1] * x6);
u02 = (COSN2[2] * x2 + SINN2[2] * x5);
u12 = (-SINN2[1] * x1 + COSN2[1] * x6);
u03= ( COSN2[3] * x3 + SINN2[3] * x4);
u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
u03 = (COSN2[3] * x3 + SINN2[3] * x4);
u13 = -(-SINN2[0] * x0 + COSN2[0] * x7);
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
......@@ -316,8 +307,7 @@ __device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,
}
}
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
{
inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
......@@ -328,52 +318,52 @@ inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
......@@ -395,12 +385,11 @@ inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
......@@ -411,52 +400,52 @@ inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_n
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u00 = ((*x0) - (*x7));
u10 = ((*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u01 = (-(*x1) + (*x6));
u11 = (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u02 = ((*x2) - (*x5));
u12 = ((*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
u03 = (-(*x3) + (*x4));
u13 = (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u00 = ((*x0) + (*x7));
u10 = ((*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u01 = ((*x1) + (*x6));
u11 = ((*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u02 = ((*x2) + (*x5));
u12 = ((*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
u03 = ((*x3) + (*x4));
u13 = ((*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x0 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
......@@ -478,12 +467,11 @@ inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_n
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x7 = (w30 + w31) * 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
......@@ -493,59 +481,58 @@ inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10= (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u00 = (COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10 = (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u01= ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u01 = (COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11 = -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02= ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12= (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u02 = (COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12 = (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03= ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
u03 = (COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13 = -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
......@@ -564,9 +551,8 @@ inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
}
}
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
{
float sgn = 1 - 2* dst_not_dct;
inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct) {
float sgn = 1 - 2 * dst_not_dct;
float *y0 = x;
float *y1 = y0 + inc;
float *y2 = y1 + inc;
......@@ -587,46 +573,45 @@ inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float ua00= u00 + u03;
float ua10= u00 - u03;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float ub00= u10 + u13;
float ub10= u10 - u13;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
......@@ -638,8 +623,7 @@ inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
*y7 = sgn * vb00 * 0.5f; // w1[3];
}
inline __device__ void dctiv_nodiverg(float * x0, int inc)
{
inline __device__ void dctiv_nodiverg(float *x0, int inc) {
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
......@@ -648,46 +632,45 @@ inline __device__ void dctiv_nodiverg(float * x0, int inc)
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float ua00= u00 + u03;
float ua10= u00 - u03;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float ub00= u10 + u13;
float ub10= u10 - u13;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
......@@ -699,8 +682,7 @@ inline __device__ void dctiv_nodiverg(float * x0, int inc)
*x7 = vb00 * 0.5f; // w1[3];
}
inline __device__ void dstiv_nodiverg(float * x, int inc)
{
inline __device__ void dstiv_nodiverg(float *x, int inc) {
float *x0 = x + 7 * inc;
// negate inc, replace
inc = -inc;
......@@ -712,46 +694,45 @@ inline __device__ void dstiv_nodiverg(float * x, int inc)
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
float ua00= u00 + u03;
float ua10= u00 - u03;
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua01= u01 + u02;
float ua11= u01 - u02;
float ua00 = u00 + u03;
float ua10 = u00 - u03;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float ua01 = u01 + u02;
float ua11 = u01 - u02;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float v00 = ua00 + ua01;
float v02 = ua00 - ua01;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float ub00= u10 + u13;
float ub10= u10 - u13;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub01= u11 + u12;
float ub11= u11 - u12;
float ub00 = u10 + u13;
float ub10 = u10 - u13;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float ub01 = u11 + u12;
float ub11 = u11 - u12;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
float vb00 = ub00 + ub01;
float vb01 = ub00 - ub01;
float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
*x7 = v00 * 0.5f; // w0[0];
*x5 = (v01 + vb11) * SQRT1_8; // w0[1];
......@@ -764,49 +745,47 @@ inline __device__ void dstiv_nodiverg(float * x, int inc)
*x0 = -vb00 * 0.5f; // w1[3];
}
inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void _dctii_nrecurs8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= (x[0] + x[7]);
float u10= (x[0] - x[7]);
float u00 = (x[0] + x[7]);
float u10 = (x[0] - x[7]);
float u01= (x[1] + x[6]);
float u11= (x[1] - x[6]);
float u01 = (x[1] + x[6]);
float u11 = (x[1] - x[6]);
float u02= (x[2] + x[5]);
float u12= (x[2] - x[5]);
float u02 = (x[2] + x[5]);
float u12 = (x[2] - x[5]);
float u03= (x[3] + x[4]);
float u13= (x[3] - x[4]);
float u03 = (x[3] + x[4]);
float u13 = (x[3] - x[4]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w00 = u00 + u03;
float w10 = u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float w01 = (u01 + u02);
float w11 = (u01 - u02);
float v00= w00 + w01;
float v02= w00 - w01;
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
float v00 = w00 + w01;
float v02 = w00 - w01;
float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00= w20 + w21;
float z01= w20 - w21;
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00 = w20 + w21;
float z01 = w20 - w21;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10= w30 + w31;
float z11= w30 - w31;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10 = w30 + w31;
float z11 = w30 - w31;
float v10 = SQRT_2 * z00;
float v11 = z01 - z11;
......@@ -827,85 +806,80 @@ inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to
y[7] = v13;
}
inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void dct_ii8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctii_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8;
}
}
__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
__device__ void dct_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctiv_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
for (int i = 0; i < 8; i++) {
y[i] *= SQRT1_8;
}
}
inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
inline __device__ void dst_iv8(float x[8], float y[8]) // x,y point to 8-element arrays each
{
float xr[8];
#pragma unroll
for (int i=0; i < 8;i++){
for (int i = 0; i < 8; i++) {
xr[i] = x[7 - i];
}
_dctiv_nrecurs8(xr, y);
#pragma unroll
for (int i=0; i < 8;i+=2){
for (int i = 0; i < 8; i += 2) {
y[i] *= SQRT1_8;
y[i+1] *= -SQRT1_8;
y[i + 1] *= -SQRT1_8;
}
}
//=========================== 2D functions ===============
__device__ void corrUnfoldTile(
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
float *qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float *rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; //8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int crp1 = corr_radius + 1; // 8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
float *qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float *qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float *qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
if (i > corr_radius) {
return; // not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
/// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
/// for (int j = 1; j < DTT_SIZE; j++) {
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
rslt[rslt_base_index_pp] = corr_pixscale * (
qdata0[i_transform_size + j] +
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (
qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pp] = corr_pixscale * (qdata0[i_transform_size + j] +
qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j - 1]); // incomplete, will only be used for thread i=0
}
if (i == 0) {
return;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float d = corr_pixscale * qdata2[im1_transform_size];
rslt[rslt_base_index_p] += d;
rslt[rslt_base_index_m] -= d;
......@@ -915,8 +889,8 @@ __device__ void corrUnfoldTile(
int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_mm = rslt_base_index_m - j;
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
float d3 = corr_pixscale * qdata3[im1_transform_size + j - 1];
// rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_pp] += d2 + d3;
......@@ -925,85 +899,81 @@ __device__ void corrUnfoldTile(
}
__device__ void dttii_2d(
float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
float *clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
{
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
for (int q = 0; q < 4; q++) {
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x, DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
for (int q = 0; q < 4; q++) {
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1, 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
}
__device__ void dttiv_color_2d(
float * clt_tile,
int color)
{
float *clt_tile,
int color) {
dctiv_nodiverg( // all colors
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
1); //int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN){
1); // int inc);
// __syncthreads();// worsened
if (color == BAYER_GREEN) {
dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
1); //int inc);
1); // int inc);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
// __syncthreads();// worsened
if (color == BAYER_GREEN){
// __syncthreads();// worsened
if (color == BAYER_GREEN) {
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
}
__device__ void dttiv_mono_2d(
float * clt_tile)
{
float *clt_tile) {
// Copy 0-> 1
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
1); // int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
1); // int inc);
dctiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
1); // int inc);
dstiv_nodiverg(
clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
1); //int inc);
__syncthreads();// __syncwarp();
1); // int inc);
__syncthreads(); // __syncwarp();
#ifdef DEBUG222
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
dctiv_nodiverg( // CC
......@@ -1018,47 +988,45 @@ __device__ void dttiv_mono_2d(
dstiv_nodiverg( // SS
clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
}
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float *mclt_tile) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; // 0..7,7,..0
float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG3
if ((threadIdx.x) == 0){
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles before IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg(
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
clt_tile + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
clt_tile1 + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg(
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
......@@ -1067,101 +1035,99 @@ __device__ void imclt(
dstiv_nodiverg(
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#ifdef DEBUG3
if ((threadIdx.x) == 0){
if ((threadIdx.x) == 0) {
printf("\nDTT Tiles after IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
float *rslt = mclt_tile + column;
#pragma unroll
for (int i = 0; i < 4; i++){
for (int i = 0; i < 4; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
d0 += d1;
d2 += d3;
d0 += d2;
if (i < 3) {
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w, d0, val); // w*d0 + val
*rslt = val;
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
for (int i = 4; i < 8; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
d0 += d1;
d2 += d3;
d0 += d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// }
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
for (int i = 7; i >= 4; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 4) {
clt_offset -= DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
for (int i = 3; i >= 0; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 0) {
clt_offset += DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = __fmaf_rd(w, d0, val); // w*d0 + val
rslt += DTT_SIZE21;
}
#ifdef DEBUG3
__syncthreads();// __syncwarp();
if ((threadIdx.x) == 0){
__syncthreads(); // __syncwarp();
if ((threadIdx.x) == 0) {
printf("\nMCLT Tiles after IMCLT\n");
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
}
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
......@@ -1169,24 +1135,23 @@ __device__ void imclt(
__device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float *mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug) {
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles before IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
......@@ -1203,7 +1168,7 @@ __device__ void imclt8threads(
dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
......@@ -1218,37 +1183,37 @@ __device__ void imclt8threads(
dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1);
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for (int thr3 = 0; thr3 < 2; thr3++){
for (int thr3 = 0; thr3 < 2; thr3++) {
int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
int wcolumn = (thr3m - thr3) ^ thr012; // 0..7,7,..0
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
float *rslt = mclt_tile + column;
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
#pragma unroll
for (int i = 0; i < 4; i++){
for (int i = 0; i < 4; i++) {
float val = *rslt;
// facc
float w = HWINDOW2[i] * hw;
......@@ -1256,91 +1221,87 @@ __device__ void imclt8threads(
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
d0 += d1;
d2 += d3;
d0 += d2;
if (i < 3) {
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
for (int i = 4; i < 8; i++) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
d0 += d1;
d2 += d3;
d0 += d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
for (int i = 7; i >= 4; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 4) {
clt_offset -= DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
for (int i = 3; i >= 0; i--) {
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
d0 += d1;
d2 += d3;
d0 += d2;
if (i > 0) {
clt_offset += DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
*rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
}
#ifdef DEBUG7
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)){
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)) {
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif
}
//#endif
......@@ -64,17 +64,16 @@
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//extern __constant__ float idct_signs[4][4][4];
//extern __constant__ int imclt_indx9[16];
//extern __constant__ float HWINDOW2[];
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH + 1)
// extern __constant__ float idct_signs[4][4][4];
// extern __constant__ int imclt_indx9[16];
// extern __constant__ float HWINDOW2[];
// kernels (not used so far)
#if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #if 0
#endif // #if 0
//=========================== 2D functions ===============
extern __device__ void corrUnfoldTile(
......@@ -83,19 +82,19 @@ extern __device__ void corrUnfoldTile(
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
extern __device__ void dttii_2d(
float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
float* clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
extern __device__ void dttiv_color_2d(
float * clt_tile,
float* clt_tile,
int color);
extern __device__ void dttiv_mono_2d(
float * clt_tile);
float* clt_tile);
extern __device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile );
float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float* mclt_tile);
extern __device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float* mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug);
......@@ -37,16 +37,15 @@
*/
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#include "tp_defines.h"
#include "dtt8x8.h"
#include "geometry_correction.h"
#endif // #ifndef JCUDA
#ifndef get_task_size
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#endif
// Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
......@@ -57,9 +56,8 @@
#define DBG_CAM 3
__device__ void printGeometryCorrection(struct gc * g, int num_cams);
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
__device__ void printGeometryCorrection(struct gc *g, int num_cams);
__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams);
/**
* Calculate non-distorted radius from distorted using table approximation
......@@ -67,114 +65,124 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
},{ // tilt
{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
},{ // roll*zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
},{ // d_azimuth
{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
},{ // d_tilt
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
},{ // d_roll
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
},{ // d_zoom
{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
}
};
__constant__ int angles_offsets [4] = {
offsetof(corr_vector, azimuth)/sizeof(float),
offsetof(corr_vector, tilt) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float),
offsetof(corr_vector, roll) /sizeof(float)};
__constant__ int mm_seq [3][3][3]={
float rByRDist[RBYRDIST_LEN]); // shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = { // ...{cos,sin,const}...
{
{6,5,12}, // a_t * a_z -> tmp0
{7,6,13}, // a_r * a_t -> tmp1
{7,9,14}, // a_r * a_dt -> tmp2
}, {
{7,12,0}, // a_r * tmp0 -> rot - bad
{13,8,1}, // tmp1 * a_daz -> deriv0 - good
{14,5,2}, // tmp2 * a_az -> deriv1 - good
}, {
{10,12,3}, // a_dr * tmp0 -> deriv2 - good
{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1,-1,-1} // do nothing
// azimuth
{{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
{{0, 0, 0}, {0, 0, 1}, {0, 0, 0}},
{{0, 1, 0}, {0, 0, 0}, {1, 0, 0}},
},
{
// tilt
{{0, 0, 1}, {0, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {1, 0, 0}, {0, 1, 0}},
{{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
},
{
// roll*zoom
{{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 1}},
},
{
// d_azimuth
{{0, -1, 0}, {0, 0, 0}, {-1, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
},
{
// d_tilt
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
{{0, 0, 0}, {-1, 0, 0}, {0, -1, 0}},
},
{
// d_roll
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{-1, 0, 0}, {0, -1, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
},
{
// d_zoom
{{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
{{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
{{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
}};
__constant__ int offset_rots = 0; //0
__constant__ int angles_offsets[4] = {
offsetof(corr_vector, azimuth) / sizeof(float),
offsetof(corr_vector, tilt) / sizeof(float),
offsetof(corr_vector, roll) / sizeof(float),
offsetof(corr_vector, roll) / sizeof(float)};
__constant__ int mm_seq[3][3][3] = {
{
{6, 5, 12}, // a_t * a_z -> tmp0
{7, 6, 13}, // a_r * a_t -> tmp1
{7, 9, 14}, // a_r * a_dt -> tmp2
},
{
{7, 12, 0}, // a_r * tmp0 -> rot - bad
{13, 8, 1}, // tmp1 * a_daz -> deriv0 - good
{14, 5, 2}, // tmp2 * a_az -> deriv1 - good
},
{
{10, 12, 3}, // a_dr * tmp0 -> deriv2 - good
{11, 12, 4}, // a_dzoom * tnmp0 -> deriv3 - good
{-1, -1, -1} // do nothing
}};
__constant__ int offset_rots = 0; // 0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
__constant__ int offset_tmp = 12; // 12..15
//inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
// inline __device__ int get_task_size_gc(int num_cams);
inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams);
inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams);
//inline __device__ int get_task_size_gc(int num_cams){
// inline __device__ int get_task_size_gc(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
//}
// }
inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile);
inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile);
}
inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
return *(int *) (gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams) {
return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
}
/**
* Calculate rotation matrices and derivatives by az, tilt, roll, zoom
* NUM_CAMS blocks of 3,3,3 tiles
*/
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv)
{
__shared__ float sincos [4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 +4][3][3];
struct corr_vector *gpu_correction_vector,
trot_deriv *gpu_rot_deriv) {
__shared__ float sincos[4][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[5 + 7 + 4][3][3];
float angle;
float zoom;
int ncam = blockIdx.x; // threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll
int is_sin = nangle1 & 1;
if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
/// if (ncam == (NUM_CAMS-1)){ // for the whole block
if (ncam == (num_cams-1)){ // for the whole block
if ((threadIdx.z == 0) && (nangle < 4)) { // others just idle here
float *gangles = (float *)gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
/// if (ncam == (NUM_CAMS-1)){ // for the whole block
if (ncam == (num_cams - 1)) { // for the whole block
angle = 0.0;
zoom = 0.0;
/// for (int n = 0; n < (NUM_CAMS-1); n++){
for (int n = 0; n < (num_cams-1); n++){
/// for (int n = 0; n < (NUM_CAMS-1); n++){
for (int n = 0; n < (num_cams - 1); n++) {
angle -= *(gangles + n);
zoom -= gpu_correction_vector->zoom[n];
}
if (nangle >= 2){ // diverging for roll (last two)
if (nangle >= 2) { // diverging for roll (last two)
angle = *(gangles + ncam);
}
......@@ -182,72 +190,68 @@ extern "C" __global__ void calc_rot_deriv(
angle = *(gangles + ncam);
zoom = gpu_correction_vector->zoom[ncam];
}
if (!is_sin){
angle += M_PI/2;
if (!is_sin) {
angle += M_PI / 2;
}
float sc = sinf(angle);
if (nangle ==2) {
if (nangle == 2) {
sc *= 1.0 + zoom;
}
sincos[nangle][is_sin]= sc;
sincos[nangle][is_sin] = sc;
}
__syncthreads();
#ifdef DEBUG20
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
printf( " Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf( " Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf( " Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
printf(" Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG20
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices+threadIdx.z; // 0..2
// Create 3 3x3 matrices for az, tilt, roll/zoom:
int axis = offset_matrices + threadIdx.z; // 0..2
int const_index = threadIdx.z; // 0..2
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
axis += 3; // skip index == 3
const_index +=3;
const_index += 3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
if (threadIdx.z == 0){
if (threadIdx.z == 0) {
axis += 3;
const_index +=3;
const_index += 3;
matrices[axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0] + // cos
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1] + // sin
ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20
const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
const char *matrices_names[] = {"az", "tilt", "roll*zoom", "d_daz", "d_tilt", "d_roll", "d_zoom"};
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
for (int i = 0; i < 7; i++) {
printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
for (int row = 0; row < 3; row++){
for (int col = 0; col < 3; col++){
printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
for (int row = 0; row < 3; row++) {
for (int col = 0; col < 3; col++) {
printf("%9.6f, ", matrices[offset_matrices + i][row][col]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG20
/*
/*
__constant__ int mm_seq [3][3][3]={
{
{6,5,12}, // a_t * a_z -> tmp0
......@@ -261,20 +265,20 @@ extern "C" __global__ void calc_rot_deriv(
{10,12,3}, // a_dr * tmp0 -> deriv2
{11,12,4}, // a_dzoom * tnmp0 -> deriv3
}};
*/
for (int i = 0; i < 3; i++){
*/
for (int i = 0; i < 3; i++) {
int srcl = mm_seq[i][threadIdx.z][0];
int srcr = mm_seq[i][threadIdx.z][1];
int dst = mm_seq[i][threadIdx.z][2];
if (srcl >= 0){
if (srcl >= 0) {
matrices[dst][threadIdx.y][threadIdx.x] =
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x] +
matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x] +
matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
}
__syncthreads();
}
// copy results to global memory
// copy results to global memory
int gindx = threadIdx.z;
int lindx = offset_rots + threadIdx.z;
#ifdef NVRTC_BUG
......@@ -283,8 +287,8 @@ extern "C" __global__ void calc_rot_deriv(
#else
gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
#endif
gindx +=3;
lindx+=3;
gindx += 3;
lindx += 3;
if (lindx < 5) {
#ifdef NVRTC_BUG
// going beyond first dimension
......@@ -295,54 +299,49 @@ extern "C" __global__ void calc_rot_deriv(
}
__syncthreads();
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG20
// All done - read/verify all arrays
// All done - read/verify all arrays
}
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
{
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS/ num_cams;
dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
dim3 grid_geom ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv) {
/// dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
/// dim3 grid_geom ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
int tiles_per_block_geom = NUM_THREADS / num_cams;
dim3 threads_geom(num_cams, tiles_per_block_geom, 1);
dim3 grid_geom((num_tiles + tiles_per_block_geom - 1) / tiles_per_block_geom, 1, 1);
//#define NUM_THREADS 32
if (threadIdx.x == 0) { // always 1
get_tiles_offsets<<<grid_geom,threads_geom>>> (
get_tiles_offsets<<<grid_geom, threads_geom>>>(
uniform_grid, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
}
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
// __syncthreads();// __syncwarp();
// cudaDeviceSynchronize();
// cudaDeviceSynchronize();
}
/*
* blockDim.x = NUM_CAMS
* blockDim.y = TILES_PER_BLOCK_GEOM
......@@ -351,46 +350,45 @@ extern "C" __global__ void calculate_tiles_offsets(
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv)
{
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv) {
int task_size = get_task_size(num_cams);
int task_num = blockIdx.x * blockDim.y + threadIdx.y; // blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
__shared__ struct gc geometry_correction;
__shared__ float rByRDist [RBYRDIST_LEN];
__shared__ float rByRDist[RBYRDIST_LEN];
__shared__ struct corr_vector extrinsic_corr;
__shared__ trot_deriv rot_deriv;
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
/// __shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
float pXY[2]; // result to be copied to task
//blockDim.y
// blockDim.y
// copy data common to all threads
{
int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
float * gcp_local = (float *) &geometry_correction;
float * gcp_global = (float *) gpu_geometry_correction;
int cycles_copy_gc = ((sizeof(struct gc) / sizeof(float) + dim_xy - 1) / dim_xy);
float *gcp_local = (float *)&geometry_correction;
float *gcp_global = (float *)gpu_geometry_correction;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_gc; i++){
if (offset < sizeof(struct gc)/sizeof(float)) {
for (int i = 0; i < cycles_copy_gc; i++) {
if (offset < sizeof(struct gc) / sizeof(float)) {
*(gcp_local + offset) = *(gcp_global + offset);
}
offset += dim_xy;
}
}
{
int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
float * cvp_local = (float *) &extrinsic_corr;
float * cvp_global = (float *) gpu_correction_vector;
int cycles_copy_cv = ((sizeof(struct corr_vector) / sizeof(float) + dim_xy - 1) / dim_xy);
float *cvp_local = (float *)&extrinsic_corr;
float *cvp_global = (float *)gpu_correction_vector;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_cv; i++){
if (offset < sizeof(struct corr_vector)/sizeof(float)) {
for (int i = 0; i < cycles_copy_cv; i++) {
if (offset < sizeof(struct corr_vector) / sizeof(float)) {
*(cvp_local + offset) = *(cvp_global + offset);
}
offset += dim_xy;
......@@ -399,10 +397,10 @@ extern "C" __global__ void get_tiles_offsets(
// TODO: maybe it is better to use system memory and not read all table?
{
int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
float * rByRDistp_local = (float *) rByRDist;
float * rByRDistp_global = (float *) gpu_rByRDist;
float *rByRDistp_local = (float *)rByRDist;
float *rByRDistp_global = (float *)gpu_rByRDist;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rbrd; i++){
for (int i = 0; i < cycles_copy_rbrd; i++) {
if (offset < RBYRDIST_LEN) {
*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
}
......@@ -411,12 +409,12 @@ extern "C" __global__ void get_tiles_offsets(
}
// copy rotational matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
{
int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
float * rots_local = (float *) &rot_deriv;
float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
int cycles_copy_rot = ((sizeof(trot_deriv) / sizeof(float)) + dim_xy - 1) / dim_xy;
float *rots_local = (float *)&rot_deriv;
float *rots_global = (float *)gpu_rot_deriv; // rot_matrices;
int offset = thread_xy;
for (int i = 0; i < cycles_copy_rot; i++){
if (offset < sizeof(trot_deriv)/sizeof(float)) {
for (int i = 0; i < cycles_copy_rot; i++) {
if (offset < sizeof(trot_deriv) / sizeof(float)) {
*(rots_local + offset) = *(rots_global + offset);
}
offset += dim_xy;
......@@ -424,7 +422,7 @@ extern "C" __global__ void get_tiles_offsets(
}
__syncthreads();
int ncam = threadIdx.x;
if (task_num >= num_tiles){
if (task_num >= num_tiles) {
return;
}
int imu_exists = // todo - calculate once with rot_deriv?
......@@ -436,17 +434,15 @@ extern "C" __global__ void get_tiles_offsets(
(extrinsic_corr.imu_move[2] != 0.0);
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nTile = %d, camera= %d\n", task_num, ncam);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
printf("\nget_tiles_offsets() threadIdx.x = %d, threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int)blockIdx.x);
printGeometryCorrection(&geometry_correction, num_cams);
printExtrinsicCorrection(&extrinsic_corr,num_cams);
printExtrinsicCorrection(&extrinsic_corr, num_cams);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// String dbg_s = corr_vector.toString();
/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
* 1) unapply common distortion (maybe for different - master camera)
......@@ -458,28 +454,28 @@ extern "C" __global__ void get_tiles_offsets(
*/
// common code, calculated in parallel
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
float disparity = * (gpu_ftasks + task_size * task_num + 2);
/// int cxy = gpu_tasks[task_num].txy;
/// float disparity = gpu_tasks[task_num].target_disparity;
float disparity = *(gpu_ftasks + task_size * task_num + 2);
float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
float px = *(centerXY);
float py = *(centerXY + 1);
int cxy = *(int *) (gpu_ftasks + task_size * task_num + 1);
int cxy = *(int *)(gpu_ftasks + task_size * task_num + 1);
int tileX = (cxy & 0xffff);
int tileY = (cxy >> 16);
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
// if (isnan(px)) {
// if (__float_as_int(px) == 0x7fffffff) {
if (uniform_grid) {
#ifdef DEBUG23
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
printf ("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)) {
printf("\n get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX, tileY, ncam);
printf("\n");
__syncthreads();
}
#endif //#ifdef DEBUG23
px = tileX * DTT_SIZE + DTT_SIZE/2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE/2; // - shiftY;
px = tileX * DTT_SIZE + DTT_SIZE / 2; // - shiftX;
py = tileY * DTT_SIZE + DTT_SIZE / 2; // - shiftY;
*(centerXY) = px;
*(centerXY + 1) = py;
}
......@@ -488,28 +484,27 @@ extern "C" __global__ void get_tiles_offsets(
float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
float rXY [2];
float rXY[2];
rXY[0] = geometry_correction.rXY[ncam][0];
rXY[1] = geometry_correction.rXY[ncam][1];
float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
float rD = sqrtf(pXcd * pXcd + pYcd * pYcd) * 0.001 * geometry_correction.pixelSize; // distorted radius in a virtual center camera
float rND2R = getRByRDist(rD / geometry_correction.distortionRadius, rByRDist);
float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
float pYc = pYcd * rND2R; // in pixels
float xyz [3]; // getWorldCoordinates
float xyz[3]; // getWorldCoordinates
xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
// next radial distortion coefficients are for this, not master camera (may be the same)
// geometry_correction.rad_coeff[i];
float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float fl_pix = geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("\nuniform_grid=%d\n", uniform_grid);
printf("Tile = %d, camera= %d\n", task_num, ncam);
printf("TargetDisparity = %f\n", disparity);
......@@ -521,12 +516,11 @@ extern "C" __global__ void get_tiles_offsets(
printf("rD = %f, rND2R = %f\n", rD, rND2R);
printf("pXc = %f, pYc = %f\n", pXc, pYc);
printf("fl_pix = %f, ri_scale = %f\n", fl_pix, ri_scale);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0], xyz[1], xyz[2]);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
// non-distorted XY of the shifted location of the individual sensor
......@@ -542,30 +536,30 @@ extern "C" __global__ void get_tiles_offsets(
float rvi[3];
#pragma unroll
for (int j = 0; j< 3; j++){
for (int j = 0; j < 3; j++) {
rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
}
// get back to the projection plane by normalizing vector
float norm_z = fl_pix/rvi[2];
float norm_z = fl_pix / rvi[2];
float pXci = rvi[0] * norm_z;
float pYci = rvi[1] * norm_z;
// Re-apply distortion
float rNDi = sqrtf(pXci*pXci + pYci*pYci); // in pixels
float ri = rNDi* ri_scale; // relative to distortion radius
float rNDi = sqrtf(pXci * pXci + pYci * pYci); // in pixels
float ri = rNDi * ri_scale; // relative to distortion radius
float rD2rND = 1.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
for (int j = 0; j < RAD_COEFF_LEN; j++) {
rri *= ri;
rD2rND += ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
rD2rND += ((float *)&geometry_correction.distortionC)[j] * (rri - 1.0);
}
#else
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
rri *= ri;
rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
rD2rND += geometry_correction.rad_coeff[j] * (rri - 1.0);
}
#endif
}
......@@ -579,14 +573,14 @@ extern "C" __global__ void get_tiles_offsets(
__syncthreads();
// Each thread re-calculate same sum
float lines_avg = 0;
for (int i = 0; i < num_cams; i ++){
for (int i = 0; i < num_cams; i++) {
lines_avg += pY_offsets[threadIdx.y][i];
}
lines_avg *= (1.0/num_cams);
lines_avg *= (1.0 / num_cams);
// used when calculating derivatives, TODO: combine calculations !
float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("pXci0 = %f, pYci0 = %f\n", pXci0, pYci0);
printf("rvi[0] = %f, rvi[1] = %f, rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
printf("norm_z = %f, pXci = %f, pYci = %f\n", norm_z, pXci, pYci);
......@@ -596,15 +590,15 @@ extern "C" __global__ void get_tiles_offsets(
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
printf("lines_avg = %f, pY_offset = %f\n", lines_avg, pY_offset); // *
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
float drvi_daz[3]; // drvi_daz = deriv_rots[i][0].times(vi);
float drvi_dtl[3]; // drvi_dtl = deriv_rots[i][1].times(vi);
float drvi_drl[3]; // drvi_drl = deriv_rots[i][2].times(vi);
#pragma unroll
for (int j = 0; j< 3; j++){
for (int j = 0; j < 3; j++) {
drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
......@@ -618,7 +612,7 @@ extern "C" __global__ void get_tiles_offsets(
float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("drvi_daz[0] = %f, drvi_daz[1] = %f, drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
printf("drvi_dtl[0] = %f, drvi_dtl[1] = %f, drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
printf("drvi_drl[0] = %f, drvi_drl[1] = %f, drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
......@@ -627,7 +621,7 @@ extern "C" __global__ void get_tiles_offsets(
printf("dpXci_dtilt = %f, dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
printf("dpXci_droll = %f, dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
......@@ -641,47 +635,47 @@ extern "C" __global__ void get_tiles_offsets(
Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
*/
float dd1[2][2];// get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
float dd1[2][2]; // get top left 2x2 sub-matrix
dd1[0][0] = (-rot_deriv.rots[ncam][0][0] * rXY[0] - rot_deriv.rots[ncam][0][1] * rXY[1]) * norm_z;
dd1[0][1] = (rot_deriv.rots[ncam][0][0] * rXY[1] - rot_deriv.rots[ncam][0][1] * rXY[0]) * norm_z;
dd1[1][0] = (-rot_deriv.rots[ncam][1][0] * rXY[0] - rot_deriv.rots[ncam][1][1] * rXY[1]) * norm_z;
dd1[1][1] = (rot_deriv.rots[ncam][1][0] * rXY[1] - rot_deriv.rots[ncam][1][1] * rXY[0]) * norm_z;
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("dd1[0][0] = %f, dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("dd1[0][0] = %f, dd1[0][1] = %f\n", dd1[0][0], dd1[0][1]);
printf("dd1[1][0] = %f, dd1[1][1] = %f\n", dd1[1][0], dd1[1][1]);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
// unity vector in the direction of radius
float c_dist = pXci/rNDi;
float s_dist = pYci/rNDi;
float c_dist = pXci / rNDi;
float s_dist = pYci / rNDi;
//#undef NVRTC_BUG
float drD2rND_dri = 0.0;
{
float rri = 1.0;
#ifdef NVRTC_BUG
#pragma unroll
for (int j = 0; j < RAD_COEFF_LEN; j++){
drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
for (int j = 0; j < RAD_COEFF_LEN; j++) {
drD2rND_dri += ((float *)&geometry_correction.distortionC)[j] * (j + 1) * rri;
rri *= ri;
}
#else
#pragma unroll
for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
drD2rND_dri += geometry_correction.rad_coeff[j] * (j + 1) * rri;
rri *= ri;
}
#endif
}
float scale_distort00 = rD2rND + ri* drD2rND_dri;
float scale_distort00 = rD2rND + ri * drD2rND_dri;
float scale_distort11 = rD2rND;
float scale_distortXrot2Xdd1[2][2];
scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[0][0] = (c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
scale_distortXrot2Xdd1[0][1] = (c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
......@@ -691,21 +685,21 @@ extern "C" __global__ void get_tiles_offsets(
disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
#ifdef DEBUG210
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("scale_distortXrot2Xdd1[0][0] = %f, scale_distortXrot2Xdd1[0][1] = %f\n", scale_distortXrot2Xdd1[0][0], scale_distortXrot2Xdd1[0][1]);
printf("scale_distortXrot2Xdd1[1][0] = %f, scale_distortXrot2Xdd1[1][1] = %f\n", scale_distortXrot2Xdd1[1][0], scale_distortXrot2Xdd1[1][1]);
printf("disp_dist[0] = %f\n", disp_dist[0]);
printf("disp_dist[1] = %f\n", disp_dist[1]);
printf("disp_dist[2] = %f\n", disp_dist[2]);
printf("disp_dist[3] = %f\n", disp_dist[3]);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float * disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
/// gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
/// gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
/// gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
/// gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
float *disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams * 2 + ncam * 4; // ncam = threadIdx.x, so each thread will have different offset
*(disp_dist_p++) = disp_dist[0]; // global memory
*(disp_dist_p++) = disp_dist[1];
*(disp_dist_p++) = disp_dist[2];
......@@ -715,7 +709,7 @@ extern "C" __global__ void get_tiles_offsets(
// float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
// float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
// ERS linear does not yet use per-port rotations, probably not needed
if (imu_exists){
if (imu_exists) {
float ers_x =
dpXci_dtilt * extrinsic_corr.imu_rot[0] +
dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
......@@ -726,114 +720,121 @@ extern "C" __global__ void get_tiles_offsets(
dpYci_droll * extrinsic_corr.imu_rot[2];
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("ers_x = %f, ers_y = %f\n", ers_x, ers_y);
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
if (disparity >= MIN_DISPARITY){ // all threads together
if (disparity >= MIN_DISPARITY) { // all threads together
float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
float wdisparity = disparity;
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
float dwdisp_dz = (k * geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
dpXci_pYci_imu_lin[1][1] = wdisparity / k; // dpy/ dworld_Y
dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
//// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
//// dpXci_pYci_imu_lin[1][2] = (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
float delta_t = (pY_offset / (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
pXY[0] += delta_t * ers_x * rD2rND; // added correction to pixel X
pXY[1] += delta_t * ers_y * rD2rND; // added correction to pixel Y
#ifdef DEBUG21
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)){
if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
printf("k = %f, wdisparity = %f, dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
printf("dpXci_pYci_imu_lin[0][0] = %f, dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0], dpXci_pYci_imu_lin[0][2]);
printf("dpXci_pYci_imu_lin[1][1] = %f, dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1], dpXci_pYci_imu_lin[1][2]);
printf("delta_t = %f, ers_x = %f, ers_y = %f\n", delta_t, ers_x, ers_y);
printf("pXY[0] = %f, pXY[1] = %f\n", pXY[0], pXY[1]); // OK
}
__syncthreads();// __syncwarp();
__syncthreads(); // __syncwarp();
#endif // DEBUG21
}
}
// copy results to global memory pXY, disp_dist (already copied)
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// gpu_tasks[task_num].xy[ncam][0] = pXY[0];
// gpu_tasks[task_num].xy[ncam][1] = pXY[1];
// float * tile_xy_p = gpu_ftasks + task_size * task_num + 3 + num_cams * 4 + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
// .xy goes right after 3 commonn (tak, txy and target_disparity
float * tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
float *tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2; // ncam = threadIdx.x, so each thread will have different offset
*(tile_xy_p++) = pXY[0]; // global memory
*(tile_xy_p++) = pXY[1]; // global memory
}
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
struct gc *geometry_correction,
float *rByRDist) {
// int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta = 1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative = 0.01;
int numIterations = 1000;
double drDistDr = 1.0;
double d = 1.0 - geometry_correction->distortionA8 - geometry_correction->distortionA7 - geometry_correction->distortionA6 - geometry_correction->distortionA5 - geometry_correction->distortionA - geometry_correction->distortionB - geometry_correction->distortionC;
double rPrev = 0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
for (int p = 0; p < num_points; p++) {
int i = indx * num_points + p;
if (i >= RBYRDIST_LEN) {
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
if (i == 0) {
rByRDist[0] = (float)1.0 / d;
continue;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
for (int iteration = 0; iteration < numIterations; iteration++) {
double k = (((((((
geometry_correction->distortionA8) *
r +
geometry_correction->distortionA7) *
r +
geometry_correction->distortionA6) *
r +
geometry_correction->distortionA5) *
r +
geometry_correction->distortionA) *
r +
geometry_correction->distortionB) *
r +
geometry_correction->distortionC) *
r +
d;
drDistDr = (((((((
8 * geometry_correction->distortionA8) *
r +
7 * geometry_correction->distortionA7) *
r +
6 * geometry_correction->distortionA6) *
r +
5 * geometry_correction->distortionA5) *
r +
4 * geometry_correction->distortionA) *
r +
3 * geometry_correction->distortionB) *
r +
2 * geometry_correction->distortionC) *
r +
d;
if (drDistDr < minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
double rD = r * k;
if (fabs(rD - rDist) < delta) {
break;
}
r+=(rDist-rD)/drDistDr;
r += (rDist - rD) / drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
rPrev = r;
rByRDist[i] = (float)r / rDist;
}
}
......@@ -843,14 +844,14 @@ extern "C" __global__ void calcReverseDistortionTable(
* @return corresponding non-distorted radius
*/
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]) //shared memory
float rByRDist[RBYRDIST_LEN]) // shared memory
{
if (rDist < 0) {
return 0.0f; // normally should not happen
}
float findex = rDist/RBYRDIST_STEP;
int index= (int) floorf(findex);
if (index < 0){
float findex = rDist / RBYRDIST_STEP;
int index = (int)floorf(findex);
if (index < 0) {
index = 0;
}
if (index > (RBYRDIST_LEN - 3)) {
......@@ -858,95 +859,107 @@ inline __device__ float getRByRDist(float rDist,
}
float mu = fmaxf(findex - index, 0.0f);
float mu2 = mu * mu;
float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
float y0 = (index > 0) ? rByRDist[index - 1] : (2 * rByRDist[index] - rByRDist[index + 1]);
// use Catmull-Rom
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index+1] - 0.5 * rByRDist[index+2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index+1];
float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index + 1] + 0.5 * rByRDist[index + 2];
float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index + 1] - 0.5 * rByRDist[index + 2];
float a2 = -0.5 * y0 + 0.5 * rByRDist[index + 1];
float a3 = rByRDist[index];
float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
float result = a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3;
return result;
}
__device__ void printGeometryCorrection(struct gc * g, int num_cams){
__device__ void printGeometryCorrection(struct gc *g, int num_cams) {
#ifndef JCUDA
printf("\nGeometry Correction\n------------------\n");
printf("%22s: %f\n","pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n","line_time", g->line_time);
printf("%22s: %f\n","focalLength", g->focalLength);
printf("%22s: %f\n","pixelSize", g->pixelSize);
printf("%22s: %f\n","distortionRadius",g->distortionRadius);
printf("%22s: %f\n","distortionC", g->distortionC);
printf("%22s: %f\n","distortionB", g->distortionB);
printf("%22s: %f\n","distortionA", g->distortionA);
printf("%22s: %f\n","distortionA5",g->distortionA5);
printf("%22s: %f\n","distortionA6",g->distortionA6);
printf("%22s: %f\n","distortionA7",g->distortionA7);
printf("%22s: %f\n","distortionA8",g->distortionA8);
printf("%22s: %f\n","elevation", g->elevation);
printf("%22s: %f\n","heading", g->heading);
// printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
// printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
// printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
// printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
// printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
// printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
// printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
// printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s:","forward"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]); printf("\n");
printf("%22s:","right"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right [ncam]); printf("\n");
printf("%22s:","height"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height [ncam]); printf("\n");
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll [ncam]); printf("\n");
printf("%22s: %f\n", "pixelCorrectionWidth", g->pixelCorrectionWidth);
printf("%22s: %f\n", "pixelCorrectionHeight", g->pixelCorrectionHeight);
printf("%22s: %f\n", "line_time", g->line_time);
printf("%22s: %f\n", "focalLength", g->focalLength);
printf("%22s: %f\n", "pixelSize", g->pixelSize);
printf("%22s: %f\n", "distortionRadius", g->distortionRadius);
printf("%22s: %f\n", "distortionC", g->distortionC);
printf("%22s: %f\n", "distortionB", g->distortionB);
printf("%22s: %f\n", "distortionA", g->distortionA);
printf("%22s: %f\n", "distortionA5", g->distortionA5);
printf("%22s: %f\n", "distortionA6", g->distortionA6);
printf("%22s: %f\n", "distortionA7", g->distortionA7);
printf("%22s: %f\n", "distortionA8", g->distortionA8);
printf("%22s: %f\n", "elevation", g->elevation);
printf("%22s: %f\n", "heading", g->heading);
// printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
// printf("%22s: %f, %f, %f, %f \n","right", g->right[0], g->right[1], g->right[2], g->right[3]);
// printf("%22s: %f, %f, %f, %f \n","height", g->height[0], g->height[1], g->height[2], g->height[3]);
// printf("%22s: %f, %f, %f, %f \n","roll", g->roll[0], g->roll[1], g->roll[2], g->roll[3]);
// printf("%22s: %f, %f \n", "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
// printf("%22s: %f, %f \n", "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
// printf("%22s: %f, %f \n", "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
// printf("%22s: %f, %f \n", "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
printf("%22s:", "forward");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]);
printf("\n");
printf("%22s:", "right");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right[ncam]);
printf("\n");
printf("%22s:", "height");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height[ncam]);
printf("\n");
printf("%22s:", "roll");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll[ncam]);
printf("\n");
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "pXY0[",ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
printf("%19s%2d]: %f, %f \n", "pXY0[", ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
}
printf("%22s: %f\n","common_right", g->common_right);
printf("%22s: %f\n","common_forward", g->common_forward);
printf("%22s: %f\n","common_height", g->common_height);
printf("%22s: %f\n","common_roll", g->common_roll);
printf("%22s: %f\n", "common_right", g->common_right);
printf("%22s: %f\n", "common_forward", g->common_forward);
printf("%22s: %f\n", "common_height", g->common_height);
printf("%22s: %f\n", "common_roll", g->common_roll);
// printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
// printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
// printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
// printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
// printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
// printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
// printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
for (int ncam = 0; ncam < num_cams; ncam++) {
printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
}
printf("%22s: %f\n","cameraRadius", g->cameraRadius);
printf("%22s: %f\n","disparityRadius", g->disparityRadius);
printf("%22s: %f\n", "cameraRadius", g->cameraRadius);
printf("%22s: %f\n", "disparityRadius", g->disparityRadius);
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
printf("%22s:","woi_tops"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]); printf("\n");
// printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
printf("%22s:", "woi_tops");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]);
printf("\n");
#endif //ifndef JCUDA
#endif // ifndef JCUDA
}
__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams)
{
__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams) {
#ifndef JCUDA
printf("\nExtrinsic Correction Vector\n---------------------------\n");
// printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
// printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
// printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
// printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s:","tilt"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->tilt[ncam]); printf("\n");
printf("%22s:","azimuth"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->azimuth[ncam]); printf("\n");
printf("%22s:","roll"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]); printf("\n");
printf("%22s:","zoom"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->zoom[ncam]); printf("\n");
// printf("%22s: %f, %f, %f\n", "tilt", cv->tilt[0], cv->tilt[1], cv->tilt[2]);
// printf("%22s: %f, %f, %f\n", "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
// printf("%22s: %f, %f, %f, %f\n", "roll", cv->roll[0], cv->roll[1], cv->roll[2], cv->roll[3]);
// printf("%22s: %f, %f, %f\n", "zoom", cv->zoom[0], cv->zoom[1], cv->zoom[2]);
printf("%22s:", "tilt");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->tilt[ncam]);
printf("\n");
printf("%22s:", "azimuth");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->azimuth[ncam]);
printf("\n");
printf("%22s:", "roll");
for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]);
printf("\n");
printf("%22s:", "zoom");
for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->zoom[ncam]);
printf("\n");
printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
#endif //ifndef JCUDA
#endif // ifndef JCUDA
}
......@@ -41,18 +41,16 @@
#include "tp_defines.h"
#endif
#define NVRTC_BUG 1
#ifndef M_PI
#define M_PI 3.14159265358979323846 /* pi */
#endif
#ifndef offsetof
#define offsetof(st, m) \
((size_t)&(((st *)0)->m))
((size_t) & (((st *)0)->m))
//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
#endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task {
......@@ -68,37 +66,37 @@ struct tp_task {
float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
};
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
#define tp_task_xy_offset 5
#define tp_task_centerXY_offset 3
struct corr_vector{
float tilt [NUM_CAMS-1]; // 0..2
float azimuth [NUM_CAMS-1]; // 3..5
float roll [NUM_CAMS]; // 6..9
float zoom [NUM_CAMS-1]; // 10..12
struct corr_vector {
float tilt[NUM_CAMS - 1]; // 0..2
float azimuth[NUM_CAMS - 1]; // 3..5
float roll[NUM_CAMS]; // 6..9
float zoom[NUM_CAMS - 1]; // 10..12
// for ERS correction:
float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_rot[3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
};
#ifdef NVRTC_BUG
struct trot_deriv{
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
struct trot_deriv {
float rots[NUM_CAMS][3][3];
float d_daz[NUM_CAMS][3][3];
float d_tilt[NUM_CAMS][3][3];
float d_roll[NUM_CAMS][3][3];
float d_zoom[NUM_CAMS][3][3];
};
#else
union trot_deriv{
union trot_deriv {
struct {
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
float rots[NUM_CAMS][3][3];
float d_daz[NUM_CAMS][3][3];
float d_tilt[NUM_CAMS][3][3];
float d_roll[NUM_CAMS][3][3];
float d_zoom[NUM_CAMS][3][3];
};
float matrices [5][NUM_CAMS][3][3];
float matrices[5][NUM_CAMS][3][3];
};
#endif
......@@ -116,72 +114,68 @@ struct gc {
float distortionC; // r^2
float distortionB; // r^3
float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
float distortionA5; // r^5 (normalized to focal length or to sensor half width?)
float distortionA6; // r^6 (normalized to focal length or to sensor half width?)
float distortionA7; // r^7 (normalized to focal length or to sensor half width?)
float distortionA8; // r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
};
float rad_coeff [7];
float rad_coeff[7];
};
#endif
// parameters, common for all sensors
float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive
float forward [NUM_CAMS];
float right [NUM_CAMS];
float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2];
float forward[NUM_CAMS];
float right[NUM_CAMS];
float height[NUM_CAMS];
float roll[NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0[NUM_CAMS][2];
float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY[NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
float woi_tops [NUM_CAMS]; // used to calculate scanline timing
float woi_tops[NUM_CAMS]; // used to calculate scanline timing
};
#define RAD_COEFF_LEN 7
extern "C" __global__ void get_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
// struct tp_task * gpu_tasks,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv);
extern "C" __global__ void calculate_tiles_offsets(
int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
float *gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
struct gc *gpu_geometry_correction,
struct corr_vector *gpu_correction_vector,
float *gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv *gpu_rot_deriv);
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
int num_cams,
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
struct corr_vector *gpu_correction_vector,
trot_deriv *gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist);
struct gc *geometry_correction,
float *rByRDist);
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -77,7 +77,7 @@
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
#define TILES_PER_BLOCK_GEOM (32 / NUM_CAMS) // each tile has NUM_CAMS threads
#define DEBUG_ANY 1
......@@ -87,13 +87,13 @@
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#if TEST_LWIR
#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#define DBG_TILE_X 50 // 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 19 // 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#else
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 51 // 52 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#endif
#undef DBG_MARK_DBG_TILE
//#undef DBG_TILE
......@@ -101,8 +101,7 @@
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
// 7
//#define DEBUG1 1
//#define DEBUG2 1
//#define DEBUG3 1
......@@ -118,7 +117,7 @@
#define DEBUG9 1
*/
//#define DEBUG8A 1 // generate_RBGA_host
//textures
// textures
//#define DEBUG10 1
//#define DEBUG11 1
//#define DEBUG12 1
......@@ -127,7 +126,6 @@
// geom
//#define DEBUG20 1
#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
//#define DEBUG20 1 // Geometry Correction
//#define DEBUG21 1 // Geometry Correction
......@@ -140,6 +138,4 @@
#endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment