Commit f20a0723 authored by Andrey Filippov's avatar Andrey Filippov

implemented and tested 2D phase correlation

parent 6b065851
......@@ -41,17 +41,24 @@
#pragma once
#include "dtt8x8.cuh"
#define THREADSX (DTT_SIZE)
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_PAIR_SHIFT 8 // 8 lower bits - number of a pair, other bits tile number
#define TASK_CORR_BITS 4
#define CORR_OUT_RAD 7
#endif
//#define IMCLT14
......@@ -106,6 +113,11 @@
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
// Use CORR_OUT_RAD for the correlation output
#define BAYER_RED 0
#define BAYER_BLUE 1
......@@ -117,15 +129,16 @@
//#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
#define DBG_TILE_X 174
#define DBG_TILE_Y 118
#define DBG_TILE_X 40
#define DBG_TILE_Y 80
//#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
//#define DEBUG1 1
//#define DEBUG2 1
//#define DEBUG3 1
//#define DEBUG4 1
//#define DEBUG5 1
#define DEBUG6 1
//56494
// struct tp_task
//#define TASK_SIZE 12
......@@ -311,6 +324,24 @@ __constant__ float lpf_data[3][64]={
0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
}};
__constant__ float lpf_corr[64]={ // modify if needed
1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
0.87041007f, 0.75761368f, 0.57398049f, 0.37851747f, 0.21734206f, 0.10895863f, 0.04888546f, 0.02374977f,
0.65943687f, 0.57398049f, 0.43485698f, 0.28677101f, 0.16466189f, 0.08254883f, 0.03703642f, 0.01799322f,
0.43487258f, 0.37851747f, 0.28677101f, 0.18911416f, 0.10858801f, 0.05443770f, 0.02442406f, 0.01186582f,
0.24970076f, 0.21734206f, 0.16466189f, 0.10858801f, 0.06235047f, 0.03125774f, 0.01402412f, 0.00681327f,
0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
};
__constant__ int pairs[6][2]={
{0, 1},
{2, 3},
{0, 2},
{1, 3},
{0, 3},
{2, 1}};
//#endif
__device__ void convertCorrectTile(
struct CltExtra * gpu_kernel_offsets, // [tileY][tileX][color]
......@@ -333,43 +364,306 @@ __device__ void convertCorrectTile(
float window_hor_sin [2*DTT_SIZE],
float window_vert_cos [2*DTT_SIZE]);
__device__ void debug_print_lpf(
float * lpf_tile);
__device__ void debug_print_clt1(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const int color,
int mask);
__device__ void debug_print_mclt(
float * mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const int color);
__device__ void debug_print_corr_15x15(
float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
const int color);
// Fractional pixel shift (phase rotation), horizontal. In-place.
__device__ void shiftTileHor(
__device__ void shiftTileHor( // implemented, used
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float residual_shift );
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__ void shiftTileVert(
__device__ void shiftTileVert( // implemented, used
float *clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float residual_shift );
__device__ void convolveTiles(
__device__ void convolveTiles( // implemented, used
float* clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* kernel); // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the CLT kernel (DTT3 converted)
__device__ void imclt(
__device__ void correlateAccumulateTiles(
float scale, // scale correlation
float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
float* corr_tile); // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__device__ void resetCorrelation(
float* corr_tile); // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__device__ void normalizeTileAmplitude(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float fat_zero); // fat zero is absolute, scale it outside
__device__ void corrUnfoldTile(
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__device__ void imclt( // implemented, used // why is it twice?
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__ void imclt(
__device__ void imclt( // implemented, used // why is it twice?
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__ void imclt_plane(
__device__ void imclt_plane( // not implemented, not used
int color,
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, HEIGHT
const size_t dstride); // in floats (pixels)
extern "C"
__global__ void convert_correct_tiles(
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct paraeters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
__global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// int tilesX, // make it variable
int colors, // number of colors (3/1)
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
float * gpu_corrs) // correlation output data
{
/// int thr3 = threadIdx.x >> 3; // now zero?
/// int column = threadIdx.x; // modify to use 2 * 8 threads, if needed.
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
if (corr_num >= num_corr_tiles){
return; // nothing to do
}
// get number of pair and number of tile
#define ALLTILES 1
#ifdef ALLTILES
int corr_pair = corr_num % NUM_PAIRS;
int tile_num = corr_num / NUM_PAIRS;
#else
int corr_pair = gpu_corr_indices[corr_num];
int tile_num = corr_pair >> CORR_PAIR_SHIFT;
#endif
corr_pair &= (corr_pair & ((1 << CORR_PAIR_SHIFT) - 1));
if (corr_pair > NUM_PAIRS){
return; // BUG - should not happen
}
int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
int cam2 = pairs[corr_pair][1]; // number of the first camera in a pair
__syncthreads();// __syncwarp();
__shared__ float clt_tiles1 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_tiles2 [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float clt_corrs [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr);
for (int color = 0; color < colors; color++){
// copy clt (frequency domain data)
float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
float * clt_tile2 = ((float *) clt_tiles2) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * gpu_tile1 = ((float *) gpu_clt[cam1]) + offs;
float * gpu_tile2 = ((float *) gpu_clt[cam2]) + offs;
float * clt_tile1i = clt_tile1 + threadIdx.x;
float * clt_tile2i = clt_tile2 + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
*clt_tile1i= *gpu_tile1;
*clt_tile2i= *gpu_tile2;
clt_tile1i += DTT_SIZE1;
clt_tile2i += DTT_SIZE1;
gpu_tile1 += DTT_SIZE;
gpu_tile2 += DTT_SIZE;
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA1\n",tile_num, corr_pair,color);
debug_print_clt1(clt_tile1, color, 0xf); //
printf("\ncorrelate2D tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads();// __syncwarp();
#endif
#endif
// each thread should get the same pointers here, offsets are inside
correlateAccumulateTiles(
scales[color], // float scale, // scale correlation
clt_tile1, // float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
clt_tile2, // float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
clt_corr); // float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D, color = %d CORRELATION\n", color);
debug_print_clt1(clt_corr, color, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
}
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D LPF\n");
debug_print_lpf(lpf_corr);
}
__syncthreads();// __syncwarp();
#endif
#endif
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
float *lpf = lpf_corr + threadIdx.x;
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
(*clt) *= (*lpf);
clt += DTT_SIZE1;
lpf += DTT_SIZE;
}
}
__syncthreads();// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D CORRELATION LPF-ed\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// now new part - need to transform with DCT-II and make 15x15
/*
// quadrant 0 dct_ii hor, dct_ii vert,
// quadrant 1 dct_ii hor, dst_ii vert,
// quadrant 2 dst_ii hor, dct_ii vert,
// quadrant 3 dst_ii hor, dst_ii vert,
Java code:
for (int quadrant = 0; quadrant < 4; quadrant++){
int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
}
*/
// change to 16-32 threads?? in next iteration
// hor pass
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
// int is_sin = q & 1;
// dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
// dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D AFTER VERTICAL (HORIZONTAL) PASS\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// vert pass
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
// int is_sin = (q >> 1) & 1;
// dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
// dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
corrUnfoldTile(
(float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after UNFOLD\n");
debug_print_corr_15x15(mclt_corr, -1);
}
__syncthreads();// __syncwarp();
#endif
#endif
// copy 15x15 tile to main memory
int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_tile_offset;
//CORR_THREADS_PER_TILE
// int offs = threadIdx.x;
#pragma unroll
for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs];
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D after copy to main memory\n");
// debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
}
extern "C"
__global__ void convert_correct_tiles(
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
{
// struct CltExtra* gpu_kernel_offsets = (struct CltExtra*) vgpu_kernel_offsets;
dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
......@@ -379,8 +673,6 @@ __global__ void convert_correct_tiles(
__shared__ struct tp_task tt [TILES_PER_BLOCK];
// Copy task data to shared memory
tt[tile_in_block].task = gpu_task -> task;
// tt[tile_in_block].tx = gpu_task -> tx;
// tt[tile_in_block].ty = gpu_task -> ty;
tt[tile_in_block].txy = gpu_task -> txy;
int thread0 = threadIdx.x & 1;
int thread12 = threadIdx.x >>1;
......@@ -426,7 +718,6 @@ __global__ void convert_correct_tiles(
lpf_mask, // const int lpf_mask,
tt[tile_in_block].xy[ncam][0], // const float centerX,
tt[tile_in_block].xy[ncam][1], // const float centerY,
// tt[tile_in_block].tx | (tt[tile_in_block].ty <<16), // const int txy,
tt[tile_in_block].txy, // const int txy,
dstride, // size_t dstride, // in floats (pixels)
(float * )(clt_tile [tile_in_block]), // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
......@@ -556,6 +847,191 @@ __device__ void convolveTiles(
}
}
__device__ void correlateAccumulateTiles(
float scale, // scale correlation
float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
{
int joffs = threadIdx.x * DTT_SIZE1;
float * clt_tile2_j; // = clt_tile2 + joffs; // ==&clt_tile2[0][j][0]
float * clt_tile1_j0 = clt_tile1 + joffs; // ==&clt_tile[0][j][0]
float * clt_tile1_j1 = clt_tile1_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
float * clt_tile1_j2 = clt_tile1_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
float * clt_tile1_j3 = clt_tile1_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
float * corr_tile_j0 = corr_tile + joffs; // ==&clt_tile[0][j][0]
float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
//#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
// k=0
clt_tile2_j = clt_tile2 + joffs + i;
float clt2 = *(clt_tile2_j);
float r0 = *(clt_tile1_j0) * clt2;
float r1 = -*(clt_tile1_j1) * clt2;
float r2 = -*(clt_tile1_j2) * clt2;
float r3 = *(clt_tile1_j3) * clt2;
// k = 1
clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
clt2 = *(clt_tile2_j);
r0 += *(clt_tile1_j1) * clt2;
r1 += *(clt_tile1_j0) * clt2;
r2 -= *(clt_tile1_j3) * clt2;
r3 -= *(clt_tile1_j2) * clt2;
// k=2
clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
clt2 = *(clt_tile2_j);
r0 += *(clt_tile1_j2) * clt2;
r1 -= *(clt_tile1_j3) * clt2;
r2 += *(clt_tile1_j0) * clt2;
r3 -= *(clt_tile1_j1) * clt2;
// k=3
clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
clt2 = *(clt_tile2_j);
r0 += *(clt_tile1_j3) * clt2;
r1 += *(clt_tile1_j2) * clt2;
r2 += *(clt_tile1_j1) * clt2;
r3 += *(clt_tile1_j0) * clt2;
*(corr_tile_j0) += scale * r0;
*(corr_tile_j1) += scale * r1;
*(corr_tile_j2) += scale * r2;
*(corr_tile_j3) += scale * r3;
clt_tile1_j0 ++;
clt_tile1_j1 ++;
clt_tile1_j2 ++;
clt_tile1_j3 ++;
corr_tile_j0 ++;
corr_tile_j1 ++;
corr_tile_j2 ++;
corr_tile_j3 ++;
}
}
__device__ void resetCorrelation(
float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
{
int joffs = threadIdx.x * DTT_SIZE1;
float * corr_tile_j0 = corr_tile + joffs; // k = 0
float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // k = 1
float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // k = 2
float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // k = 3
//#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){
*(corr_tile_j0) = 0;
*(corr_tile_j1) = 0;
*(corr_tile_j2) = 0;
*(corr_tile_j3) = 0;
corr_tile_j0 ++;
corr_tile_j1 ++;
corr_tile_j2 ++;
corr_tile_j3 ++;
}
}
__device__ void normalizeTileAmplitude(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float fat_zero ) // fat zero is absolute, scale it outside
{
int joffs = threadIdx.x * DTT_SIZE1;
float * clt_tile_j0 = clt_tile + joffs; // ==&clt_tile[0][j][0]
float * clt_tile_j1 = clt_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
float * clt_tile_j2 = clt_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
float * clt_tile_j3 = clt_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++) {
float s2 = fat_zero * fat_zero +
*(clt_tile_j0) * *(clt_tile_j0) +
*(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j3) * *(clt_tile_j3);
float scale = rsqrtf(s2); // 1.0/sqrt(s2)
*(clt_tile_j0) *= scale;
*(clt_tile_j1) *= scale;
*(clt_tile_j2) *= scale;
*(clt_tile_j3) *= scale;
clt_tile_j0 ++; // =DTT_SIZE1;
clt_tile_j1 ++; // =DTT_SIZE1;
clt_tile_j2 ++; // =DTT_SIZE1;
clt_tile_j3 ++; // =DTT_SIZE1;
}
}
/*
Converted from DttRad2.java:443
public double [] corr_unfold_tile(
double [][] qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
int transform_size
)
*/
__device__ void corrUnfoldTile(
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE;
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
for (int j = 1; j < DTT_SIZE; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
/// int rslt_base_index_mp = rslt_base_index_m + j;
/// int rslt_base_index_mm = rslt_base_index_m - j;
rslt[rslt_base_index_pp] = corr_pixscale * (
qdata0[i_transform_size + j] +
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (
qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
}
if (i == 0) {
return;
}
/// int im1 = i-1;
im1_transform_size = i_transform_size - DTT_SIZE1;
float d = corr_pixscale * qdata2[im1_transform_size];
rslt[rslt_base_index_p] += d;
rslt[rslt_base_index_m] -= d;
for (int j = 1; j < DTT_SIZE; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_mm = rslt_base_index_m - j;
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_pp] += d2 + d3;
rslt[rslt_base_index_pm] += d2 - d3;
}
}
__device__ void debug_print_lpf(
float * lpf_tile)
{
for (int dbg_row = 0; dbg_row < DTT_SIZE; dbg_row++){
for (int dbg_col = 0; dbg_col < DTT_SIZE; dbg_col++){
printf ("%10.5f ", lpf_tile[dbg_row * DTT_SIZE + dbg_col]);
}
printf("\n");
}
}
__device__ void debug_print_clt1(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const int color,
......@@ -591,6 +1067,23 @@ __device__ void debug_print_mclt(
printf("\n");
}
__device__ void debug_print_corr_15x15(
float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
const int color)
{
if (color >= 0) printf("----------- Color = %d -----------\n",color);
for (int dbg_row = 0; dbg_row < DTT_SIZE2M1; dbg_row++){
for (int dbg_col = 0; dbg_col < DTT_SIZE2M1; dbg_col++){
printf ("%10.5f ", mclt_tile[dbg_row * DTT_SIZE2M1 + dbg_col]);
}
printf("\n");
}
printf("\n");
}
__device__ void convertCorrectTile(
struct CltExtra * gpu_kernel_offsets, // [tileY][tileX][color]
float * gpu_kernels, // [tileY][tileX][color]
......@@ -1361,7 +1854,7 @@ __device__ void imclt_plane(
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Shuld be zeroed before the
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt(
......
......@@ -212,39 +212,40 @@ int main(int argc, char **argv)
// CLT testing
const char* kernel_file[] = {
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0_transposed.kernel",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1_transposed.kernel",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2_transposed.kernel",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3_transposed.kernel"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
const char* kernel_offs_file[] = {
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0_transposed.kernel_offsets",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1_transposed.kernel_offsets",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2_transposed.kernel_offsets",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3_transposed.kernel_offsets"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
const char* image_files[] = {
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.bayer",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.bayer",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.bayer",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.bayer"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
const char* ports_offs_xy_file[] = {
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.portsxy",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.portsxy",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.portsxy",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.portsxy"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
const char* ports_clt_file[] = { // never referenced
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.clt",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.clt",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.clt",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.clt"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
const char* result_rbg_file[] = {
"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.rbg",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.rbg",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.rbg",
"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.rbg"};
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
// not yet used
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
......@@ -272,10 +273,12 @@ int main(int argc, char **argv)
int KERN_TILES = KERNELS_HOR * KERNELS_VERT * NUM_COLORS;
int KERN_SIZE = KERN_TILES * 4 * 64;
int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// host array of pointers to GPU memory
float * gpu_kernels_h [NUM_CAMS];
......@@ -288,6 +291,10 @@ int main(int argc, char **argv)
float * gpu_corr_images_h [NUM_CAMS];
#endif
float * gpu_corrs;
// float * gpu_corr_indices;
int * gpu_corr_indices;
int num_corrs;
// GPU pointers to GPU pointers to memory
float ** gpu_kernels; // [NUM_CAMS];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
......@@ -300,6 +307,8 @@ int main(int argc, char **argv)
struct tp_task * gpu_tasks;
size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
float lpf_rbg[3][64];
for (int ncol = 0; ncol < 3; ncol++) {
......@@ -339,9 +348,12 @@ int main(int argc, char **argv)
IMG_WIDTH + DTT_SIZE, // int width,
3*(IMG_HEIGHT + DTT_SIZE)); // int height);
#endif
}
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width,
NUM_PAIRS * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
readFloatsFromFile(
......@@ -361,8 +373,24 @@ int main(int argc, char **argv)
ports_offs_xy_file[ncam]); // char * path) // file path
}
// build TP task that processes all tiles in linescan order
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
task_data[nt].txy = tx + (ty << 16);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
}
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
#ifdef DBG_TILE
#ifdef DBG0
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
for (int t = 0; t < NUM_TEST_TILES; t++) {
......@@ -375,28 +403,34 @@ int main(int argc, char **argv)
task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
int tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
#else
// build TP task that processes all tiles in linescan order
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
task_data[nt].task = 1;
task_data[nt].txy = tx + (ty << 16);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
}
tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
#endif
#endif
// segfault in the next
gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// build corr_indices
num_corrs = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm){
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
corr_indices[num_corrs++] = (nt << CORR_PAIR_SHIFT) | b;
}
}
}
}
// num_corrs now has the total number of correlations
// copy corr_indices to gpu
// gpu_corr_indices = (float *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
gpu_corr_indices = (int *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
// will need to pass num_corrs too
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, NUM_CAMS);
......@@ -440,9 +474,11 @@ int main(int argc, char **argv)
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// gpu_corrs, // float * gpu_corrs, // [][15x15] - padded
dstride/sizeof(float), // size_t dstride, // for gpu_images
// dstride_corr/sizeof(float), //size_t dstride_corr, // in floats: padded correlation size
tp_task_size, // int num_tiles) // number of tiles in task
7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError("Kernel execution failed");
......@@ -584,6 +620,82 @@ int main(int argc, char **argv)
free(cpu_corr_image);
#endif
#ifndef NOCORR
// testing corr
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR);
}
dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D<<<grid_corr,threads_corr>>>(
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
0.25, // float scale0, // scale for R
0.25, // float scale1, // scale for B
0.5, // float scale2, // scale for G
30.0, // float fat_zero, // here - absolute
num_corrs, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
gpu_corrs); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
sdkStopTimer(&timerCORR);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkDeleteTimer(&timerCORR);
printf("Average CORR run time =%f ms\n", avgTimeCORR);
int corr_size = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size = num_corrs * corr_size * corr_size;
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s\n", result_corr_file);
writeFloatsToFile(
cpu_corr, // float * data, // allocated array
rslt_corr_size, // int size, // length in elements
result_corr_file); // const char * path) // file path
#endif
free(cpu_corr);
#endif // ifndef NOCORR
#ifdef SAVE_CLT
free(cpu_clt);
#endif
......@@ -605,5 +717,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_images));
checkCudaErrors(cudaFree(gpu_clt));
// checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corr_indices));
exit(0);
}
......@@ -84,6 +84,7 @@ __constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used
......@@ -376,6 +377,88 @@ inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment