/** ** ** dtt8x8.h ** ** Copyright (C) 2018 Elphel, Inc. ** ** -----------------------------------------------------------------------------** ** ** dtt8x8.cuh is free software: you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by ** the Free Software Foundation, either version 3 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program. If not, see <http://www.gnu.org/licenses/>. ** ** Additional permission under GNU GPL version 3 section 7 ** ** If you modify this Program, or any covered work, by linking or ** combining it with NVIDIA Corporation's CUDA libraries from the ** NVIDIA CUDA Toolkit (or a modified version of those libraries), ** containing parts covered by the terms of NVIDIA CUDA Toolkit ** EULA, the licensors of this Program grant you additional ** permission to convey the resulting work. ** -----------------------------------------------------------------------------** */ /** ************************************************************************** * \file dtt8x8.h * \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8) * in GPU * This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT) * implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation * results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented * here it is used to convert convolution kernels and LPF to the frequency domain - done in * software. * * This file is cpompatible with both runtime and driver API, runtime is used for development * with Nvidia Nsight, driver API when calling these kernels from Java */ #ifndef JCUDA #define DTT_SIZE_LOG2 3 #endif #pragma once #define DTT_SIZE (1 << DTT_SIZE_LOG2) #define DTT_SIZE1 (DTT_SIZE + 1) #define DTT_SIZE2 (2 * DTT_SIZE) #define DTT_SIZE21 (DTT_SIZE2 + 1) #define DTT_SIZE4 (4 * DTT_SIZE) #define DTT_SIZE2M1 (DTT_SIZE2 - 1) #define BAYER_RED 0 #define BAYER_BLUE 1 #define BAYER_GREEN 2 // assuming GR/BG as now #define BAYER_RED_ROW 0 #define BAYER_RED_COL 1 #define DTTTEST_BLOCK_WIDTH 32 #define DTTTEST_BLOCK_HEIGHT 16 #define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1) //extern __constant__ float idct_signs[4][4][4]; //extern __constant__ int imclt_indx9[16]; //extern __constant__ float HWINDOW2[]; // kernels (not used so far) #if 0 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode); #endif// #if 0 //=========================== 2D functions =============== extern __device__ void corrUnfoldTile( int corr_radius, float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15 extern __device__ void dttii_2d( float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE] extern __device__ void dttiv_color_2d( float * clt_tile, int color); extern __device__ void imclt( float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float * mclt_tile ); extern __device__ void imclt8threads( int do_acc, // 1 - add to previous value, 0 - overwrite float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9] float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17] int debug);