prepared for separate compilation, for now merged

3d5ddc28 · Andrey Filippov · 39e75987 · 3d5ddc28 · 3d5ddc28 · 3d5ddc28
Commit 3d5ddc28 authored Apr 07, 2020 by Andrey Filippov
6 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/dtt8x8.cuh
+++ b/src/main/resources/kernels/dtt8x8.cuh
--- a/src/main/resources/kernels/dtt8x8.h
+++ b/src/main/resources/kernels/dtt8x8.h
+/**
+ **
+ ** dtt8x8.h
+ **
+ ** Copyright (C) 2018 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  dtt8x8.cuh is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+/**
+**************************************************************************
+* \file dtt8x8.h
+* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
+*        in GPU
+* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
+* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
+* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
+* here it is used to convert convolution kernels and LPF to the frequency domain - done in
+* software.
+*
+* This file is cpompatible with both runtime and driver API, runtime is used for development
+* with Nvidia Nsight, driver API when calling these kernels from Java
+*/
+#ifndef JCUDA
+#define DTT_SIZE_LOG2                 3
+#endif
+#pragma once
+#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
+#define DTT_SIZE1        (DTT_SIZE + 1)
+#define DTT_SIZE2        (2 * DTT_SIZE)
+#define DTT_SIZE21       (DTT_SIZE2 + 1)
+#define DTT_SIZE4        (4 * DTT_SIZE)
+#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
+#define BAYER_RED   0
+#define BAYER_BLUE  1
+#define BAYER_GREEN 2
+// assuming GR/BG as now
+#define BAYER_RED_ROW 0
+#define BAYER_RED_COL 1
+#define DTTTEST_BLOCK_WIDTH          32
+#define DTTTEST_BLOCK_HEIGHT         16
+#define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
+//extern __constant__ float idct_signs[4][4][4];
+//extern __constant__ int imclt_indx9[16];
+//extern __constant__ float HWINDOW2[];
+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
+inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
+inline __device__ void dctiv_nodiverg  (float * x0,  int inc);                    // used in TP
+inline __device__ void dstiv_nodiverg  (float * x0,  int inc);                    // used in TP
+inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+// kernels (not used so far)
+#ifdef BBBB
+extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
+#endif// #ifdef BBBB
+//=========================== 2D functions ===============
+extern __device__ void corrUnfoldTile(
+		int corr_radius,
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+extern __device__ void dttii_2d(
+		float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+extern __device__ void dttiv_color_2d(
+		float * clt_tile,
+		int color);
+extern __device__ void imclt(
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile );
+extern __device__ void imclt8threads(
+		int     do_acc,     // 1 - add to previous value, 0 - overwrite
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+		int     debug);
--- a/src/main/resources/kernels/test_tp.cu
+++ b/src/main/resources/kernels/test_tp.cu
--- a/src/main/resources/kernels/tp_defines.h
+++ b/src/main/resources/kernels/tp_defines.h
+/**
+ **
+ ** tp_defines.h
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  tp_defines.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+/**
+**************************************************************************
+* \file tp_defines.h
+* \brief Defines for running in C++ environment, replaced when called from Java
+*/
+// Avoiding includes in jcuda, all source files will be merged
+#pragma once
+#ifndef JCUDA
+#define THREADSX         (DTT_SIZE)
+#define NUM_CAMS                  4
+#define NUM_PAIRS                 6
+#define NUM_COLORS                3
+#define IMG_WIDTH              2592
+#define IMG_HEIGHT             1936
+#define KERNELS_HOR             164
+#define KERNELS_VERT            123
+#define KERNELS_LSTEP             4
+#define THREADS_PER_TILE          8
+#define TILES_PER_BLOCK           4
+#define CORR_THREADS_PER_TILE     8
+#define CORR_TILES_PER_BLOCK      4
+#define TEXTURE_THREADS_PER_TILE  8
+#define TEXTURE_TILES_PER_BLOCK   1
+#define IMCLT_THREADS_PER_TILE   16
+#define IMCLT_TILES_PER_BLOCK     4
+#define CORR_NTILE_SHIFT          8 // higher bits - number of a pair, other bits tile number
+#define CORR_PAIRS_MASK        0x3f// lower bits used to address correlation pair for the selected tile
+#define CORR_TEXTURE_BIT          7 // bit 7 used to request texture for the tile
+#define TASK_CORR_BITS            4
+#define TASK_TEXTURE_N_BIT        0 // Texture with North neighbor
+#define TASK_TEXTURE_E_BIT        1 // Texture with East  neighbor
+#define TASK_TEXTURE_S_BIT        2 // Texture with South neighbor
+#define TASK_TEXTURE_W_BIT        3 // Texture with West  neighbor
+#define TASK_TEXTURE_BIT          3 // bit to request texture calculation int task field of struct tp_task
+#define LIST_TEXTURE_BIT          7 // bit to request texture calculation
+#define CORR_OUT_RAD              4
+#define FAT_ZERO_WEIGHT           0.0001 // add to port weights to avoid nan
+#define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list
+//#undef HAS_PRINTF
+#define HAS_PRINTF
+//7
+//#define DEBUG1 1
+//#define DEBUG2 1
+//#define DEBUG3 1
+//#define DEBUG4 1
+//#define DEBUG5 1
+//#define DEBUG6 1
+/*
+#define DEBUG7 1
+#define DEBUG8 1
+#define DEBUG9 1
+*/
+#define DEBUG10 1
+#define DEBUG11 1
+#define DEBUG12 1
+//#define USE_textures_gen
+#define DEBUG_OOB1 1
+#endif //#ifndef JCUDA