cleanup

06b62a05 · Andrey Filippov · 2bed5def · 06b62a05 · 06b62a05 · 06b62a05
Commit 06b62a05 authored Apr 07, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 225 deletions

TileProcessor.cuh src/TileProcessor.cuh +1 -221

dtt8x8.cu src/dtt8x8.cu +4 -1

dtt8x8.h src/dtt8x8.h +5 -3

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -40,8 +40,8 @@
 #pragma once
 #ifndef JCUDA
 #include "tp_defines.h"
-#endif // #ifndef JCUDA
 #include "dtt8x8.h"
+#endif // #ifndef JCUDA
 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
@@ -330,29 +330,6 @@ printAlphaFade(8)
 """
-__constant__ float idct_signs[4][4][4] ={
-		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
-				{-1, 1, 1, 1},
-				{-1, 1, 1, 1},
-				{-1, 1, 1, 1}
-		},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
-				{-1,-1,-1, 1},
-				{-1,-1,-1, 1},
-				{-1,-1,-1, 1}
-		},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
-				{ 1,-1,-1,-1},
-				{ 1,-1,-1,-1},
-				{-1, 1, 1, 1}
-		},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
-				{ 1, 1, 1,-1},
-				{ 1, 1, 1,-1},
-				{-1,-1,-1, 1}
-		}};
 */
@@ -3259,203 +3236,6 @@ __device__ void convertCorrectTile(
 //#endif
 }
-#ifdef NOICLT1
-extern "C"
-__global__ void test_imclt(
-		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		int             ncam) // just for debug print
-// Initially - no output, will add later
-{
-//	dim3 t = threadIdx;
-	int tile_in_block = threadIdx.y;
-	int tile_num = blockIdx.x * IMCLT_TILES_PER_BLOCK + tile_in_block;
-	if (tile_num >= 1) return; // just testing with a single tile
-	int thr3 =    threadIdx.x >> 3;
-	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-//	int thr012 =  threadIdx.x & 7;
-	// Read clt tile to
-    __shared__ float clt_tiles        [IMCLT_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
-    __shared__ float mclt_tiles       [IMCLT_TILES_PER_BLOCK][DTT_SIZE2][DTT_SIZE21];
-    // Read clt tile from device memory
-    for (int color = 0; color < NUM_COLORS; color++) {
-    	float * clt_tile = ((float *) clt_tiles) +  tile_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
-    	float * gpu_tile = ((float *) gpu_clt) +  ((DBG_TILE_Y * TILESX + DBG_TILE_X) * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
-#ifdef DEBUG3
-    	if ((threadIdx.x) == 0){
-    		printf("\n\n\n================== gpu_tile = 0x%lx, clt_tile = 0x%lx, COLOR=%d, ncam = %d  ======================\n",gpu_tile,clt_tile,color,ncam);
-    	}
-#endif
-    	clt_tile += column + thr3; // first 2 rows
-    	gpu_tile += column;  // first 2 rows
-#pragma unroll
-    	for (int i = 0; i < DTT_SIZE2; i++){
-    		*clt_tile= *gpu_tile;
-    		clt_tile += (2 * DTT_SIZE1);
-    		gpu_tile += (2 * DTT_SIZE);
-    	}
-    	// reset mclt tile to zero
-    	float * mclt_tile = ((float*) mclt_tiles) +  tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column;
-#pragma unroll
-    	for (int i = 0; i < DTT_SIZE2; i++){
-    		*mclt_tile= 0.0f;
-    		mclt_tile += DTT_SIZE21;
-    	}
-    	__syncthreads();// __syncwarp();
-    	imclt(
-    			((float*) clt_tiles) +  tile_in_block * (4 * DTT_SIZE * DTT_SIZE1), // float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-				((float*) mclt_tiles) +  tile_in_block * (DTT_SIZE2 * DTT_SIZE21)); // float * mclt_tile )
-    	__syncthreads();// __syncwarp();
-    }
-}
-#endif // NOICLT1
-#ifdef BBBB
-//
-// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
-// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
-// first pass
-//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
-__device__ void imclt(
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile ) //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-{
-	int thr3 =    threadIdx.x >> 3;
-	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-	int thr012 =  threadIdx.x & 7;
-	int column4 = threadIdx.x >> 2;
-//	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
-//	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
-	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
-#ifdef DEBUG3
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles before IDTT\n");
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
-    }
-     __syncthreads();// __syncwarp();
-#endif
-	// perform horizontal dct-iv on quadrants 0 and 1
-    dctiv_nodiverg(
-    		clt_tile +  DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
-			1);
-	// perform horizontal dst-iv on quadrants 2 and 3
-    dstiv_nodiverg( // all colors
-    		clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
-			1);
-    __syncthreads();// __syncwarp();
-	// perform vertical   dct-iv on quadrants 0 and 2
-    dctiv_nodiverg(
-    		clt_tile +  thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
-			DTT_SIZE1);
-	// perform vertical   dst-iv on quadrants 1 and 3
-    dstiv_nodiverg(
-    		clt_tile2 + thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
-			DTT_SIZE1);
-    __syncthreads();// __syncwarp();
-#ifdef DEBUG3
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles after IDTT\n");
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
-    }
-     __syncthreads();// __syncwarp();
-#endif
-    float hw = HWINDOW2[wcolumn];
-    int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
-    float * rslt = mclt_tile + column;
-#pragma unroll
-    for (int i = 0; i < 4; i++){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i < 3){
-    		clt_offset +=  DTT_SIZE1;
-    	}
-//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	val = __fmaf_rd(w,d0,val); // w*d0 + val
-    	*rslt = val;
-    	rslt += DTT_SIZE21;
-    }
-#pragma unroll
-    for (int i = 4; i < 8; i++){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-//    	if (i < 7){
-   		clt_offset -=  DTT_SIZE1;
-//    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
-    }
-#pragma unroll
-    for (int i = 7; i >= 4; i--){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i > 4){
-    		clt_offset -=  DTT_SIZE1;
-    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
-    }
-#pragma unroll
-    for (int i = 3; i >= 0; i--){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i > 0){
-    		clt_offset +=  DTT_SIZE1;
-    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
-    }
-#ifdef DEBUG3
-    __syncthreads();// __syncwarp();
-    if ((threadIdx.x) == 0){
-        printf("\nMCLT Tiles after IMCLT\n");
-    	debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
-    }
-    __syncthreads();// __syncwarp();
-#endif
-}
-#endif

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -44,7 +44,9 @@
 * This file is cpompatible with both runtime and driver API, runtime is used for development
 * with Nvidia Nsight, driver API when calling these kernels from Java
 */
+#ifndef JCUDA
 #include "dtt8x8.h"
+#endif
 //#define CUDART_INF_F            __int_as_float(0x7f800000)
 /*
@@ -124,7 +126,7 @@ __constant__ float HWINDOW2[] =  {0.049009f, 0.145142f, 0.235698f, 0.317197f,
 *
 * \return None
 */
+#ifdef BBBB
 extern "C"
 __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
 {
@@ -163,6 +165,7 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
    for (unsigned int i = 0; i < DTT_SIZE; i++)
        dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
 }
+#endif //#ifdef BBBB

--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -66,9 +66,9 @@
 #define DTTTEST_BLOCK_HEIGHT         16
 #define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
-extern __constant__ float idct_signs[4][4][4];
+//extern __constant__ float idct_signs[4][4][4];
-extern __constant__ int imclt_indx9[16];
+//extern __constant__ int imclt_indx9[16];
-extern __constant__ float HWINDOW2[];
+//extern __constant__ float HWINDOW2[];
 inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
 inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
@@ -84,7 +84,9 @@ inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point t
 inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
 // kernels (not used so far)
+#ifdef BBBB
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
+#endif// #ifdef BBBB
 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(