Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
06b62a05
Commit
06b62a05
authored
Apr 07, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cleanup
parent
2bed5def
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
10 additions
and
225 deletions
+10
-225
TileProcessor.cuh
src/TileProcessor.cuh
+1
-221
dtt8x8.cu
src/dtt8x8.cu
+4
-1
dtt8x8.h
src/dtt8x8.h
+5
-3
No files found.
src/TileProcessor.cuh
View file @
06b62a05
...
...
@@ -40,8 +40,8 @@
#pragma once
#ifndef JCUDA
#include "tp_defines.h"
#endif // #ifndef JCUDA
#include "dtt8x8.h"
#endif // #ifndef JCUDA
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
...
...
@@ -330,29 +330,6 @@ printAlphaFade(8)
"""
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{-1,-1,-1, 1},
{-1,-1,-1, 1},
{-1,-1,-1, 1}
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{-1, 1, 1, 1}
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
}};
*/
...
...
@@ -3259,203 +3236,6 @@ __device__ void convertCorrectTile(
//#endif
}
#ifdef NOICLT1
extern "C"
__global__ void test_imclt(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int ncam) // just for debug print
// Initially - no output, will add later
{
// dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int tile_num = blockIdx.x * IMCLT_TILES_PER_BLOCK + tile_in_block;
if (tile_num >= 1) return; // just testing with a single tile
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// Read clt tile to
__shared__ float clt_tiles [IMCLT_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
__shared__ float mclt_tiles [IMCLT_TILES_PER_BLOCK][DTT_SIZE2][DTT_SIZE21];
// Read clt tile from device memory
for (int color = 0; color < NUM_COLORS; color++) {
float * clt_tile = ((float *) clt_tiles) + tile_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * gpu_tile = ((float *) gpu_clt) + ((DBG_TILE_Y * TILESX + DBG_TILE_X) * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\n\n\n================== gpu_tile = 0x%lx, clt_tile = 0x%lx, COLOR=%d, ncam = %d ======================\n",gpu_tile,clt_tile,color,ncam);
}
#endif
clt_tile += column + thr3; // first 2 rows
gpu_tile += column; // first 2 rows
#pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){
*clt_tile= *gpu_tile;
clt_tile += (2 * DTT_SIZE1);
gpu_tile += (2 * DTT_SIZE);
}
// reset mclt tile to zero
float * mclt_tile = ((float*) mclt_tiles) + tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column;
#pragma unroll
for (int i = 0; i < DTT_SIZE2; i++){
*mclt_tile= 0.0f;
mclt_tile += DTT_SIZE21;
}
__syncthreads();// __syncwarp();
imclt(
((float*) clt_tiles) + tile_in_block * (4 * DTT_SIZE * DTT_SIZE1), // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
((float*) mclt_tiles) + tile_in_block * (DTT_SIZE2 * DTT_SIZE21)); // float * mclt_tile )
__syncthreads();// __syncwarp();
}
}
#endif // NOICLT1
#ifdef BBBB
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\nDTT Tiles before IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg(
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg(
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg(
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = val;
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#ifdef DEBUG3
__syncthreads();// __syncwarp();
if ((threadIdx.x) == 0){
printf("\nMCLT Tiles after IMCLT\n");
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
}
#endif
...
...
src/dtt8x8.cu
View file @
06b62a05
...
...
@@ -44,7 +44,9 @@
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#include "dtt8x8.h"
#endif
//#define CUDART_INF_F __int_as_float(0x7f800000)
/*
...
...
@@ -124,7 +126,7 @@ __constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
*
* \return None
*/
#ifdef BBBB
extern "C"
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
{
...
...
@@ -163,6 +165,7 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
for (unsigned int i = 0; i < DTT_SIZE; i++)
dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
}
#endif //#ifdef BBBB
...
...
src/dtt8x8.h
View file @
06b62a05
...
...
@@ -66,9 +66,9 @@
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
extern
__constant__
float
idct_signs
[
4
][
4
][
4
];
extern
__constant__
int
imclt_indx9
[
16
];
extern
__constant__
float
HWINDOW2
[];
//
extern __constant__ float idct_signs[4][4][4];
//
extern __constant__ int imclt_indx9[16];
//
extern __constant__ float HWINDOW2[];
inline
__device__
void
dttii_shared_mem_nonortho
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// does not scale by y[0] (y[7]) by 1/sqrt[0]
inline
__device__
void
dttii_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// used in GPU_DTT24_DRV
...
...
@@ -84,7 +84,9 @@ inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point t
inline
__device__
void
_dctiv_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
// kernels (not used so far)
#ifdef BBBB
extern
"C"
__global__
void
GPU_DTT24_DRV
(
float
*
dst
,
float
*
src
,
int
src_stride
,
int
dtt_mode
);
#endif// #ifdef BBBB
//=========================== 2D functions ===============
extern
__device__
void
corrUnfoldTile
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment