Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
4c45ca5b
Commit
4c45ca5b
authored
Sep 29, 2018
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
initial Tile Processor code
parent
5ca93ffd
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
857 additions
and
0 deletions
+857
-0
TileProcessor.cuh
src/main/resources/TileProcessor.cuh
+857
-0
No files found.
src/main/resources/TileProcessor.cuh
0 → 100644
View file @
4c45ca5b
/**
**
** TileProcessor.cuh
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** TileProcessor.cuh is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file TileProcessor.cuh
* \brief Top level of the Tile Processor for frequency domain
*/
#pragma once
#include "dtt8x8.cuh"
// Using 1 tile per block with 32 threads per tile.
// some subtasks use 8 threads per 2d DTT
//#define TILES_PER_BLOCK 1
#define THREADS_PER_TILE 32
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define NUM_CAMS 4
#define NUM_COLORS 3
//#define KERNELS_STEP 16
#define KERNELS_LSTEP 4
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define IMAGE_TILE_SIDE 18
//#define KERNEL_OFFSETS 8
#define KERNELS_STEP (1 >> KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
// increase row length by 1 so vertical passes will use different ports
#define THREADSX (DTT_SIZE)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DBG_TILE (174*324 +118)
#define DEBUG1 1
//56494
// struct tp_task
//#define TASK_SIZE 12
struct
tp_task
{
long
task
;
short
ty
;
short
tx
;
float
xy
[
NUM_CAMS
][
2
];
};
struct
CltExtra
{
float
data_x
;
// kernel data is relative to this displacement X (0.5 pixel increments)
float
data_y
;
// kernel data is relative to this displacement Y (0.5 pixel increments)
float
center_x
;
// actual center X (use to find derivatives)
float
center_y
;
// actual center X (use to find derivatives)
float
dxc_dx
;
// add this to data_x per each pixel X-shift relative to the kernel center location
float
dxc_dy
;
// same per each Y-shift pixel
float
dyc_dx
;
float
dyc_dy
;
};
/*
Python code to generate constant coefficients:
def setup_hwindow(n=8, l=4):
hwindow = [math.sin(math.pi*((1.0+2*i)/(4*n))) for i in range(2*n)]
print("__constant__ float HWINDOW[] = {", end="") #
for i in range (n):
print("%ff"%(hwindow[i]), end ="")
if i == (2*n-1):
print("};")
elif ((i + 1) % l) == 0:
print(",")
print(" ", end ="")
else:
print(", ",end="")
def get_fold_rindices(n=8):
n1 = n>>1;
rind = [0] * (2 * n) # reverse indices
rcs = [0] * (2 * n) # reverse signs for cosine term
rss = [0] * (2 * n) # reverse signs for sine term
for x in range (n1):
ri0 = n + n1 - x - 1
ri1 = n + n1 + x
ri2 = x
ri3 = n - x - 1
rind[ri0] = x
rind[ri1] = x
rind[ri2] = x + n1
rind[ri3] = x + n1
rcs[ri0] = -1
rss[ri0] = 1
rcs[ri1] = -1
rss[ri1] = -1
rcs[ri2] = 1
rss[ri2] = 1
rcs[ri3] = -1
rss[ri3] = 1
rind0 = []
rind1 = []
# generate start indices for the first 2 bayer rows
for a in rind:
rind0.append(a+rind[0]*n)
rind1.append(a+rind[1]*n)
#column increments for odd/even bayer rows
inc_even = []
inc_odd = []
for i in range (n-1):
inc_even.append(rind[2*i+2]-rind[2*i])
inc_odd.append (rind[2*i+3]-rind[2*i+1])
inc_even.reverse()
inc_odd.reverse()
# combine increments into int data
inc_e = 0
inc_o = 0
for d in inc_even:
inc_e = ((inc_e) << 4) | (d & 0xf)
for d in inc_odd:
inc_o = ((inc_o) << 4) | (d & 0xf)
print("__constant__ int fold_indx2[2][%d] = {{"%(2*n), end="") #
for d in rind0[:-1]:
print('0x%2x,'%(d), end="")
print('0x%2x},'%(rind0[-1]))
print(" {", end="") #
for d in rind1[:-1]:
print('0x%2x,'%(d), end="")
print('0x%2x}};'%(rind1[-1]))
print("__constant__ int fold_inc[]= {0x%08x, 0x%08x};"%(inc_e, inc_o))
*/
//#define DTTTEST_BLOCK_WIDTH 32
//#define DTTTEST_BLOCK_HEIGHT 16
//#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//#define DTT_SIZE 8
/*
int OffsThreadInRow = threadIdx.y * DTT_SIZE + threadIdx.x;
int OffsThreadInCol = threadIdx.z * DTT_SIZE;
src += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
dst += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
float *bl_ptr = block + OffsThreadInCol * DTTTEST_BLK_STRIDE + OffsThreadInRow;
*
// GPU memory pointers
float * gpu_kernels [NUM_CAMS];
float * gpu_kernel_offsets [NUM_CAMS];
float * gpu_images [NUM_CAMS];
float * gpu_tasks;
size_t dstride;
*/
__constant__
float
HWINDOW
[]
=
{
0.098017
f
,
0.290285
f
,
0.471397
f
,
0.634393
f
,
0.773010
f
,
0.881921
f
,
0.956940
f
,
0.995185
f
};
// Offsets in 8x8 DCT_CC/DST_SC tile for the first 2 lines of the 16x16 bayer image
__constant__
int
fold_indx2
[
2
][
16
]
=
{{
0x24
,
0x25
,
0x26
,
0x27
,
0x27
,
0x26
,
0x25
,
0x24
,
0x23
,
0x22
,
0x21
,
0x20
,
0x20
,
0x21
,
0x22
,
0x23
},
{
0x2c
,
0x2d
,
0x2e
,
0x2f
,
0x2f
,
0x2e
,
0x2d
,
0x2c
,
0x2b
,
0x2a
,
0x29
,
0x28
,
0x28
,
0x29
,
0x2a
,
0x2b
}};
// increments of the offsets in 8x8 tile when going down, jumping two lines (same Bayer). Each 4 bits have to be <<3,
// addd to the current index and result should be AND-ed with 0x3f. inc_e is for even rows (0,2, ...) while inc_o - for odd ones (1,3,)
__constant__
int
fold_inc
[]
=
{
0x02feee12
,
0x021eeef2
};
// index table for convolutions
__constant__
int
zi
[
4
][
4
]
=
{{
0
,
-
1
,
-
2
,
3
},
{
1
,
0
,
-
3
,
-
2
},
{
2
,
-
3
,
0
,
-
1
},
{
3
,
2
,
1
,
0
}};
#define BAYER_RED 0
#define BAYER_BLUE 1
#define BAYER_GREEN 2
// assuming GR/BG as now
#define BAYER_RED_ROW 0
#define BAYER_RED_COL 1
//#define BAYER_BLUE_ROW (1 - BAYER_RED_ROW)
//#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
__device__
void
convertCorrectTile
(
struct
CltExtra
*
gpu_kernel_offsets
,
float
*
gpu_kernels
,
float
*
gpu_images
,
// struct tp_task * tt,
float
centerX
,
float
centerY
,
size_t
dstride
,
// in floats (pixels)
float
clt_tile
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
clt_kernels
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
// float bayer_tiles [IMAGE_TILE_SIDE][IMAGE_TILE_SIDE],
int
int_topleft
[
NUM_COLORS
][
2
],
float
residual_shift
[
NUM_COLORS
][
2
]);
// Fractional pixel shift (phase rotation), horizontal. In-place.
__device__
void
shiftTileHor
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
);
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
shiftTileVert
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
);
__device__
void
convolveTiles
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// 4 quadrants of the clt data, rows extended to optimize shared ports
float
kernel
[
4
][
DTT_SIZE
][
DTT_SIZE1
]);
// 4 quadrants of the CLT kernel (DTT3 converted)
extern
"C"
__global__
void
tileProcessor
(
struct
CltExtra
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
size_t
dstride
,
// // in floats (pixels)
int
num_tiles
)
// number of tiles in task
{
// struct CltExtra * dbg_ce_h0= &gpu_kernel_offsets[0][14328];
// struct CltExtra * dbg_ce_h1= &gpu_kernel_offsets[0][14328 + (164*123)];
// struct CltExtra * dbg_ce_h2= &gpu_kernel_offsets[0][14328 + 2* (164*123)];
int
task_num
=
blockIdx
.
x
;
// * TILES_PER_BLOCK + threadIdx.y;
if
(
task_num
>=
num_tiles
)
return
;
// nothing to do
struct
tp_task
*
gpu_task
=
&
gpu_tasks
[
task_num
];
if
(
!
gpu_task
->
task
)
return
;
// NOP tile
__shared__
struct
tp_task
tt
;
// [TILES_PER_BLOCK];
// Copy task data to shared memory
int
nc
=
(
threadIdx
.
x
>>
1
)
+
(
threadIdx
.
y
<<
2
)
-
1
;
if
(
nc
<
0
)
{
tt
.
task
=
gpu_task
->
task
;
tt
.
tx
=
gpu_task
->
tx
;
tt
.
ty
=
gpu_task
->
ty
;
}
else
{
if
(
nc
<
NUM_CAMS
)
{
tt
.
xy
[
nc
][
0
]
=
gpu_task
->
xy
[
nc
][
0
];
tt
.
xy
[
nc
][
1
]
=
gpu_task
->
xy
[
nc
][
1
];
}
}
if
(
NUM_CAMS
>
31
){
// unlikely
nc
+=
32
;
while
(
nc
<
NUM_CAMS
){
tt
.
xy
[
nc
][
0
]
=
gpu_task
->
xy
[
nc
][
0
];
tt
.
xy
[
nc
][
1
]
=
gpu_task
->
xy
[
nc
][
1
];
nc
+=
32
;
}
}
__syncthreads
();
// set memory for CLT result (per tile, per camera, per color, per clt, per row, per column
// clt_tile[][0] - before rotation, [][0][0] - R:DCT/DCT, [][0][1] - B:DCT/DCT, [][0][2] - G:DCT/DCT, [][0][3] - G:DST/DCT,
// clt_tile[][1], clt_tile[][2], and clt_tile[][3] - after rotation, 4 quadrants each
// changed, above is wrong now
__shared__
float
clt_tile
[
NUM_CAMS
][
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// sharing for cameras as they are corrected one after another
__shared__
float
clt_kernels
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// +1 to alternate column ports
__shared__
int
int_topleft
[
NUM_COLORS
][
2
];
__shared__
float
residual_shift
[
NUM_COLORS
][
2
];
// __shared__ float window_hor_cos [NUM_COLORS][2*DTT_SIZE];
// __shared__ float window_hor_sin [NUM_COLORS][2*DTT_SIZE];
// __shared__ float window_vert_cos [NUM_COLORS][2*DTT_SIZE];
//IMAGE_TILE_SIDE
// process each camera in series
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
){
convertCorrectTile
(
gpu_kernel_offsets
[
ncam
],
// float * gpu_kernel_offsets,
gpu_kernels
[
ncam
],
// float * gpu_kernels,
gpu_images
[
ncam
],
// float * gpu_images,
// &tt[threadIdx.y], // struct tp_task * tt,
tt
.
xy
[
ncam
][
0
],
// float centerX,
tt
.
xy
[
ncam
][
1
],
// float centerY,
dstride
,
// size_t dstride, // in floats (pixels)
clt_tile
[
ncam
],
// float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
clt_kernels
,
// float clt_tile [NUM_COLORS][4][DTT_SIZE][DTT_SIZE],
int_topleft
,
// int int_topleft [NUM_COLORS][2],
residual_shift
);
// float frac_topleft [NUM_COLORS][2],
}
}
// Fractional pixel shift (phase rotation), horizontal. In-place. uses 8 threads (.x)
__device__
void
shiftTileHor
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
)
{
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
clt_tile
[
0
][
i
][
j
]
*
ch
-
clt_tile
[
1
][
i
][
j
]
*
sh
;
clt_tile
[
1
][
i
][
j
]
=
clt_tile
[
0
][
i
][
j
]
*
sh
+
clt_tile
[
1
][
i
][
j
]
*
ch
;
clt_tile
[
0
][
i
][
j
]
=
t
;
t
=
clt_tile
[
2
][
i
][
j
]
*
ch
-
clt_tile
[
3
][
i
][
j
]
*
sh
;
clt_tile
[
3
][
i
][
j
]
=
clt_tile
[
2
][
i
][
j
]
*
sh
+
clt_tile
[
3
][
i
][
j
]
*
ch
;
clt_tile
[
2
][
i
][
j
]
=
t
;
}
}
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
shiftTileVert
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
)
{
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
clt_tile
[
0
][
j
][
i
]
*
ch
-
clt_tile
[
1
][
j
][
i
]
*
sh
;
clt_tile
[
1
][
j
][
i
]
=
clt_tile
[
0
][
j
][
i
]
*
sh
+
clt_tile
[
1
][
j
][
i
]
*
ch
;
clt_tile
[
0
][
j
][
i
]
=
t
;
t
=
clt_tile
[
2
][
j
][
i
]
*
ch
-
clt_tile
[
3
][
j
][
i
]
*
sh
;
clt_tile
[
3
][
j
][
i
]
=
clt_tile
[
2
][
j
][
i
]
*
sh
+
clt_tile
[
3
][
j
][
i
]
*
ch
;
clt_tile
[
2
][
j
][
i
]
=
t
;
}
}
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
convolveTiles
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// 4 quadrants of the clt data, rows extended to optimize shared ports
float
kernel
[
4
][
DTT_SIZE
][
DTT_SIZE1
])
// 4 quadrants of the CLT kernel (DTT3 converted)
{
int
j
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
float
r0
=
0
;
float
r1
=
0
;
float
r2
=
0
;
float
r3
=
0
;
for
(
int
k
=
0
;
k
<
4
;
k
++
){
if
(
zi
[
0
][
k
]
<
0
)
r0
-=
clt_tile
[
-
zi
[
0
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r0
+=
clt_tile
[
-
zi
[
0
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
1
][
k
]
<
0
)
r0
-=
clt_tile
[
-
zi
[
1
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r0
+=
clt_tile
[
-
zi
[
1
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
2
][
k
]
<
0
)
r0
-=
clt_tile
[
-
zi
[
2
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r0
+=
clt_tile
[
-
zi
[
2
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
3
][
k
]
<
0
)
r0
-=
clt_tile
[
-
zi
[
3
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r0
+=
clt_tile
[
-
zi
[
3
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
}
clt_tile
[
0
][
j
][
i
]
=
r0
;
clt_tile
[
1
][
j
][
i
]
=
r1
;
clt_tile
[
2
][
j
][
i
]
=
r2
;
clt_tile
[
3
][
j
][
i
]
=
r3
;
}
}
__device__
void
debug_print_clt
(
float
clt_tile
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports)
int
mask
)
{
for
(
int
dbg_color
=
0
;
dbg_color
<
NUM_COLORS
;
dbg_color
++
){
printf
(
"----------- Color = %d -----------
\n
"
,
dbg_color
);
for
(
int
dbg_quadrant
=
0
;
dbg_quadrant
<
4
;
dbg_quadrant
++
){
printf
(
"----------- Quadrant (c(h)-c(v), s-c, c-s, s-s) = %d -----------
\n
"
,
dbg_quadrant
);
if
((
mask
>>
(
dbg_color
*
4
+
dbg_quadrant
))
&
1
)
{
for
(
int
dbg_row
=
0
;
dbg_row
<
DTT_SIZE
;
dbg_row
++
){
for
(
int
dbg_col
=
0
;
dbg_col
<
DTT_SIZE
;
dbg_col
++
){
printf
(
"%10.5f "
,
clt_tile
[
dbg_color
][
dbg_quadrant
][
dbg_row
][
dbg_col
]);
}
printf
(
"
\n
"
);
}
}
printf
(
"
\n
"
);
}
}
}
// Uses 32 threads
__device__
void
convertCorrectTile
(
struct
CltExtra
*
gpu_kernel_offsets
,
float
*
gpu_kernels
,
float
*
gpu_images
,
// struct tp_task * tt,
float
centerX
,
float
centerY
,
size_t
dstride
,
// in floats (pixels)
// clt_tile[0] - before rotation, [0][0] - R:DCT/DCT, [0][1] - B:DCT/DCT, [0][2] - G:DCT/DCT, [0][3] - G:DST/DCT,
// clt_tile[1], clt_tile[2], and clt_tile[3] - after rotation, 4 quadrants each
// changed, above is wrong now
float
clt_tile
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
clt_kernels
[
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
int
int_topleft
[
NUM_COLORS
][
2
],
float
residual_shift
[
NUM_COLORS
][
2
])
{
// struct CltExtra * dbg_ce0= &gpu_kernel_offsets[14328];
// struct CltExtra * dbg_ce1= &gpu_kernel_offsets[14328 + (164*123)];
// struct CltExtra * dbg_ce2= &gpu_kernel_offsets[14328 + 2* (164*123)];
__shared__
float
window_hor_cos
[
NUM_COLORS
][
2
*
DTT_SIZE
];
__shared__
float
window_hor_sin
[
NUM_COLORS
][
2
*
DTT_SIZE
];
__shared__
float
window_vert_cos
[
NUM_COLORS
][
2
*
DTT_SIZE
];
// __shared__ float rot_hvcs [NUM_COLORS][4][DTT_SIZE1]; // rotation cosine/sines: CH,SH,CV,SV for each color
// get correct kernel tile, then use 2 threads per kernel and image
int
ktileX
,
ktileY
;
int
kernel_index
;
// common for all coors
float
kdx
,
kdy
;
switch
(
threadIdx
.
x
){
case
0
:
ktileX
=
min
(
KERNELS_HOR
-
1
,
max
(
0
,
(((
int
)
lrintf
(
centerX
))
+
(
1
<<
(
KERNELS_LSTEP
-
1
))
>>
KERNELS_LSTEP
)
+
1
));
// kdx = centerX - (ktileX -1 +0.5) * KERNELS_STEP; // difference in pixel
kdx
=
centerX
-
(
ktileX
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
break
;
case
1
:
ktileY
=
min
(
KERNELS_HOR
-
1
,
max
(
0
,
(((
int
)
lrintf
(
centerY
))
+
(
1
<<
(
KERNELS_LSTEP
-
1
))
>>
KERNELS_LSTEP
)
+
1
));
kdy
=
centerY
-
(
ktileY
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
break
;
}
__syncthreads
();
// thread0 gets ktileY from thread 1
ktileY
=
__shfl_sync
(
0x00000001
,
// unsigned mask,
ktileY
,
// T var,
1
,
// int srcLane,
THREADS_PER_TILE
);
// int width=warpSize);
switch
(
threadIdx
.
x
){
case
0
:
kernel_index
=
ktileX
+
ktileY
*
KERNELS_HOR
;
break
;
}
__syncthreads
();
// broadcast kernel_index
kernel_index
=
__shfl_sync
(
0xffffffff
,
// unsigned mask,
kernel_index
,
// T var,
0
,
// int srcLane,
THREADS_PER_TILE
);
// int width=warpSize);
__syncthreads
();
// is it needed?
kdx
=
__shfl_sync
(
0xffffffff
,
// unsigned mask,
kdx
,
// T var,
0
,
// int srcLane,
THREADS_PER_TILE
);
// int width=warpSize);
__syncthreads
();
// is it needed?
kdy
=
__shfl_sync
(
0xffffffff
,
// unsigned mask,
kdy
,
// T var,
1
,
// int srcLane,
THREADS_PER_TILE
);
// int width=warpSize);
__syncthreads
();
// is it needed?
int
color
=
threadIdx
.
y
;
float
px
,
py
;
// int dbg_y = threadIdx.y;
// int dbg_x = threadIdx.x;
if
(
color
<
3
){
// 3*8 threads cooperating on this
// kernel_index += color * (KERNELS_HOR * KERNELS_VERT);
// float * kernel_src = &gpu_kernels[ kernel_index * (DTT_SIZE * DTT_SIZE * 4)];
float
*
kernel_src
=
&
gpu_kernels
[
(
kernel_index
+
color
*
(
KERNELS_HOR
*
KERNELS_VERT
))
*
(
DTT_SIZE
*
DTT_SIZE
*
4
)];
float
*
kernelp
=
(
float
*
)
clt_kernels
[
color
];
kernel_src
+=
threadIdx
.
x
;
// lsb;
kernelp
+=
threadIdx
.
x
;
// lsb;
for
(
int
j
=
0
;
j
<
DTT_SIZE
*
4
;
j
++
){
// all 4 components, 8 rows
// shared memory kernels use DTT_SIZE1 (same as image data)
*
kernelp
=
*
kernel_src
;
kernelp
+=
DTT_SIZE1
;
kernel_src
+=
THREADSX
;
*
kernelp
=
*
kernel_src
;
kernelp
+=
DTT_SIZE1
;
kernel_src
+=
THREADSX
;
*
kernelp
=
*
kernel_src
;
kernelp
+=
DTT_SIZE1
;
kernel_src
+=
THREADSX
;
*
kernelp
=
*
kernel_src
;
kernelp
+=
DTT_SIZE1
;
kernel_src
+=
THREADSX
;
}
}
else
{
// if (color < 3){ calculate offsets and copy bayer image (with individual shifts)
// calculate offsets and prepare windows
int
bayer_color
=
min
((
NUM_COLORS
-
1
),
threadIdx
.
x
>>
1
);
int
bayer_g2
=
threadIdx
.
x
>=
(
NUM_COLORS
<<
1
);
// second pass of green
int
lsb
=
threadIdx
.
x
&
1
;
int
kernel_full_index
=
kernel_index
+
bayer_color
*
(
KERNELS_HOR
*
KERNELS_VERT
);
// struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_index + bayer_color*(KERNELS_HOR * KERNELS_VERT)];
struct
CltExtra
*
clt_extra
=
&
gpu_kernel_offsets
[
kernel_full_index
];
// both threads will calculate same x,y components - dont'y know how to sync just them not with other copying kernels
if
(
bayer_g2
){
// threads 30,31
if
(
lsb
){
px
=
centerX
-
DTT_SIZE
-
(
clt_extra
->
data_x
+
clt_extra
->
dxc_dx
*
kdx
+
clt_extra
->
dxc_dy
*
kdy
)
;
// fractional left corner Warp Illegal Address
int
itlx
=
(
int
)
floorf
(
px
+
0.5
f
);
int_topleft
[
bayer_color
][
0
]
=
itlx
;
float
shift_hor
=
px
-
itlx
;
residual_shift
[
bayer_color
][
0
]
=
shift_hor
;
float
x
=
shift_hor
*
(
1.0
f
/
16
);
float
ahc
=
cospif
(
x
);
float
ahs
=
sinpif
(
x
);
int
i1
=
DTT_SIZE
;
int
i
=
0
;
// embedd sign for cosine and sine branches into window coefficients
for
(;
i
<
(
DTT_SIZE
/
2
);
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_hor_sin
[
bayer_color
][
i
]
=
HWINDOW
[
i
]
*
ahc
+
HWINDOW
[
ri
]
*
ahs
;
// bayer_color== 2
window_hor_sin
[
bayer_color
][
i1
]
=
HWINDOW
[
ri
]
*
ahc
-
HWINDOW
[
i
]
*
ahs
;
i1
++
;
}
// embedd sign for cosine and sine branches into window coefficients
for
(;
i
<
DTT_SIZE
;
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_hor_sin
[
bayer_color
][
i
]
=
HWINDOW
[
i
]
*
ahc
+
HWINDOW
[
ri
]
*
ahs
;
window_hor_sin
[
bayer_color
][
i1
]
=
HWINDOW
[
i
]
*
ahs
-
HWINDOW
[
ri
]
*
ahc
;
i1
++
;
}
}
}
else
{
//if (bayer_g2){ // threads 24..29
if
(
lsb
){
px
=
centerX
-
DTT_SIZE
-
(
clt_extra
->
data_x
+
clt_extra
->
dxc_dx
*
kdx
+
clt_extra
->
dxc_dy
*
kdy
)
;
// fractional left corner
int
itlx
=
(
int
)
floorf
(
px
+
0.5
f
);
int_topleft
[
bayer_color
][
0
]
=
itlx
;
float
shift_hor
=
px
-
itlx
;
residual_shift
[
bayer_color
][
0
]
=
shift_hor
;
float
x
=
shift_hor
*
(
1.0
f
/
16
);
float
ahc
=
cospif
(
x
);
float
ahs
=
sinpif
(
x
);
int
i1
=
DTT_SIZE
;
int
i
=
0
;
// embedd sign for cosine and sine branches into window coefficients
for
(;
i
<
(
DTT_SIZE
/
2
);
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_hor_cos
[
bayer_color
][
i
]
=
HWINDOW
[
i
]
*
ahc
+
HWINDOW
[
ri
]
*
ahs
;
window_hor_cos
[
bayer_color
][
i1
]
=
HWINDOW
[
i
]
*
ahs
-
HWINDOW
[
ri
]
*
ahc
;
i1
++
;
}
// embedd sign for cosine and sine branches into window coefficients
for
(;
i
<
DTT_SIZE
;
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_hor_cos
[
bayer_color
][
i
]
=
-
HWINDOW
[
i
]
*
ahc
-
HWINDOW
[
ri
]
*
ahs
;
window_hor_cos
[
bayer_color
][
i1
]
=
HWINDOW
[
i
]
*
ahs
-
HWINDOW
[
ri
]
*
ahc
;
i1
++
;
}
}
else
{
//if (lsb){
py
=
centerY
-
DTT_SIZE
-
(
clt_extra
->
data_y
+
clt_extra
->
dyc_dx
*
kdx
+
clt_extra
->
dyc_dy
*
kdy
)
;
// fractional top corner
int
itly
=
(
int
)
floorf
(
py
+
0.5
f
);
int_topleft
[
bayer_color
][
1
]
=
itly
;
float
shift_vert
=
py
-
itly
;
residual_shift
[
bayer_color
][
1
]
=
shift_vert
;
float
x
=
shift_vert
*
(
1.0
f
/
16
);
float
avc
=
cospif
(
x
);
float
avs
=
sinpif
(
x
);
int
i1
=
DTT_SIZE
;
// embedd sign for cosine branch only into window coefficients (for R,B only CC is needed, for G - CC and SC
int
i
=
0
;
for
(;
i
<
DTT_SIZE
/
2
;
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_vert_cos
[
bayer_color
][
i
]
=
HWINDOW
[
i
]
*
avc
+
HWINDOW
[
ri
]
*
avs
;
window_vert_cos
[
bayer_color
][
i1
++
]
=
HWINDOW
[
i
]
*
avs
-
HWINDOW
[
ri
]
*
avc
;
}
for
(;
i
<
DTT_SIZE
;
i
++
){
int
ri
=
(
DTT_SIZE
-
1
)
-
i
;
window_vert_cos
[
bayer_color
][
i
]
=
-
(
HWINDOW
[
i
]
*
avc
+
HWINDOW
[
ri
]
*
avs
);
window_vert_cos
[
bayer_color
][
i1
++
]
=
HWINDOW
[
i
]
*
avs
-
HWINDOW
[
ri
]
*
avc
;
}
}
}
}
// if (color < 3) else
__syncthreads
();
// threads 0..23 loaded 3 color kernels, threads 24-27 - prepared hor and vert windows for R and B, threads 28..31 - for G
// prepare, fold and write data to DTT buffers
int
dstride2
=
dstride
<<
1
;
// in floats (pixels)
int
bayer_color
=
min
((
NUM_COLORS
-
1
),
threadIdx
.
y
);
if
(
bayer_color
<
BAYER_GREEN
){
// process R and B (2 * 8 threads) threads 0..15
// Find correct column and start row for each of the 8 participating threads
int
col_tl
=
int_topleft
[
bayer_color
][
0
];
// + (threadIdx.x << 1);
int
row_tl
=
int_topleft
[
bayer_color
][
1
];
int
local_col
=
((
col_tl
&
1
)
^
BAYER_RED_COL
^
bayer_color
)
+
(
threadIdx
.
x
<<
1
);
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
bayer_color
);
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
hwind_cos
=
window_hor_cos
[
bayer_color
][
local_col
];
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
int
dtt_offset_inc
=
fold_inc
[
local_row
];
float
*
dtt_buf
=
(
float
*
)
clt_tile
[
bayer_color
][
0
];
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
// dtt_buf[dtt_offset] = (*image_p) * hwind_cos * window_vert_cos[bayer_color][local_row];
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows (DTT_SIZE1)
float
dbg_pix
=
(
*
image_p
);
dtt_buf
[
dtt_offset1
]
=
(
*
image_p
)
*
hwind_cos
*
window_vert_cos
[
bayer_color
][
local_row
];
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
image_p
+=
dstride2
;
}
}
else
{
// process green color threads 16..31
// no need to sync here
// process green color - temporarily use two buffers instead of one, then - reduce
int
ipass
=
threadIdx
.
y
&
1
;
// Find correct column and start row for each of the 8 participating threads
int
col_tl
=
int_topleft
[
BAYER_GREEN
][
0
];
// + (threadIdx.x << 1);
int
row_tl
=
int_topleft
[
BAYER_GREEN
][
1
];
int
local_col
=
((
col_tl
&
1
)
^
(
BAYER_RED_COL
^
1
)
^
ipass
)
+
(
threadIdx
.
x
<<
1
);
// green red row: invert column from red
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
ipass
);
// use red row
int
dbg_image_offset
=
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
dbg_pix1
=
gpu_images
[
dbg_image_offset
];
float
*
dbg_pix2_p
=
gpu_images
+
dbg_image_offset
;
float
dbg_pix2
=
*
dbg_pix2_p
;
float
hwind_cos
=
window_hor_cos
[
BAYER_GREEN
][
local_col
];
float
hwind_sin
=
window_hor_sin
[
BAYER_GREEN
][
local_col
];
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
int
dtt_offset_inc
=
fold_inc
[
local_row
];
float
*
dct_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][
ipass
<<
1
];
// use 2 buffers, second - borrowing from rotated DTT
float
*
dst_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][(
ipass
<<
1
)
+
1
];
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
float
d
=
(
*
image_p
)
*
window_vert_cos
[
BAYER_GREEN
][
local_row
];
float
dbg_pix
=
(
*
image_p
);
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows
dct_buf
[
dtt_offset1
]
+=
d
*
hwind_cos
;
dst_buf
[
dtt_offset1
]
+=
d
*
hwind_sin
;
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
image_p
+=
dstride2
;
}
}
__syncthreads
();
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
// Uses all 32 threads.
float
*
dtt_buf
=
((
float
*
)
clt_tile
[
BAYER_GREEN
][
0
][
threadIdx
.
y
])
+
threadIdx
.
x
;
float
*
dtt_buf1
=
((
float
*
)
clt_tile
[
BAYER_GREEN
][
2
][
threadIdx
.
y
])
+
threadIdx
.
x
;
(
*
dtt_buf
)
+=
(
*
dtt_buf1
);
dtt_buf
+=
(
4
*
DTT_SIZE1
);
dtt_buf1
+=
(
4
*
DTT_SIZE1
);
(
*
dtt_buf
)
+=
(
*
dtt_buf1
);
dtt_buf
=
((
float
*
)
clt_tile
[
BAYER_GREEN
][
1
][
threadIdx
.
y
])
+
threadIdx
.
x
;
dtt_buf1
=
((
float
*
)
clt_tile
[
BAYER_GREEN
][
3
][
threadIdx
.
y
])
+
threadIdx
.
x
;
(
*
dtt_buf
)
+=
(
*
dtt_buf1
);
dtt_buf
+=
(
4
*
DTT_SIZE1
);
dtt_buf1
+=
(
4
*
DTT_SIZE1
);
(
*
dtt_buf
)
+=
(
*
dtt_buf1
);
__syncthreads
();
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
FOLDED DTT Tiles"
);
debug_print_clt
(
clt_tile
,
0x311
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
// Run DCT-IV/DCT-IV for all colors, DST-IV/DCT-IV for green only
if
(
threadIdx
.
y
<
NUM_COLORS
)
{
// run DCTIV for all colors
// horizontal pass
dttiv_shared_mem
(
clt_tile
[
0
][
threadIdx
.
y
][
threadIdx
.
x
],
// pointer to start of row
1
,
// int inc,
0
);
// int dst_not_dct)
// vertical pass
}
else
{
// if (threadIdx.y < NUM_COLORS) { // run DSTIV for green only
dttiv_shared_mem
(
clt_tile
[
0
][
NUM_COLORS
][
threadIdx
.
x
],
// pointer to start of row
1
,
// int inc,
1
);
// int dst_not_dct)
}
__syncthreads
();
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after horizontal pass"
);
debug_print_clt
(
clt_tile
,
0x311
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
// vertical pass // common for all 4 (DCT/DCT of RGB, and DST/DCT of G)
dttiv_shared_mem
(
&
clt_tile
[
0
][
threadIdx
.
y
][
0
][
threadIdx
.
x
],
// pointer to start of column
DTT_SIZE1
,
// int inc,
0
);
// int dst_not_dct)
__syncthreads
();
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after vertical pass"
);
debug_print_clt
(
clt_tile
,
0x311
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
// Replicate DTT, so non-bayer can still use same in-place rotation code
float
*
src
,
*
dst
;
int
negate
,
dst_inc
;
switch
(
threadIdx
.
y
)
{
case
0
:
// Red CC -> SC
negate
=
(
int_topleft
[
BAYER_RED
][
0
]
&
1
)
^
BAYER_RED_COL
;
// 1 - invert
src
=
&
clt_tile
[
BAYER_RED
][
0
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_RED
][
1
][
0
][
threadIdx
.
x
^
7
];
dst_inc
=
DTT_SIZE1
;
break
;
case
1
:
// Blue CC -> SC
negate
=
(
int_topleft
[
BAYER_BLUE
][
0
]
&
1
)
^
(
BAYER_RED_COL
^
1
);
// 1 - invert
src
=
&
clt_tile
[
BAYER_BLUE
][
0
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_BLUE
][
1
][
0
][
threadIdx
.
x
^
7
];
dst_inc
=
DTT_SIZE1
;
break
;
case
2
:
// Green CC -> SS
negate
=
(
int_topleft
[
BAYER_GREEN
][
0
]
&
1
)
^
(
int_topleft
[
2
][
1
]
&
1
)
^
(
BAYER_RED_COL
^
BAYER_RED_ROW
);
// 1 - invert
src
=
&
clt_tile
[
BAYER_GREEN
][
0
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_GREEN
][
3
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
case
3
:
// Green SC -> CS
negate
=
(
int_topleft
[
BAYER_GREEN
][
0
]
&
1
)
^
(
int_topleft
[
2
][
1
]
&
1
)
^
(
BAYER_RED_COL
^
BAYER_RED_ROW
);
// 1 - invert
src
=
&
clt_tile
[
BAYER_GREEN
][
1
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_GREEN
][
2
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
}
if
(
negate
){
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
*
src
=
-
(
*
dst
);
src
+=
DTT_SIZE1
;
dst
+=
dst_inc
;
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
*
src
=
(
*
dst
);
src
+=
DTT_SIZE1
;
dst
+=
dst_inc
;
}
}
__syncthreads
();
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after first replicating"
);
debug_print_clt
(
clt_tile
,
0xf33
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
switch
(
threadIdx
.
y
)
{
case
0
:
// Red CC -> CS
negate
=
(
int_topleft
[
BAYER_RED
][
1
]
&
1
)
^
BAYER_RED_ROW
;
// 1 - invert
src
=
&
clt_tile
[
BAYER_RED
][
0
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_RED
][
2
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
case
1
:
// Red SC -> SS
negate
=
(
int_topleft
[
BAYER_RED
][
1
]
&
1
)
^
BAYER_RED_ROW
;
// 1 - invert
src
=
&
clt_tile
[
BAYER_RED
][
1
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_RED
][
3
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
case
2
:
// Blue CC -> CS
negate
=
(
int_topleft
[
BAYER_BLUE
][
1
]
&
1
)
^
(
BAYER_RED_ROW
^
1
);
// 1 - invert
src
=
&
clt_tile
[
BAYER_BLUE
][
0
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_BLUE
][
2
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
case
3
:
// Blue SC -> SS
negate
=
(
int_topleft
[
BAYER_BLUE
][
1
]
&
1
)
^
(
BAYER_RED_ROW
^
1
);
// 1 - invert
src
=
&
clt_tile
[
BAYER_BLUE
][
1
][
0
][
threadIdx
.
x
];
dst
=
&
clt_tile
[
BAYER_BLUE
][
3
][
7
][
threadIdx
.
x
^
7
];
dst_inc
=
-
DTT_SIZE1
;
break
;
}
if
(
negate
){
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
*
src
=
-
(
*
dst
);
src
+=
DTT_SIZE1
;
dst
+=
dst_inc
;
}
}
else
{
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
*
src
=
(
*
dst
);
src
+=
DTT_SIZE1
;
dst
+=
dst_inc
;
}
}
__syncthreads
();
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after second replicating"
);
debug_print_clt
(
clt_tile
,
0xfff
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
if
(
threadIdx
.
y
<
NUM_COLORS
)
{
// convolve first, then rotate to match Java and make it easier to verify
convolveTiles
(
clt_tile
[
threadIdx
.
y
],
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
clt_kernels
[
threadIdx
.
y
]);
// float kernel [4][DTT_SIZE][DTT_SIZE1]); // 4 quadrants of the CLT kernel (DTT3 converted)
__syncthreads
();
}
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after convolution"
);
debug_print_clt
(
clt_tile
,
0xfff
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
if
(
threadIdx
.
y
<
NUM_COLORS
)
{
// rotate phases: first horizontal, then vertical
shiftTileHor
(
clt_tile
[
threadIdx
.
y
],
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
residual_shift
[
threadIdx
.
y
][
0
]);
// float residual_shift);
__syncthreads
();
}
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after horizontal shift"
);
debug_print_clt
(
clt_tile
,
0xfff
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
#endif
if
(
threadIdx
.
y
<
NUM_COLORS
)
{
shiftTileVert
(
clt_tile
[
threadIdx
.
y
],
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
residual_shift
[
threadIdx
.
y
][
1
]);
// float residual_shift);
__syncthreads
();
}
#ifdef DEBUG1
if
((
threadIdx
.
x
+
threadIdx
.
y
)
==
0
){
printf
(
"
\n
DTT Tiles after vertical shift"
);
debug_print_clt
(
clt_tile
,
0xfff
);
// only 1 quadrant for R,B and 2 - for G
printf
(
"
\n
DTT All done"
);
}
__syncthreads
();
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment