Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
ddc33b02
Commit
ddc33b02
authored
Oct 05, 2018
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tested inverse mclt tile
parent
d399f9d7
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
330 additions
and
306 deletions
+330
-306
TileProcessor.cuh
src/main/resources/TileProcessor.cuh
+330
-306
No files found.
src/main/resources/TileProcessor.cuh
View file @
ddc33b02
...
...
@@ -70,12 +70,19 @@
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define IMAGE_TILE_SIDE 18
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
// increase row length by 1 so vertical passes will use different ports
#define THREADSX (DTT_SIZE)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
#define BAYER_RED 0
#define BAYER_BLUE 1
...
...
@@ -90,10 +97,10 @@
#define DBG_TILE_X 174
#define DBG_TILE_Y 118
//
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
//#define DEBUG1 1
//#define DEBUG2 1
//#undef DEBUG2
#define DEBUG3 1
//56494
// struct tp_task
//#define TASK_SIZE 12
...
...
@@ -120,7 +127,7 @@ def setup_hwindow(n=8, l=4):
print("__constant__ float HWINDOW[] = {", end="") #
for i in range (n):
print("%ff"%(hwindow[i]), end ="")
if i == (
2*
n-1):
if i == (n-1):
print("};")
elif ((i + 1) % l) == 0:
print(",")
...
...
@@ -128,6 +135,19 @@ def setup_hwindow(n=8, l=4):
else:
print(", ",end="")
def setup_hwindow2(n=8, l=4):
hwindow = [0.5*math.sin(math.pi*((1.0+2*i)/(4*n))) for i in range(2*n)]
print("__constant__ float HWINDOW2[] = {", end="") #
for i in range (n):
print("%ff"%(hwindow[i]), end ="")
if i == (n-1):
print("};")
elif ((i + 1) % l) == 0:
print(",")
print(" ", end ="")
else:
print(", ",end="")
def get_fold_rindices(n=8):
n1 = n>>1;
rind = [0] * (2 * n) # reverse indices
...
...
@@ -181,12 +201,23 @@ def get_fold_rindices(n=8):
print('0x%2x}};'%(rind1[-1]))
print("__constant__ int fold_inc[]= {0x%08x, 0x%08x};"%(inc_e, inc_o))
def set_imclt_sa(stride=9):
sa8 =[0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x14,0x0c,0x04,0x04,0x0c,0x14,0x1c]
sa8s = [d // 8 + (d % 8) * stride for d in sa8]
print("__constant__ int imclt_indx9[16] = {", end="") #
for d in sa8s[:-1]:
print('0x%02x,'%(d), end="")
print('0x%2x};'%(sa8s[-1]))
*/
__constant__
float
HWINDOW
[]
=
{
0.098017
f
,
0.290285
f
,
0.471397
f
,
0.634393
f
,
0.773010
f
,
0.881921
f
,
0.956940
f
,
0.995185
f
};
__constant__
float
HWINDOW2
[]
=
{
0.049009
f
,
0.145142
f
,
0.235698
f
,
0.317197
f
,
0.386505
f
,
0.440961
f
,
0.478470
f
,
0.497592
f
};
// Offsets in 8x8 DCT_CC/DST_SC tile for the first 2 lines of the 16x16 bayer image
__constant__
int
fold_indx2
[
2
][
16
]
=
{{
0x24
,
0x25
,
0x26
,
0x27
,
0x27
,
0x26
,
0x25
,
0x24
,
0x23
,
0x22
,
0x21
,
0x20
,
0x20
,
0x21
,
0x22
,
0x23
},
{
0x2c
,
0x2d
,
0x2e
,
0x2f
,
0x2f
,
0x2e
,
0x2d
,
0x2c
,
0x2b
,
0x2a
,
0x29
,
0x28
,
0x28
,
0x29
,
0x2a
,
0x2b
}};
...
...
@@ -195,21 +226,34 @@ __constant__ int fold_indx2[2][16] = {{0x24,0x25,0x26,0x27,0x27,0x26,0x25,0x24,0
// addd to the current index and result should be AND-ed with 0x3f. inc_e is for even rows (0,2, ...) while inc_o - for odd ones (1,3,)
__constant__
int
fold_inc
[]
=
{
0x02feee12
,
0x021eeef2
};
// index table for convolutions
__constant__
int
zi
[
4
][
4
]
=
{{
0
,
-
1
,
-
2
,
3
},
{
1
,
0
,
-
3
,
-
2
},
{
2
,
-
3
,
0
,
-
1
},
{
3
,
2
,
1
,
0
}};
__constant__
int
za
[
4
][
4
]
=
{{
0
,
1
,
2
,
3
},
{
1
,
0
,
3
,
2
},
{
2
,
3
,
0
,
1
},
{
3
,
2
,
1
,
0
}};
__constant__
int
zs
[
4
][
4
]
=
{{
0
,
-
1
,
-
1
,
1
},
{
1
,
0
,
-
1
,
-
1
},
{
1
,
-
1
,
0
,
-
1
},
{
1
,
1
,
1
,
0
}};
//__constant__ int imclt_indx[16] = {0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x22,0x21,0x20,0x20,0x21,0x22,0x23};
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__constant__
int
imclt_indx9
[
16
]
=
{
0x28
,
0x29
,
0x2a
,
0x2b
,
0x2b
,
0x2a
,
0x29
,
0x28
,
0x27
,
0x26
,
0x25
,
0x24
,
0x24
,
0x25
,
0x26
,
0x27
};
// Hope that if 2 outer indices are known at compile time there will be no integer multiplications
__constant__
float
idct_signs
[
4
][
4
][
4
]
=
{
{
// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
}
},{
// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
}
}};
__device__
void
convertCorrectTile
(
...
...
@@ -220,13 +264,10 @@ __device__ void convertCorrectTile(
const
int
color
,
const
float
centerX
,
const
float
centerY
,
const
short
tx
,
const
short
ty
,
const
size_t
dstride
,
// in floats (pixels)
// clt_tile[0] - before rotation, [0][0] - R:DCT/DCT, [0][1] - B:DCT/DCT, [0][2] - G:DCT/DCT, [0][3] - G:DST/DCT,
// clt_tile[1], clt_tile[2], and clt_tile[3] - after rotation, 4 quadrants each
// changed, above is wrong now
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
// float clt_kernels [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_kernels
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
int
int_topleft
[
2
],
float
residual_shift
[
2
],
...
...
@@ -238,26 +279,19 @@ __device__ void convertCorrectTile(
__device__
void
shiftTileHor
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
residual_shift
);
__device__
void
shiftTileHor1
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
);
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
shiftTileVert
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
residual_shift
);
__device__
void
shiftTileVert1
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
);
__device__
void
convolveTiles
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
kernel
);
// [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the CLT kernel (DTT3 converted)
__device__
void
convolveTiles0
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// 4 quadrants of the clt data, rows extended to optimize shared ports
float
kernel
[
4
][
DTT_SIZE
][
DTT_SIZE1
]);
// 4 quadrants of the CLT kernel (DTT3 converted)
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
);
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
);
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
extern
"C"
__global__
void
tileProcessor
(
...
...
@@ -305,27 +339,15 @@ __global__ void tileProcessor(
}
__syncthreads
();
// __syncwarp();
// set memory for CLT result (per tile, per camera, per color, per clt, per row, per column
// clt_tile[][0] - before rotation, [][0][0] - R:DCT/DCT, [][0][1] - B:DCT/DCT, [][0][2] - G:DCT/DCT, [][0][3] - G:DST/DCT,
// clt_tile[][1], clt_tile[][2], and clt_tile[][3] - after rotation, 4 quadrants each
// changed, above is wrong now
/// __shared__ float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1];
__shared__
float
clt_tile
[
TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// sharing shared memory for cameras as they are corrected one after another
// TODO: evaluate total shared memory usage, maybe this sharing is not needed
__shared__
float
clt_kernels
[
TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// +1 to alternate column ports
__shared__
int
int_topleft
[
TILES_PER_BLOCK
][
2
];
__shared__
float
residual_shift
[
TILES_PER_BLOCK
][
2
];
__shared__
float
window_hor_cos
[
TILES_PER_BLOCK
][
2
*
DTT_SIZE
];
__shared__
float
window_hor_sin
[
TILES_PER_BLOCK
][
2
*
DTT_SIZE
];
__shared__
float
window_vert_cos
[
TILES_PER_BLOCK
][
2
*
DTT_SIZE
];
//IMAGE_TILE_SIDE
// process each camera in series
// process each camera,l each color in series (to reduce shared memory)
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
){
for
(
int
color
=
0
;
color
<
NUM_COLORS
;
color
++
){
convertCorrectTile
(
...
...
@@ -336,6 +358,8 @@ __global__ void tileProcessor(
color
,
// const int color,
tt
[
tile_in_block
].
xy
[
ncam
][
0
],
// const float centerX,
tt
[
tile_in_block
].
xy
[
ncam
][
1
],
// const float centerY,
tt
[
tile_in_block
].
tx
,
// const short tx,
tt
[
tile_in_block
].
ty
,
// const short ty,
dstride
,
// size_t dstride, // in floats (pixels)
(
float
*
)(
clt_tile
[
tile_in_block
]),
// float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
(
float
*
)(
clt_kernels
[
tile_in_block
]),
// float clt_tile [NUM_COLORS][4][DTT_SIZE][DTT_SIZE],
...
...
@@ -464,234 +488,12 @@ __device__ void convolveTiles(
}
}
__device__
void
shiftTileHor2
(
float
*
fclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
residual_shift
)
{
float
(
*
clt_tile
)
[
4
][
DTT_SIZE
][
DTT_SIZE1
]
=
(
float
(
*
)[
4
][
DTT_SIZE
][
DTT_SIZE1
])
fclt_tile
;
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
(
*
clt_tile
)[
0
][
i
][
j
]
*
ch
-
(
*
clt_tile
)[
1
][
i
][
j
]
*
sh
;
(
*
clt_tile
)[
1
][
i
][
j
]
=
(
*
clt_tile
)[
0
][
i
][
j
]
*
sh
+
(
*
clt_tile
)[
1
][
i
][
j
]
*
ch
;
(
*
clt_tile
)[
0
][
i
][
j
]
=
t
;
t
=
(
*
clt_tile
)[
2
][
i
][
j
]
*
ch
-
(
*
clt_tile
)[
3
][
i
][
j
]
*
sh
;
(
*
clt_tile
)[
3
][
i
][
j
]
=
(
*
clt_tile
)[
2
][
i
][
j
]
*
sh
+
(
*
clt_tile
)[
3
][
i
][
j
]
*
ch
;
(
*
clt_tile
)[
2
][
i
][
j
]
=
t
;
}
}
__device__
void
shiftTileHor1
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
)
{
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
clt_tile
[
0
][
i
][
j
]
*
ch
-
clt_tile
[
1
][
i
][
j
]
*
sh
;
clt_tile
[
1
][
i
][
j
]
=
clt_tile
[
0
][
i
][
j
]
*
sh
+
clt_tile
[
1
][
i
][
j
]
*
ch
;
clt_tile
[
0
][
i
][
j
]
=
t
;
t
=
clt_tile
[
2
][
i
][
j
]
*
ch
-
clt_tile
[
3
][
i
][
j
]
*
sh
;
clt_tile
[
3
][
i
][
j
]
=
clt_tile
[
2
][
i
][
j
]
*
sh
+
clt_tile
[
3
][
i
][
j
]
*
ch
;
clt_tile
[
2
][
i
][
j
]
=
t
;
}
}
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
shiftTileVert0
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
)
{
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
clt_tile
[
0
][
j
][
i
]
*
ch
-
clt_tile
[
1
][
j
][
i
]
*
sh
;
clt_tile
[
1
][
j
][
i
]
=
clt_tile
[
0
][
j
][
i
]
*
sh
+
clt_tile
[
1
][
j
][
i
]
*
ch
;
clt_tile
[
0
][
j
][
i
]
=
t
;
t
=
clt_tile
[
2
][
j
][
i
]
*
ch
-
clt_tile
[
3
][
j
][
i
]
*
sh
;
clt_tile
[
3
][
j
][
i
]
=
clt_tile
[
2
][
j
][
i
]
*
sh
+
clt_tile
[
3
][
j
][
i
]
*
ch
;
clt_tile
[
2
][
j
][
i
]
=
t
;
}
}
__device__
void
shiftTileVert1
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports
float
residual_shift
)
{
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
clt_tile
[
0
][
j
][
i
]
*
ch
-
clt_tile
[
2
][
j
][
i
]
*
sh
;
clt_tile
[
2
][
j
][
i
]
=
clt_tile
[
0
][
j
][
i
]
*
sh
+
clt_tile
[
2
][
j
][
i
]
*
ch
;
clt_tile
[
0
][
j
][
i
]
=
t
;
t
=
clt_tile
[
1
][
j
][
i
]
*
ch
-
clt_tile
[
3
][
j
][
i
]
*
sh
;
clt_tile
[
3
][
j
][
i
]
=
clt_tile
[
1
][
j
][
i
]
*
sh
+
clt_tile
[
3
][
j
][
i
]
*
ch
;
clt_tile
[
1
][
j
][
i
]
=
t
;
}
}
__device__
void
shiftTileVert2
(
float
*
fclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
residual_shift
)
{
float
(
*
clt_tile
)
[
4
][
DTT_SIZE
][
DTT_SIZE1
]
=
(
float
(
*
)[
4
][
DTT_SIZE
][
DTT_SIZE1
])
fclt_tile
;
int
j
=
threadIdx
.
x
;
float
x
=
residual_shift
*
((
j
<<
1
)
+
1
)
*
(
0.5
f
/
DTT_SIZE
);
float
ch
=
cospif
(
x
);
float
sh
=
sinpif
(
x
);
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
t
=
(
*
clt_tile
)[
0
][
j
][
i
]
*
ch
-
(
*
clt_tile
)[
2
][
j
][
i
]
*
sh
;
(
*
clt_tile
)[
2
][
j
][
i
]
=
(
*
clt_tile
)[
0
][
j
][
i
]
*
sh
+
(
*
clt_tile
)[
2
][
j
][
i
]
*
ch
;
(
*
clt_tile
)[
0
][
j
][
i
]
=
t
;
t
=
(
*
clt_tile
)[
1
][
j
][
i
]
*
ch
-
(
*
clt_tile
)[
3
][
j
][
i
]
*
sh
;
(
*
clt_tile
)[
3
][
j
][
i
]
=
(
*
clt_tile
)[
1
][
j
][
i
]
*
sh
+
(
*
clt_tile
)[
3
][
j
][
i
]
*
ch
;
(
*
clt_tile
)[
1
][
j
][
i
]
=
t
;
}
}
// Fractional pixel shift (phase rotation), vertical. In-place.
__device__
void
convolveTiles1
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// 4 quadrants of the clt data, rows extended to optimize shared ports
float
kernel
[
4
][
DTT_SIZE
][
DTT_SIZE1
])
// 4 quadrants of the CLT kernel (DTT3 converted)
{
int
j
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
float
r0
=
0
;
float
r1
=
0
;
float
r2
=
0
;
float
r3
=
0
;
for
(
int
k
=
0
;
k
<
4
;
k
++
){
r0
+=
zs
[
0
][
k
]
*
clt_tile
[
za
[
0
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
r1
+=
zs
[
1
][
k
]
*
clt_tile
[
za
[
1
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
r2
+=
zs
[
2
][
k
]
*
clt_tile
[
za
[
2
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
r3
+=
zs
[
3
][
k
]
*
clt_tile
[
za
[
3
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
}
clt_tile
[
0
][
j
][
i
]
=
r0
;
clt_tile
[
1
][
j
][
i
]
=
r1
;
clt_tile
[
2
][
j
][
i
]
=
r2
;
clt_tile
[
3
][
j
][
i
]
=
r3
;
}
}
__device__
void
convolveTiles0
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// 4 quadrants of the clt data, rows extended to optimize shared ports
float
kernel
[
4
][
DTT_SIZE
][
DTT_SIZE1
])
// 4 quadrants of the CLT kernel (DTT3 converted)
{
int
j
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
float
r0
=
0
;
float
r1
=
0
;
float
r2
=
0
;
float
r3
=
0
;
for
(
int
k
=
0
;
k
<
4
;
k
++
){
if
(
zi
[
0
][
k
]
<
0
)
r0
-=
clt_tile
[
-
zi
[
0
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r0
+=
clt_tile
[
zi
[
0
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
1
][
k
]
<
0
)
r1
-=
clt_tile
[
-
zi
[
1
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r1
+=
clt_tile
[
zi
[
1
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
2
][
k
]
<
0
)
r2
-=
clt_tile
[
-
zi
[
2
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r2
+=
clt_tile
[
zi
[
2
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
if
(
zi
[
3
][
k
]
<
0
)
r3
-=
clt_tile
[
-
zi
[
3
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
else
r3
+=
clt_tile
[
zi
[
3
][
k
]][
j
][
i
]
*
kernel
[
k
][
j
][
i
];
}
clt_tile
[
0
][
j
][
i
]
=
r0
;
clt_tile
[
1
][
j
][
i
]
=
r1
;
clt_tile
[
2
][
j
][
i
]
=
r2
;
clt_tile
[
3
][
j
][
i
]
=
r3
;
}
}
__device__
void
convolveTiles2
(
float
*
fclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
fkernel
)
// [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the CLT kernel (DTT3 converted)
{
float
(
*
clt_tile
)
[
4
][
DTT_SIZE
][
DTT_SIZE1
]
=
(
float
(
*
)[
4
][
DTT_SIZE
][
DTT_SIZE1
])
fclt_tile
;
float
(
*
kernel
)
[
4
][
DTT_SIZE
][
DTT_SIZE1
]
=
(
float
(
*
)[
4
][
DTT_SIZE
][
DTT_SIZE1
])
fkernel
;
int
j
=
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
float
r0
=
0
;
float
r1
=
0
;
float
r2
=
0
;
float
r3
=
0
;
for
(
int
k
=
0
;
k
<
4
;
k
++
){
if
(
zi
[
0
][
k
]
<
0
)
r0
-=
(
*
clt_tile
)[
-
zi
[
0
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
else
r0
+=
(
*
clt_tile
)[
zi
[
0
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
if
(
zi
[
1
][
k
]
<
0
)
r1
-=
(
*
clt_tile
)[
-
zi
[
1
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
else
r1
+=
(
*
clt_tile
)[
zi
[
1
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
if
(
zi
[
2
][
k
]
<
0
)
r2
-=
(
*
clt_tile
)[
-
zi
[
2
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
else
r2
+=
(
*
clt_tile
)[
zi
[
2
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
if
(
zi
[
3
][
k
]
<
0
)
r3
-=
(
*
clt_tile
)[
-
zi
[
3
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
else
r3
+=
(
*
clt_tile
)[
zi
[
3
][
k
]][
j
][
i
]
*
(
*
kernel
)[
k
][
j
][
i
];
}
(
*
clt_tile
)[
0
][
j
][
i
]
=
r0
;
(
*
clt_tile
)[
1
][
j
][
i
]
=
r1
;
(
*
clt_tile
)[
2
][
j
][
i
]
=
r2
;
(
*
clt_tile
)[
3
][
j
][
i
]
=
r3
;
}
}
__device__
void
debug_print_clt
(
float
clt_tile
[
4
][
DTT_SIZE
][
DTT_SIZE1
],
// +1 to alternate column ports)
const
int
color
,
int
mask
)
{
printf
(
"----------- Color = %d -----------
\n
"
,
color
);
for
(
int
dbg_quadrant
=
0
;
dbg_quadrant
<
4
;
dbg_quadrant
++
){
printf
(
"----------- Quadrant (c(h)-c(v), s-c, c-s, s-s) = %d -----------
\n
"
,
dbg_quadrant
);
if
((
mask
>>
dbg_quadrant
)
&
1
)
{
for
(
int
dbg_row
=
0
;
dbg_row
<
DTT_SIZE
;
dbg_row
++
){
for
(
int
dbg_col
=
0
;
dbg_col
<
DTT_SIZE
;
dbg_col
++
){
printf
(
"%10.5f "
,
clt_tile
[
dbg_quadrant
][
dbg_row
][
dbg_col
]);
}
printf
(
"
\n
"
);
}
}
printf
(
"
\n
"
);
}
}
__device__
void
debug_print_clt1
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const
int
color
,
int
mask
)
{
printf
(
"----------- Color = %d -----------
\n
"
,
color
);
if
(
color
>=
0
)
printf
(
"----------- Color = %d -----------
\n
"
,
color
);
for
(
int
dbg_quadrant
=
0
;
dbg_quadrant
<
4
;
dbg_quadrant
++
){
printf
(
"----------- Quadrant (c(h)-c(v), s-c, c-s, s-s) = %d -----------
\n
"
,
dbg_quadrant
);
if
((
mask
>>
dbg_quadrant
)
&
1
)
{
...
...
@@ -706,9 +508,21 @@ __device__ void debug_print_clt1(
}
}
__device__
void
debug_print_mclt
(
float
*
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const
int
color
)
{
if
(
color
>=
0
)
printf
(
"----------- Color = %d -----------
\n
"
,
color
);
for
(
int
dbg_row
=
0
;
dbg_row
<
DTT_SIZE2
;
dbg_row
++
){
for
(
int
dbg_col
=
0
;
dbg_col
<
DTT_SIZE2
;
dbg_col
++
){
printf
(
"%10.5f "
,
mclt_tile
[
dbg_row
*
DTT_SIZE21
+
dbg_col
]);
}
printf
(
"
\n
"
);
}
printf
(
"
\n
"
);
}
// Uses 32 threads
__device__
void
convertCorrectTile
(
struct
CltExtra
*
gpu_kernel_offsets
,
// [tileY][tileX][color]
float
*
gpu_kernels
,
// [tileY][tileX][color]
...
...
@@ -717,13 +531,10 @@ __device__ void convertCorrectTile(
const
int
color
,
const
float
centerX
,
const
float
centerY
,
const
short
tx
,
const
short
ty
,
const
size_t
dstride
,
// in floats (pixels)
// clt_tile[0] - before rotation, [0][0] - R:DCT/DCT, [0][1] - B:DCT/DCT, [0][2] - G:DCT/DCT, [0][3] - G:DST/DCT,
// clt_tile[1], clt_tile[2], and clt_tile[3] - after rotation, 4 quadrants each
// changed, above is wrong now
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
// float clt_kernels [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_kernels
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
int
int_topleft
[
2
],
float
residual_shift
[
2
],
...
...
@@ -732,10 +543,6 @@ __device__ void convertCorrectTile(
float
window_vert_cos
[
2
*
DTT_SIZE
])
{
/// __shared__ float window_hor_cos [NUM_COLORS][2*DTT_SIZE];
/// __shared__ float window_hor_sin [NUM_COLORS][2*DTT_SIZE];
/// __shared__ float window_vert_cos [NUM_COLORS][2*DTT_SIZE];
// get correct kernel tile, then use 2 threads per kernel and image
int
ktileX
,
ktileY
;
int
kernel_index
;
// common for all coors
...
...
@@ -781,21 +588,9 @@ __device__ void convertCorrectTile(
*
kernelp
=
*
kernel_src
;
kernelp
+=
DTT_SIZE1
;
kernel_src
+=
THREADSX
;
/*
*kernelp = *kernel_src;
kernelp+=DTT_SIZE1;
kernel_src+=THREADSX;
*kernelp = *kernel_src;
kernelp+=DTT_SIZE1;
kernel_src+=THREADSX;
*kernelp = *kernel_src;
kernelp+=DTT_SIZE1;
kernel_src+=THREADSX;
*/
}
// Calculate offsets and prepare windows (all colors):
// int kernel_full_index = kernel_index + color;
struct
CltExtra
*
clt_extra
=
&
gpu_kernel_offsets
[
kernel_full_index
];
px
=
centerX
-
DTT_SIZE
-
(
clt_extra
->
data_x
+
clt_extra
->
dxc_dx
*
kdx
+
clt_extra
->
dxc_dy
*
kdy
)
;
// fractional left corner
...
...
@@ -884,9 +679,6 @@ __device__ void convertCorrectTile(
for
(
int
gpass
=
0
;
gpass
<
(
color0
+
1
);
gpass
++
)
{
// Only once for R, B, twice - for G
int
col_tl
=
int_topleft
[
0
];
// + (threadIdx.x << 1);
int
row_tl
=
int_topleft
[
1
];
// int local_col = ((col_tl & 1) ^ BAYER_RED_COL ^ color0) + (threadIdx.x << 1);
// int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ color0);
// for red, blue and green, pass 0
int
local_col
=
((
col_tl
&
1
)
^
(
BAYER_RED_COL
^
color0
^
color1
^
gpass
))
+
(
threadIdx
.
x
<<
1
);
// green red row: invert column from red
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
gpass
);
// use red row
...
...
@@ -895,8 +687,6 @@ __device__ void convertCorrectTile(
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
int
dtt_offset_inc
=
fold_inc
[
local_row
];
// float *dct_buf = (float *) clt_tile[ gpass << 1];
// float *dst_buf = (float *) clt_tile[(gpass << 1)+1]; // **** only used for green
float
*
dct_buf
=
clt_tile
+
((
gpass
<<
1
)
*
(
DTT_SIZE
*
DTT_SIZE1
));
float
*
dst_buf
=
clt_tile
+
(((
gpass
<<
1
)
+
1
)
*
(
DTT_SIZE
*
DTT_SIZE1
));
// **** only used for green
...
...
@@ -904,10 +694,9 @@ __device__ void convertCorrectTile(
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
// float d = (*image_p) * window_vert_cos[local_row]; //warp illegal address (0,2,1)
float
d
=
(
*
image_p
);
d
*=
window_vert_cos
[
local_row
];
//warp illegal address (0,2,1)
float
d
=
(
*
image_p
)
*
window_vert_cos
[
local_row
];
//warp illegal address (0,2,1)
// float d = (*image_p);
// d *= window_vert_cos[local_row]; //warp illegal address (0,2,1)
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows (DTT_SIZE1)
dct_buf
[
dtt_offset1
]
=
d
*
hwind_cos
;
dst_buf
[
dtt_offset1
]
=
d
*
hwind_sin
;
// **** only used for green
...
...
@@ -972,7 +761,6 @@ __device__ void convertCorrectTile(
#endif
dctiv_nodiverg
(
// all colors
// clt_tile[0][threadIdx.x], // pointer to start of row
clt_tile
+
(
DTT_SIZE1
*
threadIdx
.
x
),
// [0][threadIdx.x], // pointer to start of row
1
);
//int inc);
if
(
color
==
BAYER_GREEN
){
...
...
@@ -1008,7 +796,6 @@ __device__ void convertCorrectTile(
__syncthreads
();
// __syncwarp();
#endif
// Replicate DTT, so non-bayer can still use same in-place rotation code
float
*
src
,
*
dst
;
int
negate
;
// , dst_inc;
...
...
@@ -1086,7 +873,6 @@ __device__ void convertCorrectTile(
__syncthreads
();
// __syncwarp();
#endif
// rotate phases: first horizontal, then vertical
shiftTileHor
(
clt_tile
,
// float clt_tile [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
...
...
@@ -1115,8 +901,246 @@ __device__ void convertCorrectTile(
#endif
#ifdef DBG_TILE
#ifdef DEBUG1
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after vertical shift, color = %d
\n
"
,
color
);
debug_print_clt1
(
clt_tile
,
color
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
printf
(
"
\n
DTT All done
\n
"
);
}
__syncthreads
();
// __syncwarp();
#endif
#endif
int
offset_src
=
threadIdx
.
x
;
int
offset_dst
=
((
ty
*
TILESX
+
tx
)
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
// gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
float
*
clt_src
=
clt_tile
+
offset_src
;
// threadIdx.x;
float
*
clt_dst
=
gpu_clt
+
offset_dst
;
// ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE1) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
#ifdef DBG_TILE
#ifdef DEBUG1
if
((
threadIdx
.
x
)
==
0
){
printf
(
"clt_src = 0x%lx
\n
"
,
clt_src
);
printf
(
"clt_dst = 0x%lx
\n
"
,
clt_dst
);
}
#endif
#endif
#pragma unroll
for
(
int
j
=
0
;
j
<
DTT_SIZE
*
4
;
j
++
){
// all 4 components, 8 rows
// shared memory tiles use DTT_SIZE1
*
clt_dst
=
*
clt_src
;
clt_src
+=
DTT_SIZE1
;
clt_dst
+=
DTT_SIZE
;
}
__syncthreads
();
// __syncwarp();
// just for testing perform imclt, save result to clt_kernels
}
extern
"C"
__global__
void
test_imclt
(
float
*
gpu_clt
)
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// Initially - no output, will add later
{
// dim3 t = threadIdx;
int
tile_in_block
=
threadIdx
.
y
;
int
tile_num
=
blockIdx
.
x
*
IMCLT_TILES_PER_BLOCK
+
tile_in_block
;
if
(
tile_num
>=
1
)
return
;
// just testing with a single tile
int
thr3
=
threadIdx
.
x
>>
3
;
int
column
=
threadIdx
.
x
;
// modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// Read clt tile to
__shared__
float
clt_tiles
[
IMCLT_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
mclt_tiles
[
IMCLT_TILES_PER_BLOCK
][
DTT_SIZE2
][
DTT_SIZE21
];
// Read clt tile from device memory
for
(
int
color
=
0
;
color
<
NUM_COLORS
;
color
++
)
{
float
*
clt_tile
=
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
gpu_tile
=
((
float
*
)
gpu_clt
)
+
((
DBG_TILE_Y
*
TILESX
+
DBG_TILE_X
)
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
);
// top left quadrant0
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"gpu_tile = 0x%lx
\n
"
,
gpu_tile
);
printf
(
"clt_tile = 0x%lx
\n
"
,
clt_tile
);
}
#endif
clt_tile
+=
column
+
thr3
;
// first 2 rows
gpu_tile
+=
column
;
// first 2 rows
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE2
;
i
++
){
*
clt_tile
=
*
gpu_tile
;
clt_tile
+=
(
2
*
DTT_SIZE1
);
gpu_tile
+=
(
2
*
DTT_SIZE
);
}
// reset mclt tile to zero
float
*
mclt_tile
=
((
float
*
)
mclt_tiles
)
+
tile_in_block
*
(
DTT_SIZE2
*
DTT_SIZE21
)
+
column
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE2
;
i
++
){
*
mclt_tile
=
0.0
f
;
mclt_tile
+=
DTT_SIZE21
;
}
__syncthreads
();
// __syncwarp();
imclt
(
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
),
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
((
float
*
)
mclt_tiles
)
+
tile_in_block
*
(
DTT_SIZE2
*
DTT_SIZE21
));
// float * mclt_tile )
__syncthreads
();
// __syncwarp();
}
}
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Shuld be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
)
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int
thr3
=
threadIdx
.
x
>>
3
;
int
column
=
threadIdx
.
x
;
// modify to use 2*8 threads, if needed.
int
thr012
=
threadIdx
.
x
&
7
;
int
column4
=
threadIdx
.
x
>>
2
;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int
wcolumn
=
((
thr3
<<
3
)
-
thr3
)
^
thr012
;
//0..7,7,..0
float
*
clt_tile1
=
clt_tile
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile2
=
clt_tile1
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile3
=
clt_tile2
+
(
DTT_SIZE1
*
DTT_SIZE
);
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles before IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg
(
// clt_tile + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 1
clt_tile
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 0 and 2
1
);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg
(
// all colors
// clt_tile2 + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 2 and 3
clt_tile1
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 1 and 3
1
);
__syncthreads
();
// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg
(
// clt_tile + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 2
clt_tile
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 0 and 1
DTT_SIZE1
);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg
(
// clt_tile1 + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 1 and 3
clt_tile2
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 2 and 3
DTT_SIZE1
);
__syncthreads
();
// __syncwarp();
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
float
hw
=
HWINDOW2
[
wcolumn
];
int
clt_offset
=
imclt_indx9
[
column
];
// index in each of the 4 iclt quadrants, accounting for stride=9
float
*
rslt
=
mclt_tile
+
column
;
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
0
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
0
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
0
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
0
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
<
3
){
clt_offset
+=
DTT_SIZE1
;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
*
rslt
=
val
;
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
4
;
i
<
8
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
1
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
1
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
1
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
1
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
// if (i < 7){
clt_offset
-=
DTT_SIZE1
;
// }
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
7
;
i
>=
4
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
2
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
2
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
2
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
2
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
4
){
clt_offset
-=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
3
;
i
>=
0
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
3
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
3
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
3
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
3
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
0
){
clt_offset
+=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#ifdef DEBUG3
__syncthreads
();
// __syncwarp();
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_mclt
(
mclt_tile
,
-
1
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment