Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
c264a349
Commit
c264a349
authored
Sep 19, 2018
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
GPU kernel source code
parent
3d083c29
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
546 additions
and
0 deletions
+546
-0
dtt8x8.cuh
src/main/resources/dtt8x8.cuh
+546
-0
No files found.
src/main/resources/dtt8x8.cuh
0 → 100644
View file @
c264a349
/**
**
** dtt8x8.cuh
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** dtt8x8.cuh is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
**
*/
/**
**************************************************************************
* \file dtt8x8.cuh
* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
* in GPU
* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
* imlementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
* here it is used to convert convolution kernels and LPF to the frequency domain - done in
* softwaer.
*
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#pragma once
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
#define DTT_SIZE 8
/*
Python code to generate constant coefficients:
def dct_constants():
COSPI_1_8_SQRT2 = math.cos(math.pi/8)*math.sqrt(2.0)
COSPI_3_8_SQRT2 = math.cos(3*math.pi/8)*math.sqrt(2.0)
SQRT_2 = math.sqrt(2.0)
SQRT1_2 = 1/math.sqrt(2.0)
SQRT1_8 = 1/math.sqrt(8.0)
CN = [[math.cos((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
SN = [[math.sin((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
print("__constant__ float COSPI_1_8_SQRT2 = %ff;"%(COSPI_1_8_SQRT2))
print("__constant__ float COSPI_3_8_SQRT2 = %ff;"%(COSPI_3_8_SQRT2))
print("__constant__ float SQRT_2 = %ff;"% (SQRT_2))
print("__constant__ float SQRT1_2 = %ff;"% (SQRT1_2))
print("__constant__ float SQRT1_8 = %ff;"% (SQRT1_8))
print("__constant__ float COSN1[] = {%ff,%ff};"% (CN[0][0],CN[0][1]))
print("__constant__ float COSN2[] = {%ff,%ff,%ff,%ff};"% (CN[1][0],CN[1][1],CN[1][2],CN[1][3]))
print("__constant__ float SINN1[] = {%ff,%ff};"% (SN[0][0],SN[0][1]))
print("__constant__ float SINN2[] = {%ff,%ff,%ff,%ff};"% (SN[1][0],SN[1][1],SN[1][2],SN[1][3]))
*/
__constant__
float
COSPI_1_8_SQRT2
=
1.306563
f
;
__constant__
float
COSPI_3_8_SQRT2
=
0.541196
f
;
__constant__
float
SQRT_2
=
1.414214
f
;
__constant__
float
SQRT1_2
=
0.707107
f
;
__constant__
float
SQRT1_8
=
0.353553
f
;
__constant__
float
COSN1
[]
=
{
0.980785
f
,
0.831470
f
};
__constant__
float
COSN2
[]
=
{
0.995185
f
,
0.956940
f
,
0.881921
f
,
0.773010
f
};
__constant__
float
SINN1
[]
=
{
0.195090
f
,
0.555570
f
};
__constant__
float
SINN2
[]
=
{
0.098017
f
,
0.290285
f
,
0.471397
f
,
0.634393
f
};
inline
__device__
void
dttii_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
inline
__device__
void
dttiv_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each
inline
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each
inline
__device__
void
dst_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each
inline
__device__
void
_dctiv_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each
/**
**************************************************************************
* Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
* Mostly for testing and profiling individual converions
*
* \param dst [OUT] - Coefficients as 8x8 tiles
* \param src [IN] - Source image of floats
* \param src_stride [IN] - Source image stride
* \param mode [IN] - DTT mode:
* 0 - horizontal DCT-IV followed by vertical DCT-IV
* 1 - horizontal DST-IV followed by vertical DCT-IV
* 2 - horizontal DCT-IV followed by vertical DST-IV
* 3 - horizontal DST-IV followed by vertical DST-IV
* 4 - horizontal DCT-II followed by vertical DCT-II
* 5 - horizontal DST-II followed by vertical DCT-II
* 6 - horizontal DCT-II followed by vertical DST-II
* 7 - horizontal DST-II followed by vertical DST-II
*
* \return None
*/
extern
"C"
__global__
void
GPU_DTT24_DRV
(
float
*
dst
,
float
*
src
,
int
src_stride
,
int
dtt_mode
)
{
int
dtt_mode0
=
dtt_mode
&
1
;
int
dtt_mode1
=
(
dtt_mode
>>
1
)
&
1
;
__shared__
float
block
[
DTTTEST_BLOCK_HEIGHT
*
DTTTEST_BLK_STRIDE
];
int
OffsThreadInRow
=
threadIdx
.
y
*
DTT_SIZE
+
threadIdx
.
x
;
int
OffsThreadInCol
=
threadIdx
.
z
*
DTT_SIZE
;
src
+=
((
blockIdx
.
y
*
DTTTEST_BLOCK_HEIGHT
+
OffsThreadInCol
)
*
src_stride
)
+
blockIdx
.
x
*
DTTTEST_BLOCK_WIDTH
+
OffsThreadInRow
;
dst
+=
((
blockIdx
.
y
*
DTTTEST_BLOCK_HEIGHT
+
OffsThreadInCol
)
*
src_stride
)
+
blockIdx
.
x
*
DTTTEST_BLOCK_WIDTH
+
OffsThreadInRow
;
float
*
bl_ptr
=
block
+
OffsThreadInCol
*
DTTTEST_BLK_STRIDE
+
OffsThreadInRow
;
#pragma unroll
for
(
unsigned
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
bl_ptr
[
i
*
DTTTEST_BLK_STRIDE
]
=
src
[
i
*
src_stride
];
__syncthreads
();
// horizontal pass
if
(
dtt_mode
>
3
)
{
dttii_shared_mem
(
block
+
(
OffsThreadInCol
+
threadIdx
.
x
)
*
DTTTEST_BLK_STRIDE
+
OffsThreadInRow
-
threadIdx
.
x
,
1
,
dtt_mode0
);
}
else
{
dttiv_shared_mem
(
block
+
(
OffsThreadInCol
+
threadIdx
.
x
)
*
DTTTEST_BLK_STRIDE
+
OffsThreadInRow
-
threadIdx
.
x
,
1
,
dtt_mode0
);
}
__syncthreads
();
// vertical pass
if
(
dtt_mode
>
3
)
{
dttii_shared_mem
(
bl_ptr
,
DTTTEST_BLK_STRIDE
,
dtt_mode1
);
}
else
{
dttiv_shared_mem
(
bl_ptr
,
DTTTEST_BLK_STRIDE
,
dtt_mode1
);
}
__syncthreads
();
for
(
unsigned
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
dst
[
i
*
src_stride
]
=
bl_ptr
[
i
*
DTTTEST_BLK_STRIDE
];
}
inline
__device__
void
_dctiv_nrecurs8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
float
u00
=
(
COSN2
[
0
]
*
x
[
0
]
+
SINN2
[
0
]
*
x
[
7
]);
float
u10
=
(
-
SINN2
[
3
]
*
x
[
3
]
+
COSN2
[
3
]
*
x
[
4
]);
float
u01
=
(
COSN2
[
1
]
*
x
[
1
]
+
SINN2
[
1
]
*
x
[
6
]);
float
u11
=
-
(
-
SINN2
[
2
]
*
x
[
2
]
+
COSN2
[
2
]
*
x
[
5
]);
float
u02
=
(
COSN2
[
2
]
*
x
[
2
]
+
SINN2
[
2
]
*
x
[
5
]);
float
u12
=
(
-
SINN2
[
1
]
*
x
[
1
]
+
COSN2
[
1
]
*
x
[
6
]);
float
u03
=
(
COSN2
[
3
]
*
x
[
3
]
+
SINN2
[
3
]
*
x
[
4
]);
float
u13
=
-
(
-
SINN2
[
0
]
*
x
[
0
]
+
COSN2
[
0
]
*
x
[
7
]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float
ua00
=
u00
+
u03
;
float
ua10
=
u00
-
u03
;
float
ua01
=
u01
+
u02
;
float
ua11
=
u01
-
u02
;
float
v00
=
ua00
+
ua01
;
float
v02
=
ua00
-
ua01
;
float
v01
=
COSPI_1_8_SQRT2
*
ua10
+
COSPI_3_8_SQRT2
*
ua11
;
float
v03
=
COSPI_3_8_SQRT2
*
ua10
-
COSPI_1_8_SQRT2
*
ua11
;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float
ub00
=
u10
+
u13
;
float
ub10
=
u10
-
u13
;
float
ub01
=
u11
+
u12
;
float
ub11
=
u11
-
u12
;
float
vb00
=
ub00
+
ub01
;
float
vb01
=
ub00
-
ub01
;
float
vb10
=
COSPI_1_8_SQRT2
*
ub10
+
COSPI_3_8_SQRT2
*
ub11
;
float
vb11
=
COSPI_3_8_SQRT2
*
ub10
-
COSPI_1_8_SQRT2
*
ub11
;
y
[
0
]
=
SQRT_2
*
v00
;
// w0[0];
y
[
1
]
=
v01
-
vb11
;
// w1[0];
// j == 1
y
[
2
]
=
v01
+
vb11
;
// w0[1];
y
[
3
]
=
v02
+
vb01
;
// w1[1];
// j == 2
y
[
4
]
=
v02
-
vb01
;
// w0[2];
y
[
5
]
=
v03
-
vb10
;
// w1[2]; - same as y[3]
// j == 3
y
[
6
]
=
v03
+
vb10
;
// w0[3];
y
[
7
]
=
SQRT_2
*
vb00
;
// w1[3];
}
inline
__device__
void
_dttiv
(
float
x0
,
float
x1
,
float
x2
,
float
x3
,
float
x4
,
float
x5
,
float
x6
,
float
x7
,
float
*
y0
,
float
*
y1
,
float
*
y2
,
float
*
y3
,
float
*
y4
,
float
*
y5
,
float
*
y6
,
float
*
y7
,
int
dst_not_dct
)
{
float
u00
,
u01
,
u02
,
u03
,
u10
,
u11
,
u12
,
u13
;
if
(
dst_not_dct
)
{
// DSTIV
u00
=
(
COSN2
[
0
]
*
x7
+
SINN2
[
0
]
*
x0
);
u10
=
(
-
SINN2
[
3
]
*
x4
+
COSN2
[
3
]
*
x3
);
u01
=
(
COSN2
[
1
]
*
x6
+
SINN2
[
1
]
*
x1
);
u11
=
-
(
-
SINN2
[
2
]
*
x5
+
COSN2
[
2
]
*
x2
);
u02
=
(
COSN2
[
2
]
*
x5
+
SINN2
[
2
]
*
x2
);
u12
=
(
-
SINN2
[
1
]
*
x6
+
COSN2
[
1
]
*
x1
);
u03
=
(
COSN2
[
3
]
*
x4
+
SINN2
[
3
]
*
x3
);
u13
=
-
(
-
SINN2
[
0
]
*
x7
+
COSN2
[
0
]
*
x0
);
}
else
{
// DCTIV
u00
=
(
COSN2
[
0
]
*
x0
+
SINN2
[
0
]
*
x7
);
u10
=
(
-
SINN2
[
3
]
*
x3
+
COSN2
[
3
]
*
x4
);
u01
=
(
COSN2
[
1
]
*
x1
+
SINN2
[
1
]
*
x6
);
u11
=
-
(
-
SINN2
[
2
]
*
x2
+
COSN2
[
2
]
*
x5
);
u02
=
(
COSN2
[
2
]
*
x2
+
SINN2
[
2
]
*
x5
);
u12
=
(
-
SINN2
[
1
]
*
x1
+
COSN2
[
1
]
*
x6
);
u03
=
(
COSN2
[
3
]
*
x3
+
SINN2
[
3
]
*
x4
);
u13
=
-
(
-
SINN2
[
0
]
*
x0
+
COSN2
[
0
]
*
x7
);
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float
ua00
=
u00
+
u03
;
float
ua10
=
u00
-
u03
;
float
ua01
=
u01
+
u02
;
float
ua11
=
u01
-
u02
;
float
v00
=
ua00
+
ua01
;
float
v02
=
ua00
-
ua01
;
float
v01
=
COSPI_1_8_SQRT2
*
ua10
+
COSPI_3_8_SQRT2
*
ua11
;
float
v03
=
COSPI_3_8_SQRT2
*
ua10
-
COSPI_1_8_SQRT2
*
ua11
;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float
ub00
=
u10
+
u13
;
float
ub10
=
u10
-
u13
;
float
ub01
=
u11
+
u12
;
float
ub11
=
u11
-
u12
;
float
vb00
=
ub00
+
ub01
;
float
vb01
=
ub00
-
ub01
;
float
vb10
=
COSPI_1_8_SQRT2
*
ub10
+
COSPI_3_8_SQRT2
*
ub11
;
float
vb11
=
COSPI_3_8_SQRT2
*
ub10
-
COSPI_1_8_SQRT2
*
ub11
;
*
y0
=
v00
*
0.5
f
;
// w0[0];
// j == 1
*
y2
=
(
v01
+
vb11
)
*
SQRT1_8
;
// w0[1];
// j == 2
*
y4
=
(
v02
-
vb01
)
*
SQRT1_8
;
// w0[2];
// j == 3
*
y6
=
(
v03
+
vb10
)
*
SQRT1_8
;
// w0[3];
if
(
dst_not_dct
)
{
// DSTIV
*
y1
=
(
vb11
-
v01
)
*
SQRT1_8
;
// w1[0];
*
y3
=
-
(
v02
+
vb01
)
*
SQRT1_8
;
// w1[1];
*
y5
=
(
vb10
-
v03
)
*
SQRT1_8
;
// w1[2]; - same as y[3]
*
y7
=
-
vb00
*
0.5
f
;
// w1[3];
}
else
{
*
y1
=
(
v01
-
vb11
)
*
SQRT1_8
;
// w1[0];
*
y3
=
(
v02
+
vb01
)
*
SQRT1_8
;
// w1[1];
*
y5
=
(
v03
-
vb10
)
*
SQRT1_8
;
// w1[2]; - same as y[3]
*
y7
=
vb00
*
0.5
f
;
// w1[3];
}
}
inline
__device__
void
dttii_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
)
{
float
*
x1
=
x0
+
inc
;
float
*
x2
=
x1
+
inc
;
float
*
x3
=
x2
+
inc
;
float
*
x4
=
x3
+
inc
;
float
*
x5
=
x4
+
inc
;
float
*
x6
=
x5
+
inc
;
float
*
x7
=
x6
+
inc
;
float
u00
,
u01
,
u02
,
u03
,
u10
,
u11
,
u12
,
u13
;
if
(
dst_not_dct
)
{
// DSTII
// invert odd input samples
u00
=
(
(
*
x0
)
-
(
*
x7
));
u10
=
(
(
*
x0
)
+
(
*
x7
));
u01
=
(
-
(
*
x1
)
+
(
*
x6
));
u11
=
(
-
(
*
x1
)
-
(
*
x6
));
u02
=
(
(
*
x2
)
-
(
*
x5
));
u12
=
(
(
*
x2
)
+
(
*
x5
));
u03
=
(
-
(
*
x3
)
+
(
*
x4
));
u13
=
(
-
(
*
x3
)
-
(
*
x4
));
}
else
{
// DCTII
u00
=
(
(
*
x0
)
+
(
*
x7
));
u10
=
(
(
*
x0
)
-
(
*
x7
));
u01
=
(
(
*
x1
)
+
(
*
x6
));
u11
=
(
(
*
x1
)
-
(
*
x6
));
u02
=
(
(
*
x2
)
+
(
*
x5
));
u12
=
(
(
*
x2
)
-
(
*
x5
));
u03
=
(
(
*
x3
)
+
(
*
x4
));
u13
=
(
(
*
x3
)
-
(
*
x4
));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float
w00
=
u00
+
u03
;
float
w10
=
u00
-
u03
;
float
w01
=
(
u01
+
u02
);
float
w11
=
(
u01
-
u02
);
float
v01
=
COSPI_1_8_SQRT2
*
w10
+
COSPI_3_8_SQRT2
*
w11
;
float
v03
=
COSPI_3_8_SQRT2
*
w10
-
COSPI_1_8_SQRT2
*
w11
;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float
w20
=
(
COSN1
[
0
]
*
u10
+
SINN1
[
0
]
*
u13
);
float
w30
=
(
-
SINN1
[
1
]
*
u11
+
COSN1
[
1
]
*
u12
);
float
w21
=
(
COSN1
[
1
]
*
u11
+
SINN1
[
1
]
*
u12
);
float
w31
=
-
(
-
SINN1
[
0
]
*
u10
+
COSN1
[
0
]
*
u13
);
float
v11
=
w20
-
w21
-
w30
+
w31
;
float
v12
=
w20
-
w21
+
w30
-
w31
;
if
(
dst_not_dct
)
{
// DSTII
// Invert output sequence
*
x0
=
(
w30
+
w31
)
*
0.5
f
;
// v13 * SQRT1_8; z10 * 0.5f
*
x1
=
v03
*
SQRT1_8
;
*
x2
=
v12
*
SQRT1_8
;
*
x3
=
(
w00
-
w01
)
*
SQRT1_8
;
// v02 * SQRT1_8
*
x4
=
v11
*
SQRT1_8
;
*
x5
=
v01
*
SQRT1_8
;
*
x6
=
(
w20
+
w21
)
*
0.5
f
;
// v10 * SQRT1_8; z00 * 0.5f;
*
x7
=
(
w00
+
w01
)
*
SQRT1_8
;
// v00 * SQRT1_8
}
else
{
*
x0
=
(
w00
+
w01
)
*
SQRT1_8
;
// v00 * SQRT1_8
*
x1
=
(
w20
+
w21
)
*
0.5
f
;
// v10 * SQRT1_8; z00 * 0.5f;
*
x2
=
v01
*
SQRT1_8
;
*
x3
=
v11
*
SQRT1_8
;
*
x4
=
(
w00
-
w01
)
*
SQRT1_8
;
// v02 * SQRT1_8
*
x5
=
v12
*
SQRT1_8
;
*
x6
=
v03
*
SQRT1_8
;
*
x7
=
(
w30
+
w31
)
*
0.5
f
;
// v13 * SQRT1_8; z10 * 0.5f
}
}
inline
__device__
void
dttiv_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
)
{
float
*
x1
=
x0
+
inc
;
float
*
x2
=
x1
+
inc
;
float
*
x3
=
x2
+
inc
;
float
*
x4
=
x3
+
inc
;
float
*
x5
=
x4
+
inc
;
float
*
x6
=
x5
+
inc
;
float
*
x7
=
x6
+
inc
;
float
u00
,
u01
,
u02
,
u03
,
u10
,
u11
,
u12
,
u13
;
if
(
dst_not_dct
)
{
// DSTIV
u00
=
(
COSN2
[
0
]
*
(
*
x7
)
+
SINN2
[
0
]
*
(
*
x0
));
u10
=
(
-
SINN2
[
3
]
*
(
*
x4
)
+
COSN2
[
3
]
*
(
*
x3
));
u01
=
(
COSN2
[
1
]
*
(
*
x6
)
+
SINN2
[
1
]
*
(
*
x1
));
u11
=
-
(
-
SINN2
[
2
]
*
(
*
x5
)
+
COSN2
[
2
]
*
(
*
x2
));
u02
=
(
COSN2
[
2
]
*
(
*
x5
)
+
SINN2
[
2
]
*
(
*
x2
));
u12
=
(
-
SINN2
[
1
]
*
(
*
x6
)
+
COSN2
[
1
]
*
(
*
x1
));
u03
=
(
COSN2
[
3
]
*
(
*
x4
)
+
SINN2
[
3
]
*
(
*
x3
));
u13
=
-
(
-
SINN2
[
0
]
*
(
*
x7
)
+
COSN2
[
0
]
*
(
*
x0
));
}
else
{
// DCTIV
u00
=
(
COSN2
[
0
]
*
(
*
x0
)
+
SINN2
[
0
]
*
(
*
x7
));
u10
=
(
-
SINN2
[
3
]
*
(
*
x3
)
+
COSN2
[
3
]
*
(
*
x4
));
u01
=
(
COSN2
[
1
]
*
(
*
x1
)
+
SINN2
[
1
]
*
(
*
x6
));
u11
=
-
(
-
SINN2
[
2
]
*
(
*
x2
)
+
COSN2
[
2
]
*
(
*
x5
));
u02
=
(
COSN2
[
2
]
*
(
*
x2
)
+
SINN2
[
2
]
*
(
*
x5
));
u12
=
(
-
SINN2
[
1
]
*
(
*
x1
)
+
COSN2
[
1
]
*
(
*
x6
));
u03
=
(
COSN2
[
3
]
*
(
*
x3
)
+
SINN2
[
3
]
*
(
*
x4
));
u13
=
-
(
-
SINN2
[
0
]
*
(
*
x0
)
+
COSN2
[
0
]
*
(
*
x7
));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float
ua00
=
u00
+
u03
;
float
ua10
=
u00
-
u03
;
float
ua01
=
u01
+
u02
;
float
ua11
=
u01
-
u02
;
float
v00
=
ua00
+
ua01
;
float
v02
=
ua00
-
ua01
;
float
v01
=
COSPI_1_8_SQRT2
*
ua10
+
COSPI_3_8_SQRT2
*
ua11
;
float
v03
=
COSPI_3_8_SQRT2
*
ua10
-
COSPI_1_8_SQRT2
*
ua11
;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float
ub00
=
u10
+
u13
;
float
ub10
=
u10
-
u13
;
float
ub01
=
u11
+
u12
;
float
ub11
=
u11
-
u12
;
float
vb00
=
ub00
+
ub01
;
float
vb01
=
ub00
-
ub01
;
float
vb10
=
COSPI_1_8_SQRT2
*
ub10
+
COSPI_3_8_SQRT2
*
ub11
;
float
vb11
=
COSPI_3_8_SQRT2
*
ub10
-
COSPI_1_8_SQRT2
*
ub11
;
*
x0
=
v00
*
0.5
f
;
// w0[0];
*
x2
=
(
v01
+
vb11
)
*
SQRT1_8
;
// w0[1];
*
x4
=
(
v02
-
vb01
)
*
SQRT1_8
;
// w0[2];
*
x6
=
(
v03
+
vb10
)
*
SQRT1_8
;
// w0[3];
if
(
dst_not_dct
)
{
// DSTIV
*
x1
=
(
vb11
-
v01
)
*
SQRT1_8
;
// w1[0];
*
x3
=
-
(
v02
+
vb01
)
*
SQRT1_8
;
// w1[1];
*
x5
=
(
vb10
-
v03
)
*
SQRT1_8
;
// w1[2]; - same as y[3]
*
x7
=
-
vb00
*
0.5
f
;
// w1[3];
}
else
{
*
x1
=
(
v01
-
vb11
)
*
SQRT1_8
;
// w1[0];
*
x3
=
(
v02
+
vb01
)
*
SQRT1_8
;
// w1[1];
*
x5
=
(
v03
-
vb10
)
*
SQRT1_8
;
// w1[2]; - same as y[3]
*
x7
=
vb00
*
0.5
f
;
// w1[3];
}
}
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
float
u00
=
(
x
[
0
]
+
x
[
7
]);
float
u10
=
(
x
[
0
]
-
x
[
7
]);
float
u01
=
(
x
[
1
]
+
x
[
6
]);
float
u11
=
(
x
[
1
]
-
x
[
6
]);
float
u02
=
(
x
[
2
]
+
x
[
5
]);
float
u12
=
(
x
[
2
]
-
x
[
5
]);
float
u03
=
(
x
[
3
]
+
x
[
4
]);
float
u13
=
(
x
[
3
]
-
x
[
4
]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float
w00
=
u00
+
u03
;
float
w10
=
u00
-
u03
;
float
w01
=
(
u01
+
u02
);
float
w11
=
(
u01
-
u02
);
float
v00
=
w00
+
w01
;
float
v02
=
w00
-
w01
;
float
v01
=
COSPI_1_8_SQRT2
*
w10
+
COSPI_3_8_SQRT2
*
w11
;
float
v03
=
COSPI_3_8_SQRT2
*
w10
-
COSPI_1_8_SQRT2
*
w11
;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float
w20
=
(
COSN1
[
0
]
*
u10
+
SINN1
[
0
]
*
u13
);
float
w30
=
(
-
SINN1
[
1
]
*
u11
+
COSN1
[
1
]
*
u12
);
float
w21
=
(
COSN1
[
1
]
*
u11
+
SINN1
[
1
]
*
u12
);
float
w31
=
-
(
-
SINN1
[
0
]
*
u10
+
COSN1
[
0
]
*
u13
);
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float
z00
=
w20
+
w21
;
float
z01
=
w20
-
w21
;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float
z10
=
w30
+
w31
;
float
z11
=
w30
-
w31
;
float
v10
=
SQRT_2
*
z00
;
float
v11
=
z01
-
z11
;
float
v12
=
z01
+
z11
;
float
v13
=
SQRT_2
*
z10
;
y
[
0
]
=
v00
;
y
[
1
]
=
v10
;
y
[
2
]
=
v01
;
y
[
3
]
=
v11
;
y
[
4
]
=
v02
;
y
[
5
]
=
v12
;
y
[
6
]
=
v03
;
y
[
7
]
=
v13
;
}
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
_dctii_nrecurs8
(
x
,
y
);
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
y
[
i
]
*=
SQRT1_8
;
}
}
inline
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
_dctiv_nrecurs8
(
x
,
y
);
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
y
[
i
]
*=
SQRT1_8
;
}
}
inline
__device__
void
dst_iv8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
float
xr
[
8
];
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
){
xr
[
i
]
=
x
[
7
-
i
];
}
_dctiv_nrecurs8
(
xr
,
y
);
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
+=
2
){
y
[
i
]
*=
SQRT1_8
;
y
[
i
+
1
]
*=
-
SQRT1_8
;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment