Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
3c3ebfa9
Commit
3c3ebfa9
authored
Apr 07, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
preparing for separate compilation in java
parent
0253bab4
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
1347 additions
and
2485 deletions
+1347
-2485
TileProcessor.cuh
src/TileProcessor.cuh
+211
-689
dtt8x8.cu
src/dtt8x8.cu
+1136
-950
dtt8x8.cuh
src/dtt8x8.cuh
+0
-846
No files found.
src/TileProcessor.cuh
View file @
3c3ebfa9
...
@@ -37,9 +37,8 @@
...
@@ -37,9 +37,8 @@
*/
*/
// Avoiding includes in jcuda, all source files will be merged
// Avoiding includes in jcuda, all source files will be merged
#ifndef JCUDA
#pragma once
#pragma once
#i
nclude "dtt8x8.cuh"
#i
fndef JCUDA
#define THREADSX (DTT_SIZE)
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_PAIRS 6
...
@@ -93,10 +92,9 @@
...
@@ -93,10 +92,9 @@
#define DEBUG_OOB1 1
#define DEBUG_OOB1 1
#endif //#ifndef JCUDA
#endif //#ifndef JCUDA
#include "dtt8x8.h"
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
//#define IMCLT14
//#define NOICLT 1
//#define NOICLT 1
//#define TEST_IMCLT
//#define TEST_IMCLT
...
@@ -148,25 +146,10 @@
...
@@ -148,25 +146,10 @@
// Make TILESYA >= TILESX and a multiple of 4
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
// increase row length by 1 so vertical passes will use different ports
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
//#define DTT_SIZE22 (DTT_SIZE2 + 2)
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
// Use CORR_OUT_RAD for the correlation output
// Use CORR_OUT_RAD for the correlation output
#define BAYER_RED 0
#define BAYER_BLUE 1
#define BAYER_GREEN 2
// assuming GR/BG as now
#define BAYER_RED_ROW 0
#define BAYER_RED_COL 1
//#define BAYER_BLUE_ROW (1 - BAYER_RED_ROW)
//#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
//#define DBG_TILE_X 40
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_X 161 // 49
...
@@ -312,11 +295,14 @@ def set_imclt_sa(stride=9):
...
@@ -312,11 +295,14 @@ def set_imclt_sa(stride=9):
print('0x%02x,'%(d), end="")
print('0x%02x,'%(d), end="")
print('0x%2x};'%(sa8s[-1]))
print('0x%2x};'%(sa8s[-1]))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np # math
import numpy as np # math
def printAlphaFade(transform_size):
def printAlphaFade(transform_size):
ts2 = 2 * transform_size
ts2 = 2 * transform_size
ts2m1 = ts2-1
ts2m1 = ts2-1
alphaFade = np.zeros(shape=(16,ts2*ts2), dtype=float) # double [][] alphaFade = new double[16][ts2*ts2];
alphaFade = np.zeros(shape=(9,ts2*ts2), dtype=float) # double [][] alphaFade = new double[16][ts2*ts2];
alphaIndex = np.zeros(shape=(16,), dtype=int)
fade1d = np.zeros(shape=(16,), dtype=float) # double [] fade1d = new double [ts2];
fade1d = np.zeros(shape=(16,), dtype=float) # double [] fade1d = new double [ts2];
for i in range (ts2):
for i in range (ts2):
fade1d[i] = 0.5 * (1.0 - np.cos(np.pi * (i +0.5) /ts2))
fade1d[i] = 0.5 * (1.0 - np.cos(np.pi * (i +0.5) /ts2))
...
@@ -326,27 +312,49 @@ def printAlphaFade(transform_size):
...
@@ -326,27 +312,49 @@ def printAlphaFade(transform_size):
for m in range (16):
for m in range (16):
# if m == 0:
# if m == 0:
# alphaFade[m][indx] = 0
# alphaFade[m][indx] = 0
elif m == 1: # 0
if m == 1: # 0
alphaFade[m][indx] = fade1d[ts2m1 - i]
alphaIndex[m] = 1
alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - i]
elif m == 2:
elif m == 2:
alphaFade[m][indx] = fade1d[j]
alphaIndex[m] = 2
alphaFade[alphaIndex[m]][indx] = fade1d[j]
elif m == 4:
elif m == 4:
alphaFade[m][indx] = fade1d[i]
alphaIndex[m] = 3
alphaFade[alphaIndex[m]][indx] = fade1d[i]
elif m == 8:
elif m == 8:
alphaFade[m][indx] = fade1d[ts2m1 - j]
alphaIndex[m] = 4
alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - j]
elif m == 3:
elif m == 3:
alphaFade[m][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
alphaIndex[m] = 5
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
elif m == 6:
elif m == 6:
alphaFade[m][indx] = (fade1d[i],fade1d[j])[j > i]
alphaIndex[m] = 6
alphaFade[alphaIndex[m]][indx] = (fade1d[i],fade1d[j])[j > i]
elif m == 9:
elif m == 9:
alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
alphaIndex[m] = 7
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
elif m == 12:
elif m == 12:
alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
alphaIndex[m] = 8
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
else:
else:
alphaFade[m][indx] = 1.0
alphaIndex[m] = 0
alphaFade[alphaIndex[m]][indx] = 1.0
floats_in_line=8
floats_in_line=8
print("__constant__
float alphaFade[16][%d] = {"%(ts2*ts2)
)
print("__constant__
int alphaIndex[16] = {"
)
for m in range (16):
for m in range (16):
if ((m % floats_in_line) == 0):
print("\n ",end="")
else:
print(" ",end="")
print("%d"%(alphaIndex[m]), end ="")
if (m < (16-1)):
print(",",end="")
print("};")
print("__constant__ float alphaFade[9][%d] = {"%(ts2*ts2))
for m in range (9):
for i in range (ts2 * ts2):
for i in range (ts2 * ts2):
if ((i % floats_in_line) == 0):
if ((i % floats_in_line) == 0):
print(" ",end="")
print(" ",end="")
...
@@ -362,20 +370,45 @@ def printAlphaFade(transform_size):
...
@@ -362,20 +370,45 @@ def printAlphaFade(transform_size):
print(",")
print(",")
else:
else:
print(", ",end="")
print(", ",end="")
if (m ==
15
):
if (m ==
(9-1)
):
print("};")
print("};")
else:
else:
print(",")
print(",")
printAlphaFade(8)
"""
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{-1,-1,-1, 1},
{-1,-1,-1, 1},
{-1,-1,-1, 1}
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{-1, 1, 1, 1}
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
}};
*/
*/
__constant__ float HWINDOW[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f,
__constant__ float HWINDOW[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f,
0.773010f, 0.881921f, 0.956940f, 0.995185f};
0.773010f, 0.881921f, 0.956940f, 0.995185f};
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
__constant__ float HWINDOW_SQ[] = {0.009607f, 0.084265f, 0.222215f, 0.402455f,
__constant__ float HWINDOW_SQ[] = {0.009607f, 0.084265f, 0.222215f, 0.402455f,
0.597545f, 0.777785f, 0.915735f, 0.990393f};
0.597545f, 0.777785f, 0.915735f, 0.990393f};
...
@@ -392,9 +425,12 @@ __constant__ int fold_inc[]= {0x02feee12, 0x021eeef2};
...
@@ -392,9 +425,12 @@ __constant__ int fold_inc[]= {0x02feee12, 0x021eeef2};
//__constant__ int imclt_indx[16] = {0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x22,0x21,0x20,0x20,0x21,0x22,0x23};
//__constant__ int imclt_indx[16] = {0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x22,0x21,0x20,0x20,0x21,0x22,0x23};
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
#ifdef BBBB
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
// Hope that if 2 outer indices are known at compile time there will be no integer multiplications
// Hope that if 2 outer indices are known at compile time there will be no integer multiplications
__constant__ float idct_signs[4][4][4] ={
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
...
@@ -418,6 +454,7 @@ __constant__ float idct_signs[4][4][4] ={
...
@@ -418,6 +454,7 @@ __constant__ float idct_signs[4][4][4] ={
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
{-1,-1,-1, 1}
}};
}};
#endif
// LPF for sigma 0.9 each color (modify through cudaMemcpyToSymbol() or similar in Driver API
// LPF for sigma 0.9 each color (modify through cudaMemcpyToSymbol() or similar in Driver API
//#ifndef NOICLT
//#ifndef NOICLT
__constant__ float lpf_data[4][64]={
__constant__ float lpf_data[4][64]={
...
@@ -489,39 +526,40 @@ __constant__ int pairs[6][2]={
...
@@ -489,39 +526,40 @@ __constant__ int pairs[6][2]={
{0, 3},
{0, 3},
{2, 1}};
{2, 1}};
__constant__ float alphaFade[16][256] = {
__constant__ int alphaIndex[16] = {0, 1, 2, 5, 3, 0, 6, 0, 4, 7, 0, 0, 8, 0, 0, 0};
{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
__constant__ float alphaFade[9][256] = {
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
...
@@ -586,38 +624,6 @@ __constant__ float alphaFade[16][256] = {
...
@@ -586,38 +624,6 @@ __constant__ float alphaFade[16][256] = {
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
{0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
{0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f,
0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f,
...
@@ -650,102 +656,6 @@ __constant__ float alphaFade[16][256] = {
...
@@ -650,102 +656,6 @@ __constant__ float alphaFade[16][256] = {
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
{0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
{0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
{0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
...
@@ -780,228 +690,132 @@ __constant__ float alphaFade[16][256] = {
...
@@ -780,228 +690,132 @@ __constant__ float alphaFade[16][256] = {
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.9
97592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961
f,
0.9
78470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592
f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.9
97592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505
f,
0.9
40961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592
f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197
f,
0.
886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698
f,
0.
817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.64514
2f,
0.
735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.99759
2f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009
f,
0.
645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009
f,
0.
549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
{0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
{0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009
f,
0.
549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.64514
2f,
0.
549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.99759
2f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
0.
997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698
f,
0.
645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592
f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
{0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
{0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
{1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f}};
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f}};
//#endif
//#endif
__device__ void convertCorrectTile(
__device__ void convertCorrectTile(
...
@@ -1065,16 +879,10 @@ __device__ void resetCorrelation(
...
@@ -1065,16 +879,10 @@ __device__ void resetCorrelation(
__device__ void normalizeTileAmplitude(
__device__ void normalizeTileAmplitude(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float fat_zero); // fat zero is absolute, scale it outside
float fat_zero); // fat zero is absolute, scale it outside
__device__ void corrUnfoldTile(
int corr_radius,
//__device__ void imclt( // for 16 threads implemented, used // why is it twice?
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt); // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
//__device__ void imclt( // implemented, used // why is it twice?
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
// float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
// float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__ void imclt( // for 16 threads implemented, used // why is it twice?
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__ void imclt8threads(// for 8 threads
__device__ void imclt8threads(// for 8 threads
int do_acc, // 1 - add to previous value, 0 - overwrite
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
...
@@ -1356,14 +1164,8 @@ __global__ void correlate2D(
...
@@ -1356,14 +1164,8 @@ __global__ void correlate2D(
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
#endif
#endif
#endif
dttii_2d(clt_corr);
/*
/*
Java code:
for (int quadrant = 0; quadrant < 4; quadrant++){
int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
}
*/
// change to 16-32 threads?? in next iteration
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
for (int q = 0; q < 4; q++){
...
@@ -1371,15 +1173,6 @@ Java code:
...
@@ -1371,15 +1173,6 @@ Java code:
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
}
__syncthreads();
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
printf("\ncorrelate2D AFTER VERTICAL (HORIZONTAL) PASS\n");
debug_print_clt1(clt_corr, -1, 0xf);
}
__syncthreads();// __syncwarp();
#endif
#endif
// hor pass, corresponding to vert pass in Java
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
for (int q = 0; q < 4; q++){
...
@@ -1387,6 +1180,9 @@ Java code:
...
@@ -1387,6 +1180,9 @@ Java code:
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
}
__syncthreads();
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
...
@@ -2498,7 +2294,8 @@ __global__ void textures_accumulate(
...
@@ -2498,7 +2294,8 @@ __global__ void textures_accumulate(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif // DEBUG12
#endif // DEBUG12
if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
int alpha_mode = alphaIndex[tile_code];
if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
for (int pass = 0; pass < 8; pass ++) {
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int row = pass * 2 + (threadIdx.y >> 1);
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
...
@@ -2509,12 +2306,12 @@ __global__ void textures_accumulate(
...
@@ -2509,12 +2306,12 @@ __global__ void textures_accumulate(
if (colors == 3){
if (colors == 3){
#pragma unroll
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[
tile_c
ode][gi]; // reduce [tile_code] by LUT
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[
alpha_m
ode][gi]; // reduce [tile_code] by LUT
}
}
} else { // assuming colors = 1
} else { // assuming colors = 1
#pragma unroll
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[
tile_c
ode][gi]; // reduce [tile_code] by LUT
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[
alpha_m
ode][gi]; // reduce [tile_code] by LUT
}
}
}
}
}
}
...
@@ -2940,72 +2737,6 @@ __device__ void normalizeTileAmplitude(
...
@@ -2940,72 +2737,6 @@ __device__ void normalizeTileAmplitude(
clt_tile_j3 ++; // =DTT_SIZE1;
clt_tile_j3 ++; // =DTT_SIZE1;
}
}
}
}
/*
Converted from DttRad2.java:443
public double [] corr_unfold_tile(
double [][] qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
int transform_size
)
*/
__device__ void corrUnfoldTile(
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; //8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
if (i > corr_radius) {
return; // not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float corr_pixscale = 0.25f;
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
rslt[rslt_base_index_pp] = corr_pixscale * (
qdata0[i_transform_size + j] +
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
rslt[rslt_base_index_pm] = corr_pixscale * (
qdata0[i_transform_size + j] +
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
}
if (i == 0) {
return;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float d = corr_pixscale * qdata2[im1_transform_size];
rslt[rslt_base_index_p] += d;
rslt[rslt_base_index_m] -= d;
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int rslt_base_index_pm = rslt_base_index_p - j;
int rslt_base_index_mp = rslt_base_index_m + j;
int rslt_base_index_mm = rslt_base_index_m - j;
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
rslt[rslt_base_index_pp] += d2 + d3;
rslt[rslt_base_index_pm] += d2 - d3;
}
}
__device__ void debug_print_lpf(
__device__ void debug_print_lpf(
float * lpf_tile)
float * lpf_tile)
...
@@ -3362,28 +3093,6 @@ __device__ void convertCorrectTile(
...
@@ -3362,28 +3093,6 @@ __device__ void convertCorrectTile(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
#endif
/*
if (color == BAYER_GREEN) {
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
// float *dtt_buf = ((float *) clt_tile[0]) + threadIdx.x;
// float *dtt_buf1 = ((float *) clt_tile[2]) + threadIdx.x;
float *dtt_buf = clt_tile + threadIdx.x;
float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
(*dtt_buf) += (*dtt_buf1);
dtt_buf += (4 * DTT_SIZE1);
dtt_buf1 += (4 * DTT_SIZE1);
(*dtt_buf) += (*dtt_buf1);
dtt_buf = clt_tile + (DTT_SIZE1 * DTT_SIZE) + threadIdx.x; // ((float *) clt_tile[1]) + threadIdx.x;
dtt_buf1 = dtt_buf + (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[3]) + threadIdx.x;
(*dtt_buf) += (*dtt_buf1);
dtt_buf += (4 * DTT_SIZE1);
dtt_buf1 += (4 * DTT_SIZE1);
(*dtt_buf) += (*dtt_buf1);
__syncthreads();// __syncwarp();
}
*/
if (color == BAYER_GREEN) {
if (color == BAYER_GREEN) {
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
float *dtt_buf = clt_tile + threadIdx.x;
float *dtt_buf = clt_tile + threadIdx.x;
...
@@ -3404,21 +3113,16 @@ __device__ void convertCorrectTile(
...
@@ -3404,21 +3113,16 @@ __device__ void convertCorrectTile(
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
#endif
dttiv_color_2d(
clt_tile,
color);
/*
dctiv_nodiverg( // all colors
dctiv_nodiverg( // all colors
#ifdef USE_UMUL24
clt_tile + __umul24(threadIdx.x,DTT_SIZE1), // [0][threadIdx.x], // pointer to start of row
#else
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
#endif
1); //int inc);
1); //int inc);
if (color == BAYER_GREEN){
if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors
dstiv_nodiverg( // all colors
#ifdef USE_UMUL24
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
clt_tile + __umul24(threadIdx.x + DTT_SIZE, DTT_SIZE1), // clt_tile[1][threadIdx.x], // pointer to start of row
#else
clt_tile + DTT_SIZE1 * (threadIdx.x + DTT_SIZE), // clt_tile[1][threadIdx.x], // pointer to start of row
#endif
1); //int inc);
1); //int inc);
}
}
...
@@ -3435,12 +3139,12 @@ __device__ void convertCorrectTile(
...
@@ -3435,12 +3139,12 @@ __device__ void convertCorrectTile(
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
DTT_SIZE1); // int inc,
if (color == BAYER_GREEN){
if (color == BAYER_GREEN){
// dstiv_nodiverg( // all colors
dctiv_nodiverg( // all colors
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
DTT_SIZE1); // int inc,
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
*/
#ifdef DEBUG2
#ifdef DEBUG2
if ((threadIdx.x) == 0){
if ((threadIdx.x) == 0){
...
@@ -3692,6 +3396,7 @@ __global__ void test_imclt(
...
@@ -3692,6 +3396,7 @@ __global__ void test_imclt(
}
}
#endif // NOICLT1
#endif // NOICLT1
#ifdef BBBB
//
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
...
@@ -3721,23 +3426,19 @@ __device__ void imclt(
...
@@ -3721,23 +3426,19 @@ __device__ void imclt(
// perform horizontal dct-iv on quadrants 0 and 1
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg(
dctiv_nodiverg(
// clt_tile + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 1
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
1);
// perform horizontal dst-iv on quadrants 2 and 3
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
dstiv_nodiverg( // all colors
// clt_tile2 + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 2 and 3
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
1);
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg(
dctiv_nodiverg(
// clt_tile + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 2
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1);
DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg(
dstiv_nodiverg(
// clt_tile1 + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 1 and 3
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
DTT_SIZE1);
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
...
@@ -3833,188 +3534,9 @@ __device__ void imclt(
...
@@ -3833,188 +3534,9 @@ __device__ void imclt(
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
#endif
}
}
//#endif
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nDTT Tiles before IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( // quadrant 0
clt_tile + threadIdx.x, // pointer to start of row for quadrant 0
DTT_SIZE1);
dctiv_nodiverg( // quadrant 1
clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
DTT_SIZE1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // quadrant 2
clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
DTT_SIZE1);
dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
1);
dctiv_nodiverg( // quadrant 2
clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg( // quadrant 1
clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
1);
dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1);
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for (int thr3 = 0; thr3 < 2; thr3++){
int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
#endif
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
// facc
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
}
#ifdef DEBUG7
__syncthreads();// __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
}
__device__ void debayer_shot(
__device__ void debayer_shot(
const int rb_mode, // 0 - green, 1 - r/b
const int rb_mode, // 0 - green, 1 - r/b
...
...
src/dtt8x8.cu
View file @
3c3ebfa9
/**
/**
**
**
** dtt8x8.cu
- CPU test code to run GPU tile processor
** dtt8x8.cu
h
**
**
** Copyright (C) 2018 Elphel, Inc.
** Copyright (C) 2018 Elphel, Inc.
**
**
** -----------------------------------------------------------------------------**
** -----------------------------------------------------------------------------**
**
**
** dtt8x8.cu is free software: you can redistribute it and/or modify
** dtt8x8.cu
h
is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
** (at your option) any later version.
...
@@ -30,1069 +30,1255 @@
...
@@ -30,1069 +30,1255 @@
** -----------------------------------------------------------------------------**
** -----------------------------------------------------------------------------**
*/
*/
#include <stdio.h>
/**
#include <stdlib.h>
**************************************************************************
#include <math.h>
* \file dtt8x8.cuh
#include <cuda_runtime.h>
* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
#include <helper_cuda.h>
* in GPU
#include <helper_functions.h>
* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
* here it is used to convert convolution kernels and LPF to the frequency domain - done in
* software.
*
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#include "dtt8x8.h"
//#define CUDART_INF_F __int_as_float(0x7f800000)
/*
Python code to generate constant coefficients:
def dct_constants():
COSPI_1_8_SQRT2 = math.cos(math.pi/8)*math.sqrt(2.0)
COSPI_3_8_SQRT2 = math.cos(3*math.pi/8)*math.sqrt(2.0)
SQRT_2 = math.sqrt(2.0)
SQRT1_2 = 1/math.sqrt(2.0)
SQRT1_8 = 1/math.sqrt(8.0)
CN = [[math.cos((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
SN = [[math.sin((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
print("__constant__ float COSPI_1_8_SQRT2 = %ff;"%(COSPI_1_8_SQRT2))
print("__constant__ float COSPI_3_8_SQRT2 = %ff;"%(COSPI_3_8_SQRT2))
print("__constant__ float SQRT_2 = %ff;"% (SQRT_2))
print("__constant__ float SQRT1_2 = %ff;"% (SQRT1_2))
print("__constant__ float SQRT1_8 = %ff;"% (SQRT1_8))
print("__constant__ float COSN1[] = {%ff,%ff};"% (CN[0][0],CN[0][1]))
print("__constant__ float COSN2[] = {%ff,%ff,%ff,%ff};"% (CN[1][0],CN[1][1],CN[1][2],CN[1][3]))
print("__constant__ float SINN1[] = {%ff,%ff};"% (SN[0][0],SN[0][1]))
print("__constant__ float SINN2[] = {%ff,%ff,%ff,%ff};"% (SN[1][0],SN[1][1],SN[1][2],SN[1][3]))
*/
__constant__ float COSPI_1_8_SQRT2 = 1.306563f;
__constant__ float COSPI_3_8_SQRT2 = 0.541196f;
__constant__ float SQRT_2 = 1.414214f;
__constant__ float SQRT1_2 = 0.707107f;
__constant__ float SQRT1_8 = 0.353553f;
__constant__ float COSN1[] = {0.980785f,0.831470f};
__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
__constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
__constant__ float idct_signs[4][4][4] ={
{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{-1, 1, 1, 1},
{-1, 1, 1, 1},
{-1, 1, 1, 1}
},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{-1,-1,-1, 1},
{-1,-1,-1, 1},
{-1,-1,-1, 1}
},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{ 1,-1,-1,-1},
{-1, 1, 1, 1}
},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{ 1, 1, 1,-1},
{-1,-1,-1, 1}
}};
__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
0.386505f, 0.440961f, 0.478470f, 0.497592f};
// for reading binary files
#include <fstream>
#include <iterator>
#include <vector>
#include "dtt8x8.cuh"
#include "TileProcessor.cuh"
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
/**
**************************************************************************
* Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
* Mostly for testing and profiling individual conversions
*
* \param dst [OUT] - Coefficients as 8x8 tiles
* \param src [IN] - Source image of floats
* \param src_stride [IN] - Source image stride
* \param mode [IN] - DTT mode:
* 0 - horizontal DCT-IV followed by vertical DCT-IV
* 1 - horizontal DST-IV followed by vertical DCT-IV
* 2 - horizontal DCT-IV followed by vertical DST-IV
* 3 - horizontal DST-IV followed by vertical DST-IV
* 4 - horizontal DCT-II followed by vertical DCT-II
* 5 - horizontal DST-II followed by vertical DCT-II
* 6 - horizontal DCT-II followed by vertical DST-II
* 7 - horizontal DST-II followed by vertical DST-II
*
* \return None
*/
float * copyalloc_kernel_gpu(float * kernel_host,
extern "C"
int size, // size in floats
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
int full_size)
{
{
float *kernel_gpu;
int dtt_mode0 = dtt_mode & 1;
checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
int dtt_mode1 = (dtt_mode >>1) & 1;
checkCudaErrors(cudaMemcpy( // segfault
kernel_gpu,
kernel_host,
size * sizeof(float),
cudaMemcpyHostToDevice));
return kernel_gpu;
}
float * copyalloc_kernel_gpu(float * kernel_host,
__shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
int size)
{
return copyalloc_kernel_gpu(kernel_host,
size, // size in floats
size);
}
int OffsThreadInRow = threadIdx.y * DTT_SIZE + threadIdx.x;
int OffsThreadInCol = threadIdx.z * DTT_SIZE;
src += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
dst += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
float *bl_ptr = block + OffsThreadInCol * DTTTEST_BLK_STRIDE + OffsThreadInRow;
#pragma unroll
float * alloccopy_from_gpu(
for (unsigned int i = 0; i < DTT_SIZE; i++)
float * gpu_data,
bl_ptr[i * DTTTEST_BLK_STRIDE] = src[i * src_stride];
float * cpu_data, // if null, will allocate
int size)
__syncthreads();
{
// horizontal pass
if (!cpu_data) {
if (dtt_mode > 3) {
cpu_data = (float *)malloc(size*sizeof(float));
dttii_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} else {
dttiv_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
}
}
checkCudaErrors(cudaMemcpy( // segfault
cpu_data,
gpu_data,
size * sizeof(float),
cudaMemcpyDeviceToHost));
return cpu_data;
__syncthreads();
// vertical pass
if (dtt_mode > 3) {
dttii_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} else {
dttiv_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
}
__syncthreads();
for (unsigned int i = 0; i < DTT_SIZE; i++)
dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
}
}
float * alloc_kernel_gpu(int size) // size in floats
inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
{
float *kernel_gpu;
float u00= ( COSN2[0] * x[0] + SINN2[0] * x[7]);
checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
float u10= (-SINN2[3] * x[3] + COSN2[3] * x[4]);
return kernel_gpu;
}
float u01= ( COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11= -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
float ** copyalloc_pointers_gpu(float ** gpu_pointer,
float u02= ( COSN2[2] * x[2] + SINN2[2] * x[5]);
int size) // number of entries (cameras)
float u12= (-SINN2[1] * x[1] + COSN2[1] * x[6]);
{
float ** gpu_pointer_to_gpu_pointers;
checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
checkCudaErrors(cudaMemcpy(
gpu_pointer_to_gpu_pointers,
gpu_pointer,
size * sizeof(float*),
cudaMemcpyHostToDevice));
return gpu_pointer_to_gpu_pointers;
}
float u03= ( COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13= -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
float * copyalloc_image_gpu(float * image_host,
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
size_t* dstride, // in floats !
int width,
int height)
{
float *image_gpu;
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
checkCudaErrors(cudaMemcpy2D(
image_gpu,
*dstride, // * sizeof(float),
image_host,
width * sizeof(float), // make in 16*n?
width * sizeof(float),
height,
cudaMemcpyHostToDevice));
return image_gpu;
}
float * alloc_image_gpu(size_t* dstride, // in bytes!!
float ua00= u00 + u03;
int width,
float ua10= u00 - u03;
int height)
{
float *image_gpu;
checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
return image_gpu;
}
int readFloatsFromFile(float * data, // allocated array
float ua01= u01 + u02;
const char * path) // file path
float ua11= u01 - u02;
{
std::ifstream input(path, std::ios::binary );
float v00= ua00 + ua01;
// copies all data into buffer
float v02= ua00 - ua01;
std::vector<char> buffer((
std::istreambuf_iterator<char>(input)),
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
(std::istreambuf_iterator<char>()));
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
std::copy( buffer.begin(), buffer.end(), (char *) data);
return 0;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
}
int writeFloatsToFile(float * data, // allocated array
float ub00= u10 + u13;
int size, // length in elements
float ub10= u10 - u13;
const char * path) // file path
{
float ub01= u11 + u12;
float ub11= u11 - u12;
// std::ifstream input(path, std::ios::binary );
float vb00= ub00 + ub01;
std::ofstream ofile(path, std::ios::binary);
float vb01= ub00 - ub01;
ofile.write((char *) data, size * sizeof(float));
return 0;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
y[0] = SQRT_2 * v00; // w0[0];
y[1] = v01 - vb11; // w1[0];
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
}
// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
void set_clt_lpf(
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
float * lpf, // size*size array to be filled out
float sigma,
const int dct_size)
{
{
int dct_len = dct_size * dct_size;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (sigma == 0.0f) {
if (dst_not_dct) { // DSTIV
lpf[0] = 1.0f;
u00= ( COSN2[0] * x7 + SINN2[0] * x0);
for (int i = 1; i < dct_len; i++){
u10= (-SINN2[3] * x4 + COSN2[3] * x3);
lpf[i] = 0.0;
u01= ( COSN2[1] * x6 + SINN2[1] * x1);
u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
u02= ( COSN2[2] * x5 + SINN2[2] * x2);
u12= (-SINN2[1] * x6 + COSN2[1] * x1);
u03= ( COSN2[3] * x4 + SINN2[3] * x3);
u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00= ( COSN2[0] * x0 + SINN2[0] * x7);
u10= (-SINN2[3] * x3 + COSN2[3] * x4);
u01= ( COSN2[1] * x1 + SINN2[1] * x6);
u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
u02= ( COSN2[2] * x2 + SINN2[2] * x5);
u12= (-SINN2[1] * x1 + COSN2[1] * x6);
u03= ( COSN2[3] * x3 + SINN2[3] * x4);
u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
}
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
// j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
*y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
} else {
} else {
for (int i = 0; i < dct_size; i++){
*y1 = (v01 - vb11) * SQRT1_8; // w1[0];
for (int j = 0; j < dct_size; j++){
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
*y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = vb00 * 0.5f; // w1[3];
}
}
}
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
}
// normalize
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
double sum = 0;
for (int i = 0; i < dct_size; i++){
float w00= u00 + u03;
for (int j = 0; j < dct_size; j++){
float w10= u00 - u03;
double d = lpf[i*dct_size+j];
d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
float w01= (u01 + u02);
if (i > 0) d*= 2.0;
float w11= (u01 - u02);
if (j > 0) d*= 2.0;
sum +=d;
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
}
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
}
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
for (int i = 0; i< dct_len; i++){
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
lpf[i] /= sum;
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
}
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
/**
float w01= (u01 + u02);
**************************************************************************
float w11= (u01 - u02);
* Program entry point
*
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
* \param argc [IN] - Number of command-line arguments
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
* \param argv [IN] - Array of command-line arguments
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
*
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
* \return Status code
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
*/
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
int main(int argc, char **argv)
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
{
//
float *x1 = x0 + inc;
// Sample initialization
float *x2 = x1 + inc;
//
float *x3 = x2 + inc;
printf("%s Starting...\n\n", argv[0]);
float *x4 = x3 + inc;
printf("sizeof(float*)=%d\n",(int)sizeof(float*));
float *x5 = x4 + inc;
float *x6 = x5 + inc;
//initialize CUDA
float *x7 = x6 + inc;
findCudaDevice(argc, (const char **)argv);
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
// CLT testing
u00= ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10= (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
const char* kernel_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
u01= ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
u02= ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12= (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
const char* kernel_offs_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
u03= ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
} else { // DCTIV
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
const char* image_files[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
const char* ports_offs_xy_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
}
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
const char* ports_clt_file[] = { // never referenced
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
const char* result_rbg_file[] = {
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
// not yet used
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
float port_offsets[NUM_CAMS][2] = {// used only in textures to scale differences
{-0.5, -0.5},
{ 0.5, -0.5},
{-0.5, 0.5},
{ 0.5, 0.5}};
int keep_texture_weights = 1; // try with 0 also
int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
/*
float ua00= u00 + u03;
#define IMG_WIDTH 2592
float ua10= u00 - u03;
#define IMG_HEIGHT 1936
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_STEP 16
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNEL_OFFSETS 8
#define TILESX 324
#define TILESY 242
*/
/*
struct tp_task {
long task;
short ty;
short tx;
float xy[NUM_CAMS][2];
} ;
*/
int KERN_TILES = KERNELS_HOR * KERNELS_VERT * NUM_COLORS;
int KERN_SIZE = KERN_TILES * 4 * 64;
// int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1)
;
float ua01= u01 + u02
;
int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1)
;
float ua11= u01 - u02
;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float)
);
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13
);
struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
float ub00= u10 + u13;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
float ub10= u10 - u13;
// int texture_indices [TILESX*TILESY];
int texture_indices [TILESX*TILESYA];
int cpu_woi [4];
// host array of pointers to GPU memory
float ub01= u11 + u12;
float * gpu_kernels_h [NUM_CAMS];
float ub11= u11 - u12;
struct CltExtra * gpu_kernel_offsets_h [NUM_CAMS];
float * gpu_images_h [NUM_CAMS];
float tile_coords_h [NUM_CAMS][TILESX * TILESY][2];
float * gpu_clt_h [NUM_CAMS];
float * gpu_lpf_h [NUM_COLORS]; // never used
#ifndef NOICLT
float * gpu_corr_images_h [NUM_CAMS];
#endif
float * gpu_corrs;
float vb00= ub00 + ub01;
int * gpu_corr_indices;
float vb01= ub00 - ub01;
float * gpu_textures;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float * gpu_textures_rbga;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
int * gpu_texture_indices;
int * gpu_woi;
int * gpu_num_texture_tiles;
*x0 = v00 * 0.5f; // w0[0];
float * gpu_port_offsets;
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
int num_corrs;
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
int num_textures;
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
int num_ports = NUM_CAMS;
if (dst_not_dct) { // DSTIV
// GPU pointers to GPU pointers to memory
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
float ** gpu_kernels; // [NUM_CAMS];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
float ** gpu_images; // [NUM_CAMS];
*x7 = -vb00 * 0.5f; // w1[3];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_lpf; // [NUM_CAMS]; // never referenced
// GPU pointers to GPU memory
// float * gpu_tasks;
struct tp_task * gpu_tasks;
size_t dstride; // in bytes !
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) {
set_clt_lpf (
lpf_rbg[ncol], // float * lpf, // size*size array to be filled out
lpf_sigmas[ncol], // float sigma,
8); // int dct_size)
gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
} else {
} else {
gpu_lpf_h[ncol] = NULL;
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
}
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
readFloatsFromFile(
{
host_kern_buf, // float * data, // allocated array
float sgn = 1 - 2* dst_not_dct;
kernel_file[ncam]); // char * path) // file path
float *y0 = x;
gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
float *y1 = y0 + inc;
float *y2 = y1 + inc;
readFloatsFromFile(
float *y3 = y2 + inc;
host_kern_buf, // float * data, // allocated array
float *y4 = y3 + inc;
kernel_offs_file[ncam]); // char * path) // file path
float *y5 = y4 + inc;
gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
float *y6 = y5 + inc;
host_kern_buf,
float *y7 = y6 + inc;
KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
// will get results back
float *x0 = x + dst_not_dct * 7 * inc;
gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
// negate inc, replace
printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
inc *= sgn;
// allocate result images (3x height to accommodate 3 colors
float *x1 = x0 + inc;
float *x2 = x1 + inc;
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
float *x3 = x2 + inc;
//host array of pointers to GPU arrays
float *x4 = x3 + inc;
#ifndef NOICLT
float *x5 = x4 + inc;
gpu_corr_images_h[ncam] = alloc_image_gpu(
float *x6 = x5 + inc;
&dstride_rslt, // size_t* dstride, // in bytes!!
float *x7 = x6 + inc;
IMG_WIDTH + DTT_SIZE, // int width,
float u00, u01, u02, u03, u10, u11, u12, u13;
3*(IMG_HEIGHT + DTT_SIZE)); // int height);
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
#endif
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
}
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
gpu_corrs = alloc_image_gpu(
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width,
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
NUM_PAIRS * TILESX * TILESY); // int height);
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
// read channel images (assuming host_kern_buf size > image size, reusing it)
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
readFloatsFromFile(
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
host_kern_buf, // float * data, // allocated array
image_files[ncam]); // char * path) // file path
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
gpu_images_h[ncam] = copyalloc_image_gpu(
host_kern_buf, // float * image_host,
float ua00= u00 + u03;
&dstride, // size_t* dstride,
float ua10= u00 - u03;
IMG_WIDTH, // int width,
IMG_HEIGHT); // int height);
float ua01= u01 + u02;
}
float ua11= u01 - u02;
//#define DBG_TILE (174*324 +118)
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
inline __device__ void dctiv_nodiverg(float * x0, int inc)
readFloatsFromFile(
{
(float *) &tile_coords_h[ncam],
float *x1 = x0 + inc;
ports_offs_xy_file[ncam]); // char * path) // file path
float *x2 = x1 + inc;
}
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
// build TP task that processes all tiles in linescan order
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
for (int ty = 0; ty < TILESY; ty++){
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
task_data[nt].txy = tx + (ty << 16);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
}
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
#ifdef DBG0
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
for (int t = 0; t < NUM_TEST_TILES; t++) {
task_data[t].task = 1;
task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
int nt = task_data[t].ty * TILESX + task_data[t].tx;
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
float ua00= u00 + u03;
task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
float ua10= u00 - u03;
task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
}
}
tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
#endif
float ua01= u01 + u02;
float ua11= u01 - u02;
// segfault in the next
float v00= ua00 + ua01;
gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
float v02= ua00 - ua01;
// build corr_indices
num_corrs = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm){
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
}
}
}
}
// num_corrs now has the total number of correlations
// copy corr_indices to gpu
// gpu_corr_indices = (int *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
gpu_corr_indices = (int *) copyalloc_kernel_gpu(
(float * ) corr_indices,
num_corrs,
NUM_PAIRS * TILESX * TILESY);
// build texture_indices
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx;
// int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
int cm = task_data[nt].task & TASK_TEXTURE_BITS;
if (cm){
texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
}
}
// num_textures now has the total number of textures
// copy corr_indices to gpu
// gpu_texture_indices = (int *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
gpu_texture_indices = (int *) copyalloc_kernel_gpu(
(float * ) texture_indices,
num_textures,
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2)
;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11
;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
// int keep_texture_weights = 1; // try with 0 also
float ub01= u11 + u12;
// int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
float ub11= u11 - u12;
// double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
tile_texture_size, // int width (floats),
TILESX * TILESY); // int height);
int rgba_width = (TILESX+1) * DTT_SIZE;
*x0 = v00 * 0.5f; // w0[0];
int rgba_height = (TILESY+1) * DTT_SIZE;
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
int rbga_slices = texture_colors + 1; // 4/1
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
gpu_textures_rbga = alloc_image_gpu(
inline __device__ void dstiv_nodiverg(float * x, int inc)
&dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile
{
rgba_width, // int width (floats),
float *x0 = x + 7 * inc;
rgba_height * rbga_slices); // int height);
// negate inc, replace
inc = -inc;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
gpu_kernels = copyalloc_pointers_gpu (gpu_kernels_h, NUM_CAMS);
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
gpu_images = copyalloc_pointers_gpu (gpu_images_h, NUM_CAMS);
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
//create and start CUDA timer
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
float ua00= u00 + u03;
float ua10= u00 - u03;
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
float ua01= u01 + u02;
dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
float ua11= u01 - u02;
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
#ifdef DBG_TILE
float v00= ua00 + ua01;
const int numIterations = 1; //0;
float v02= ua00 - ua01;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_correct_tiles<<<grid_tp,threads_tp>>>(
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
getLastCudaError("Kernel execution failed");
float ub00= u10 + u13;
checkCudaErrors(cudaDeviceSynchronize());
float ub10= u10 - u13;
printf("%d\n",i);
}
// checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
printf("Run time =%f ms\n", avgTime);
#ifdef SAVE_CLT
int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
#ifndef DBG_TILE
printf("Writing CLT data to %s\n", ports_clt_file[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
ports_clt_file[ncam]); // const char * path) // file path
#endif
}
#endif
#ifdef TEST_IMCLT
float ub01= u11 + u12;
{
float ub11= u11 - u12;
// testing imclt
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
dim3 grid_imclt(1,1,1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
test_imclt<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // ncam]); // // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ncam); // int ncam); // just for debug print
}
getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize());
printf("test_imclt() DONE\n");
}
#endif
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
#ifndef NOICLT
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
// testing imclt
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
StopWatchInterface *timerIMCLT = 0;
sdkCreateTimer(&timerIMCLT);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerIMCLT);
sdkStartTimer(&timerIMCLT);
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
#ifdef IMCLT14
for (int v_offs = 0; v_offs < 1; v_offs++){ // temporarily for debugging
for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
#else
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
#endif
int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg<<<grid_imclt,threads_imclt>>>(
gpu_clt_h[ncam], // float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images_h[ncam], // float * gpu_rbg, // WIDTH, 3 * HEIGHT
color, // int color,
v_offs, // int v_offset,
h_offs, // int h_offset,
dstride_rslt/sizeof(float)); //const size_t dstride); // in floats (pixels)
}
}
}
}
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerIMCLT)
;
*x7 = v00 * 0.5f; // w0[0]
;
float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations
;
*x5 = (v01 + vb11) * SQRT1_8; // w0[1]
;
sdkDeleteTimer(&timerIMCLT)
;
*x3 = (v02 - vb01) * SQRT1_8; // w0[2]
;
printf("Average IMCLT run time =%f ms\n", avgTimeIMCLT)
;
*x1 = (v03 + vb10) * SQRT1_8; // w0[3]
;
int rslt_img_size = NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
*x6 = (vb11 - v01) * SQRT1_8; // w1[0];
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3];
}
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
checkCudaErrors(cudaMemcpy2D( // segfault
{
cpu_corr_image,
float u00= (x[0] + x[7]);
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
float u10= (x[0] - x[7]);
gpu_corr_images_h[ncam],
dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
3* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
#ifndef DBG_TILE
float u01= (x[1] + x[6]);
printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
float u11= (x[1] - x[6]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path
#endif
}
free(cpu_corr_image
);
float u02= (x[2] + x[5]
);
#endif
float u12= (x[2] - x[5]);
float u03= (x[3] + x[4]);
float u13= (x[3] - x[4]);
#ifndef NOCORR
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
// cudaProfilerStart();
// testing corr
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR);
}
dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
float w00= u00 + u03;
correlate2D<<<grid_corr,threads_corr>>>(
float w10= u00 - u03;
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1)
0.25, // float scale0, // scale for R
0.25, // float scale1, // scale for B
0.5, // float scale2, // scale for G
30.0, // float fat_zero, // here - absolute
num_corrs, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
CORR_OUT_RAD, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerCORR);
float w01= (u01 + u02);
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
float w11= (u01 - u02);
sdkDeleteTimer(&timerCORR);
printf("Average CORR run time =%f ms\n", avgTimeCORR);
int corr_size = 2 * CORR_OUT_RAD + 1;
float v00= w00 + w01;
int rslt_corr_size = num_corrs * corr_size * corr_size;
float v02= w00 - w01;
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
checkCudaErrors(cudaMemcpy2D(
// _dctii_nrecurs2(u00, u01, &v00, &v01);
cpu_corr,
float z00= w20 + w21;
(corr_size * corr_size) * sizeof(float),
float z01= w20 - w21;
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_CORR
// _dctii_nrecurs2(u10, u11, &v10, &v11);
printf("Writing phase correlation data to %s\n", result_corr_file);
float z10= w30 + w31;
writeFloatsToFile(
float z11= w30 - w31;
cpu_corr, // float * data, // allocated array
rslt_corr_size, // int size, // length in elements
float v10 = SQRT_2 * z00;
result_corr_file); // const char * path) // file path
float v11 = z01 - z11;
#endif
free(cpu_corr);
float v12 = z01 + z11;
#endif // ifndef NOCORR
float v13 = SQRT_2 * z10;
y[0] = v00;
// -----------------
y[1] = v10;
#ifndef NOTEXTURES
y[2] = v01;
// cudaProfilerStart();
y[3] = v11;
// testing textures
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
y[4] = v02;
dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
y[5] = v12;
printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
y[6] = v03;
StopWatchInterface *timerTEXTURE = 0;
y[7] = v13;
sdkCreateTimer(&timerTEXTURE);
}
for (int i = i0; i < numIterations; i++)
inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
{
if (i == 0)
_dctii_nrecurs8(x, y);
{
#pragma unroll
checkCudaErrors(cudaDeviceSynchronize());
for (int i = 0; i < 8 ; i++) {
sdkResetTimer(&timerTEXTURE);
y[i] *= SQRT1_8;
sdkStartTimer(&timerTEXTURE);
}
}
}
// Channel0 weight = 0.294118
__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
// Channel1 weight = 0.117647
{
// Channel2 weight = 0.588235
_dctiv_nrecurs8(x, y);
textures_accumulate<<<grid_texture,threads_texture>>> (
#pragma unroll
// 0, // int border_tile, // if 1 - watch for border
for (int i = 0; i < 8 ; i++) {
(int *) 0, // int * woi, // x, y, width,height
y[i] *= SQRT1_8;
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // const size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURE);
}
float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
sdkDeleteTimer(&timerTEXTURE);
inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
{
float xr[8];
int rslt_texture_size = num_textures * tile_texture_size;
#pragma unroll
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
for (int i=0; i < 8;i++){
xr[i] = x[7 - i];
checkCudaErrors(cudaMemcpy2D(
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
dstride_textures,
tile_texture_size * sizeof(float),
num_textures,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing phase texture data to %s\n", result_textures_file);
writeFloatsToFile(
cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
//DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
if ((i % 256) == 0){
printf("\nchn = %d\n", chn++);
}
}
printf("%10.4f", *(cpu_textures + texture_offset + i)
);
_dctiv_nrecurs8(xr, y
);
if (((i + 1) % 16) == 0){
#pragma unroll
printf("\n");
for (int i=0; i < 8;i+=2){
} else {
y[i] *= SQRT1_8;
printf(" ")
;
y[i+1] *= -SQRT1_8
;
}
}
}
//=========================== 2D functions ===============
__device__ void corrUnfoldTile(
int corr_radius,
float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int size2r1 = 2 * corr_radius + 1; // 15
int crp1 = corr_radius + 1; //8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
int i = threadIdx.x;
if (i > corr_radius) {
return; // not needed, only use inner
}
}
// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
#endif // DEBUG9
float corr_pixscale = 0.25f;
#endif
int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
free(cpu_textures);
int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
#endif // ifndef NOTEXTURES
/// int rslt_row_offs = i * DTT_SIZE2M1;
int rslt_row_offs = i * size2r1;
int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
#define GEN_TEXTURE_LIST
int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
#ifdef GEN_TEXTURE_LIST
rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
rslt[rslt_base_index_m] = rslt[rslt_base_index_p]; // nop for i=0 incomplete, will only be used for thread i=0
dim3 grid_list (1,1,1);
/// for (int j = 1; j < DTT_SIZE; j++) {
printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
for (int j = 1; j <= corr_radius; j++) {
printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
int rslt_base_index_pp = rslt_base_index_p + j;
StopWatchInterface *timerTEXTURELIST = 0;
int rslt_base_index_pm = rslt_base_index_p - j;
sdkCreateTimer(&timerTEXTURELIST);
rslt[rslt_base_index_pp] = corr_pixscale * (
for (int i = i0; i < numIterations; i++)
qdata0[i_transform_size + j] +
{
qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
if (i == 0)
rslt[rslt_base_index_pm] = corr_pixscale * (
{
qdata0[i_transform_size + j] +
checkCudaErrors(cudaDeviceSynchronize());
-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
sdkResetTimer(&timerTEXTURELIST);
sdkStartTimer(&timerTEXTURELIST);
}
}
if (i == 0) {
prepare_texture_list<<<grid_list,threads_list>>> (
return;
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY); // int height); // <= TILESY, use for faster processing of LWIR images
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
}
/// cudaProfilerStop();
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
sdkStopTimer(&timerTEXTURELIST);
float d = corr_pixscale * qdata2[im1_transform_size];
float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
rslt[rslt_base_index_p] += d;
sdkDeleteTimer(&timerTEXTURELIST);
rslt[rslt_base_index_m] -= d;
printf("Average TextureList run time =%f ms\n", avgTimeTEXTURESLIST);
for (int j = 1; j <= corr_radius; j++) {
int rslt_base_index_pp = rslt_base_index_p + j;
int cpu_num_texture_tiles[8];
int rslt_base_index_pm = rslt_base_index_p - j;
checkCudaErrors(cudaMemcpy(
int rslt_base_index_mp = rslt_base_index_m + j;
cpu_woi,
int rslt_base_index_mm = rslt_base_index_m - j;
gpu_woi,
float d2 = corr_pixscale * qdata2[im1_transform_size + j];
4 * sizeof(float),
float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
cudaMemcpyDeviceToHost));
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
checkCudaErrors(cudaMemcpy(
rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
cpu_num_texture_tiles,
rslt[rslt_base_index_pp] += d2 + d3;
gpu_num_texture_tiles,
rslt[rslt_base_index_pm] += d2 - d3;
8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost));
printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
tp_task_size
);
for (int q = 0; q < 4; q++) {
checkCudaErrors(cudaMemcpy(
texture_indices + q * TILESX * (TILESYA >> 2),
gpu_texture_indices + q * TILESX * (TILESYA >> 2),
cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
cudaMemcpyDeviceToHost));
}
}
for (int q = 0; q < 4; q++) {
}
printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
__device__ void dttii_2d(
(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
{
(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
// change to 16-32 threads?? in next iteration
(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
for (int q = 0; q < 4; q++){
(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
int is_sin = (q >> 1) & 1;
(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
}
#endif //GEN_TEXTURE_LIST
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
}
#ifndef NOTEXTURE_RGBA
__device__ void dttiv_color_2d(
dim3 threads_rgba(1, 1, 1);
float * clt_tile,
dim3 grid_rgba(1,1,1);
int color)
printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
{
printf("grid_rgba=(%d, %d, %d)\n", grid_rgba.x,grid_rgba.y,grid_rgba.z);
dctiv_nodiverg( // all colors
StopWatchInterface *timerRGBA = 0;
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
sdkCreateTimer(&timerRGBA);
1); //int inc);
if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
1); //int inc);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerRGBA);
sdkStartTimer(&timerRGBA);
}
}
__syncthreads();// __syncwarp();
generate_RBGA<<<grid_rgba,threads_rgba>>> (
#ifdef DEBUG222
// Parameters to generate texture tasks
if ((threadIdx.x) == 0){
gpu_tasks, // struct tp_task * gpu_tasks,
printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
tp_task_size, // int num_tiles, // number of tiles in task list
debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
0, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
}
sdkStopTimer(&timerRGBA);
__syncthreads();// __syncwarp();
float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
sdkDeleteTimer(&timerRGBA);
printf("Average Texture run time =%f ms\n", avgTimeRGBA);
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
int rgba_woi_width = (cpu_woi[2] + 1) * DTT_SIZE;
int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices;
float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures_rgba,
rgba_width * sizeof(float),
gpu_textures_rbga,
dstride_textures_rbga,
rgba_width * sizeof(float),
rgba_height * rbga_slices,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path
#endif
#endif
#ifdef DEBUG11
dctiv_nodiverg( // all colors
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
for (int chn = 0; chn < rbga_slices; chn++){
DTT_SIZE1); // int inc,
printf("\nchn = %d\n", chn);
if (color == BAYER_GREEN){
int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
for (int i = 0; i < 8; i++){
DTT_SIZE1); // int inc,
for (int j = 0; j < 8; j++){
printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
}
printf("\n");
}
}
}
#endif // DEBUG11
__syncthreads();// __syncwarp();
free(cpu_textures_rgba);
}
#endif // ifndef NOTEXTURES
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt(
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile ) // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int thr3 = threadIdx.x >> 3;
int column = threadIdx.x; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7;
int column4 = threadIdx.x >> 2;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\nDTT Tiles before IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg(
clt_tile + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // all colors
clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg(
clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
DTT_SIZE1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg(
clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
#ifdef DEBUG3
if ((threadIdx.x) == 0){
printf("\nDTT Tiles after IDTT\n");
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = val;
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
rslt += DTT_SIZE21;
}
#ifdef DEBUG3
__syncthreads();// __syncwarp();
if ((threadIdx.x) == 0){
printf("\nMCLT Tiles after IMCLT\n");
debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
}
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__ void imclt8threads(
int do_acc, // 1 - add to previous value, 0 - overwrite
float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float * mclt_tile, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int debug)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float * clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nDTT Tiles before IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg( // quadrant 0
clt_tile + threadIdx.x, // pointer to start of row for quadrant 0
DTT_SIZE1);
dctiv_nodiverg( // quadrant 1
clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
DTT_SIZE1);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg( // quadrant 2
clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
DTT_SIZE1);
dstiv_nodiverg( // quadrant 3
clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
DTT_SIZE1);
__syncthreads();// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg( // quadrant 0
clt_tile + DTT_SIZE1 * threadIdx.x, // pointer to start of row for quadrant 0
1);
dctiv_nodiverg( // quadrant 2
clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
1);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg( // quadrant 1
clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
1);
dstiv_nodiverg( // quadrant 3
clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
1);
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nDTT Tiles after IDTT\n");
debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for (int thr3 = 0; thr3 < 2; thr3++){
int thr3m = (thr3 << 3);
int column = threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
int thr012 = threadIdx.x & 7; // == threadIdx.x
int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
float hw = HWINDOW2[wcolumn];
int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
float * rslt = mclt_tile + column;
#ifdef DEBUG7
if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
debug_print_clt1(clt_tile, -1, 0xf); // only 1 quadrant for R,B and 2 - for G
}
__syncthreads();// __syncwarp();
#endif
#pragma unroll
for (int i = 0; i < 4; i++){
float val = *rslt;
// facc
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i < 3){
clt_offset += DTT_SIZE1;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 4; i < 8; i++){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
// if (i < 7){
clt_offset -= DTT_SIZE1;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 7; i >= 4; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 4){
clt_offset -= DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
#pragma unroll
for (int i = 3; i >= 0; i--){
float val = *rslt;
float w = HWINDOW2[i] * hw;
float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
d0+=d1;
d2+=d3;
d0+= d2;
if (i > 0){
clt_offset += DTT_SIZE1;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
rslt += DTT_SIZE21;
}
}
#ifdef DEBUG7
__syncthreads();// __syncwarp();
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
}
#ifdef SAVE_CLT
//#endif
free(cpu_clt);
#endif
free (host_kern_buf);
// TODO: move somewhere when all is done
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
checkCudaErrors(cudaFree(gpu_images_h[ncam]));
checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
#ifndef NOICLT
checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
#endif
}
checkCudaErrors(cudaFree(gpu_tasks));
checkCudaErrors(cudaFree(gpu_kernels));
checkCudaErrors(cudaFree(gpu_kernel_offsets));
checkCudaErrors(cudaFree(gpu_images));
checkCudaErrors(cudaFree(gpu_clt));
// checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors(cudaFree(gpu_corrs));
checkCudaErrors(cudaFree(gpu_corr_indices));
checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets));
checkCudaErrors(cudaFree(gpu_textures));
checkCudaErrors(cudaFree(gpu_textures_rbga));
checkCudaErrors(cudaFree(gpu_woi));
checkCudaErrors(cudaFree(gpu_num_texture_tiles));
exit(0);
}
src/dtt8x8.cuh
deleted
100644 → 0
View file @
0253bab4
/**
**
** dtt8x8.cuh
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** dtt8x8.cuh is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file dtt8x8.cuh
* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
* in GPU
* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
* here it is used to convert convolution kernels and LPF to the frequency domain - done in
* software.
*
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE_LOG2 3
//#define DTT_SIZE 8
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//#define CUDART_INF_F __int_as_float(0x7f800000)
/*
Python code to generate constant coefficients:
def dct_constants():
COSPI_1_8_SQRT2 = math.cos(math.pi/8)*math.sqrt(2.0)
COSPI_3_8_SQRT2 = math.cos(3*math.pi/8)*math.sqrt(2.0)
SQRT_2 = math.sqrt(2.0)
SQRT1_2 = 1/math.sqrt(2.0)
SQRT1_8 = 1/math.sqrt(8.0)
CN = [[math.cos((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
SN = [[math.sin((2*k+1)*(math.pi/(8*(2 << t)))) for k in range (2 << t)] for t in range (2)]
print("__constant__ float COSPI_1_8_SQRT2 = %ff;"%(COSPI_1_8_SQRT2))
print("__constant__ float COSPI_3_8_SQRT2 = %ff;"%(COSPI_3_8_SQRT2))
print("__constant__ float SQRT_2 = %ff;"% (SQRT_2))
print("__constant__ float SQRT1_2 = %ff;"% (SQRT1_2))
print("__constant__ float SQRT1_8 = %ff;"% (SQRT1_8))
print("__constant__ float COSN1[] = {%ff,%ff};"% (CN[0][0],CN[0][1]))
print("__constant__ float COSN2[] = {%ff,%ff,%ff,%ff};"% (CN[1][0],CN[1][1],CN[1][2],CN[1][3]))
print("__constant__ float SINN1[] = {%ff,%ff};"% (SN[0][0],SN[0][1]))
print("__constant__ float SINN2[] = {%ff,%ff,%ff,%ff};"% (SN[1][0],SN[1][1],SN[1][2],SN[1][3]))
*/
__constant__ float COSPI_1_8_SQRT2 = 1.306563f;
__constant__ float COSPI_3_8_SQRT2 = 0.541196f;
__constant__ float SQRT_2 = 1.414214f;
__constant__ float SQRT1_2 = 0.707107f;
__constant__ float SQRT1_8 = 0.353553f;
__constant__ float COSN1[] = {0.980785f,0.831470f};
__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
__constant__ float SINN1[] = {0.195090f,0.555570f};
__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct); // used in GPU_DTT24_DRV
inline __device__ void dttiv_nodiverg (float * x, int inc, int dst_not_dct); // not used
inline __device__ void dctiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dstiv_nodiverg (float * x0, int inc); // used in TP
inline __device__ void dct_ii8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dct_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void dst_iv8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
/**
**************************************************************************
* Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
* Mostly for testing and profiling individual conversions
*
* \param dst [OUT] - Coefficients as 8x8 tiles
* \param src [IN] - Source image of floats
* \param src_stride [IN] - Source image stride
* \param mode [IN] - DTT mode:
* 0 - horizontal DCT-IV followed by vertical DCT-IV
* 1 - horizontal DST-IV followed by vertical DCT-IV
* 2 - horizontal DCT-IV followed by vertical DST-IV
* 3 - horizontal DST-IV followed by vertical DST-IV
* 4 - horizontal DCT-II followed by vertical DCT-II
* 5 - horizontal DST-II followed by vertical DCT-II
* 6 - horizontal DCT-II followed by vertical DST-II
* 7 - horizontal DST-II followed by vertical DST-II
*
* \return None
*/
extern "C"
__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
{
int dtt_mode0 = dtt_mode & 1;
int dtt_mode1 = (dtt_mode >>1) & 1;
__shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
int OffsThreadInRow = threadIdx.y * DTT_SIZE + threadIdx.x;
int OffsThreadInCol = threadIdx.z * DTT_SIZE;
src += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
dst += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
float *bl_ptr = block + OffsThreadInCol * DTTTEST_BLK_STRIDE + OffsThreadInRow;
#pragma unroll
for (unsigned int i = 0; i < DTT_SIZE; i++)
bl_ptr[i * DTTTEST_BLK_STRIDE] = src[i * src_stride];
__syncthreads();
// horizontal pass
if (dtt_mode > 3) {
dttii_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
} else {
dttiv_shared_mem (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
}
__syncthreads();
// vertical pass
if (dtt_mode > 3) {
dttii_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
} else {
dttiv_shared_mem (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
}
__syncthreads();
for (unsigned int i = 0; i < DTT_SIZE; i++)
dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
}
inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= ( COSN2[0] * x[0] + SINN2[0] * x[7]);
float u10= (-SINN2[3] * x[3] + COSN2[3] * x[4]);
float u01= ( COSN2[1] * x[1] + SINN2[1] * x[6]);
float u11= -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
float u02= ( COSN2[2] * x[2] + SINN2[2] * x[5]);
float u12= (-SINN2[1] * x[1] + COSN2[1] * x[6]);
float u03= ( COSN2[3] * x[3] + SINN2[3] * x[4]);
float u13= -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
y[0] = SQRT_2 * v00; // w0[0];
y[1] = v01 - vb11; // w1[0];
// j == 1
y[2] = v01 + vb11; // w0[1];
y[3] = v02 + vb01; // w1[1];
// j == 2
y[4] = v02 - vb01; // w0[2];
y[5] = v03 - vb10; // w1[2]; - same as y[3]
// j == 3
y[6] = v03 + vb10; // w0[3];
y[7] = SQRT_2 * vb00; // w1[3];
}
inline __device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
{
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * x7 + SINN2[0] * x0);
u10= (-SINN2[3] * x4 + COSN2[3] * x3);
u01= ( COSN2[1] * x6 + SINN2[1] * x1);
u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
u02= ( COSN2[2] * x5 + SINN2[2] * x2);
u12= (-SINN2[1] * x6 + COSN2[1] * x1);
u03= ( COSN2[3] * x4 + SINN2[3] * x3);
u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
} else { // DCTIV
u00= ( COSN2[0] * x0 + SINN2[0] * x7);
u10= (-SINN2[3] * x3 + COSN2[3] * x4);
u01= ( COSN2[1] * x1 + SINN2[1] * x6);
u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
u02= ( COSN2[2] * x2 + SINN2[2] * x5);
u12= (-SINN2[1] * x1 + COSN2[1] * x6);
u03= ( COSN2[3] * x3 + SINN2[3] * x4);
u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
// j == 1
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
// j == 2
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
// j == 3
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*y1 = (vb11 - v01) * SQRT1_8; // w1[0];
*y3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = -vb00 * 0.5f; // w1[3];
} else {
*y1 = (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttii_shared_mem(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
} else {
*x0 = (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttii_shared_mem_nonortho(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTII
// invert odd input samples
u00= ( (*x0) - (*x7));
u10= ( (*x0) + (*x7));
u01= (-(*x1) + (*x6));
u11= (-(*x1) - (*x6));
u02= ( (*x2) - (*x5));
u12= ( (*x2) + (*x5));
u03= (-(*x3) + (*x4));
u13= (-(*x3) - (*x4));
} else { // DCTII
u00= ( (*x0) + (*x7));
u10= ( (*x0) - (*x7));
u01= ( (*x1) + (*x6));
u11= ( (*x1) - (*x6));
u02= ( (*x2) + (*x5));
u12= ( (*x2) - (*x5));
u03= ( (*x3) + (*x4));
u13= ( (*x3) - (*x4));
}
// _dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
float v11 = w20 - w21 - w30 + w31;
float v12 = w20 - w21 + w30 - w31;
if (dst_not_dct) { // DSTII
// Invert output sequence
*x0 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
*x1 = v03 * SQRT1_8;
*x2 = v12 * SQRT1_8;
*x3 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x4 = v11 * SQRT1_8;
*x5 = v01 * SQRT1_8;
*x6 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x7 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
} else {
*x0 = (w00 + w01) * 0.5f; // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
*x1 = (w20 + w21) * 0.5f; // v10 * SQRT1_8; z00 * 0.5f;
*x2 = v01 * SQRT1_8;
*x3 = v11 * SQRT1_8;
*x4 = (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
*x5 = v12 * SQRT1_8;
*x6 = v03 * SQRT1_8;
*x7 = (w30 + w31)* 0.5f; // v13 * SQRT1_8; z10 * 0.5f
}
}
inline __device__ void dttiv_shared_mem(float * x0, int inc, int dst_not_dct)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
if (dst_not_dct) { // DSTIV
u00= ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
u10= (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
u01= ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
u02= ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
u12= (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
u03= ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
} else { // DCTIV
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
}
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
if (dst_not_dct) { // DSTIV
*x1 = (vb11 - v01) * SQRT1_8; // w1[0];
*x3 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = -vb00 * 0.5f; // w1[3];
} else {
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
}
inline __device__ void dttiv_nodiverg(float * x, int inc, int dst_not_dct)
{
float sgn = 1 - 2* dst_not_dct;
float *y0 = x;
float *y1 = y0 + inc;
float *y2 = y1 + inc;
float *y3 = y2 + inc;
float *y4 = y3 + inc;
float *y5 = y4 + inc;
float *y6 = y5 + inc;
float *y7 = y6 + inc;
float *x0 = x + dst_not_dct * 7 * inc;
// negate inc, replace
inc *= sgn;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*y0 = v00 * 0.5f; // w0[0];
*y2 = (v01 + vb11) * SQRT1_8; // w0[1];
*y4 = (v02 - vb01) * SQRT1_8; // w0[2];
*y6 = (v03 + vb10) * SQRT1_8; // w0[3];
*y1 = sgn * (v01 - vb11) * SQRT1_8; // w1[0];
*y3 = sgn * (v02 + vb01) * SQRT1_8; // w1[1];
*y5 = sgn * (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*y7 = sgn * vb00 * 0.5f; // w1[3];
}
inline __device__ void dctiv_nodiverg(float * x0, int inc)
{
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x0 = v00 * 0.5f; // w0[0];
*x2 = (v01 + vb11) * SQRT1_8; // w0[1];
*x4 = (v02 - vb01) * SQRT1_8; // w0[2];
*x6 = (v03 + vb10) * SQRT1_8; // w0[3];
*x1 = (v01 - vb11) * SQRT1_8; // w1[0];
*x3 = (v02 + vb01) * SQRT1_8; // w1[1];
*x5 = (v03 - vb10) * SQRT1_8; // w1[2]; - same as y[3]
*x7 = vb00 * 0.5f; // w1[3];
}
inline __device__ void dstiv_nodiverg(float * x, int inc)
{
float *x0 = x + 7 * inc;
// negate inc, replace
inc = -inc;
float *x1 = x0 + inc;
float *x2 = x1 + inc;
float *x3 = x2 + inc;
float *x4 = x3 + inc;
float *x5 = x4 + inc;
float *x6 = x5 + inc;
float *x7 = x6 + inc;
float u00, u01, u02, u03, u10, u11, u12, u13;
u00= ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
u10= (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
u01= ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
u02= ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
u12= (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
u03= ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float ua00= u00 + u03;
float ua10= u00 - u03;
float ua01= u01 + u02;
float ua11= u01 - u02;
float v00= ua00 + ua01;
float v02= ua00 - ua01;
float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
// _dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float ub00= u10 + u13;
float ub10= u10 - u13;
float ub01= u11 + u12;
float ub11= u11 - u12;
float vb00= ub00 + ub01;
float vb01= ub00 - ub01;
float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
*x7 = v00 * 0.5f; // w0[0];
*x5 = (v01 + vb11) * SQRT1_8; // w0[1];
*x3 = (v02 - vb01) * SQRT1_8; // w0[2];
*x1 = (v03 + vb10) * SQRT1_8; // w0[3];
*x6 = (vb11 - v01) * SQRT1_8; // w1[0];
*x4 = -(v02 + vb01) * SQRT1_8; // w1[1];
*x2 = (vb10 - v03) * SQRT1_8; // w1[2]; - same as y[3]
*x0 = -vb00 * 0.5f; // w1[3];
}
inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
float u00= (x[0] + x[7]);
float u10= (x[0] - x[7]);
float u01= (x[1] + x[6]);
float u11= (x[1] - x[6]);
float u02= (x[2] + x[5]);
float u12= (x[2] - x[5]);
float u03= (x[3] + x[4]);
float u13= (x[3] - x[4]);
// _dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
float w00= u00 + u03;
float w10= u00 - u03;
float w01= (u01 + u02);
float w11= (u01 - u02);
float v00= w00 + w01;
float v02= w00 - w01;
float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
// _dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
float w20= ( COSN1[0] * u10 + SINN1[0] * u13);
float w30= (-SINN1[1] * u11 + COSN1[1] * u12);
float w21= ( COSN1[1] * u11 + SINN1[1] * u12);
float w31= -(-SINN1[0] * u10 + COSN1[0] * u13);
// _dctii_nrecurs2(u00, u01, &v00, &v01);
float z00= w20 + w21;
float z01= w20 - w21;
// _dctii_nrecurs2(u10, u11, &v10, &v11);
float z10= w30 + w31;
float z11= w30 - w31;
float v10 = SQRT_2 * z00;
float v11 = z01 - z11;
float v12 = z01 + z11;
float v13 = SQRT_2 * z10;
y[0] = v00;
y[1] = v10;
y[2] = v01;
y[3] = v11;
y[4] = v02;
y[5] = v12;
y[6] = v03;
y[7] = v13;
}
inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctii_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
y[i] *= SQRT1_8;
}
}
inline __device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
_dctiv_nrecurs8(x, y);
#pragma unroll
for (int i = 0; i < 8 ; i++) {
y[i] *= SQRT1_8;
}
}
inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
{
float xr[8];
#pragma unroll
for (int i=0; i < 8;i++){
xr[i] = x[7 - i];
}
_dctiv_nrecurs8(xr, y);
#pragma unroll
for (int i=0; i < 8;i+=2){
y[i] *= SQRT1_8;
y[i+1] *= -SQRT1_8;
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment