Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
3d5ddc28
Commit
3d5ddc28
authored
Apr 07, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
prepared for separate compilation, for now merged
parent
39e75987
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2088 additions
and
1034 deletions
+2088
-1034
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+154
-66
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+166
-943
dtt8x8.cu
src/main/resources/kernels/dtt8x8.cu
+466
-25
dtt8x8.h
src/main/resources/kernels/dtt8x8.h
+111
-0
test_tp.cu
src/main/resources/kernels/test_tp.cu
+1097
-0
tp_defines.h
src/main/resources/kernels/tp_defines.h
+94
-0
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
3d5ddc28
...
...
@@ -31,6 +31,7 @@ import static jcuda.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABI
// Uses code by Marco Hutter - http://www.jcuda.org
import
static
jcuda
.
driver
.
CUjitInputType
.
CU_JIT_INPUT_LIBRARY
;
import
static
jcuda
.
driver
.
CUjitInputType
.
CU_JIT_INPUT_PTX
;
import
static
jcuda
.
driver
.
CUjit_option
.
CU_JIT_LOG_VERBOSE
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuCtxCreate
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuCtxSynchronize
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuDeviceGet
;
...
...
@@ -62,6 +63,7 @@ import java.io.IOException;
import
java.nio.charset.StandardCharsets
;
import
java.nio.file.Files
;
import
java.nio.file.Paths
;
import
java.util.Random
;
import
java.util.concurrent.atomic.AtomicInteger
;
import
com.elphel.imagej.tileprocessor.DttRad2
;
...
...
@@ -89,6 +91,10 @@ public class GPUTileProcessor {
String
LIBRARY_PATH
=
"/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"
;
// linux
static
String
GPU_RESOURCE_DIR
=
"kernels"
;
static
String
[]
GPU_KERNEL_FILES
=
{
"dtt8x8.cuh"
,
"TileProcessor.cuh"
};
// "*" - generated defines, first index - separately compiled unit
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu"},{"*","dtt8x8.h","TileProcessor.cuh"}};
static
String
[][]
GPU_SRC_FILES
=
{{
"*"
,
"dtt8x8.h"
,
"dtt8x8.cu"
,
"TileProcessor.cuh"
}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
static
String
GPU_CONVERT_CORRECT_TILES_NAME
=
"convert_correct_tiles"
;
// name in C code
static
String
GPU_IMCLT_RBG_NAME
=
"imclt_rbg"
;
// name in C code
static
String
GPU_CORRELATE2D_NAME
=
"correlate2D"
;
// name in C code
...
...
@@ -295,39 +301,8 @@ public class GPUTileProcessor {
return
new
PointerWithAddress
(
p
).
getAddress
();
}
public
GPUTileProcessor
(
String
cuda_project_directory
)
throws
IOException
{
// From code by Marco Hutter - http://www.jcuda.org
// Enable exceptions and omit all subsequent error checks
JCudaDriver
.
setExceptionsEnabled
(
true
);
JNvrtc
.
setExceptionsEnabled
(
true
);
// Initialize the driver and create a context for the first device.
cuInit
(
0
);
//2020 - making them global
CUdevice
device
=
new
CUdevice
();
cuDeviceGet
(
device
,
0
);
CUcontext
context
=
new
CUcontext
();
cuCtxCreate
(
context
,
0
,
device
);
int
majorArray
[]
=
{
0
};
int
minorArray
[]
=
{
0
};
cuDeviceGetAttribute
(
majorArray
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
device
);
cuDeviceGetAttribute
(
minorArray
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
device
);
int
major
=
majorArray
[
0
];
int
minor
=
minorArray
[
0
];
int
capability
=
major
*
10
+
minor
;
// Obtain the CUDA source code from the CUDA file
// Get absolute path to the file in resource folder, then read it as a normal file.
// When using just Eclipse resources - it does not notice that the file
// was edited (happens frequently during kernel development).
ClassLoader
classLoader
=
getClass
().
getClassLoader
();
String
kernelSource
=
"#define JCUDA\n"
+
private
String
getTpDefines
()
{
return
"#define JCUDA\n"
+
"#define DTT_SIZE_LOG2 "
+
DTT_SIZE_LOG2
+
"\n"
+
"#define THREADSX "
+
THREADSX
+
"\n"
+
"#define NUM_CAMS "
+
NUM_CAMS
+
"\n"
+
...
...
@@ -359,7 +334,71 @@ public class GPUTileProcessor {
"#define FAT_ZERO_WEIGHT "
+
FAT_ZERO_WEIGHT
+
"\n"
+
"#define THREADS_DYNAMIC_BITS "
+
THREADS_DYNAMIC_BITS
+
"\n"
;
}
public
GPUTileProcessor
(
String
cuda_project_directory
)
throws
IOException
{
// From code by Marco Hutter - http://www.jcuda.org
// Enable exceptions and omit all subsequent error checks
JCudaDriver
.
setExceptionsEnabled
(
true
);
JNvrtc
.
setExceptionsEnabled
(
true
);
// Initialize the driver and create a context for the first device.
cuInit
(
0
);
//2020 - making them global
CUdevice
device
=
new
CUdevice
();
cuDeviceGet
(
device
,
0
);
CUcontext
context
=
new
CUcontext
();
cuCtxCreate
(
context
,
0
,
device
);
int
majorArray
[]
=
{
0
};
int
minorArray
[]
=
{
0
};
cuDeviceGetAttribute
(
majorArray
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR
,
device
);
cuDeviceGetAttribute
(
minorArray
,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR
,
device
);
int
major
=
majorArray
[
0
];
int
minor
=
minorArray
[
0
];
int
capability
=
major
*
10
+
minor
;
// Obtain the CUDA source code from the CUDA file
// Get absolute path to the file in resource folder, then read it as a normal file.
// When using just Eclipse resources - it does not notice that the file
// was edited (happens frequently during kernel development).
ClassLoader
classLoader
=
getClass
().
getClassLoader
();
String
[]
kernelSources
=
new
String
[
GPU_SRC_FILES
.
length
];
for
(
int
cunit
=
0
;
cunit
<
kernelSources
.
length
;
cunit
++)
{
kernelSources
[
cunit
]
=
""
;
// use StringBuffer?
for
(
String
src_file:
GPU_SRC_FILES
[
cunit
])
{
if
(
src_file
.
contentEquals
(
"*"
))
{
kernelSources
[
cunit
]
+=
getTpDefines
();
}
else
{
File
file
=
null
;
if
((
cuda_project_directory
==
null
)
||
cuda_project_directory
.
isEmpty
())
{
file
=
new
File
(
classLoader
.
getResource
(
GPU_RESOURCE_DIR
+
"/"
+
src_file
).
getFile
());
System
.
out
.
println
(
"Loading resource "
+
file
);
}
else
{
File
src_dir
=
new
File
(
cuda_project_directory
,
"src"
);
file
=
new
File
(
src_dir
.
getPath
(),
src_file
);
System
.
out
.
println
(
"Loading resource "
+
file
);
}
System
.
out
.
println
(
file
.
getAbsolutePath
());
String
cuFileName
=
file
.
getAbsolutePath
();
// /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
String
sourceFile
=
readFileAsString
(
cuFileName
);
// readResourceAsString(cuFileName);
if
(
sourceFile
==
null
)
{
String
msg
=
"Could not read the kernel source code from "
+
cuFileName
;
IJ
.
showMessage
(
"Error"
,
msg
);
new
IllegalArgumentException
(
msg
);
}
kernelSources
[
cunit
]
+=
sourceFile
;
}
}
}
/*
String kernelSource = getTpDefines();
for (String src_file:GPU_KERNEL_FILES) {
File file = null;
if ((cuda_project_directory == null) || cuda_project_directory.isEmpty()) {
...
...
@@ -379,8 +418,9 @@ public class GPUTileProcessor {
new IllegalArgumentException (msg);
}
kernelSource += sourceFile;
}
*/
// Create the kernel functions (first - just test)
String
[]
func_names
=
{
GPU_CONVERT_CORRECT_TILES_NAME
,
...
...
@@ -388,7 +428,7 @@ public class GPUTileProcessor {
GPU_CORRELATE2D_NAME
,
GPU_TEXTURES_NAME
,
GPU_RBGA_NAME
};
CUfunction
[]
functions
=
createFunctions
(
kernelSource
,
CUfunction
[]
functions
=
createFunctions
(
kernelSource
s
,
func_names
,
capability
);
// on my - 75
...
...
@@ -711,6 +751,7 @@ public class GPUTileProcessor {
double
xc
=
woi
.
x
+
rx
-
0.5
;
double
yc
=
woi
.
y
+
ry
-
0.5
;
boolean
dbg1
=
false
;
// true;
double
dbg_frac
=
0.0
;
// 0.25;
boolean
[]
mask
=
new
boolean
[
tilesX
*
tilesY
];
int
num_tiles
=
0
;
for
(
int
ty
=
woi
.
y
;
ty
<
(
woi
.
y
+
woi
.
height
);
ty
++)
{
...
...
@@ -725,6 +766,35 @@ public class GPUTileProcessor {
}
}
}
if
(
dbg_frac
>
0
)
{
Random
rnd
=
new
Random
(
0
);
int
num_final
=
(
int
)
Math
.
round
(
num_tiles
*
(
1.0
-
dbg_frac
));
while
(
num_tiles
>
num_final
)
{
int
tx
=
woi
.
x
+
rnd
.
nextInt
(
woi
.
width
);
int
ty
=
woi
.
y
+
rnd
.
nextInt
(
woi
.
height
);
int
indx
=
ty
*
tilesX
+
tx
;
if
(
mask
[
indx
])
{
mask
[
indx
]
=
false
;
num_tiles
--;
}
}
// filter out with no neighbors
for
(
int
indx
=
0
;
indx
<
mask
.
length
;
indx
++)
if
(
mask
[
indx
])
{
int
ix
=
indx
%
tilesX
;
int
iy
=
indx
/
tilesX
;
int
num_neib
=
0
;
if
((
ix
>
0
)
&&
mask
[
indx
-
1
])
num_neib
++;
if
((
ix
<
(
tilesX
-
1
))
&&
mask
[
indx
+
1
])
num_neib
++;
if
((
iy
>
0
)
&&
mask
[
indx
-
tilesX
])
num_neib
++;
if
((
iy
<
(
tilesY
-
1
))
&&
mask
[
indx
+
tilesX
])
num_neib
++;
if
(
num_neib
==
0
)
{
mask
[
indx
]
=
false
;
num_tiles
--;
}
}
//nextInt(int bound)
}
if
(
dbg1
)
{
// mask[(woi.y-1) * tilesX + (woi.x-1)] = true;
mask
[(
woi
.
y
+
woi
.
height
)
*
tilesX
+
(
woi
.
x
+
woi
.
width
)]
=
true
;
...
...
@@ -1332,53 +1402,71 @@ public class GPUTileProcessor {
// private static CUfunction [] createFunctions(
private
CUfunction
[]
createFunctions
(
String
sourceCode
,
String
[]
sourceCodeUnits
,
String
[]
kernelNames
,
int
capability
)
throws
IOException
{
CUfunction
[]
functions
=
new
CUfunction
[
kernelNames
.
length
];
byte
[][]
ptxDataUnits
=
new
byte
[
sourceCodeUnits
.
length
][];
boolean
OK
=
false
;
// Use the NVRTC to create a program by compiling the source code
nvrtcProgram
program
=
new
nvrtcProgram
();
nvrtcCreateProgram
(
program
,
sourceCode
,
null
,
0
,
null
,
null
);
String
options
[]
=
{
"--gpu-architecture=compute_"
+
capability
};
// for (String sourceCode: sourceCodeUnits) {
for
(
int
cunit
=
0
;
cunit
<
ptxDataUnits
.
length
;
cunit
++)
{
String
sourceCode
=
sourceCodeUnits
[
cunit
];
// Use the NVRTC to create a program by compiling the source code
nvrtcProgram
program
=
new
nvrtcProgram
();
nvrtcCreateProgram
(
program
,
sourceCode
,
null
,
0
,
null
,
null
);
String
options
[]
=
{
"--gpu-architecture=compute_"
+
capability
};
try
{
nvrtcCompileProgram
(
program
,
options
.
length
,
options
);
OK
=
true
;
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"nvrtcCompileProgram() FAILED"
);
}
// Compilation log with errors/warnings
String
programLog
[]
=
new
String
[
1
];
nvrtcGetProgramLog
(
program
,
programLog
);
String
log
=
programLog
[
0
].
trim
();
if
(!
log
.
isEmpty
())
{
System
.
err
.
println
(
"Program compilation log:\n"
+
log
);
}
if
(!
OK
)
{
throw
new
IOException
(
"Could not compile program"
);
}
try
{
nvrtcCompileProgram
(
program
,
options
.
length
,
options
);
OK
=
true
;
}
catch
(
Exception
e
)
{
System
.
out
.
println
(
"nvrtcCompileProgram() FAILED"
);
}
// Compilation log with errors/warnings
String
programLog
[]
=
new
String
[
1
];
nvrtcGetProgramLog
(
program
,
programLog
);
String
log
=
programLog
[
0
].
trim
();
if
(!
log
.
isEmpty
())
{
System
.
err
.
println
(
"Program compilation log:\n"
+
log
);
// Get the PTX code of the compiled program (not the binary)
String
[]
ptx
=
new
String
[
1
];
nvrtcGetPTX
(
program
,
ptx
);
nvrtcDestroyProgram
(
program
);
// byte[] ptxData = ptx[0].getBytes();
ptxDataUnits
[
cunit
]
=
ptx
[
0
].
getBytes
();
System
.
out
.
println
(
"ptxDataUnits["
+
cunit
+
"].length="
+
ptxDataUnits
[
cunit
].
length
);
// System.out.println( ptx[0]);
}
if
(!
OK
)
{
throw
new
IOException
(
"Could not compile program"
);
}
// Get the PTX code of the compiled program (not the binary)
String
[]
ptx
=
new
String
[
1
];
nvrtcGetPTX
(
program
,
ptx
);
nvrtcDestroyProgram
(
program
);
byte
[]
ptxData
=
ptx
[
0
].
getBytes
();
JITOptions
jitOptions
=
new
JITOptions
();
jitOptions
.
putInt
(
CU_JIT_LOG_VERBOSE
,
1
);
CUlinkState
state
=
new
CUlinkState
();
cuLinkCreate
(
jitOptions
,
state
);
cuLinkAddFile
(
state
,
CU_JIT_INPUT_LIBRARY
,
LIBRARY_PATH
,
jitOptions
);
System
.
out
.
println
(
"ptxData.length="
+
ptxData
.
length
);
// System.out.println( ptx[0]);
for
(
int
cunit
=
0
;
cunit
<
ptxDataUnits
.
length
;
cunit
++)
{
// cuLinkAddData(state, CU_JIT_INPUT_PTX, Pointer.to(ptxData), ptxData.length, "input.ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
cuLinkAddData
(
state
,
CU_JIT_INPUT_PTX
,
Pointer
.
to
(
ptxDataUnits
[
cunit
]),
ptxDataUnits
[
cunit
].
length
,
"input"
+
cunit
+
".ptx"
,
jitOptions
);
// CUDA_ERROR_INVALID_PTX
// cuLinkAddData(state, CU_JIT_INPUT_PTX, Pointer.to(ptxDataUnits[cunit]), ptxDataUnits[cunit].length, "input.ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
}
// cuLinkAddFile(state, CU_JIT_INPUT_LIBRARY, LIBRARY_PATH, jitOptions);
cuLinkAddData
(
state
,
CU_JIT_INPUT_PTX
,
Pointer
.
to
(
ptxData
),
ptxData
.
length
,
"input.ptx"
,
jitOptions
);
// CUDA_ERROR_INVALID_PTX
long
size
[]
=
{
0
};
Pointer
image
=
new
Pointer
();
cuLinkComplete
(
state
,
image
,
size
);
JCudaDriver
.
setExceptionsEnabled
(
false
);
int
cuda_result
=
cuLinkComplete
(
state
,
image
,
size
);
System
.
out
.
println
(
"cuLinkComplete() -> "
+
cuda_result
);
JCudaDriver
.
setExceptionsEnabled
(
true
);
module
=
new
CUmodule
();
cuModuleLoadDataEx
(
module
,
image
,
0
,
new
int
[
0
],
Pointer
.
to
(
new
int
[
0
]));
cuLinkDestroy
(
state
);
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
3d5ddc28
...
...
@@ -37,66 +37,14 @@
*/
// Avoiding includes in jcuda, all source files will be merged
#ifndef JCUDA
#pragma once
#include "dtt8x8.cuh"
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_COLORS 3
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
//#define DEBUG3 1
//#define DEBUG4 1
//#define DEBUG5 1
//#define DEBUG6 1
/*
#define DEBUG7 1
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//#define USE_textures_gen
#define DEBUG_OOB1 1
#endif //#ifndef JCUDA
#ifndef JCUDA
#include "tp_defines.h"
#include "dtt8x8.h"
#endif // #ifndef JCUDA
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14
//#define NOICLT 1
//#define TEST_IMCLT
...
...
@@ -148,25 +96,10 @@
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
//#define DTT_SIZE22 (DTT_SIZE2 + 2)
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
// Use CORR_OUT_RAD for the correlation output
#define BAYER_RED 0
#define BAYER_BLUE 1
#define BAYER_GREEN 2
// assuming GR/BG as now
#define BAYER_RED_ROW 0
#define BAYER_RED_COL 1
//#define BAYER_BLUE_ROW (1 - BAYER_RED_ROW)
//#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
...
...
@@ -312,11 +245,14 @@ def set_imclt_sa(stride=9):
print('0x%02x,'%(d), end="")
print('0x%2x};'%(sa8s[-1]))
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import numpy as np # math
def printAlphaFade(transform_size):
ts2 = 2 * transform_size
ts2m1 = ts2-1
alphaFade = np.zeros(shape=(16,ts2*ts2), dtype=float) # double [][] alphaFade = new double[16][ts2*ts2];
alphaFade = np.zeros(shape=(9,ts2*ts2), dtype=float) # double [][] alphaFade = new double[16][ts2*ts2];
alphaIndex = np.zeros(shape=(16,), dtype=int)
fade1d = np.zeros(shape=(16,), dtype=float) # double [] fade1d = new double [ts2];
for i in range (ts2):
fade1d[i] = 0.5 * (1.0 - np.cos(np.pi * (i +0.5) /ts2))
...
...
@@ -326,27 +262,49 @@ def printAlphaFade(transform_size):
for m in range (16):
# if m == 0:
# alphaFade[m][indx] = 0
elif m == 1: # 0
alphaFade[m][indx] = fade1d[ts2m1 - i]
if m == 1: # 0
alphaIndex[m] = 1
alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - i]
elif m == 2:
alphaFade[m][indx] = fade1d[j]
alphaIndex[m] = 2
alphaFade[alphaIndex[m]][indx] = fade1d[j]
elif m == 4:
alphaFade[m][indx] = fade1d[i]
alphaIndex[m] = 3
alphaFade[alphaIndex[m]][indx] = fade1d[i]
elif m == 8:
alphaFade[m][indx] = fade1d[ts2m1 - j]
alphaIndex[m] = 4
alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - j]
elif m == 3:
alphaFade[m][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
alphaIndex[m] = 5
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
elif m == 6:
alphaFade[m][indx] = (fade1d[i],fade1d[j])[j > i]
alphaIndex[m] = 6
alphaFade[alphaIndex[m]][indx] = (fade1d[i],fade1d[j])[j > i]
elif m == 9:
alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
alphaIndex[m] = 7
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
elif m == 12:
alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
alphaIndex[m] = 8
alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
else:
alphaFade[m][indx] = 1.0
alphaIndex[m] = 0
alphaFade[alphaIndex[m]][indx] = 1.0
floats_in_line=8
print("__constant__
float alphaFade[16][%d] = {"%(ts2*ts2)
)
print("__constant__
int alphaIndex[16] = {"
)
for m in range (16):
if ((m % floats_in_line) == 0):
print("\n ",end="")
else:
print(" ",end="")
print("%d"%(alphaIndex[m]), end ="")
if (m < (16-1)):
print(",",end="")
print("};")
print("__constant__ float alphaFade[9][%d] = {"%(ts2*ts2))
for m in range (9):
for i in range (ts2 * ts2):
if ((i % floats_in_line) == 0):
print(" ",end="")
...
...
@@ -362,20 +320,22 @@ def printAlphaFade(transform_size):
print(",")
else:
print(", ",end="")
if (m ==
15
):
if (m ==
(9-1)
):
print("};")
else:
print(",")
printAlphaFade(8)
"""
*/
__constant__
float
HWINDOW
[]
=
{
0.098017
f
,
0.290285
f
,
0.471397
f
,
0.634393
f
,
0.773010
f
,
0.881921
f
,
0.956940
f
,
0.995185
f
};
__constant__
float
HWINDOW2
[]
=
{
0.049009
f
,
0.145142
f
,
0.235698
f
,
0.317197
f
,
0.386505
f
,
0.440961
f
,
0.478470
f
,
0.497592
f
};
__constant__
float
HWINDOW_SQ
[]
=
{
0.009607
f
,
0.084265
f
,
0.222215
f
,
0.402455
f
,
0.597545
f
,
0.777785
f
,
0.915735
f
,
0.990393
f
};
...
...
@@ -392,32 +352,7 @@ __constant__ int fold_inc[]= {0x02feee12, 0x021eeef2};
//__constant__ int imclt_indx[16] = {0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x22,0x21,0x20,0x20,0x21,0x22,0x23};
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__constant__
int
imclt_indx9
[
16
]
=
{
0x28
,
0x29
,
0x2a
,
0x2b
,
0x2b
,
0x2a
,
0x29
,
0x28
,
0x27
,
0x26
,
0x25
,
0x24
,
0x24
,
0x25
,
0x26
,
0x27
};
// Hope that if 2 outer indices are known at compile time there will be no integer multiplications
__constant__
float
idct_signs
[
4
][
4
][
4
]
=
{
{
// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
}
},{
// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
}
}};
// LPF for sigma 0.9 each color (modify through cudaMemcpyToSymbol() or similar in Driver API
//#ifndef NOICLT
__constant__
float
lpf_data
[
4
][
64
]
=
{
...
...
@@ -489,39 +424,40 @@ __constant__ int pairs[6][2]={
{
0
,
3
},
{
2
,
1
}};
__constant__
float
alphaFade
[
16
][
256
]
=
{
{
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
,
0.000000
f
},
__constant__
int
alphaIndex
[
16
]
=
{
0
,
1
,
2
,
5
,
3
,
0
,
6
,
0
,
4
,
7
,
0
,
0
,
8
,
0
,
0
,
0
};
__constant__
float
alphaFade
[
9
][
256
]
=
{
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
...
...
@@ -586,38 +522,6 @@ __constant__ float alphaFade[16][256] = {
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.002408
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
},
{
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.059039
f
,
0.059039
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.021530
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.002408
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
},
{
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.002408
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
0.021530
f
,
...
...
@@ -650,102 +554,6 @@ __constant__ float alphaFade[16][256] = {
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
0.002408
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.021530
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.059039
f
,
0.059039
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
...
...
@@ -778,6 +586,70 @@ __constant__ float alphaFade[16][256] = {
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
},
{
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.059039
f
,
0.059039
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.021530
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.002408
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
},
{
0.002408
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.021530
f
,
0.021530
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.059039
f
,
0.059039
f
,
0.059039
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.113495
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.182803
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.264302
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.354858
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.450991
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.549009
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.645142
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.735698
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.817197
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.886505
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.940961
f
,
0.978470
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
},
{
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
...
...
@@ -807,73 +679,9 @@ __constant__ float alphaFade[16][256] = {
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.059039
f
,
0.059039
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.021530
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.021530
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
},
{
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
0.450991
f
,
0.354858
f
,
0.264302
f
,
0.182803
f
,
0.113495
f
,
0.059039
f
,
0.021530
f
,
0.002408
f
,
0.997592
f
,
0.978470
f
,
0.940961
f
,
0.886505
f
,
0.817197
f
,
0.735698
f
,
0.645142
f
,
0.549009
f
,
...
...
@@ -905,103 +713,7 @@ __constant__ float alphaFade[16][256] = {
0.997592
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.978470
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
},
{
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
,
1.000000
f
}};
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
,
0.997592
f
}};
//#endif
__device__
void
convertCorrectTile
(
...
...
@@ -1065,16 +777,10 @@ __device__ void resetCorrelation(
__device__
void
normalizeTileAmplitude
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
fat_zero
);
// fat zero is absolute, scale it outside
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
);
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
//__device__ void imclt( // implemented, used // why is it twice?
//__device__ void imclt( // for 16 threads implemented, used // why is it twice?
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
// float * mclt_tile ); // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__
void
imclt
(
// for 16 threads implemented, used // why is it twice?
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
);
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
__device__
void
imclt8threads
(
// for 8 threads
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
...
...
@@ -1356,14 +1062,8 @@ __global__ void correlate2D(
__syncthreads
();
// __syncwarp();
#endif
#endif
dttii_2d
(
clt_corr
);
/*
Java code:
for (int quadrant = 0; quadrant < 4; quadrant++){
int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
}
*/
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
...
...
@@ -1371,15 +1071,6 @@ Java code:
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
#ifdef DBG_TILE
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
printf
(
"
\n
correlate2D AFTER VERTICAL (HORIZONTAL) PASS
\n
"
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
}
__syncthreads
();
// __syncwarp();
#endif
#endif
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
...
...
@@ -1387,6 +1078,9 @@ Java code:
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
4
)){
...
...
@@ -2498,7 +2192,8 @@ __global__ void textures_accumulate(
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
if
(
tile_code
!=
TASK_TEXTURE_BITS
){
// only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
int
alpha_mode
=
alphaIndex
[
tile_code
];
if
(
!
alpha_mode
){
// only multiply if needed, alpha_mode == 0 - keep as is.
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
...
...
@@ -2509,12 +2204,12 @@ __global__ void textures_accumulate(
if
(
colors
==
3
){
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
;
ncol
++
)
{
// 4
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
tile_c
ode
][
gi
];
// reduce [tile_code] by LUT
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
alpha_m
ode
][
gi
];
// reduce [tile_code] by LUT
}
}
else
{
// assuming colors = 1
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
;
ncol
++
)
{
// 2
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
tile_c
ode
][
gi
];
// reduce [tile_code] by LUT
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
alpha_m
ode
][
gi
];
// reduce [tile_code] by LUT
}
}
}
...
...
@@ -2940,72 +2635,6 @@ __device__ void normalizeTileAmplitude(
clt_tile_j3
++
;
// =DTT_SIZE1;
}
}
/*
Converted from DttRad2.java:443
public double [] corr_unfold_tile(
double [][] qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
int transform_size
)
*/
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
)
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int
size2r1
=
2
*
corr_radius
+
1
;
// 15
int
crp1
=
corr_radius
+
1
;
//8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int
rslt_base_index
=
size2r1
*
crp1
-
crp1
;
// offset of the center
float
*
qdata1
=
qdata0
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata2
=
qdata1
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata3
=
qdata2
+
(
DTT_SIZE
*
DTT_SIZE1
);
int
i
=
threadIdx
.
x
;
if
(
i
>
corr_radius
)
{
return
;
// not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float
corr_pixscale
=
0.25
f
;
int
i_transform_size
=
i
*
DTT_SIZE1
;
// used to address source rows which are 9 long
int
im1_transform_size
=
i_transform_size
-
DTT_SIZE1
;
// negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
int
rslt_row_offs
=
i
*
size2r1
;
int
rslt_base_index_p
=
rslt_base_index
+
rslt_row_offs
;
// i * DTT_SIZE2M1;
int
rslt_base_index_m
=
rslt_base_index
-
rslt_row_offs
;
// i * DTT_SIZE2M1;
rslt
[
rslt_base_index_p
]
=
corr_pixscale
*
qdata0
[
i_transform_size
];
// incomplete, will only be used for thread i=0
rslt
[
rslt_base_index_m
]
=
rslt
[
rslt_base_index_p
];
// nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
for
(
int
j
=
1
;
j
<=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
rslt
[
rslt_base_index_pp
]
=
corr_pixscale
*
(
qdata0
[
i_transform_size
+
j
]
+
qdata1
[
i_transform_size
+
j
-
1
]);
// incomplete, will only be used for thread i=0
rslt
[
rslt_base_index_pm
]
=
corr_pixscale
*
(
qdata0
[
i_transform_size
+
j
]
+
-
qdata1
[
i_transform_size
+
j
-
1
]);
// incomplete, will only be used for thread i=0
}
if
(
i
==
0
)
{
return
;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float
d
=
corr_pixscale
*
qdata2
[
im1_transform_size
];
rslt
[
rslt_base_index_p
]
+=
d
;
rslt
[
rslt_base_index_m
]
-=
d
;
for
(
int
j
=
1
;
j
<=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
int
rslt_base_index_mp
=
rslt_base_index_m
+
j
;
int
rslt_base_index_mm
=
rslt_base_index_m
-
j
;
float
d2
=
corr_pixscale
*
qdata2
[
im1_transform_size
+
j
];
float
d3
=
corr_pixscale
*
qdata3
[
im1_transform_size
+
j
-
1
];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt
[
rslt_base_index_mp
]
=
rslt
[
rslt_base_index_pp
]
-
d2
-
d3
;
rslt
[
rslt_base_index_mm
]
=
rslt
[
rslt_base_index_pm
]
-
d2
+
d3
;
rslt
[
rslt_base_index_pp
]
+=
d2
+
d3
;
rslt
[
rslt_base_index_pm
]
+=
d2
-
d3
;
}
}
__device__
void
debug_print_lpf
(
float
*
lpf_tile
)
...
...
@@ -3362,28 +2991,6 @@ __device__ void convertCorrectTile(
}
__syncthreads
();
// __syncwarp();
#endif
/*
if (color == BAYER_GREEN) {
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
// float *dtt_buf = ((float *) clt_tile[0]) + threadIdx.x;
// float *dtt_buf1 = ((float *) clt_tile[2]) + threadIdx.x;
float *dtt_buf = clt_tile + threadIdx.x;
float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
(*dtt_buf) += (*dtt_buf1);
dtt_buf += (4 * DTT_SIZE1);
dtt_buf1 += (4 * DTT_SIZE1);
(*dtt_buf) += (*dtt_buf1);
dtt_buf = clt_tile + (DTT_SIZE1 * DTT_SIZE) + threadIdx.x; // ((float *) clt_tile[1]) + threadIdx.x;
dtt_buf1 = dtt_buf + (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[3]) + threadIdx.x;
(*dtt_buf) += (*dtt_buf1);
dtt_buf += (4 * DTT_SIZE1);
dtt_buf1 += (4 * DTT_SIZE1);
(*dtt_buf) += (*dtt_buf1);
__syncthreads();// __syncwarp();
}
*/
if
(
color
==
BAYER_GREEN
)
{
// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
float
*
dtt_buf
=
clt_tile
+
threadIdx
.
x
;
...
...
@@ -3404,21 +3011,16 @@ __device__ void convertCorrectTile(
}
__syncthreads
();
// __syncwarp();
#endif
dttiv_color_2d
(
clt_tile
,
color
);
/*
dctiv_nodiverg( // all colors
#ifdef USE_UMUL24
clt_tile
+
__umul24
(
threadIdx
.
x
,
DTT_SIZE1
),
// [0][threadIdx.x], // pointer to start of row
#else
clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
#endif
1); //int inc);
if (color == BAYER_GREEN){
dstiv_nodiverg( // all colors
#ifdef USE_UMUL24
clt_tile
+
__umul24
(
threadIdx
.
x
+
DTT_SIZE
,
DTT_SIZE1
),
// clt_tile[1][threadIdx.x], // pointer to start of row
#else
clt_tile
+
DTT_SIZE1
*
(
threadIdx
.
x
+
DTT_SIZE
),
// clt_tile[1][threadIdx.x], // pointer to start of row
#endif
clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
1); //int inc);
}
...
...
@@ -3435,12 +3037,12 @@ __device__ void convertCorrectTile(
clt_tile + threadIdx.x, // &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
if (color == BAYER_GREEN){
// dstiv_nodiverg( // all colors
dctiv_nodiverg( // all colors
clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1); // int inc,
}
__syncthreads();// __syncwarp();
*/
#ifdef DEBUG2
if
((
threadIdx
.
x
)
==
0
){
...
...
@@ -3634,387 +3236,8 @@ __device__ void convertCorrectTile(
//#endif
}
#ifdef NOICLT1
extern
"C"
__global__
void
test_imclt
(
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int
ncam
)
// just for debug print
// Initially - no output, will add later
{
// dim3 t = threadIdx;
int
tile_in_block
=
threadIdx
.
y
;
int
tile_num
=
blockIdx
.
x
*
IMCLT_TILES_PER_BLOCK
+
tile_in_block
;
if
(
tile_num
>=
1
)
return
;
// just testing with a single tile
int
thr3
=
threadIdx
.
x
>>
3
;
int
column
=
threadIdx
.
x
;
// modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// Read clt tile to
__shared__
float
clt_tiles
[
IMCLT_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
mclt_tiles
[
IMCLT_TILES_PER_BLOCK
][
DTT_SIZE2
][
DTT_SIZE21
];
// Read clt tile from device memory
for
(
int
color
=
0
;
color
<
NUM_COLORS
;
color
++
)
{
float
*
clt_tile
=
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
gpu_tile
=
((
float
*
)
gpu_clt
)
+
((
DBG_TILE_Y
*
TILESX
+
DBG_TILE_X
)
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
);
// top left quadrant0
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n\n\n
================== gpu_tile = 0x%lx, clt_tile = 0x%lx, COLOR=%d, ncam = %d ======================
\n
"
,
gpu_tile
,
clt_tile
,
color
,
ncam
);
}
#endif
clt_tile
+=
column
+
thr3
;
// first 2 rows
gpu_tile
+=
column
;
// first 2 rows
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE2
;
i
++
){
*
clt_tile
=
*
gpu_tile
;
clt_tile
+=
(
2
*
DTT_SIZE1
);
gpu_tile
+=
(
2
*
DTT_SIZE
);
}
// reset mclt tile to zero
float
*
mclt_tile
=
((
float
*
)
mclt_tiles
)
+
tile_in_block
*
(
DTT_SIZE2
*
DTT_SIZE21
)
+
column
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE2
;
i
++
){
*
mclt_tile
=
0.0
f
;
mclt_tile
+=
DTT_SIZE21
;
}
__syncthreads
();
// __syncwarp();
imclt
(
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
),
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
((
float
*
)
mclt_tiles
)
+
tile_in_block
*
(
DTT_SIZE2
*
DTT_SIZE21
));
// float * mclt_tile )
__syncthreads
();
// __syncwarp();
}
}
#endif // NOICLT1
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
)
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int
thr3
=
threadIdx
.
x
>>
3
;
int
column
=
threadIdx
.
x
;
// modify to use 2*8 threads, if needed.
int
thr012
=
threadIdx
.
x
&
7
;
int
column4
=
threadIdx
.
x
>>
2
;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int
wcolumn
=
((
thr3
<<
3
)
-
thr3
)
^
thr012
;
//0..7,7,..0
float
*
clt_tile1
=
clt_tile
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile2
=
clt_tile1
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile3
=
clt_tile2
+
(
DTT_SIZE1
*
DTT_SIZE
);
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles before IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg
(
// clt_tile + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 1
clt_tile
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 0 and 2
1
);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg
(
// all colors
// clt_tile2 + DTT_SIZE1 * (thr012 + DTT_SIZE * thr3), // pointer to start of row for quadrants 2 and 3
clt_tile1
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 1 and 3
1
);
__syncthreads
();
// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg
(
// clt_tile + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 2
clt_tile
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 0 and 1
DTT_SIZE1
);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg
(
// clt_tile1 + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 1 and 3
clt_tile2
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 2 and 3
DTT_SIZE1
);
__syncthreads
();
// __syncwarp();
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
float
hw
=
HWINDOW2
[
wcolumn
];
int
clt_offset
=
imclt_indx9
[
column
];
// index in each of the 4 iclt quadrants, accounting for stride=9
float
*
rslt
=
mclt_tile
+
column
;
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
0
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
0
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
0
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
0
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
<
3
){
clt_offset
+=
DTT_SIZE1
;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
*
rslt
=
val
;
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
4
;
i
<
8
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
1
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
1
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
1
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
1
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
// if (i < 7){
clt_offset
-=
DTT_SIZE1
;
// }
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
7
;
i
>=
4
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
2
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
2
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
2
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
2
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
4
){
clt_offset
-=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
3
;
i
>=
0
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
3
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
3
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
3
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
3
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
0
){
clt_offset
+=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#ifdef DEBUG3
__syncthreads
();
// __syncwarp();
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
MCLT Tiles after IMCLT
\n
"
);
debug_print_mclt
(
mclt_tile
,
-
1
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
}
//#endif
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__
void
imclt8threads
(
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
,
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int
debug
)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float
*
clt_tile1
=
clt_tile
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile2
=
clt_tile1
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile3
=
clt_tile2
+
(
DTT_SIZE1
*
DTT_SIZE
);
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
DTT Tiles before IDTT
\n
"
);
debug_print_clt_scaled
(
clt_tile
,
-
1
,
0xf
,
0.25
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg
(
// quadrant 0
clt_tile
+
threadIdx
.
x
,
// pointer to start of row for quadrant 0
DTT_SIZE1
);
dctiv_nodiverg
(
// quadrant 1
clt_tile
+
threadIdx
.
x
+
(
1
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 1
DTT_SIZE1
);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg
(
// quadrant 2
clt_tile
+
threadIdx
.
x
+
(
2
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 2
DTT_SIZE1
);
dstiv_nodiverg
(
// quadrant 3
clt_tile
+
threadIdx
.
x
+
(
3
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 3
DTT_SIZE1
);
__syncthreads
();
// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg
(
// quadrant 0
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
,
// pointer to start of row for quadrant 0
1
);
dctiv_nodiverg
(
// quadrant 2
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
2
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 2
1
);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg
(
// quadrant 1
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
1
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 1
1
);
dstiv_nodiverg
(
// quadrant 3
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
3
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 3
1
);
__syncthreads
();
// __syncwarp();
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_clt_scaled
(
clt_tile
,
-
1
,
0xf
,
0.25
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for
(
int
thr3
=
0
;
thr3
<
2
;
thr3
++
){
int
thr3m
=
(
thr3
<<
3
);
int
column
=
threadIdx
.
x
+
thr3m
;
// modify to use 2*8 threads, if needed.
int
thr012
=
threadIdx
.
x
&
7
;
// == threadIdx.x
int
column4
=
column
>>
2
;
// (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int
wcolumn
=
(
thr3m
-
thr3
)
^
thr012
;
//0..7,7,..0
float
hw
=
HWINDOW2
[
wcolumn
];
int
clt_offset
=
imclt_indx9
[
column
];
// index in each of the 4 iclt quadrants, accounting for stride=9
float
*
rslt
=
mclt_tile
+
column
;
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
Unrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d
\n
"
,
thr3
,
thr3m
,
column
,
thr012
,
column4
,
wcolumn
,
hw
,
clt_offset
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
){
float
val
=
*
rslt
;
// facc
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
0
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
0
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
0
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
0
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
<
3
){
clt_offset
+=
DTT_SIZE1
;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
4
;
i
<
8
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
1
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
1
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
1
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
1
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
// if (i < 7){
clt_offset
-=
DTT_SIZE1
;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
7
;
i
>=
4
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
2
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
2
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
2
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
2
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
4
){
clt_offset
-=
DTT_SIZE1
;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
3
;
i
>=
0
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
3
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
3
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
3
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
3
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
0
){
clt_offset
+=
DTT_SIZE1
;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
}
#ifdef DEBUG7
__syncthreads
();
// __syncwarp();
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
printf
(
"
\n
MCLT Tiles after IMCLT, cam=%d
\n
"
,
threadIdx
.
y
);
debug_print_mclt
(
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
}
__device__
void
debayer_shot
(
const
int
rb_mode
,
// 0 - green, 1 - r/b
...
...
src/main/resources/kernels/dtt8x8.cu
h
→
src/main/resources/kernels/dtt8x8.cu
View file @
3d5ddc28
...
...
@@ -45,14 +45,8 @@
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE_LOG2 3
//#define DTT_SIZE 8
#include "dtt8x8.h"
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//#define CUDART_INF_F __int_as_float(0x7f800000)
/*
...
...
@@ -84,21 +78,33 @@ __constant__ float COSN1[] = {0.980785f,0.831470f};
__constant__
float
COSN2
[]
=
{
0.995185
f
,
0.956940
f
,
0.881921
f
,
0.773010
f
};
__constant__
float
SINN1
[]
=
{
0.195090
f
,
0.555570
f
};
__constant__
float
SINN2
[]
=
{
0.098017
f
,
0.290285
f
,
0.471397
f
,
0.634393
f
};
__constant__
int
imclt_indx9
[
16
]
=
{
0x28
,
0x29
,
0x2a
,
0x2b
,
0x2b
,
0x2a
,
0x29
,
0x28
,
0x27
,
0x26
,
0x25
,
0x24
,
0x24
,
0x25
,
0x26
,
0x27
};
__constant__
float
idct_signs
[
4
][
4
][
4
]
=
{
{
// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
},
{
-
1
,
-
1
,
-
1
,
1
}
},{
// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
1
,
-
1
,
-
1
,
-
1
},
{
-
1
,
1
,
1
,
1
}
},{
// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
1
,
1
,
1
,
-
1
},
{
-
1
,
-
1
,
-
1
,
1
}
}};
__constant__
float
HWINDOW2
[]
=
{
0.049009
f
,
0.145142
f
,
0.235698
f
,
0.317197
f
,
0.386505
f
,
0.440961
f
,
0.478470
f
,
0.497592
f
};
inline
__device__
void
dttii_shared_mem_nonortho
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// does not scale by y[0] (y[7]) by 1/sqrt[0]
inline
__device__
void
dttii_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// used in GPU_DTT24_DRV
inline
__device__
void
dttiv_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// used in GPU_DTT24_DRV
inline
__device__
void
dttiv_nodiverg
(
float
*
x
,
int
inc
,
int
dst_not_dct
);
// not used
inline
__device__
void
dctiv_nodiverg
(
float
*
x0
,
int
inc
);
// used in TP
inline
__device__
void
dstiv_nodiverg
(
float
*
x0
,
int
inc
);
// used in TP
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
dst_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
_dctiv_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
/**
**************************************************************************
...
...
@@ -120,7 +126,7 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
*
* \return None
*/
#ifdef BBBB
extern
"C"
__global__
void
GPU_DTT24_DRV
(
float
*
dst
,
float
*
src
,
int
src_stride
,
int
dtt_mode
)
{
...
...
@@ -159,6 +165,7 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
for
(
unsigned
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
dst
[
i
*
src_stride
]
=
bl_ptr
[
i
*
DTTTEST_BLK_STRIDE
];
}
#endif //#ifdef BBBB
...
...
@@ -218,7 +225,7 @@ inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to
y
[
7
]
=
SQRT_2
*
vb00
;
// w1[3];
}
inline
__device__
void
_dttiv
(
float
x0
,
float
x1
,
float
x2
,
float
x3
,
float
x4
,
float
x5
,
float
x6
,
float
x7
,
__device__
void
_dttiv
(
float
x0
,
float
x1
,
float
x2
,
float
x3
,
float
x4
,
float
x5
,
float
x6
,
float
x7
,
float
*
y0
,
float
*
y1
,
float
*
y2
,
float
*
y3
,
float
*
y4
,
float
*
y5
,
float
*
y6
,
float
*
y7
,
int
dst_not_dct
)
{
float
u00
,
u01
,
u02
,
u03
,
u10
,
u11
,
u12
,
u13
;
...
...
@@ -746,7 +753,7 @@ inline __device__ void dstiv_nodiverg(float * x, int inc)
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
float
u00
=
(
x
[
0
]
+
x
[
7
]);
float
u10
=
(
x
[
0
]
-
x
[
7
]);
...
...
@@ -807,7 +814,7 @@ inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to
y
[
7
]
=
v13
;
}
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
_dctii_nrecurs8
(
x
,
y
);
#pragma unroll
...
...
@@ -817,7 +824,7 @@ inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-elemen
}
inline
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
])
// x,y point to 8-element arrays each
{
_dctiv_nrecurs8
(
x
,
y
);
#pragma unroll
...
...
@@ -843,4 +850,438 @@ inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-elemen
}
//=========================== 2D functions ===============
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
)
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
int
size2r1
=
2
*
corr_radius
+
1
;
// 15
int
crp1
=
corr_radius
+
1
;
//8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int
rslt_base_index
=
size2r1
*
crp1
-
crp1
;
// offset of the center
float
*
qdata1
=
qdata0
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata2
=
qdata1
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata3
=
qdata2
+
(
DTT_SIZE
*
DTT_SIZE1
);
int
i
=
threadIdx
.
x
;
if
(
i
>
corr_radius
)
{
return
;
// not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float
corr_pixscale
=
0.25
f
;
int
i_transform_size
=
i
*
DTT_SIZE1
;
// used to address source rows which are 9 long
int
im1_transform_size
=
i_transform_size
-
DTT_SIZE1
;
// negative for i = 0, use only after divergence
/// int rslt_row_offs = i * DTT_SIZE2M1;
int
rslt_row_offs
=
i
*
size2r1
;
int
rslt_base_index_p
=
rslt_base_index
+
rslt_row_offs
;
// i * DTT_SIZE2M1;
int
rslt_base_index_m
=
rslt_base_index
-
rslt_row_offs
;
// i * DTT_SIZE2M1;
rslt
[
rslt_base_index_p
]
=
corr_pixscale
*
qdata0
[
i_transform_size
];
// incomplete, will only be used for thread i=0
rslt
[
rslt_base_index_m
]
=
rslt
[
rslt_base_index_p
];
// nop for i=0 incomplete, will only be used for thread i=0
/// for (int j = 1; j < DTT_SIZE; j++) {
for
(
int
j
=
1
;
j
<=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
rslt
[
rslt_base_index_pp
]
=
corr_pixscale
*
(
qdata0
[
i_transform_size
+
j
]
+
qdata1
[
i_transform_size
+
j
-
1
]);
// incomplete, will only be used for thread i=0
rslt
[
rslt_base_index_pm
]
=
corr_pixscale
*
(
qdata0
[
i_transform_size
+
j
]
+
-
qdata1
[
i_transform_size
+
j
-
1
]);
// incomplete, will only be used for thread i=0
}
if
(
i
==
0
)
{
return
;
}
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float
d
=
corr_pixscale
*
qdata2
[
im1_transform_size
];
rslt
[
rslt_base_index_p
]
+=
d
;
rslt
[
rslt_base_index_m
]
-=
d
;
for
(
int
j
=
1
;
j
<=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
int
rslt_base_index_mp
=
rslt_base_index_m
+
j
;
int
rslt_base_index_mm
=
rslt_base_index_m
-
j
;
float
d2
=
corr_pixscale
*
qdata2
[
im1_transform_size
+
j
];
float
d3
=
corr_pixscale
*
qdata3
[
im1_transform_size
+
j
-
1
];
//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
rslt
[
rslt_base_index_mp
]
=
rslt
[
rslt_base_index_pp
]
-
d2
-
d3
;
rslt
[
rslt_base_index_mm
]
=
rslt
[
rslt_base_index_pm
]
-
d2
+
d3
;
rslt
[
rslt_base_index_pp
]
+=
d2
+
d3
;
rslt
[
rslt_base_index_pm
]
+=
d2
-
d3
;
}
}
__device__
void
dttii_2d
(
float
*
clt_corr
)
// shared memory, [4][DTT_SIZE1][DTT_SIZE]
{
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for
(
int
q
=
0
;
q
<
4
;
q
++
){
int
is_sin
=
(
q
>>
1
)
&
1
;
dttii_shared_mem_nonortho
(
clt_corr
+
q
*
(
DTT_SIZE1
*
DTT_SIZE
)
+
threadIdx
.
x
,
DTT_SIZE1
,
is_sin
);
// vertical pass, thread is column
}
__syncthreads
();
// hor pass, corresponding to vert pass in Java
for
(
int
q
=
0
;
q
<
4
;
q
++
){
int
is_sin
=
q
&
1
;
dttii_shared_mem_nonortho
(
clt_corr
+
(
q
*
DTT_SIZE
+
threadIdx
.
x
)
*
DTT_SIZE1
,
1
,
is_sin
);
// horizontal pass, tread is row
}
__syncthreads
();
}
__device__
void
dttiv_color_2d
(
float
*
clt_tile
,
int
color
)
{
dctiv_nodiverg
(
// all colors
clt_tile
+
(
DTT_SIZE1
*
threadIdx
.
x
),
// [0][threadIdx.x], // pointer to start of row
1
);
//int inc);
if
(
color
==
BAYER_GREEN
){
dstiv_nodiverg
(
// all colors
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
DTT_SIZE1
*
DTT_SIZE
,
// clt_tile[1][threadIdx.x], // pointer to start of row
1
);
//int inc);
}
__syncthreads
();
// __syncwarp();
#ifdef DEBUG222
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after horizontal pass, color=%d
\n
"
,
color
);
debug_print_clt1
(
clt_tile
,
color
,
(
color
==
BAYER_GREEN
)
?
3
:
1
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
dctiv_nodiverg
(
// all colors
clt_tile
+
threadIdx
.
x
,
// &clt_tile[0][0][threadIdx.x], // pointer to start of column
DTT_SIZE1
);
// int inc,
if
(
color
==
BAYER_GREEN
){
dctiv_nodiverg
(
// all colors
clt_tile
+
threadIdx
.
x
+
(
DTT_SIZE1
*
DTT_SIZE
),
// &clt_tile[1][0][threadIdx.x], // pointer to start of column
DTT_SIZE1
);
// int inc,
}
__syncthreads
();
// __syncwarp();
}
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
)
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
{
int
thr3
=
threadIdx
.
x
>>
3
;
int
column
=
threadIdx
.
x
;
// modify to use 2*8 threads, if needed.
int
thr012
=
threadIdx
.
x
&
7
;
int
column4
=
threadIdx
.
x
>>
2
;
// int wcolumn =column ^ (7 * thr3); //0..7,7,..0
// int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
int
wcolumn
=
((
thr3
<<
3
)
-
thr3
)
^
thr012
;
//0..7,7,..0
float
*
clt_tile1
=
clt_tile
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile2
=
clt_tile1
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile3
=
clt_tile2
+
(
DTT_SIZE1
*
DTT_SIZE
);
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles before IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg
(
clt_tile
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 0 and 2
1
);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg
(
// all colors
clt_tile1
+
DTT_SIZE1
*
(
thr012
+
2
*
DTT_SIZE
*
thr3
),
// pointer to start of row for quadrants 1 and 3
1
);
__syncthreads
();
// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg
(
clt_tile
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 0 and 1
DTT_SIZE1
);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg
(
clt_tile2
+
thr012
+
(
DTT_SIZE1
*
DTT_SIZE
)
*
thr3
,
// pointer to start of row for quadrants 2 and 3
DTT_SIZE1
);
__syncthreads
();
// __syncwarp();
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
float
hw
=
HWINDOW2
[
wcolumn
];
int
clt_offset
=
imclt_indx9
[
column
];
// index in each of the 4 iclt quadrants, accounting for stride=9
float
*
rslt
=
mclt_tile
+
column
;
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
0
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
0
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
0
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
0
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
<
3
){
clt_offset
+=
DTT_SIZE1
;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
val
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
*
rslt
=
val
;
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
4
;
i
<
8
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
1
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
1
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
1
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
1
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
// if (i < 7){
clt_offset
-=
DTT_SIZE1
;
// }
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
7
;
i
>=
4
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
2
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
2
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
2
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
2
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
4
){
clt_offset
-=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
3
;
i
>=
0
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
3
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
3
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
3
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
3
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
0
){
clt_offset
+=
DTT_SIZE1
;
}
*
rslt
=
__fmaf_rd
(
w
,
d0
,
val
);
// w*d0 + val
rslt
+=
DTT_SIZE21
;
}
#ifdef DEBUG3
__syncthreads
();
// __syncwarp();
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
MCLT Tiles after IMCLT
\n
"
);
debug_print_mclt
(
mclt_tile
,
-
1
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
}
// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
// first pass
//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
__device__
void
imclt8threads
(
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
,
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int
debug
)
{
// int thr3 = threadIdx.x >> 3;
// int column = threadIdx.x; // modify to use 2*8 threads, if needed.
// int thr012 = threadIdx.x & 7;
// int column4 = threadIdx.x >> 2;
// int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
float
*
clt_tile1
=
clt_tile
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile2
=
clt_tile1
+
(
DTT_SIZE1
*
DTT_SIZE
);
float
*
clt_tile3
=
clt_tile2
+
(
DTT_SIZE1
*
DTT_SIZE
);
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
DTT Tiles before IDTT
\n
"
);
debug_print_clt_scaled
(
clt_tile
,
-
1
,
0xf
,
0.25
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// perform horizontal dct-iv on quadrants 0 and 1
dctiv_nodiverg
(
// quadrant 0
clt_tile
+
threadIdx
.
x
,
// pointer to start of row for quadrant 0
DTT_SIZE1
);
dctiv_nodiverg
(
// quadrant 1
clt_tile
+
threadIdx
.
x
+
(
1
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 1
DTT_SIZE1
);
// perform horizontal dst-iv on quadrants 2 and 3
dstiv_nodiverg
(
// quadrant 2
clt_tile
+
threadIdx
.
x
+
(
2
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 2
DTT_SIZE1
);
dstiv_nodiverg
(
// quadrant 3
clt_tile
+
threadIdx
.
x
+
(
3
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 3
DTT_SIZE1
);
__syncthreads
();
// __syncwarp();
// perform vertical dct-iv on quadrants 0 and 2
dctiv_nodiverg
(
// quadrant 0
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
,
// pointer to start of row for quadrant 0
1
);
dctiv_nodiverg
(
// quadrant 2
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
2
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 2
1
);
// perform vertical dst-iv on quadrants 1 and 3
dstiv_nodiverg
(
// quadrant 1
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
1
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 1
1
);
dstiv_nodiverg
(
// quadrant 3
clt_tile
+
DTT_SIZE1
*
threadIdx
.
x
+
(
3
*
DTT_SIZE
*
DTT_SIZE1
),
// pointer to start of row for quadrant 3
1
);
__syncthreads
();
// __syncwarp();
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
DTT Tiles after IDTT
\n
"
);
debug_print_clt_scaled
(
clt_tile
,
-
1
,
0xf
,
0.25
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
// re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
for
(
int
thr3
=
0
;
thr3
<
2
;
thr3
++
){
int
thr3m
=
(
thr3
<<
3
);
int
column
=
threadIdx
.
x
+
thr3m
;
// modify to use 2*8 threads, if needed.
int
thr012
=
threadIdx
.
x
&
7
;
// == threadIdx.x
int
column4
=
column
>>
2
;
// (threadIdx.x >> 2) | (thr3 << 1) ; // different !
int
wcolumn
=
(
thr3m
-
thr3
)
^
thr012
;
//0..7,7,..0
float
hw
=
HWINDOW2
[
wcolumn
];
int
clt_offset
=
imclt_indx9
[
column
];
// index in each of the 4 iclt quadrants, accounting for stride=9
float
*
rslt
=
mclt_tile
+
column
;
#ifdef DEBUG7
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
Unrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d
\n
"
,
thr3
,
thr3m
,
column
,
thr012
,
column4
,
wcolumn
,
hw
,
clt_offset
);
debug_print_clt1
(
clt_tile
,
-
1
,
0xf
);
// only 1 quadrant for R,B and 2 - for G
}
__syncthreads
();
// __syncwarp();
#endif
#pragma unroll
for
(
int
i
=
0
;
i
<
4
;
i
++
){
float
val
=
*
rslt
;
// facc
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
0
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
0
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
0
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
0
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
<
3
){
clt_offset
+=
DTT_SIZE1
;
}
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
// val =__fmaf_rd(w,d0,val); // w*d0 + val
// *rslt = val;
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
4
;
i
<
8
;
i
++
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
1
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
1
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
1
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
1
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
// if (i < 7){
clt_offset
-=
DTT_SIZE1
;
// }
// *rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
7
;
i
>=
4
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
2
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
2
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
2
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
2
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
4
){
clt_offset
-=
DTT_SIZE1
;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
#pragma unroll
for
(
int
i
=
3
;
i
>=
0
;
i
--
){
float
val
=
*
rslt
;
float
w
=
HWINDOW2
[
i
]
*
hw
;
float
d0
=
idct_signs
[
0
][
3
][
column4
]
*
(
*
(
clt_tile
+
clt_offset
));
float
d1
=
idct_signs
[
1
][
3
][
column4
]
*
(
*
(
clt_tile1
+
clt_offset
));
float
d2
=
idct_signs
[
2
][
3
][
column4
]
*
(
*
(
clt_tile2
+
clt_offset
));
float
d3
=
idct_signs
[
3
][
3
][
column4
]
*
(
*
(
clt_tile3
+
clt_offset
));
d0
+=
d1
;
d2
+=
d3
;
d0
+=
d2
;
if
(
i
>
0
){
clt_offset
+=
DTT_SIZE1
;
}
//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
*
rslt
=
do_acc
?
__fmaf_rd
(
w
,
d0
,
val
)
:
w
*
d0
;
// w*d0 + val do_acc - common for all thereads
rslt
+=
DTT_SIZE21
;
}
}
#ifdef DEBUG7
__syncthreads
();
// __syncwarp();
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
(
debug
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
printf
(
"
\n
MCLT Tiles after IMCLT, cam=%d
\n
"
,
threadIdx
.
y
);
debug_print_mclt
(
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
}
//#endif
src/main/resources/kernels/dtt8x8.h
0 → 100644
View file @
3d5ddc28
/**
**
** dtt8x8.h
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** dtt8x8.cuh is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file dtt8x8.h
* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
* in GPU
* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
* here it is used to convert convolution kernels and LPF to the frequency domain - done in
* software.
*
* This file is cpompatible with both runtime and driver API, runtime is used for development
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE_LOG2 3
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
#define DTT_SIZE21 (DTT_SIZE2 + 1)
#define DTT_SIZE4 (4 * DTT_SIZE)
#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
#define BAYER_RED 0
#define BAYER_BLUE 1
#define BAYER_GREEN 2
// assuming GR/BG as now
#define BAYER_RED_ROW 0
#define BAYER_RED_COL 1
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
//extern __constant__ float idct_signs[4][4][4];
//extern __constant__ int imclt_indx9[16];
//extern __constant__ float HWINDOW2[];
inline
__device__
void
dttii_shared_mem_nonortho
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// does not scale by y[0] (y[7]) by 1/sqrt[0]
inline
__device__
void
dttii_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// used in GPU_DTT24_DRV
inline
__device__
void
dttiv_shared_mem
(
float
*
x0
,
int
inc
,
int
dst_not_dct
);
// used in GPU_DTT24_DRV
inline
__device__
void
dttiv_nodiverg
(
float
*
x
,
int
inc
,
int
dst_not_dct
);
// not used
inline
__device__
void
dctiv_nodiverg
(
float
*
x0
,
int
inc
);
// used in TP
inline
__device__
void
dstiv_nodiverg
(
float
*
x0
,
int
inc
);
// used in TP
inline
__device__
void
dct_ii8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
dct_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
dst_iv8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
_dctii_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
inline
__device__
void
_dctiv_nrecurs8
(
float
x
[
8
],
float
y
[
8
]);
// x,y point to 8-element arrays each // not used
// kernels (not used so far)
#ifdef BBBB
extern
"C"
__global__
void
GPU_DTT24_DRV
(
float
*
dst
,
float
*
src
,
int
src_stride
,
int
dtt_mode
);
#endif// #ifdef BBBB
//=========================== 2D functions ===============
extern
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
);
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
extern
__device__
void
dttii_2d
(
float
*
clt_corr
);
// shared memory, [4][DTT_SIZE1][DTT_SIZE]
extern
__device__
void
dttiv_color_2d
(
float
*
clt_tile
,
int
color
);
extern
__device__
void
imclt
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
);
extern
__device__
void
imclt8threads
(
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
float
*
mclt_tile
,
// [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
int
debug
);
src/main/resources/kernels/test_tp.cu
0 → 100644
View file @
3d5ddc28
/**
**
** dtt8x8.cu - CPU test code to run GPU tile processor
**
** Copyright (C) 2018 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** dtt8x8.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include <helper_functions.h>
// for reading binary files
#include <fstream>
#include <iterator>
#include <vector>
//#include "dtt8x8.cuh"
#include "dtt8x8.h"
#include "TileProcessor.cuh"
///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h"
float
*
copyalloc_kernel_gpu
(
float
*
kernel_host
,
int
size
,
// size in floats
int
full_size
)
{
float
*
kernel_gpu
;
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
kernel_gpu
,
full_size
*
sizeof
(
float
)));
checkCudaErrors
(
cudaMemcpy
(
// segfault
kernel_gpu
,
kernel_host
,
size
*
sizeof
(
float
),
cudaMemcpyHostToDevice
));
return
kernel_gpu
;
}
float
*
copyalloc_kernel_gpu
(
float
*
kernel_host
,
int
size
)
{
return
copyalloc_kernel_gpu
(
kernel_host
,
size
,
// size in floats
size
);
}
float
*
alloccopy_from_gpu
(
float
*
gpu_data
,
float
*
cpu_data
,
// if null, will allocate
int
size
)
{
if
(
!
cpu_data
)
{
cpu_data
=
(
float
*
)
malloc
(
size
*
sizeof
(
float
));
}
checkCudaErrors
(
cudaMemcpy
(
// segfault
cpu_data
,
gpu_data
,
size
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
return
cpu_data
;
}
float
*
alloc_kernel_gpu
(
int
size
)
// size in floats
{
float
*
kernel_gpu
;
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
kernel_gpu
,
size
*
sizeof
(
float
)));
return
kernel_gpu
;
}
float
**
copyalloc_pointers_gpu
(
float
**
gpu_pointer
,
int
size
)
// number of entries (cameras)
{
float
**
gpu_pointer_to_gpu_pointers
;
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
gpu_pointer_to_gpu_pointers
,
size
*
sizeof
(
float
*
)));
checkCudaErrors
(
cudaMemcpy
(
gpu_pointer_to_gpu_pointers
,
gpu_pointer
,
size
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
));
return
gpu_pointer_to_gpu_pointers
;
}
float
*
copyalloc_image_gpu
(
float
*
image_host
,
size_t
*
dstride
,
// in floats !
int
width
,
int
height
)
{
float
*
image_gpu
;
checkCudaErrors
(
cudaMallocPitch
((
void
**
)
&
image_gpu
,
dstride
,
width
*
sizeof
(
float
),
height
));
checkCudaErrors
(
cudaMemcpy2D
(
image_gpu
,
*
dstride
,
// * sizeof(float),
image_host
,
width
*
sizeof
(
float
),
// make in 16*n?
width
*
sizeof
(
float
),
height
,
cudaMemcpyHostToDevice
));
return
image_gpu
;
}
float
*
alloc_image_gpu
(
size_t
*
dstride
,
// in bytes!!
int
width
,
int
height
)
{
float
*
image_gpu
;
checkCudaErrors
(
cudaMallocPitch
((
void
**
)
&
image_gpu
,
dstride
,
width
*
sizeof
(
float
),
height
));
return
image_gpu
;
}
int
readFloatsFromFile
(
float
*
data
,
// allocated array
const
char
*
path
)
// file path
{
std
::
ifstream
input
(
path
,
std
::
ios
::
binary
);
// copies all data into buffer
std
::
vector
<
char
>
buffer
((
std
::
istreambuf_iterator
<
char
>
(
input
)),
(
std
::
istreambuf_iterator
<
char
>
()));
std
::
copy
(
buffer
.
begin
(),
buffer
.
end
(),
(
char
*
)
data
);
return
0
;
}
int
writeFloatsToFile
(
float
*
data
,
// allocated array
int
size
,
// length in elements
const
char
*
path
)
// file path
{
// std::ifstream input(path, std::ios::binary );
std
::
ofstream
ofile
(
path
,
std
::
ios
::
binary
);
ofile
.
write
((
char
*
)
data
,
size
*
sizeof
(
float
));
return
0
;
}
// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
void
set_clt_lpf
(
float
*
lpf
,
// size*size array to be filled out
float
sigma
,
const
int
dct_size
)
{
int
dct_len
=
dct_size
*
dct_size
;
if
(
sigma
==
0.0
f
)
{
lpf
[
0
]
=
1.0
f
;
for
(
int
i
=
1
;
i
<
dct_len
;
i
++
){
lpf
[
i
]
=
0.0
;
}
}
else
{
for
(
int
i
=
0
;
i
<
dct_size
;
i
++
){
for
(
int
j
=
0
;
j
<
dct_size
;
j
++
){
lpf
[
i
*
dct_size
+
j
]
=
exp
(
-
(
i
*
i
+
j
*
j
)
/
(
2
*
sigma
));
}
}
// normalize
double
sum
=
0
;
for
(
int
i
=
0
;
i
<
dct_size
;
i
++
){
for
(
int
j
=
0
;
j
<
dct_size
;
j
++
){
double
d
=
lpf
[
i
*
dct_size
+
j
];
d
*=
cos
(
M_PI
*
i
/
(
2
*
dct_size
))
*
cos
(
M_PI
*
j
/
(
2
*
dct_size
));
if
(
i
>
0
)
d
*=
2.0
;
if
(
j
>
0
)
d
*=
2.0
;
sum
+=
d
;
}
}
for
(
int
i
=
0
;
i
<
dct_len
;
i
++
){
lpf
[
i
]
/=
sum
;
}
}
}
/**
**************************************************************************
* Program entry point
*
* \param argc [IN] - Number of command-line arguments
* \param argv [IN] - Array of command-line arguments
*
* \return Status code
*/
int
main
(
int
argc
,
char
**
argv
)
{
//
// Sample initialization
//
printf
(
"%s Starting...
\n\n
"
,
argv
[
0
]);
printf
(
"sizeof(float*)=%d
\n
"
,(
int
)
sizeof
(
float
*
));
//initialize CUDA
findCudaDevice
(
argc
,
(
const
char
**
)
argv
);
// CLT testing
const
char
*
kernel_file
[]
=
{
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"
};
const
char
*
kernel_offs_file
[]
=
{
"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"
};
const
char
*
image_files
[]
=
{
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"
};
const
char
*
ports_offs_xy_file
[]
=
{
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"
};
const
char
*
ports_clt_file
[]
=
{
// never referenced
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"
};
const
char
*
result_rbg_file
[]
=
{
"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg"
,
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"
};
const
char
*
result_corr_file
=
"/data_ssd/git/tile_processor_gpu/clt/main_corr.corr"
;
const
char
*
result_textures_file
=
"/data_ssd/git/tile_processor_gpu/clt/texture.rgba"
;
const
char
*
result_textures_rgba_file
=
"/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba"
;
// not yet used
float
lpf_sigmas
[
3
]
=
{
0.9
f
,
0.9
f
,
0.9
f
};
// G, B, G
float
port_offsets
[
NUM_CAMS
][
2
]
=
{
// used only in textures to scale differences
{
-
0.5
,
-
0.5
},
{
0.5
,
-
0.5
},
{
-
0.5
,
0.5
},
{
0.5
,
0.5
}};
int
keep_texture_weights
=
1
;
// try with 0 also
int
texture_colors
=
3
;
// result will be 3+1 RGBA (for mono - 2)
/*
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_STEP 16
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNEL_OFFSETS 8
#define TILESX 324
#define TILESY 242
*/
/*
struct tp_task {
long task;
short ty;
short tx;
float xy[NUM_CAMS][2];
} ;
*/
int
KERN_TILES
=
KERNELS_HOR
*
KERNELS_VERT
*
NUM_COLORS
;
int
KERN_SIZE
=
KERN_TILES
*
4
*
64
;
// int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
int
CORR_SIZE
=
(
2
*
CORR_OUT_RAD
+
1
)
*
(
2
*
CORR_OUT_RAD
+
1
);
float
*
host_kern_buf
=
(
float
*
)
malloc
(
KERN_SIZE
*
sizeof
(
float
));
struct
tp_task
task_data
[
TILESX
*
TILESY
];
// maximal length - each tile
int
corr_indices
[
NUM_PAIRS
*
TILESX
*
TILESY
];
// int texture_indices [TILESX*TILESY];
int
texture_indices
[
TILESX
*
TILESYA
];
int
cpu_woi
[
4
];
// host array of pointers to GPU memory
float
*
gpu_kernels_h
[
NUM_CAMS
];
struct
CltExtra
*
gpu_kernel_offsets_h
[
NUM_CAMS
];
float
*
gpu_images_h
[
NUM_CAMS
];
float
tile_coords_h
[
NUM_CAMS
][
TILESX
*
TILESY
][
2
];
float
*
gpu_clt_h
[
NUM_CAMS
];
float
*
gpu_lpf_h
[
NUM_COLORS
];
// never used
#ifndef NOICLT
float
*
gpu_corr_images_h
[
NUM_CAMS
];
#endif
float
*
gpu_corrs
;
int
*
gpu_corr_indices
;
float
*
gpu_textures
;
float
*
gpu_textures_rbga
;
int
*
gpu_texture_indices
;
int
*
gpu_woi
;
int
*
gpu_num_texture_tiles
;
float
*
gpu_port_offsets
;
int
num_corrs
;
int
num_textures
;
int
num_ports
=
NUM_CAMS
;
// GPU pointers to GPU pointers to memory
float
**
gpu_kernels
;
// [NUM_CAMS];
struct
CltExtra
**
gpu_kernel_offsets
;
// [NUM_CAMS];
float
**
gpu_images
;
// [NUM_CAMS];
float
**
gpu_clt
;
// [NUM_CAMS];
float
**
gpu_lpf
;
// [NUM_CAMS]; // never referenced
// GPU pointers to GPU memory
// float * gpu_tasks;
struct
tp_task
*
gpu_tasks
;
size_t
dstride
;
// in bytes !
size_t
dstride_rslt
;
// in bytes !
size_t
dstride_corr
;
// in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t
dstride_textures
;
// in bytes ! for one rgba/ya 16x16 tile
size_t
dstride_textures_rbga
;
// in bytes ! for one rgba/ya 16x16 tile
float
lpf_rbg
[
3
][
64
];
// not used
for
(
int
ncol
=
0
;
ncol
<
3
;
ncol
++
)
{
if
(
lpf_sigmas
[
ncol
]
>
0.0
)
{
set_clt_lpf
(
lpf_rbg
[
ncol
],
// float * lpf, // size*size array to be filled out
lpf_sigmas
[
ncol
],
// float sigma,
8
);
// int dct_size)
gpu_lpf_h
[
ncol
]
=
copyalloc_kernel_gpu
(
lpf_rbg
[
ncol
],
64
);
}
else
{
gpu_lpf_h
[
ncol
]
=
NULL
;
}
}
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
readFloatsFromFile
(
host_kern_buf
,
// float * data, // allocated array
kernel_file
[
ncam
]);
// char * path) // file path
gpu_kernels_h
[
ncam
]
=
copyalloc_kernel_gpu
(
host_kern_buf
,
KERN_SIZE
);
readFloatsFromFile
(
host_kern_buf
,
// float * data, // allocated array
kernel_offs_file
[
ncam
]);
// char * path) // file path
gpu_kernel_offsets_h
[
ncam
]
=
(
struct
CltExtra
*
)
copyalloc_kernel_gpu
(
host_kern_buf
,
KERN_TILES
*
(
sizeof
(
struct
CltExtra
)
/
sizeof
(
float
)));
// will get results back
gpu_clt_h
[
ncam
]
=
alloc_kernel_gpu
(
TILESY
*
TILESX
*
NUM_COLORS
*
4
*
DTT_SIZE
*
DTT_SIZE
);
printf
(
"Allocating GPU memory, 0x%x floats
\n
"
,
(
TILESY
*
TILESX
*
NUM_COLORS
*
4
*
DTT_SIZE
*
DTT_SIZE
))
;
// allocate result images (3x height to accommodate 3 colors
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
//host array of pointers to GPU arrays
#ifndef NOICLT
gpu_corr_images_h
[
ncam
]
=
alloc_image_gpu
(
&
dstride_rslt
,
// size_t* dstride, // in bytes!!
IMG_WIDTH
+
DTT_SIZE
,
// int width,
3
*
(
IMG_HEIGHT
+
DTT_SIZE
));
// int height);
#endif
}
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs
=
alloc_image_gpu
(
&
dstride_corr
,
// in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE
,
// int width,
NUM_PAIRS
*
TILESX
*
TILESY
);
// int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
readFloatsFromFile
(
host_kern_buf
,
// float * data, // allocated array
image_files
[
ncam
]);
// char * path) // file path
gpu_images_h
[
ncam
]
=
copyalloc_image_gpu
(
host_kern_buf
,
// float * image_host,
&
dstride
,
// size_t* dstride,
IMG_WIDTH
,
// int width,
IMG_HEIGHT
);
// int height);
}
//#define DBG_TILE (174*324 +118)
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
readFloatsFromFile
(
(
float
*
)
&
tile_coords_h
[
ncam
],
ports_offs_xy_file
[
ncam
]);
// char * path) // file path
}
// build TP task that processes all tiles in linescan order
for
(
int
ty
=
0
;
ty
<
TILESY
;
ty
++
){
for
(
int
tx
=
0
;
tx
<
TILESX
;
tx
++
){
int
nt
=
ty
*
TILESX
+
tx
;
task_data
[
nt
].
task
=
0xf
|
(((
1
<<
NUM_PAIRS
)
-
1
)
<<
TASK_CORR_BITS
);
task_data
[
nt
].
txy
=
tx
+
(
ty
<<
16
);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
task_data
[
nt
].
xy
[
ncam
][
0
]
=
tile_coords_h
[
ncam
][
nt
][
0
];
task_data
[
nt
].
xy
[
ncam
][
1
]
=
tile_coords_h
[
ncam
][
nt
][
1
];
}
}
}
int
tp_task_size
=
sizeof
(
task_data
)
/
sizeof
(
struct
tp_task
);
#ifdef DBG0
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
for
(
int
t
=
0
;
t
<
NUM_TEST_TILES
;
t
++
)
{
task_data
[
t
].
task
=
1
;
task_data
[
t
].
txy
=
((
DBG_TILE
+
t
)
-
324
*
((
DBG_TILE
+
t
)
/
324
))
+
(((
DBG_TILE
+
t
)
/
324
))
<<
16
;
int
nt
=
task_data
[
t
].
ty
*
TILESX
+
task_data
[
t
].
tx
;
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
task_data
[
t
].
xy
[
ncam
][
0
]
=
tile_coords_h
[
ncam
][
nt
][
0
];
task_data
[
t
].
xy
[
ncam
][
1
]
=
tile_coords_h
[
ncam
][
nt
][
1
];
}
}
tp_task_size
=
NUM_TEST_TILES
;
// sizeof(task_data)/sizeof(float);
#endif
// segfault in the next
gpu_tasks
=
(
struct
tp_task
*
)
copyalloc_kernel_gpu
((
float
*
)
&
task_data
,
tp_task_size
*
(
sizeof
(
struct
tp_task
)
/
sizeof
(
float
)));
// build corr_indices
num_corrs
=
0
;
for
(
int
ty
=
0
;
ty
<
TILESY
;
ty
++
){
for
(
int
tx
=
0
;
tx
<
TILESX
;
tx
++
){
int
nt
=
ty
*
TILESX
+
tx
;
int
cm
=
(
task_data
[
nt
].
task
>>
TASK_CORR_BITS
)
&
((
1
<<
NUM_PAIRS
)
-
1
);
if
(
cm
){
for
(
int
b
=
0
;
b
<
NUM_PAIRS
;
b
++
)
if
((
cm
&
(
1
<<
b
))
!=
0
)
{
corr_indices
[
num_corrs
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
}
}
}
}
// num_corrs now has the total number of correlations
// copy corr_indices to gpu
// gpu_corr_indices = (int *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
gpu_corr_indices
=
(
int
*
)
copyalloc_kernel_gpu
(
(
float
*
)
corr_indices
,
num_corrs
,
NUM_PAIRS
*
TILESX
*
TILESY
);
// build texture_indices
num_textures
=
0
;
for
(
int
ty
=
0
;
ty
<
TILESY
;
ty
++
){
for
(
int
tx
=
0
;
tx
<
TILESX
;
tx
++
){
int
nt
=
ty
*
TILESX
+
tx
;
// int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
int
cm
=
task_data
[
nt
].
task
&
TASK_TEXTURE_BITS
;
if
(
cm
){
texture_indices
[
num_textures
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
(
1
<<
LIST_TEXTURE_BIT
);
}
}
}
// num_textures now has the total number of textures
// copy corr_indices to gpu
// gpu_texture_indices = (int *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
gpu_texture_indices
=
(
int
*
)
copyalloc_kernel_gpu
(
(
float
*
)
texture_indices
,
num_textures
,
TILESX
*
TILESYA
);
// number of rows - multiple of 4
// just allocate
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
gpu_woi
,
4
*
sizeof
(
float
)));
checkCudaErrors
(
cudaMalloc
((
void
**
)
&
gpu_num_texture_tiles
,
8
*
sizeof
(
float
)));
// for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
gpu_port_offsets
=
(
float
*
)
copyalloc_kernel_gpu
((
float
*
)
port_offsets
,
num_ports
*
2
);
// int keep_texture_weights = 1; // try with 0 also
// int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
// double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
int
tile_texture_size
=
(
texture_colors
+
1
+
(
keep_texture_weights
?
(
NUM_CAMS
+
texture_colors
+
1
)
:
0
))
*
256
;
gpu_textures
=
alloc_image_gpu
(
&
dstride_textures
,
// in bytes ! for one rgba/ya 16x16 tile
tile_texture_size
,
// int width (floats),
TILESX
*
TILESY
);
// int height);
int
rgba_width
=
(
TILESX
+
1
)
*
DTT_SIZE
;
int
rgba_height
=
(
TILESY
+
1
)
*
DTT_SIZE
;
int
rbga_slices
=
texture_colors
+
1
;
// 4/1
gpu_textures_rbga
=
alloc_image_gpu
(
&
dstride_textures_rbga
,
// in bytes ! for one rgba/ya 16x16 tile
rgba_width
,
// int width (floats),
rgba_height
*
rbga_slices
);
// int height);
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
gpu_kernels
=
copyalloc_pointers_gpu
(
gpu_kernels_h
,
NUM_CAMS
);
gpu_kernel_offsets
=
(
struct
CltExtra
**
)
copyalloc_pointers_gpu
((
float
**
)
gpu_kernel_offsets_h
,
NUM_CAMS
);
gpu_images
=
copyalloc_pointers_gpu
(
gpu_images_h
,
NUM_CAMS
);
gpu_clt
=
copyalloc_pointers_gpu
(
gpu_clt_h
,
NUM_CAMS
);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
//create and start CUDA timer
StopWatchInterface
*
timerTP
=
0
;
sdkCreateTimer
(
&
timerTP
);
dim3
threads_tp
(
THREADSX
,
TILES_PER_BLOCK
,
1
);
dim3
grid_tp
((
tp_task_size
+
TILES_PER_BLOCK
-
1
)
/
TILES_PER_BLOCK
,
1
);
printf
(
"threads_tp=(%d, %d, %d)
\n
"
,
threads_tp
.
x
,
threads_tp
.
y
,
threads_tp
.
z
);
printf
(
"grid_tp= (%d, %d, %d)
\n
"
,
grid_tp
.
x
,
grid_tp
.
y
,
grid_tp
.
z
);
#ifdef DBG_TILE
const
int
numIterations
=
1
;
//0;
const
int
i0
=
0
;
// -1;
#else
const
int
numIterations
=
10
;
// 0; //0;
const
int
i0
=
-
1
;
// 0; // -1;
#endif
cudaFuncSetCacheConfig
(
convert_correct_tiles
,
cudaFuncCachePreferShared
);
/// cudaProfilerStart();
float
**
fgpu_kernel_offsets
=
(
float
**
)
gpu_kernel_offsets
;
// [NUM_CAMS];
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerTP
);
sdkStartTimer
(
&
timerTP
);
}
convert_correct_tiles
<<<
grid_tp
,
threads_tp
>>>
(
fgpu_kernel_offsets
,
// struct CltExtra ** gpu_kernel_offsets,
gpu_kernels
,
// float ** gpu_kernels,
gpu_images
,
// float ** gpu_images,
gpu_tasks
,
// struct tp_task * gpu_tasks,
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride
/
sizeof
(
float
),
// size_t dstride, // for gpu_images
tp_task_size
,
// int num_tiles) // number of tiles in task
0
);
// 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError
(
"Kernel execution failed"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"%d
\n
"
,
i
);
}
// checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer
(
&
timerTP
);
float
avgTime
=
(
float
)
sdkGetTimerValue
(
&
timerTP
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerTP
);
printf
(
"Run time =%f ms
\n
"
,
avgTime
);
#ifdef SAVE_CLT
int
rslt_size
=
(
TILESY
*
TILESX
*
NUM_COLORS
*
4
*
DTT_SIZE
*
DTT_SIZE
);
float
*
cpu_clt
=
(
float
*
)
malloc
(
rslt_size
*
sizeof
(
float
));
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
checkCudaErrors
(
cudaMemcpy
(
// segfault
cpu_clt
,
gpu_clt_h
[
ncam
],
rslt_size
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
#ifndef DBG_TILE
printf
(
"Writing CLT data to %s
\n
"
,
ports_clt_file
[
ncam
]);
writeFloatsToFile
(
cpu_clt
,
// float * data, // allocated array
rslt_size
,
// int size, // length in elements
ports_clt_file
[
ncam
]);
// const char * path) // file path
#endif
}
#endif
#ifdef TEST_IMCLT
{
// testing imclt
dim3
threads_imclt
(
IMCLT_THREADS_PER_TILE
,
IMCLT_TILES_PER_BLOCK
,
1
);
dim3
grid_imclt
(
1
,
1
,
1
);
printf
(
"threads_imclt=(%d, %d, %d)
\n
"
,
threads_imclt
.
x
,
threads_imclt
.
y
,
threads_imclt
.
z
);
printf
(
"grid_imclt= (%d, %d, %d)
\n
"
,
grid_imclt
.
x
,
grid_imclt
.
y
,
grid_imclt
.
z
);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
test_imclt
<<<
grid_imclt
,
threads_imclt
>>>
(
gpu_clt_h
[
ncam
],
// ncam]); // // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ncam
);
// int ncam); // just for debug print
}
getLastCudaError
(
"Kernel execution failed"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test_imclt() DONE
\n
"
);
}
#endif
#ifndef NOICLT
// testing imclt
dim3
threads_imclt
(
IMCLT_THREADS_PER_TILE
,
IMCLT_TILES_PER_BLOCK
,
1
);
printf
(
"threads_imclt=(%d, %d, %d)
\n
"
,
threads_imclt
.
x
,
threads_imclt
.
y
,
threads_imclt
.
z
);
StopWatchInterface
*
timerIMCLT
=
0
;
sdkCreateTimer
(
&
timerIMCLT
);
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerIMCLT
);
sdkStartTimer
(
&
timerIMCLT
);
}
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
for
(
int
color
=
0
;
color
<
NUM_COLORS
;
color
++
)
{
#ifdef IMCLT14
for
(
int
v_offs
=
0
;
v_offs
<
1
;
v_offs
++
){
// temporarily for debugging
for
(
int
h_offs
=
0
;
h_offs
<
1
;
h_offs
++
){
// temporarily for debugging
#else
for
(
int
v_offs
=
0
;
v_offs
<
2
;
v_offs
++
){
for
(
int
h_offs
=
0
;
h_offs
<
2
;
h_offs
++
){
#endif
int
tilesy_half
=
(
TILESY
+
(
v_offs
^
1
))
>>
1
;
int
tilesx_half
=
(
TILESX
+
(
h_offs
^
1
))
>>
1
;
int
tiles_in_pass
=
tilesy_half
*
tilesx_half
;
dim3
grid_imclt
((
tiles_in_pass
+
IMCLT_TILES_PER_BLOCK
-
1
)
/
IMCLT_TILES_PER_BLOCK
,
1
,
1
);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg
<<<
grid_imclt
,
threads_imclt
>>>
(
gpu_clt_h
[
ncam
],
// float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images_h
[
ncam
],
// float * gpu_rbg, // WIDTH, 3 * HEIGHT
color
,
// int color,
v_offs
,
// int v_offset,
h_offs
,
// int h_offset,
dstride_rslt
/
sizeof
(
float
));
//const size_t dstride); // in floats (pixels)
}
}
}
}
getLastCudaError
(
"Kernel failure"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test pass: %d
\n
"
,
i
);
}
sdkStopTimer
(
&
timerIMCLT
);
float
avgTimeIMCLT
=
(
float
)
sdkGetTimerValue
(
&
timerIMCLT
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerIMCLT
);
printf
(
"Average IMCLT run time =%f ms
\n
"
,
avgTimeIMCLT
);
int
rslt_img_size
=
NUM_COLORS
*
(
IMG_HEIGHT
+
DTT_SIZE
)
*
(
IMG_WIDTH
+
DTT_SIZE
);
float
*
cpu_corr_image
=
(
float
*
)
malloc
(
rslt_img_size
*
sizeof
(
float
));
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
checkCudaErrors
(
cudaMemcpy2D
(
// segfault
cpu_corr_image
,
(
IMG_WIDTH
+
DTT_SIZE
)
*
sizeof
(
float
),
gpu_corr_images_h
[
ncam
],
dstride_rslt
,
(
IMG_WIDTH
+
DTT_SIZE
)
*
sizeof
(
float
),
3
*
(
IMG_HEIGHT
+
DTT_SIZE
),
cudaMemcpyDeviceToHost
));
#ifndef DBG_TILE
printf
(
"Writing RBG data to %s
\n
"
,
result_rbg_file
[
ncam
]);
writeFloatsToFile
(
// will have margins
cpu_corr_image
,
// float * data, // allocated array
rslt_img_size
,
// int size, // length in elements
result_rbg_file
[
ncam
]);
// const char * path) // file path
#endif
}
free
(
cpu_corr_image
);
#endif
#ifndef NOCORR
// cudaProfilerStart();
// testing corr
dim3
threads_corr
(
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
);
printf
(
"threads_corr=(%d, %d, %d)
\n
"
,
threads_corr
.
x
,
threads_corr
.
y
,
threads_corr
.
z
);
StopWatchInterface
*
timerCORR
=
0
;
sdkCreateTimer
(
&
timerCORR
);
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerCORR
);
sdkStartTimer
(
&
timerCORR
);
}
dim3
grid_corr
((
num_corrs
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
correlate2D
<<<
grid_corr
,
threads_corr
>>>
(
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3
,
// int colors, // number of colors (3/1)
0.25
,
// float scale0, // scale for R
0.25
,
// float scale1, // scale for B
0.5
,
// float scale2, // scale for G
30.0
,
// float fat_zero, // here - absolute
num_corrs
,
// size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile+pair
dstride_corr
/
sizeof
(
float
),
// const size_t corr_stride, // in floats
CORR_OUT_RAD
,
// int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs
);
// float * gpu_corrs); // correlation output data
getLastCudaError
(
"Kernel failure"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test pass: %d
\n
"
,
i
);
}
sdkStopTimer
(
&
timerCORR
);
float
avgTimeCORR
=
(
float
)
sdkGetTimerValue
(
&
timerCORR
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerCORR
);
printf
(
"Average CORR run time =%f ms
\n
"
,
avgTimeCORR
);
int
corr_size
=
2
*
CORR_OUT_RAD
+
1
;
int
rslt_corr_size
=
num_corrs
*
corr_size
*
corr_size
;
float
*
cpu_corr
=
(
float
*
)
malloc
(
rslt_corr_size
*
sizeof
(
float
));
checkCudaErrors
(
cudaMemcpy2D
(
cpu_corr
,
(
corr_size
*
corr_size
)
*
sizeof
(
float
),
gpu_corrs
,
dstride_corr
,
(
corr_size
*
corr_size
)
*
sizeof
(
float
),
num_corrs
,
cudaMemcpyDeviceToHost
));
#ifndef NSAVE_CORR
printf
(
"Writing phase correlation data to %s
\n
"
,
result_corr_file
);
writeFloatsToFile
(
cpu_corr
,
// float * data, // allocated array
rslt_corr_size
,
// int size, // length in elements
result_corr_file
);
// const char * path) // file path
#endif
free
(
cpu_corr
);
#endif // ifndef NOCORR
// -----------------
#ifndef NOTEXTURES
// cudaProfilerStart();
// testing textures
dim3
threads_texture
(
TEXTURE_THREADS_PER_TILE
,
NUM_CAMS
,
1
);
// TEXTURE_TILES_PER_BLOCK, 1);
dim3
grid_texture
((
num_textures
+
TEXTURE_TILES_PER_BLOCK
-
1
)
/
TEXTURE_TILES_PER_BLOCK
,
1
,
1
);
printf
(
"threads_texture=(%d, %d, %d)
\n
"
,
threads_texture
.
x
,
threads_texture
.
y
,
threads_texture
.
z
);
printf
(
"grid_texture=(%d, %d, %d)
\n
"
,
grid_texture
.
x
,
grid_texture
.
y
,
grid_texture
.
z
);
StopWatchInterface
*
timerTEXTURE
=
0
;
sdkCreateTimer
(
&
timerTEXTURE
);
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerTEXTURE
);
sdkStartTimer
(
&
timerTEXTURE
);
}
// Channel0 weight = 0.294118
// Channel1 weight = 0.117647
// Channel2 weight = 0.588235
textures_accumulate
<<<
grid_texture
,
threads_texture
>>>
(
// 0, // int border_tile, // if 1 - watch for border
(
int
*
)
0
,
// int * woi, // x, y, width,height
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures
,
// size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices
,
// int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets
,
// float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors
,
// int colors, // number of colors (3/1)
(
texture_colors
==
1
),
// int is_lwir, // do not perform shot correction
10.0
,
// float min_shot, // 10.0
3.0
,
// float scale_shot, // 3.0
1.5
f
,
// float diff_sigma, // pixel value/pixel change
10.0
f
,
// float diff_threshold, // pixel value/pixel change
3.0
,
// float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118
,
// float weight0, // scale for R
0.117647
,
// float weight1, // scale for B
0.588235
,
// float weight2, // scale for G
1
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights
,
// int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
0
,
// const size_t texture_rbg_stride, // in floats
(
float
*
)
0
,
// float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures
/
sizeof
(
float
),
// const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures
);
// float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
getLastCudaError
(
"Kernel failure"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test pass: %d
\n
"
,
i
);
}
/// cudaProfilerStop();
sdkStopTimer
(
&
timerTEXTURE
);
float
avgTimeTEXTURES
=
(
float
)
sdkGetTimerValue
(
&
timerTEXTURE
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerTEXTURE
);
printf
(
"Average Texture run time =%f ms
\n
"
,
avgTimeTEXTURES
);
int
rslt_texture_size
=
num_textures
*
tile_texture_size
;
float
*
cpu_textures
=
(
float
*
)
malloc
(
rslt_texture_size
*
sizeof
(
float
));
checkCudaErrors
(
cudaMemcpy2D
(
cpu_textures
,
tile_texture_size
*
sizeof
(
float
),
gpu_textures
,
dstride_textures
,
tile_texture_size
*
sizeof
(
float
),
num_textures
,
cudaMemcpyDeviceToHost
));
#ifndef NSAVE_TEXTURES
printf
(
"Writing phase texture data to %s
\n
"
,
result_textures_file
);
writeFloatsToFile
(
cpu_textures
,
// float * data, // allocated array
rslt_texture_size
,
// int size, // length in elements
result_textures_file
);
// const char * path) // file path
//DBG_TILE
#ifdef DEBUG10
int
texture_offset
=
DBG_TILE
*
tile_texture_size
;
int
chn
=
0
;
for
(
int
i
=
0
;
i
<
tile_texture_size
;
i
++
){
if
((
i
%
256
)
==
0
){
printf
(
"
\n
chn = %d
\n
"
,
chn
++
);
}
printf
(
"%10.4f"
,
*
(
cpu_textures
+
texture_offset
+
i
));
if
(((
i
+
1
)
%
16
)
==
0
){
printf
(
"
\n
"
);
}
else
{
printf
(
" "
);
}
}
// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
#endif // DEBUG9
#endif
free
(
cpu_textures
);
#endif // ifndef NOTEXTURES
#define GEN_TEXTURE_LIST
#ifdef GEN_TEXTURE_LIST
dim3
threads_list
(
1
,
1
,
1
);
// TEXTURE_TILES_PER_BLOCK, 1);
dim3
grid_list
(
1
,
1
,
1
);
printf
(
"threads_list=(%d, %d, %d)
\n
"
,
threads_list
.
x
,
threads_list
.
y
,
threads_list
.
z
);
printf
(
"grid_list=(%d, %d, %d)
\n
"
,
grid_list
.
x
,
grid_list
.
y
,
grid_list
.
z
);
StopWatchInterface
*
timerTEXTURELIST
=
0
;
sdkCreateTimer
(
&
timerTEXTURELIST
);
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerTEXTURELIST
);
sdkStartTimer
(
&
timerTEXTURELIST
);
}
prepare_texture_list
<<<
grid_list
,
threads_list
>>>
(
gpu_tasks
,
// struct tp_task * gpu_tasks,
tp_task_size
,
// int num_tiles, // number of tiles in task list
gpu_texture_indices
,
// int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles
,
// int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi
,
// int * woi, // x,y,width,height of the woi
TILESX
,
// int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY
);
// int height); // <= TILESY, use for faster processing of LWIR images
getLastCudaError
(
"Kernel failure"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test pass: %d
\n
"
,
i
);
}
/// cudaProfilerStop();
sdkStopTimer
(
&
timerTEXTURELIST
);
float
avgTimeTEXTURESLIST
=
(
float
)
sdkGetTimerValue
(
&
timerTEXTURELIST
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerTEXTURELIST
);
printf
(
"Average TextureList run time =%f ms
\n
"
,
avgTimeTEXTURESLIST
);
int
cpu_num_texture_tiles
[
8
];
checkCudaErrors
(
cudaMemcpy
(
cpu_woi
,
gpu_woi
,
4
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
printf
(
"WOI x=%d, y=%d, width=%d, height=%d
\n
"
,
cpu_woi
[
0
],
cpu_woi
[
1
],
cpu_woi
[
2
],
cpu_woi
[
3
]);
checkCudaErrors
(
cudaMemcpy
(
cpu_num_texture_tiles
,
gpu_num_texture_tiles
,
8
*
sizeof
(
float
),
// 8 sequences (0,2,4,6 - non-border, growing up;
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost
));
printf
(
"cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)
\n
"
,
cpu_num_texture_tiles
[
0
],
cpu_num_texture_tiles
[
1
],
cpu_num_texture_tiles
[
2
],
cpu_num_texture_tiles
[
3
],
cpu_num_texture_tiles
[
4
],
cpu_num_texture_tiles
[
5
],
cpu_num_texture_tiles
[
6
],
cpu_num_texture_tiles
[
7
],
cpu_num_texture_tiles
[
0
]
+
cpu_num_texture_tiles
[
1
]
+
cpu_num_texture_tiles
[
2
]
+
cpu_num_texture_tiles
[
3
]
+
cpu_num_texture_tiles
[
4
]
+
cpu_num_texture_tiles
[
5
]
+
cpu_num_texture_tiles
[
6
]
+
cpu_num_texture_tiles
[
7
],
tp_task_size
);
for
(
int
q
=
0
;
q
<
4
;
q
++
)
{
checkCudaErrors
(
cudaMemcpy
(
texture_indices
+
q
*
TILESX
*
(
TILESYA
>>
2
),
gpu_texture_indices
+
q
*
TILESX
*
(
TILESYA
>>
2
),
cpu_num_texture_tiles
[
q
]
*
sizeof
(
float
),
// change to cpu_num_texture_tiles when ready
cudaMemcpyDeviceToHost
));
}
for
(
int
q
=
0
;
q
<
4
;
q
++
)
{
printf
(
"%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x
\n
"
,
q
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
0
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
0
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
1
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
1
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
2
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
2
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
3
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
3
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
4
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
4
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
5
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
5
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
6
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
6
]
>>
8
)
%
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
7
]
>>
8
)
/
TILESX
,
(
texture_indices
[
q
*
TILESX
*
(
TILESYA
>>
2
)
+
7
]
>>
8
)
%
TILESX
);
}
#endif //GEN_TEXTURE_LIST
#ifndef NOTEXTURE_RGBA
dim3
threads_rgba
(
1
,
1
,
1
);
dim3
grid_rgba
(
1
,
1
,
1
);
printf
(
"threads_rgba=(%d, %d, %d)
\n
"
,
threads_rgba
.
x
,
threads_rgba
.
y
,
threads_rgba
.
z
);
printf
(
"grid_rgba=(%d, %d, %d)
\n
"
,
grid_rgba
.
x
,
grid_rgba
.
y
,
grid_rgba
.
z
);
StopWatchInterface
*
timerRGBA
=
0
;
sdkCreateTimer
(
&
timerRGBA
);
for
(
int
i
=
i0
;
i
<
numIterations
;
i
++
)
{
if
(
i
==
0
)
{
checkCudaErrors
(
cudaDeviceSynchronize
());
sdkResetTimer
(
&
timerRGBA
);
sdkStartTimer
(
&
timerRGBA
);
}
generate_RBGA
<<<
grid_rgba
,
threads_rgba
>>>
(
// Parameters to generate texture tasks
gpu_tasks
,
// struct tp_task * gpu_tasks,
tp_task_size
,
// int num_tiles, // number of tiles in task list
// declare arrays in device code?
gpu_texture_indices
,
// int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles
,
// int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi
,
// int * woi, // x,y,width,height of the woi
TILESX
,
// int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY
,
// int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_port_offsets
,
// float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors
,
// int colors, // number of colors (3/1)
(
texture_colors
==
1
),
// int is_lwir, // do not perform shot correction
10.0
,
// float min_shot, // 10.0
3.0
,
// float scale_shot, // 3.0
1.5
f
,
// float diff_sigma, // pixel value/pixel change
10.0
f
,
// float diff_threshold, // pixel value/pixel change
3.0
,
// float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118
,
// float weight0, // scale for R
0.117647
,
// float weight1, // scale for B
0.588235
,
// float weight2, // scale for G
1
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
0
,
// int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga
/
sizeof
(
float
),
// const size_t texture_rbga_stride, // in floats
gpu_textures_rbga
);
// float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
getLastCudaError
(
"Kernel failure"
);
checkCudaErrors
(
cudaDeviceSynchronize
());
printf
(
"test pass: %d
\n
"
,
i
);
}
sdkStopTimer
(
&
timerRGBA
);
float
avgTimeRGBA
=
(
float
)
sdkGetTimerValue
(
&
timerRGBA
)
/
(
float
)
numIterations
;
sdkDeleteTimer
(
&
timerRGBA
);
printf
(
"Average Texture run time =%f ms
\n
"
,
avgTimeRGBA
);
checkCudaErrors
(
cudaMemcpy
(
cpu_woi
,
gpu_woi
,
4
*
sizeof
(
float
),
cudaMemcpyDeviceToHost
));
printf
(
"WOI x=%d, y=%d, width=%d, height=%d
\n
"
,
cpu_woi
[
0
],
cpu_woi
[
1
],
cpu_woi
[
2
],
cpu_woi
[
3
]);
// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
int
rgba_woi_width
=
(
cpu_woi
[
2
]
+
1
)
*
DTT_SIZE
;
int
rgba_woi_height
=
(
cpu_woi
[
3
]
+
1
)
*
DTT_SIZE
;
int
rslt_rgba_size
=
rgba_woi_width
*
rgba_woi_height
*
rbga_slices
;
float
*
cpu_textures_rgba
=
(
float
*
)
malloc
(
rslt_rgba_size
*
sizeof
(
float
));
checkCudaErrors
(
cudaMemcpy2D
(
cpu_textures_rgba
,
rgba_width
*
sizeof
(
float
),
gpu_textures_rbga
,
dstride_textures_rbga
,
rgba_width
*
sizeof
(
float
),
rgba_height
*
rbga_slices
,
cudaMemcpyDeviceToHost
));
#ifndef NSAVE_TEXTURES
printf
(
"Writing RBGA texture slices to %s
\n
"
,
result_textures_rgba_file
);
writeFloatsToFile
(
cpu_textures_rgba
,
// float * data, // allocated array
rslt_rgba_size
,
// int size, // length in elements
result_textures_rgba_file
);
// const char * path) // file path
#endif
#ifdef DEBUG11
int
rgba_offset
=
(
DBG_TILE_Y
-
cpu_woi
[
1
])
*
DTT_SIZE
*
rgba_woi_width
+
(
DBG_TILE_X
-
cpu_woi
[
0
]);
for
(
int
chn
=
0
;
chn
<
rbga_slices
;
chn
++
){
printf
(
"
\n
chn = %d
\n
"
,
chn
);
int
rgba_offset_chn
=
rgba_offset
+
chn
*
rgba_woi_width
*
rgba_woi_height
;
for
(
int
i
=
0
;
i
<
8
;
i
++
){
for
(
int
j
=
0
;
j
<
8
;
j
++
){
printf
(
"%10.4f "
,
*
(
cpu_textures_rgba
+
rgba_offset_chn
+
i
*
rgba_woi_width
+
j
));
}
printf
(
"
\n
"
);
}
}
#endif // DEBUG11
free
(
cpu_textures_rgba
);
#endif // ifndef NOTEXTURES
#ifdef SAVE_CLT
free
(
cpu_clt
);
#endif
free
(
host_kern_buf
);
// TODO: move somewhere when all is done
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
checkCudaErrors
(
cudaFree
(
gpu_kernels_h
[
ncam
]));
checkCudaErrors
(
cudaFree
(
gpu_kernel_offsets_h
[
ncam
]));
checkCudaErrors
(
cudaFree
(
gpu_images_h
[
ncam
]));
checkCudaErrors
(
cudaFree
(
gpu_clt_h
[
ncam
]));
#ifndef NOICLT
checkCudaErrors
(
cudaFree
(
gpu_corr_images_h
[
ncam
]));
#endif
}
checkCudaErrors
(
cudaFree
(
gpu_tasks
));
checkCudaErrors
(
cudaFree
(
gpu_kernels
));
checkCudaErrors
(
cudaFree
(
gpu_kernel_offsets
));
checkCudaErrors
(
cudaFree
(
gpu_images
));
checkCudaErrors
(
cudaFree
(
gpu_clt
));
// checkCudaErrors(cudaFree(gpu_corr_images));
checkCudaErrors
(
cudaFree
(
gpu_corrs
));
checkCudaErrors
(
cudaFree
(
gpu_corr_indices
));
checkCudaErrors
(
cudaFree
(
gpu_texture_indices
));
checkCudaErrors
(
cudaFree
(
gpu_port_offsets
));
checkCudaErrors
(
cudaFree
(
gpu_textures
));
checkCudaErrors
(
cudaFree
(
gpu_textures_rbga
));
checkCudaErrors
(
cudaFree
(
gpu_woi
));
checkCudaErrors
(
cudaFree
(
gpu_num_texture_tiles
));
exit
(
0
);
}
src/main/resources/kernels/tp_defines.h
0 → 100644
View file @
3d5ddc28
/**
**
** tp_defines.h
**
** Copyright (C) 2020 Elphel, Inc.
**
** -----------------------------------------------------------------------------**
**
** tp_defines.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
**
** Additional permission under GNU GPL version 3 section 7
**
** If you modify this Program, or any covered work, by linking or
** combining it with NVIDIA Corporation's CUDA libraries from the
** NVIDIA CUDA Toolkit (or a modified version of those libraries),
** containing parts covered by the terms of NVIDIA CUDA Toolkit
** EULA, the licensors of this Program grant you additional
** permission to convey the resulting work.
** -----------------------------------------------------------------------------**
*/
/**
**************************************************************************
* \file tp_defines.h
* \brief Defines for running in C++ environment, replaced when called from Java
*/
// Avoiding includes in jcuda, all source files will be merged
#pragma once
#ifndef JCUDA
#define THREADSX (DTT_SIZE)
#define NUM_CAMS 4
#define NUM_PAIRS 6
#define NUM_COLORS 3
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define CORR_THREADS_PER_TILE 8
#define CORR_TILES_PER_BLOCK 4
#define TEXTURE_THREADS_PER_TILE 8
#define TEXTURE_TILES_PER_BLOCK 1
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_NTILE_SHIFT 8 // higher bits - number of a pair, other bits tile number
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
//#define DEBUG3 1
//#define DEBUG4 1
//#define DEBUG5 1
//#define DEBUG6 1
/*
#define DEBUG7 1
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//#define USE_textures_gen
#define DEBUG_OOB1 1
#endif //#ifndef JCUDA
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment