Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
1338de2c
Commit
1338de2c
authored
Oct 09, 2018
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added variable LPF through GPU constants memory
parent
72b6bdce
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
101 additions
and
302 deletions
+101
-302
GPUTileProcessor.java
src/main/java/GPUTileProcessor.java
+85
-280
TwoQuadCLT.java
src/main/java/TwoQuadCLT.java
+16
-22
No files found.
src/main/java/GPUTileProcessor.java
View file @
1338de2c
...
@@ -33,10 +33,10 @@ import static jcuda.driver.JCudaDriver.cuInit;
...
@@ -33,10 +33,10 @@ import static jcuda.driver.JCudaDriver.cuInit;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuLaunchKernel
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuLaunchKernel
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemAlloc
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemAlloc
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemAllocPitch
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemAllocPitch
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemFree
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemcpy2D
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemcpy2D
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemcpyHtoD
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuMemcpyHtoD
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuModuleGetFunction
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuModuleGetFunction
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuModuleGetGlobal
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuModuleLoadData
;
import
static
jcuda
.
driver
.
JCudaDriver
.
cuModuleLoadData
;
import
static
jcuda
.
nvrtc
.
JNvrtc
.
nvrtcCompileProgram
;
import
static
jcuda
.
nvrtc
.
JNvrtc
.
nvrtcCompileProgram
;
import
static
jcuda
.
nvrtc
.
JNvrtc
.
nvrtcCreateProgram
;
import
static
jcuda
.
nvrtc
.
JNvrtc
.
nvrtcCreateProgram
;
...
@@ -69,8 +69,8 @@ import jcuda.nvrtc.nvrtcProgram;
...
@@ -69,8 +69,8 @@ import jcuda.nvrtc.nvrtcProgram;
public
class
GPUTileProcessor
{
public
class
GPUTileProcessor
{
static
String
GPU_KERNEL_FILE
=
"dtt8x8.cuh"
;
static
String
GPU_KERNEL_FILE
=
"dtt8x8.cuh"
;
static
String
[]
GPU_KERNEL_FILES
=
{
"dtt8x8.cuh"
,
"TileProcessor.cuh"
};
static
String
[]
GPU_KERNEL_FILES
=
{
"dtt8x8.cuh"
,
"TileProcessor.cuh"
};
static
String
GPU_DTT24_NAME
=
"GPU_DTT24_DRV"
;
// this.kernelFunction = createFunction(sourceCode, "GPU_DTT24_DRV"); // "invert");
static
String
GPU_CONVERT_CORRECT_TILES_NAME
=
"convert_correct_tiles"
;
static
String
GPU_CONVERT_CORRECT_TILES_NAME
=
"convert_correct_tiles"
;
static
String
GPU_IMCLT_RBG_NAME
=
"imclt_rbg"
;
// pass some defines to gpu source code with #ifdef JCUDA
// pass some defines to gpu source code with #ifdef JCUDA
static
int
DTT_SIZE
=
8
;
static
int
DTT_SIZE
=
8
;
static
int
THREADSX
=
DTT_SIZE
;
static
int
THREADSX
=
DTT_SIZE
;
...
@@ -92,38 +92,12 @@ public class GPUTileProcessor {
...
@@ -92,38 +92,12 @@ public class GPUTileProcessor {
static
int
KERN_SIZE
=
KERN_TILES
*
4
*
64
;
static
int
KERN_SIZE
=
KERN_TILES
*
4
*
64
;
/*
extern "C"
__global__ void convert_correct_tiles(
struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
*/
static
String
GPU_IMCLT_RBG_NAME
=
"imclt_rbg"
;
/*
extern "C"
__global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int color,
int v_offset,
int h_offset,
const size_t dstride) // in floats (pixels)
*/
int
DTTTEST_BLOCK_WIDTH
=
32
;
// may be read from the source code
int
DTTTEST_BLOCK_WIDTH
=
32
;
// may be read from the source code
int
DTTTEST_BLOCK_HEIGHT
=
16
;
// may be read from the source code
int
DTTTEST_BLOCK_HEIGHT
=
16
;
// may be read from the source code
public
boolean
kernels_set
=
false
;
public
boolean
kernels_set
=
false
;
public
boolean
bayer_set
=
false
;
public
boolean
bayer_set
=
false
;
private
CUfunction
GPU_DTT24_kernel
=
null
;
private
CUfunction
GPU_CONVERT_CORRECT_TILES_kernel
=
null
;
private
CUfunction
GPU_CONVERT_CORRECT_TILES_kernel
=
null
;
private
CUfunction
GPU_IMCLT_RBG_kernel
=
null
;
private
CUfunction
GPU_IMCLT_RBG_kernel
=
null
;
// CPU arrays of pointers to GPU memory
// CPU arrays of pointers to GPU memory
...
@@ -143,18 +117,17 @@ __global__ void imclt_rbg(
...
@@ -143,18 +117,17 @@ __global__ void imclt_rbg(
private
CUdeviceptr
gpu_tasks
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_tasks
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
// private
CUmodule
module
;
// to access constants memory
// private CUdeviceptr gpu_lpf = new CUdeviceptr();
// private CUdeviceptr gpu_lpf = new CUdeviceptr();
private
int
mclt_stride
;
private
int
mclt_stride
;
private
int
imclt_stride
;
private
int
imclt_stride
;
public
int
num_task_tiles
;
public
int
num_task_tiles
;
/*
*/
public
class
TpTask
{
public
class
TpTask
{
public
int
task
;
public
int
task
;
public
float
target_disparity
;
public
float
target_disparity
;
// int txy;
public
int
ty
;
public
int
ty
;
public
int
tx
;
public
int
tx
;
public
float
[][]
xy
=
null
;
public
float
[][]
xy
=
null
;
...
@@ -167,27 +140,13 @@ __global__ void imclt_rbg(
...
@@ -167,27 +140,13 @@ __global__ void imclt_rbg(
this
.
target_disparity
=
target_disparity
;
this
.
target_disparity
=
target_disparity
;
this
.
task
=
task
;
this
.
task
=
task
;
}
}
/*
public TpTask(int main, int aux, float [] flt, int indx) {
if (main > 0) xy = new float[main][2];
if (aux > 0) xy_aux = new float[main][2];
task = Float.floatToRawIntBits(flt[indx++]);
int txy = Float.floatToRawIntBits(flt[indx++]);
tx = txy & 0xffff;
ty = txy >> 16;
for (int i = 0; i < xy.length; i++) {
xy[i][0] = flt[indx++];
xy[i][1] = flt[indx++];
}
}
*/
// convert this class to float array to match layout of the C struct
// convert this class
instance
to float array to match layout of the C struct
public
float
[]
asFloatArray
(
boolean
use_aux
)
{
public
float
[]
asFloatArray
(
boolean
use_aux
)
{
float
[]
flt
=
new
float
[
NUM_CAMS
*
2
+
2
];
float
[]
flt
=
new
float
[
NUM_CAMS
*
2
+
2
];
return
asFloatArray
(
flt
,
0
,
use_aux
);
return
asFloatArray
(
flt
,
0
,
use_aux
);
}
}
// convert this class to float array to match layout of the C struct,
// convert this class
instance
to float array to match layout of the C struct,
// fill existing float array from the specified index
// fill existing float array from the specified index
public
float
[]
asFloatArray
(
float
[]
flt
,
int
indx
,
boolean
use_aux
)
{
public
float
[]
asFloatArray
(
float
[]
flt
,
int
indx
,
boolean
use_aux
)
{
flt
[
indx
++]
=
Float
.
intBitsToFloat
(
task
);
flt
[
indx
++]
=
Float
.
intBitsToFloat
(
task
);
...
@@ -258,8 +217,6 @@ __global__ void imclt_rbg(
...
@@ -258,8 +217,6 @@ __global__ void imclt_rbg(
}
}
};
};
private
static
long
getPointerAddress
(
CUdeviceptr
p
)
private
static
long
getPointerAddress
(
CUdeviceptr
p
)
{
{
...
@@ -278,50 +235,6 @@ __global__ void imclt_rbg(
...
@@ -278,50 +235,6 @@ __global__ void imclt_rbg(
return
new
PointerWithAddress
(
p
).
getAddress
();
return
new
PointerWithAddress
(
p
).
getAddress
();
}
}
/*
*
struct tp_task {
int task;
int txy;
// short ty;
// short tx;
float xy[NUM_CAMS][2];
};
struct CltExtra{
float data_x; // kernel data is relative to this displacement X (0.5 pixel increments)
float data_y; // kernel data is relative to this displacement Y (0.5 pixel increments)
float center_x; // actual center X (use to find derivatives)
float center_y; // actual center X (use to find derivatives)
float dxc_dx; // add this to data_x per each pixel X-shift relative to the kernel center location
float dxc_dy; // same per each Y-shift pixel
float dyc_dx;
float dyc_dy;
};
*
*
intBitsToFloat(int)
public static int floatToRawIntBits(float value)
public static float intBitsToFloat(int bits)
// host array of pointers to GPU memory
float * gpu_kernels_h [NUM_CAMS];
struct CltExtra * gpu_kernel_offsets_h [NUM_CAMS];
float * gpu_images_h [NUM_CAMS];
float tile_coords_h [NUM_CAMS][TILESX * TILESY][2];
float * gpu_clt_h [NUM_CAMS];
float * gpu_lpf_h [NUM_COLORS];
#ifndef NOICLT
float * gpu_corr_images_h [NUM_CAMS];
#endif
// GPU pointers to GPU pointers to memory
float ** gpu_kernels; // [NUM_CAMS];
struct CltExtra ** gpu_kernel_offsets; // [NUM_CAMS];
float ** gpu_images; // [NUM_CAMS];
float ** gpu_clt; // [NUM_CAMS];
float ** gpu_lpf; // [NUM_CAMS];
*/
public
GPUTileProcessor
()
throws
IOException
public
GPUTileProcessor
()
throws
IOException
{
{
// From code by Marco Hutter - http://www.jcuda.org
// From code by Marco Hutter - http://www.jcuda.org
...
@@ -356,21 +269,7 @@ public static float intBitsToFloat(int bits)
...
@@ -356,21 +269,7 @@ public static float intBitsToFloat(int bits)
"#define TILES_PER_BLOCK "
+
TILES_PER_BLOCK
+
"\n"
+
"#define TILES_PER_BLOCK "
+
TILES_PER_BLOCK
+
"\n"
+
"#define IMCLT_THREADS_PER_TILE "
+
IMCLT_THREADS_PER_TILE
+
"\n"
+
"#define IMCLT_THREADS_PER_TILE "
+
IMCLT_THREADS_PER_TILE
+
"\n"
+
"#define IMCLT_TILES_PER_BLOCK "
+
IMCLT_TILES_PER_BLOCK
+
"\n"
;
"#define IMCLT_TILES_PER_BLOCK "
+
IMCLT_TILES_PER_BLOCK
+
"\n"
;
/*
#define THREADSX (DTT_SIZE)
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164
#define KERNELS_VERT 123
#define NUM_CAMS 4
#define NUM_COLORS 3
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
#define IMCLT_THREADS_PER_TILE 16
#define IMCLT_TILES_PER_BLOCK 4
*/
for
(
String
src_file:
GPU_KERNEL_FILES
)
{
for
(
String
src_file:
GPU_KERNEL_FILES
)
{
File
file
=
new
File
(
classLoader
.
getResource
(
src_file
).
getFile
());
File
file
=
new
File
(
classLoader
.
getResource
(
src_file
).
getFile
());
System
.
out
.
println
(
file
.
getAbsolutePath
());
System
.
out
.
println
(
file
.
getAbsolutePath
());
...
@@ -385,14 +284,10 @@ public static float intBitsToFloat(int bits)
...
@@ -385,14 +284,10 @@ public static float intBitsToFloat(int bits)
}
}
// Create the kernel functions (first - just test)
// Create the kernel functions (first - just test)
// this.GPU_DTT24_kernel = createFunction(kernelSource, GPU_DTT24_NAME);
String
[]
func_names
=
{
GPU_CONVERT_CORRECT_TILES_NAME
,
GPU_IMCLT_RBG_NAME
};
String
[]
func_names
=
{
GPU_DTT24_NAME
,
GPU_CONVERT_CORRECT_TILES_NAME
,
GPU_IMCLT_RBG_NAME
};
CUfunction
[]
functions
=
createFunctions
(
kernelSource
,
func_names
);
CUfunction
[]
functions
=
createFunctions
(
kernelSource
,
func_names
);
this
.
GPU_DTT24_kernel
=
functions
[
0
];
this
.
GPU_CONVERT_CORRECT_TILES_kernel
=
functions
[
0
];
this
.
GPU_CONVERT_CORRECT_TILES_kernel
=
functions
[
1
];
this
.
GPU_IMCLT_RBG_kernel
=
functions
[
1
];
this
.
GPU_IMCLT_RBG_kernel
=
functions
[
2
];
// this.GPU_CONVERT_CORRECT_TILES = createFunction(kernelSource, GPU_CONVERT_CORRECT_TILES_NAME);
// this.GPU_IMCLT_RBG = createFunction(kernelSource, GPU_IMCLT_RBG_NAME);
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
"Sizeof.POINTER="
+
Sizeof
.
POINTER
);
System
.
out
.
println
(
"Sizeof.POINTER="
+
Sizeof
.
POINTER
);
System
.
out
.
println
(
GPU_IMCLT_RBG_kernel
.
toString
());
System
.
out
.
println
(
GPU_IMCLT_RBG_kernel
.
toString
());
...
@@ -426,7 +321,6 @@ public static float intBitsToFloat(int bits)
...
@@ -426,7 +321,6 @@ public static float intBitsToFloat(int bits)
gpu_clt_h
[
ncam
]
=
new
CUdeviceptr
();
gpu_clt_h
[
ncam
]
=
new
CUdeviceptr
();
cuMemAlloc
(
gpu_clt_h
[
ncam
],
tilesY
*
tilesX
*
NUM_COLORS
*
4
*
DTT_SIZE
*
DTT_SIZE
*
Sizeof
.
FLOAT
);
// public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
cuMemAlloc
(
gpu_clt_h
[
ncam
],
tilesY
*
tilesX
*
NUM_COLORS
*
4
*
DTT_SIZE
*
DTT_SIZE
*
Sizeof
.
FLOAT
);
// public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
//gpu_clt_h
}
}
// now create device arrays pointers
// now create device arrays pointers
if
(
Sizeof
.
POINTER
!=
Sizeof
.
LONG
)
{
if
(
Sizeof
.
POINTER
!=
Sizeof
.
LONG
)
{
...
@@ -455,11 +349,9 @@ public static float intBitsToFloat(int bits)
...
@@ -455,11 +349,9 @@ public static float intBitsToFloat(int bits)
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_clt_l
[
ncam
]
=
getPointerAddress
(
gpu_clt_h
[
ncam
]);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_clt_l
[
ncam
]
=
getPointerAddress
(
gpu_clt_h
[
ncam
]);
cuMemcpyHtoD
(
gpu_clt
,
Pointer
.
to
(
gpu_clt_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemcpyHtoD
(
gpu_clt
,
Pointer
.
to
(
gpu_clt_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
// Set task array
// Set task array
cuMemAlloc
(
gpu_tasks
,
tilesX
*
tilesY
*
TPTASK_SIZE
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_tasks
,
tilesX
*
tilesY
*
TPTASK_SIZE
*
Sizeof
.
POINTER
);
//TPTASK_SIZE
}
}
...
@@ -551,7 +443,6 @@ public static float intBitsToFloat(int bits)
...
@@ -551,7 +443,6 @@ public static float intBitsToFloat(int bits)
copyH2D
.
Height
=
IMG_HEIGHT
;
// /4;
copyH2D
.
Height
=
IMG_HEIGHT
;
// /4;
cuMemcpy2D
(
copyH2D
);
cuMemcpy2D
(
copyH2D
);
}
}
// combines 3 bayer channels into one and transfers to GPU memory
// combines 3 bayer channels into one and transfers to GPU memory
public
void
setBayerImages
(
public
void
setBayerImages
(
double
[][][]
bayer_data
,
double
[][][]
bayer_data
,
...
@@ -571,9 +462,6 @@ public static float intBitsToFloat(int bits)
...
@@ -571,9 +462,6 @@ public static float intBitsToFloat(int bits)
bayer_set
=
true
;
bayer_set
=
true
;
}
}
// prepare tasks for full frame, same dispaity.
// prepare tasks for full frame, same dispaity.
// need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
// need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
public
TpTask
[]
setFullFrameImages
(
public
TpTask
[]
setFullFrameImages
(
...
@@ -661,6 +549,7 @@ public static float intBitsToFloat(int bits)
...
@@ -661,6 +549,7 @@ public static float intBitsToFloat(int bits)
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
// remove later
}
}
public
void
execImcltRbg
()
{
public
void
execImcltRbg
()
{
...
@@ -695,11 +584,11 @@ public static float intBitsToFloat(int bits)
...
@@ -695,11 +584,11 @@ public static float intBitsToFloat(int bits)
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
kernelParameters
,
null
);
// Kernel- and extra parameters
// System.out.println("ncam = "+ncam+", color="+color+", v_offs="+v_offs+", h_offs="+h_offs);
}
}
}
}
}
}
}
}
cuCtxSynchronize
();
}
}
float
[][]
getRBG
(
int
ncam
){
float
[][]
getRBG
(
int
ncam
){
...
@@ -731,159 +620,8 @@ public static float intBitsToFloat(int bits)
...
@@ -731,159 +620,8 @@ public static float intBitsToFloat(int bits)
return
fimg
;
return
fimg
;
}
}
// private static CUfunction [] createFunctions(
private
CUfunction
[]
createFunctions
(
/*
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(IMG_WIDTH + DTT_SIZE) * sizeof(float),
3* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path
}
*/
// run kernel with dttx
public
void
exec_dtt24
(
float
src_pixels
[],
float
dst_pixels
[],
int
width
,
int
dtt_mode
)
{
if
(
GPU_DTT24_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel"
);
return
;
}
int
height
=
src_pixels
.
length
/
width
;
long
width_in_bytes
=
width
*
Sizeof
.
FLOAT
;
CUdeviceptr
src_dpointer
=
new
CUdeviceptr
();
CUdeviceptr
dst_dpointer
=
new
CUdeviceptr
();
long
[]
device_stride
=
new
long
[
1
];
cuMemAllocPitch
(
src_dpointer
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
width_in_bytes
,
// long WidthInBytes,
height
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
int
pitchInElements
=
(
int
)(
device_stride
[
0
]
/
Sizeof
.
FLOAT
);
cuMemAllocPitch
(
dst_dpointer
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
width_in_bytes
,
// long WidthInBytes,
height
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
CUDA_MEMCPY2D
copyH2D
=
new
CUDA_MEMCPY2D
();
copyH2D
.
srcMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_HOST
;
copyH2D
.
srcHost
=
Pointer
.
to
(
src_pixels
);
copyH2D
.
srcPitch
=
width_in_bytes
;
copyH2D
.
dstMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_DEVICE
;
copyH2D
.
dstDevice
=
src_dpointer
;
copyH2D
.
dstPitch
=
device_stride
[
0
];
copyH2D
.
WidthInBytes
=
width_in_bytes
;
copyH2D
.
Height
=
height
;
// /4;
// for copying results to host
CUDA_MEMCPY2D
copyD2H
=
new
CUDA_MEMCPY2D
();
copyD2H
.
srcMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_DEVICE
;
copyD2H
.
srcDevice
=
dst_dpointer
;
// ((test & 1) ==0) ? src_dpointer : dst_dpointer; // copy same data
copyD2H
.
srcPitch
=
device_stride
[
0
];
copyD2H
.
dstMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_HOST
;
copyD2H
.
dstHost
=
Pointer
.
to
(
dst_pixels
);
copyD2H
.
dstPitch
=
width_in_bytes
;
copyD2H
.
WidthInBytes
=
width_in_bytes
;
copyD2H
.
Height
=
height
;
// /2;
// kernel parameters: pointer to pointers
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
dst_dpointer
),
Pointer
.
to
(
src_dpointer
),
Pointer
.
to
(
new
int
[]
{
pitchInElements
}),
Pointer
.
to
(
new
int
[]
{
dtt_mode
})
);
int
[]
GridFullWarps
=
{
width
/
DTTTEST_BLOCK_WIDTH
,
height
/
DTTTEST_BLOCK_HEIGHT
,
1
};
int
[]
ThreadsFullWarps
=
{
DTT_SIZE
,
DTTTEST_BLOCK_WIDTH
/
DTT_SIZE
,
DTTTEST_BLOCK_HEIGHT
/
DTT_SIZE
};
// Actual work starts here:
cuMemcpy2D
(
copyH2D
);
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_DTT24_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
// Copy the data from the device to the host
cuMemcpy2D
(
copyD2H
);
// clean up
cuMemFree
(
src_dpointer
);
cuMemFree
(
dst_dpointer
);
}
/**
* Create the kernel function by its name in the source code
* @param sourceCode The source code
* @param kernelName The kernel function name
* @return
* @throws IOException
*/
/*
private static CUfunction createFunction(
String sourceCode, String kernelName) throws IOException
{
boolean OK = false;
// Use the NVRTC to create a program by compiling the source code
nvrtcProgram program = new nvrtcProgram();
nvrtcCreateProgram(
program, sourceCode, null, 0, null, null);
try {
nvrtcCompileProgram(program, 0, null);
OK = true;
} catch (Exception e) {
System.out.println("nvrtcCompileProgram() FAILED");
}
// Compilation log with errors/warnongs
String programLog[] = new String[1];
nvrtcGetProgramLog(program, programLog);
String log = programLog[0].trim();
if (!log.isEmpty())
{
System.err.println("Program compilation log:\n" + log);
}
if (!OK) {
throw new IOException("Could not compile program");
}
// Get the PTX code of the compiled program (not the binary)
String[] ptx = new String[1];
nvrtcGetPTX(program, ptx);
nvrtcDestroyProgram(program);
// Create a CUDA module from the PTX code
CUmodule module = new CUmodule();
cuModuleLoadData(module, ptx[0]);
// Find the function in the source by name, get its pointer
CUfunction function = new CUfunction();
cuModuleGetFunction(function, module, kernelName);
return function;
}
*/
private
static
CUfunction
[]
createFunctions
(
String
sourceCode
,
String
[]
kernelNames
)
throws
IOException
String
sourceCode
,
String
[]
kernelNames
)
throws
IOException
{
{
CUfunction
[]
functions
=
new
CUfunction
[
kernelNames
.
length
];
CUfunction
[]
functions
=
new
CUfunction
[
kernelNames
.
length
];
...
@@ -916,7 +654,8 @@ public static float intBitsToFloat(int bits)
...
@@ -916,7 +654,8 @@ public static float intBitsToFloat(int bits)
nvrtcDestroyProgram
(
program
);
nvrtcDestroyProgram
(
program
);
// Create a CUDA module from the PTX code
// Create a CUDA module from the PTX code
CUmodule
module
=
new
CUmodule
();
// CUmodule
module
=
new
CUmodule
();
cuModuleLoadData
(
module
,
ptx
[
0
]);
cuModuleLoadData
(
module
,
ptx
[
0
]);
for
(
int
i
=
0
;
i
<
kernelNames
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
kernelNames
.
length
;
i
++)
{
...
@@ -928,9 +667,6 @@ public static float intBitsToFloat(int bits)
...
@@ -928,9 +667,6 @@ public static float intBitsToFloat(int bits)
return
functions
;
return
functions
;
}
}
static
String
readFileAsString
(
String
path
)
static
String
readFileAsString
(
String
path
)
{
{
byte
[]
encoded
;
byte
[]
encoded
;
...
@@ -1036,6 +772,75 @@ public static float intBitsToFloat(int bits)
...
@@ -1036,6 +772,75 @@ public static float intBitsToFloat(int bits)
ImageDtt
.
startAndJoin
(
threads
);
ImageDtt
.
startAndJoin
(
threads
);
}
}
public
void
setLpfRbg
(
float
sigma_r
,
float
sigma_b
,
float
sigma_g
)
{
int
dct_size
=
DTT_SIZE
;
DttRad2
dtt
=
new
DttRad2
(
dct_size
);
double
[][]
lpf_rbg
=
{
dtt
.
dttt_iiie
(
setCltLpf
(
sigma_r
)),
dtt
.
dttt_iiie
(
setCltLpf
(
sigma_b
)),
dtt
.
dttt_iiie
(
setCltLpf
(
sigma_g
))};
int
l
=
dct_size
*
dct_size
;
float
[]
lpf_flat
=
new
float
[
3
*
l
];
for
(
int
i
=
0
;
i
<
3
;
i
++)
{
// System.arraycopy(lpf_rbg[i], 0, lpf_flat, l* i, l);
for
(
int
j
=
0
;
j
<
l
;
j
++)
{
lpf_flat
[
j
+
i
*
l
]
=
(
float
)
(
lpf_rbg
[
i
][
j
]*
2
*
dct_size
);
}
}
CUdeviceptr
constantMemoryPointer
=
new
CUdeviceptr
();
long
constantMemorySizeArray
[]
=
{
0
};
cuModuleGetGlobal
(
constantMemoryPointer
,
constantMemorySizeArray
,
module
,
"lpf_data"
);
int
constantMemorySize
=
(
int
)
constantMemorySizeArray
[
0
];
//__constant__ float lpf_data[3][64]={
System
.
out
.
println
(
"constantMemoryPointer: "
+
constantMemoryPointer
);
System
.
out
.
println
(
"constantMemorySize: "
+
constantMemorySize
);
cuMemcpyHtoD
(
constantMemoryPointer
,
Pointer
.
to
(
lpf_flat
),
constantMemorySize
);
System
.
out
.
println
();
}
public
double
[]
setCltLpf
(
double
sigma
)
{
int
dct_size
=
DTT_SIZE
;
double
[]
lpf
=
new
double
[
dct_size
*
dct_size
];
int
dct_len
=
dct_size
*
dct_size
;
if
(
sigma
==
0.0f
)
{
lpf
[
0
]
=
1.0f
;
for
(
int
i
=
1
;
i
<
dct_len
;
i
++){
lpf
[
i
]
=
0.0f
;
}
}
else
{
for
(
int
i
=
0
;
i
<
dct_size
;
i
++){
for
(
int
j
=
0
;
j
<
dct_size
;
j
++){
lpf
[
i
*
dct_size
+
j
]
=
(
float
)
Math
.
exp
(-(
i
*
i
+
j
*
j
)/(
2
*
sigma
));
}
}
// normalize
double
sum
=
0
;
for
(
int
i
=
0
;
i
<
dct_size
;
i
++){
for
(
int
j
=
0
;
j
<
dct_size
;
j
++){
double
d
=
lpf
[
i
*
dct_size
+
j
];
d
*=
Math
.
cos
(
Math
.
PI
*
i
/(
2
*
dct_size
))*
Math
.
cos
(
Math
.
PI
*
j
/(
2
*
dct_size
));
if
(
i
>
0
)
d
*=
2.0
;
if
(
j
>
0
)
d
*=
2.0
;
sum
+=
d
;
}
}
for
(
int
i
=
0
;
i
<
dct_len
;
i
++){
lpf
[
i
]
/=
sum
;
}
}
return
lpf
;
}
...
...
src/main/java/TwoQuadCLT.java
View file @
1338de2c
...
@@ -1355,27 +1355,7 @@ public class TwoQuadCLT {
...
@@ -1355,27 +1355,7 @@ public class TwoQuadCLT {
dst_bayer
[
nc
][
i
]=
nc
*
main_bayer
[
nc
].
length
+
i
;
dst_bayer
[
nc
][
i
]=
nc
*
main_bayer
[
nc
].
length
+
i
;
}
}
}
}
/*
int iwidth = imp_quad_main[0].getWidth();
String [] dbg_titles= {"src0","dst0","src1","dst1","src2","dst2","src3","dst3"};
for (int nc = 0; nc < main_bayer.length; nc++) {
gPUTileProcessor.exec_dtt24(
main_bayer[nc], // float src_pixels[],
dst_bayer[nc], // float dst_pixels[],
iwidth, // int width,
0); // int dtt_mode);
}
float [][] both_bayer = {main_bayer[0],dst_bayer[0],main_bayer[1],dst_bayer[1],main_bayer[2],dst_bayer[2],main_bayer[3],dst_bayer[3]};
(new showDoubleFloatArrays()).showArrays(
both_bayer,
iwidth,
main_bayer[0].length / iwidth,
true,
"converted",
dbg_titles);
*/
double
[][][]
port_xy_main_dbg
=
new
double
[
tilesX
*
tilesY
][][];
double
[][][]
port_xy_main_dbg
=
new
double
[
tilesX
*
tilesY
][][];
double
[][][]
port_xy_aux_dbg
=
new
double
[
tilesX
*
tilesY
][][];
double
[][][]
port_xy_aux_dbg
=
new
double
[
tilesX
*
tilesY
][][];
...
@@ -1832,6 +1812,12 @@ public class TwoQuadCLT {
...
@@ -1832,6 +1812,12 @@ public class TwoQuadCLT {
final
boolean
updateStatus
,
final
boolean
updateStatus
,
final
int
debugLevel
){
final
int
debugLevel
){
gPUTileProcessor
.
setLpfRbg
(
1.1f
,
// float sigma_r,
1.1f
,
// float sigma_b,
0.7f
);
// float sigma_g)
final
boolean
use_aux
=
false
;
// currently GPU is configured for a single quad camera
final
boolean
use_aux
=
false
;
// currently GPU is configured for a single quad camera
final
boolean
batch_mode
=
clt_parameters
.
batch_run
;
//disable any debug images
final
boolean
batch_mode
=
clt_parameters
.
batch_run
;
//disable any debug images
...
@@ -1889,9 +1875,17 @@ public class TwoQuadCLT {
...
@@ -1889,9 +1875,17 @@ public class TwoQuadCLT {
use_aux
);
// boolean use_aux)
use_aux
);
// boolean use_aux)
// All set, run kernel (correct and convert)
// All set, run kernel (correct and convert)
gPUTileProcessor
.
execConverCorrectTiles
();
int
NREPEAT
=
1
;
// 00;
System
.
out
.
println
(
"\n------------ Running GPU "
+
NREPEAT
+
" times ----------------"
);
long
startGPU
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
gPUTileProcessor
.
execConverCorrectTiles
();
// run imclt;
// run imclt;
gPUTileProcessor
.
execImcltRbg
();
long
firstGPUTime
=
(
System
.
nanoTime
()
-
startGPU
)/
NREPEAT
;
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
gPUTileProcessor
.
execImcltRbg
();
long
runGPUTime
=
(
System
.
nanoTime
()
-
startGPU
)/
NREPEAT
;
System
.
out
.
println
(
"\n------------ End of running GPU "
+
NREPEAT
+
" times ----------------"
);
System
.
out
.
println
(
"GPU run time ="
+(
runGPUTime
*
1.0
e
-
6
)+
"ms, (direct conversion: "
+(
firstGPUTime
*
1.0
e
-
6
)+
"ms, imclt: "
+
((
runGPUTime
-
firstGPUTime
)*
1.0
e
-
6
)+
"ms)"
);
float
[][][]
iclt_fimg
=
new
float
[
GPUTileProcessor
.
NUM_CAMS
][][];
float
[][][]
iclt_fimg
=
new
float
[
GPUTileProcessor
.
NUM_CAMS
][][];
for
(
int
ncam
=
0
;
ncam
<
iclt_fimg
.
length
;
ncam
++)
{
for
(
int
ncam
=
0
;
ncam
<
iclt_fimg
.
length
;
ncam
++)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment