Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
20df596a
Commit
20df596a
authored
Apr 16, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changing direct conversion to CDP, handling sparse tasks
parent
0bb31239
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
436 additions
and
344 deletions
+436
-344
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+95
-105
GeometryCorrection.java
...a/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+0
-4
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+19
-13
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+215
-7
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+36
-13
geometry_correction.cu
src/main/resources/kernels/geometry_correction.cu
+65
-197
geometry_correction.h
src/main/resources/kernels/geometry_correction.h
+6
-5
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
20df596a
...
@@ -99,15 +99,14 @@ public class GPUTileProcessor {
...
@@ -99,15 +99,14 @@ public class GPUTileProcessor {
static
String
[][]
GPU_SRC_FILES
=
{{
"*"
,
"dtt8x8.h"
,
"dtt8x8.cu"
,
"geometry_correction.h"
,
"geometry_correction.cu"
,
"TileProcessor.h"
,
"TileProcessor.cuh"
}};
static
String
[][]
GPU_SRC_FILES
=
{{
"*"
,
"dtt8x8.h"
,
"dtt8x8.cu"
,
"geometry_correction.h"
,
"geometry_correction.cu"
,
"TileProcessor.h"
,
"TileProcessor.cuh"
}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
static
String
GPU_CONVERT_
CORRECT_TILES_NAME
=
"convert_correct_tiles
"
;
// name in C code
static
String
GPU_CONVERT_
DIRECT_NAME
=
"convert_direct
"
;
// name in C code
static
String
GPU_IMCLT_
RBG_NAME
=
"imclt_rbg"
;
// name in C code
static
String
GPU_IMCLT_
ALL_NAME
=
"imclt_rbg_all"
;
static
String
GPU_CORRELATE2D_NAME
=
"correlate2D"
;
// name in C code
static
String
GPU_CORRELATE2D_NAME
=
"correlate2D"
;
// name in C code
// static String GPU_TEXTURES_NAME = "textures_gen"; // name in C code
static
String
GPU_TEXTURES_NAME
=
"textures_accumulate"
;
// name in C code
static
String
GPU_TEXTURES_NAME
=
"textures_accumulate"
;
// name in C code
static
String
GPU_RBGA_NAME
=
"generate_RBGA"
;
// name in C code
static
String
GPU_RBGA_NAME
=
"generate_RBGA"
;
// name in C code
static
String
GPU_ROT_DERIV
=
"calc_rot_deriv"
;
// calculate rotation matrices and derivatives
static
String
GPU_ROT_DERIV
=
"calc_rot_deriv"
;
// calculate rotation matrices and derivatives
static
String
SET_TILES_OFFSETS
=
"get_tiles_offsets"
;
// calculate pixel offsets and disparity distortions
static
String
GPU_SET_TILES_OFFSETS
=
"get_tiles_offsets"
;
// calculate pixel offsets and disparity distortions
static
String
GPU_
IMCLT_ALL_NAME
=
"imclt_rbg_all"
;
static
String
GPU_
CALC_REVERSE_DISTORTION
=
"calcReverseDistortionTable"
;
// calculate reverse radial distortion table from gpu_geometry_correction
// pass some defines to gpu source code with #ifdef JCUDA
// pass some defines to gpu source code with #ifdef JCUDA
...
@@ -162,14 +161,14 @@ public class GPUTileProcessor {
...
@@ -162,14 +161,14 @@ public class GPUTileProcessor {
public
boolean
kernels_set
=
false
;
public
boolean
kernels_set
=
false
;
public
boolean
bayer_set
=
false
;
public
boolean
bayer_set
=
false
;
private
CUfunction
GPU_CONVERT_
CORRECT_TILES_kernel
=
null
;
private
CUfunction
GPU_CONVERT_
DIRECT_kernel
=
null
;
private
CUfunction
GPU_IMCLT_
RBG_kernel
=
null
;
private
CUfunction
GPU_IMCLT_
ALL_kernel
=
null
;
private
CUfunction
GPU_CORRELATE2D_kernel
=
null
;
private
CUfunction
GPU_CORRELATE2D_kernel
=
null
;
private
CUfunction
GPU_TEXTURES_kernel
=
null
;
private
CUfunction
GPU_TEXTURES_kernel
=
null
;
private
CUfunction
GPU_RBGA_kernel
=
null
;
private
CUfunction
GPU_RBGA_kernel
=
null
;
private
CUfunction
GPU_ROT_DERIV_kernel
=
null
;
private
CUfunction
GPU_ROT_DERIV_kernel
=
null
;
private
CUfunction
SET_TILES_OFFSETS_kernel
=
null
;
private
CUfunction
GPU_SET_TILES_OFFSETS_kernel
=
null
;
private
CUfunction
GPU_
IMCLT_ALL_kernel
=
null
;
private
CUfunction
GPU_
CALC_REVERSE_DISTORTION_kernel
=
null
;
// CPU arrays of pointers to GPU memory
// CPU arrays of pointers to GPU memory
...
@@ -202,6 +201,9 @@ public class GPUTileProcessor {
...
@@ -202,6 +201,9 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_geometry_correction
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_geometry_correction
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_rByRDist
=
new
CUdeviceptr
();
// calculated once for the camera distortion model in CPU (move to GPU?)
private
CUdeviceptr
gpu_rByRDist
=
new
CUdeviceptr
();
// calculated once for the camera distortion model in CPU (move to GPU?)
private
CUdeviceptr
gpu_active_tiles
=
new
CUdeviceptr
();
// TILESX*TILESY*sizeof(int)
private
CUdeviceptr
gpu_num_active_tiles
=
new
CUdeviceptr
();
// 1 int
CUmodule
module
;
// to access constants memory
CUmodule
module
;
// to access constants memory
private
int
mclt_stride
;
private
int
mclt_stride
;
private
int
corr_stride
;
private
int
corr_stride
;
...
@@ -464,36 +466,40 @@ public class GPUTileProcessor {
...
@@ -464,36 +466,40 @@ public class GPUTileProcessor {
// Create the kernel functions (first - just test)
// Create the kernel functions (first - just test)
String
[]
func_names
=
{
String
[]
func_names
=
{
GPU_CONVERT_CORRECT_TILES_NAME
,
// GPU_CONVERT_CORRECT_TILES_NAME,
GPU_IMCLT_RBG_NAME
,
GPU_CONVERT_DIRECT_NAME
,
GPU_IMCLT_ALL_NAME
,
GPU_CORRELATE2D_NAME
,
GPU_CORRELATE2D_NAME
,
GPU_TEXTURES_NAME
,
GPU_TEXTURES_NAME
,
GPU_RBGA_NAME
,
GPU_RBGA_NAME
,
GPU_ROT_DERIV
,
GPU_ROT_DERIV
,
SET_TILES_OFFSETS
,
GPU_
SET_TILES_OFFSETS
,
GPU_
IMCLT_ALL_NAME
GPU_
CALC_REVERSE_DISTORTION
};
};
CUfunction
[]
functions
=
createFunctions
(
kernelSources
,
CUfunction
[]
functions
=
createFunctions
(
kernelSources
,
func_names
,
func_names
,
capability
);
// on my - 75
capability
);
// on my - 75
GPU_CONVERT_CORRECT_TILES_kernel
=
functions
[
0
];
// GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
GPU_IMCLT_RBG_kernel
=
functions
[
1
];
GPU_CONVERT_DIRECT_kernel
=
functions
[
0
];
GPU_IMCLT_ALL_kernel
=
functions
[
1
];
GPU_CORRELATE2D_kernel
=
functions
[
2
];
GPU_CORRELATE2D_kernel
=
functions
[
2
];
GPU_TEXTURES_kernel
=
functions
[
3
];
GPU_TEXTURES_kernel
=
functions
[
3
];
GPU_RBGA_kernel
=
functions
[
4
];
GPU_RBGA_kernel
=
functions
[
4
];
GPU_ROT_DERIV_kernel
=
functions
[
5
];
GPU_ROT_DERIV_kernel
=
functions
[
5
];
SET_TILES_OFFSETS_kernel
=
functions
[
6
];
GPU_SET_TILES_OFFSETS_kernel
=
functions
[
6
];
GPU_
IMCLT_ALL_kernel
=
functions
[
7
];
GPU_
CALC_REVERSE_DISTORTION_kernel
=
functions
[
7
];
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
GPU_CONVERT_CORRECT_TILES_kernel
.
toString
());
// System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
System
.
out
.
println
(
GPU_IMCLT_RBG_kernel
.
toString
());
System
.
out
.
println
(
GPU_CONVERT_DIRECT_kernel
.
toString
());
System
.
out
.
println
(
GPU_IMCLT_ALL_kernel
.
toString
());
System
.
out
.
println
(
GPU_CORRELATE2D_kernel
.
toString
());
System
.
out
.
println
(
GPU_CORRELATE2D_kernel
.
toString
());
System
.
out
.
println
(
GPU_TEXTURES_kernel
.
toString
());
System
.
out
.
println
(
GPU_TEXTURES_kernel
.
toString
());
System
.
out
.
println
(
GPU_RBGA_kernel
.
toString
());
System
.
out
.
println
(
GPU_RBGA_kernel
.
toString
());
System
.
out
.
println
(
GPU_ROT_DERIV_kernel
.
toString
());
System
.
out
.
println
(
GPU_ROT_DERIV_kernel
.
toString
());
System
.
out
.
println
(
SET_TILES_OFFSETS_kernel
.
toString
());
System
.
out
.
println
(
GPU_SET_TILES_OFFSETS_kernel
.
toString
());
System
.
out
.
println
(
GPU_CALC_REVERSE_DISTORTION_kernel
.
toString
());
// Init data arrays for all kernels
// Init data arrays for all kernels
int
tilesX
=
IMG_WIDTH
/
DTT_SIZE
;
int
tilesX
=
IMG_WIDTH
/
DTT_SIZE
;
...
@@ -578,8 +584,10 @@ public class GPUTileProcessor {
...
@@ -578,8 +584,10 @@ public class GPUTileProcessor {
cuMemAlloc
(
gpu_texture_indices
,
tilesX
*
tilesYa
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_texture_indices
,
tilesX
*
tilesYa
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_port_offsets
,
NUM_CAMS
*
2
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_port_offsets
,
NUM_CAMS
*
2
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_woi
,
4
*
Sizeof
.
POINTER
);
// may be hidden in device code as a static array?
cuMemAlloc
(
gpu_woi
,
4
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_num_texture_tiles
,
8
*
Sizeof
.
POINTER
);
// may be hidden in device code as a static array?
cuMemAlloc
(
gpu_num_texture_tiles
,
8
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_active_tiles
,
tilesX
*
tilesY
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_num_active_tiles
,
1
*
Sizeof
.
FLOAT
);
cuMemAllocPitch
(
cuMemAllocPitch
(
gpu_corrs
,
// CUdeviceptr dptr,
gpu_corrs
,
// CUdeviceptr dptr,
...
@@ -611,15 +619,18 @@ public class GPUTileProcessor {
...
@@ -611,15 +619,18 @@ public class GPUTileProcessor {
}
}
public
void
setGeometryCorrection
(
GeometryCorrection
gc
)
{
public
void
setGeometryCorrection
(
GeometryCorrection
gc
,
boolean
use_java_rByRDist
)
{
// false - use newer GPU execCalcReverseDistortions
float
[]
fgc
=
gc
.
toFloatArray
();
float
[]
fgc
=
gc
.
toFloatArray
();
if
(
use_java_rByRDist
)
{
double
[]
rByRDist
=
gc
.
getRByRDist
();
double
[]
rByRDist
=
gc
.
getRByRDist
();
float
[]
fFByRDist
=
new
float
[
rByRDist
.
length
];
float
[]
fFByRDist
=
new
float
[
rByRDist
.
length
];
for
(
int
i
=
0
;
i
<
rByRDist
.
length
;
i
++)
{
for
(
int
i
=
0
;
i
<
rByRDist
.
length
;
i
++)
{
fFByRDist
[
i
]
=
(
float
)
rByRDist
[
i
];
fFByRDist
[
i
]
=
(
float
)
rByRDist
[
i
];
}
}
cuMemcpyHtoD
(
gpu_geometry_correction
,
Pointer
.
to
(
fgc
),
fgc
.
length
*
Sizeof
.
FLOAT
);
cuMemcpyHtoD
(
gpu_rByRDist
,
Pointer
.
to
(
fFByRDist
),
fFByRDist
.
length
*
Sizeof
.
FLOAT
);
cuMemcpyHtoD
(
gpu_rByRDist
,
Pointer
.
to
(
fFByRDist
),
fFByRDist
.
length
*
Sizeof
.
FLOAT
);
}
cuMemcpyHtoD
(
gpu_geometry_correction
,
Pointer
.
to
(
fgc
),
fgc
.
length
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_rot_deriv
,
5
*
NUM_CAMS
*
3
*
3
*
Sizeof
.
FLOAT
);
// NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
cuMemAlloc
(
gpu_rot_deriv
,
5
*
NUM_CAMS
*
3
*
3
*
Sizeof
.
FLOAT
);
// NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
}
}
...
@@ -1061,11 +1072,32 @@ public class GPUTileProcessor {
...
@@ -1061,11 +1072,32 @@ public class GPUTileProcessor {
kernelParameters
,
null
);
// Kernel- and extra parameters
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
// remove later
cuCtxSynchronize
();
// remove later
}
}
public
void
execCalcReverseDistortions
()
{
if
(
GPU_CALC_REVERSE_DISTORTION_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CALC_REVERSE_DISTORTION_kernel"
);
return
;
}
// kernel parameters: pointer to pointers
int
[]
GridFullWarps
=
{
NUM_CAMS
,
1
,
1
};
// round up
int
[]
ThreadsFullWarps
=
{
3
,
3
,
3
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_geometry_correction
),
// struct gc * gpu_geometry_correction,
Pointer
.
to
(
gpu_rByRDist
));
// float * gpu_rByRDist) // length should match RBYRDIST_LEN
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_CALC_REVERSE_DISTORTION_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
// remove later
}
public
void
execSetTilesOffsets
()
{
public
void
execSetTilesOffsets
()
{
if
(
SET_TILES_OFFSETS_kernel
==
null
)
if
(
GPU_
SET_TILES_OFFSETS_kernel
==
null
)
{
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: SET_TILES_OFFSETS_kernel"
);
IJ
.
showMessage
(
"Error"
,
"No GPU kernel:
GPU_
SET_TILES_OFFSETS_kernel"
);
return
;
return
;
}
}
// kernel parameters: pointer to pointers
// kernel parameters: pointer to pointers
...
@@ -1079,7 +1111,7 @@ public class GPUTileProcessor {
...
@@ -1079,7 +1111,7 @@ public class GPUTileProcessor {
Pointer
.
to
(
gpu_rByRDist
),
// float * gpu_rByRDist) // length should match RBYRDIST_LEN
Pointer
.
to
(
gpu_rByRDist
),
// float * gpu_rByRDist) // length should match RBYRDIST_LEN
Pointer
.
to
(
gpu_rot_deriv
));
// trot_deriv * gpu_rot_deriv);
Pointer
.
to
(
gpu_rot_deriv
));
// trot_deriv * gpu_rot_deriv);
cuCtxSynchronize
();
cuCtxSynchronize
();
cuLaunchKernel
(
SET_TILES_OFFSETS_kernel
,
cuLaunchKernel
(
GPU_
SET_TILES_OFFSETS_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
...
@@ -1087,15 +1119,15 @@ public class GPUTileProcessor {
...
@@ -1087,15 +1119,15 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
// remove later
cuCtxSynchronize
();
// remove later
}
}
public
void
execConver
CorrectTiles
()
{
public
void
execConver
Direct
()
{
if
(
GPU_CONVERT_
CORRECT_TILES
_kernel
==
null
)
if
(
GPU_CONVERT_
DIRECT
_kernel
==
null
)
{
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CONVERT_
CORRECT_TILES
_kernel"
);
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CONVERT_
DIRECT
_kernel"
);
return
;
return
;
}
}
// kernel parameters: pointer to pointers
// kernel parameters: pointer to pointers
int
[]
GridFullWarps
=
{
(
num_task_tiles
+
TILES_PER_BLOCK
-
1
)/
TILES_PER_BLOCK
,
1
,
1
};
// round up
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
THREADSX
,
TILES_PER_BLOCK
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_kernel_offsets
),
Pointer
.
to
(
gpu_kernel_offsets
),
Pointer
.
to
(
gpu_kernels
),
Pointer
.
to
(
gpu_kernels
),
...
@@ -1109,12 +1141,14 @@ public class GPUTileProcessor {
...
@@ -1109,12 +1141,14 @@ public class GPUTileProcessor {
Pointer
.
to
(
new
int
[]
{
IMG_WIDTH
}),
// int woi_width,
Pointer
.
to
(
new
int
[]
{
IMG_WIDTH
}),
// int woi_width,
Pointer
.
to
(
new
int
[]
{
IMG_HEIGHT
}),
// int woi_height,
Pointer
.
to
(
new
int
[]
{
IMG_HEIGHT
}),
// int woi_height,
Pointer
.
to
(
new
int
[]
{
KERNELS_HOR
}),
// int kernels_hor,
Pointer
.
to
(
new
int
[]
{
KERNELS_HOR
}),
// int kernels_hor,
Pointer
.
to
(
new
int
[]
{
KERNELS_VERT
})
// int kernels_vert);
Pointer
.
to
(
new
int
[]
{
KERNELS_VERT
}),
// int kernels_vert);
Pointer
.
to
(
gpu_active_tiles
),
Pointer
.
to
(
gpu_num_active_tiles
)
);
);
cuCtxSynchronize
();
cuCtxSynchronize
();
// Call the kernel function
// Call the kernel function
cuLaunchKernel
(
GPU_CONVERT_
CORRECT_TILES
_kernel
,
cuLaunchKernel
(
GPU_CONVERT_
DIRECT
_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
...
@@ -1122,51 +1156,7 @@ public class GPUTileProcessor {
...
@@ -1122,51 +1156,7 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
// remove later
cuCtxSynchronize
();
// remove later
}
}
public
void
execImcltRbg
(
boolean
is_mono
)
{
if
(
GPU_IMCLT_RBG_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_IMCLT_RBG_kernel"
);
return
;
}
int
apply_lpf
=
1
;
int
tilesX
=
IMG_WIDTH
/
DTT_SIZE
;
int
tilesY
=
IMG_HEIGHT
/
DTT_SIZE
;
int
[]
ThreadsFullWarps
=
{
IMCLT_THREADS_PER_TILE
,
IMCLT_TILES_PER_BLOCK
,
1
};
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
{
for
(
int
color
=
0
;
color
<
NUM_COLORS
;
color
++)
{
for
(
int
v_offs
=
0
;
v_offs
<
2
;
v_offs
++){
for
(
int
h_offs
=
0
;
h_offs
<
2
;
h_offs
++){
int
tilesy_half
=
(
tilesY
+
(
v_offs
^
1
))
>>
1
;
int
tilesx_half
=
(
tilesX
+
(
h_offs
^
1
))
>>
1
;
int
tiles_in_pass
=
tilesy_half
*
tilesx_half
;
int
[]
GridFullWarps
=
{(
tiles_in_pass
+
IMCLT_TILES_PER_BLOCK
-
1
)
/
IMCLT_TILES_PER_BLOCK
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_clt_h
[
ncam
]),
Pointer
.
to
(
gpu_corr_images_h
[
ncam
]),
Pointer
.
to
(
new
int
[]
{
apply_lpf
}),
Pointer
.
to
(
new
int
[]
{
is_mono
?
1
:
NUM_COLORS
}),
// now - NUM_COLORS
Pointer
.
to
(
new
int
[]
{
color
}),
Pointer
.
to
(
new
int
[]
{
v_offs
}),
Pointer
.
to
(
new
int
[]
{
h_offs
}),
Pointer
.
to
(
new
int
[]
{
tilesX
}),
Pointer
.
to
(
new
int
[]
{
tilesY
}),
Pointer
.
to
(
new
int
[]
{
imclt_stride
})
// lpf_mask
);
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_IMCLT_RBG_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
}
}
}
}
cuCtxSynchronize
();
}
public
void
execImcltRbgAll
(
public
void
execImcltRbgAll
(
boolean
is_mono
boolean
is_mono
...
@@ -1349,7 +1339,7 @@ public class GPUTileProcessor {
...
@@ -1349,7 +1339,7 @@ public class GPUTileProcessor {
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
kernelParameters
=
Pointer
.
to
(
// Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border
// Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border
Pointer
.
to
(
gpu_texture_indices
),
// int * woi, - not used
Pointer
.
to
(
new
int
[]
{
0
}
),
// int * woi, - not used
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
new
int
[]
{
num_texture_tiles
}),
Pointer
.
to
(
new
int
[]
{
num_texture_tiles
}),
Pointer
.
to
(
gpu_texture_indices
),
Pointer
.
to
(
gpu_texture_indices
),
...
...
src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
View file @
20df596a
...
@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
...
@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double
minDerivative
=
0.01
;
double
minDerivative
=
0.01
;
int
numIterations
=
1000
;
int
numIterations
=
1000
;
double
drDistDr
=
1.0
;
double
drDistDr
=
1.0
;
// public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
// public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
// public double distortionB=0.0; // r^3
// public double distortionC=0.0; // r^2
boolean
use8
=(
this
.
distortionA8
!=
0.0
)
||
(
this
.
distortionA7
!=
0.0
)
||
(
this
.
distortionA6
!=
0.0
);
boolean
use8
=(
this
.
distortionA8
!=
0.0
)
||
(
this
.
distortionA7
!=
0.0
)
||
(
this
.
distortionA6
!=
0.0
);
double
d
=
1.0
-
this
.
distortionA8
-
this
.
distortionA7
-
this
.
distortionA6
-
this
.
distortionA5
-
this
.
distortionA
-
this
.
distortionB
-
this
.
distortionC
;
double
d
=
1.0
-
this
.
distortionA8
-
this
.
distortionA7
-
this
.
distortionA6
-
this
.
distortionA5
-
this
.
distortionA
-
this
.
distortionB
-
this
.
distortionC
;
double
rPrev
=
0.0
;
double
rPrev
=
0.0
;
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
20df596a
...
@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
...
@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
tp_tasks
);
tp_tasks
);
gPUTileProcessor
.
setTextureIndices
(
gPUTileProcessor
.
setTextureIndices
(
texture_indices
);
texture_indices
);
gPUTileProcessor
.
setGeometryCorrection
(
quadCLT_main
.
getGeometryCorrection
());
// once
gPUTileProcessor
.
setGeometryCorrection
(
quadCLT_main
.
getGeometryCorrection
(),
false
);
// boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
gPUTileProcessor
.
setExtrinsicsVector
(
quadCLT_main
.
getGeometryCorrection
().
getCorrVector
());
// for each new image
gPUTileProcessor
.
setExtrinsicsVector
(
quadCLT_main
.
getGeometryCorrection
().
getCorrVector
());
// for each new image
// TODO: calculate from the camera geometry?
// TODO: calculate from the camera geometry?
...
@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
...
@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
int
NREPEAT
=
1
;
// 00;
int
NREPEAT
=
1
;
// 00;
System
.
out
.
println
(
"\n------------ Running GPU "
+
NREPEAT
+
" times ----------------"
);
System
.
out
.
println
(
"\n------------ Running GPU "
+
NREPEAT
+
" times ----------------"
);
long
startGPU
=
System
.
nanoTime
();
long
startGPU
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execCalcReverseDistortions
();
}
long
startRotDerivs
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execRotDerivs
();
gPUTileProcessor
.
execRotDerivs
();
}
}
...
@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
...
@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
long
startDirectConvert
=
System
.
nanoTime
();
long
startDirectConvert
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execConver
CorrectTiles
();
gPUTileProcessor
.
execConver
Direct
();
}
}
// run imclt;
// run imclt;
long
startIMCLT
=
System
.
nanoTime
();
long
startIMCLT
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor
.
execImcltRbgAll
(
quadCLT_main
.
isMonochrome
());
gPUTileProcessor
.
execImcltRbgAll
(
quadCLT_main
.
isMonochrome
());
}
}
long
endImcltTime
=
System
.
nanoTime
();
long
endImcltTime
=
System
.
nanoTime
();
...
@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
...
@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
clt_parameters
.
min_agree
,
// double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters
.
min_agree
,
// double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters
.
dust_remove
);
// boolean dust_remove,
clt_parameters
.
dust_remove
);
// boolean dust_remove,
long
endTexturesRBGA
=
System
.
nanoTime
();
long
endTexturesRBGA
=
System
.
nanoTime
();
long
endGPUTime
=
System
.
nanoTime
();
long
endGPUTime
=
System
.
nanoTime
();
long
rotDerivsTime
=
(
startTasksSetup
-
startGPU
)
/
NREPEAT
;
long
calcReverseTime
=
(
startRotDerivs
-
startGPU
)
/
NREPEAT
;
long
rotDerivsTime
=
(
startTasksSetup
-
startRotDerivs
)
/
NREPEAT
;
long
tasksSetupTime
=
(
startDirectConvert
-
startTasksSetup
)
/
NREPEAT
;
long
tasksSetupTime
=
(
startDirectConvert
-
startTasksSetup
)
/
NREPEAT
;
long
firstGPUTime
=
(
startIMCLT
-
startDirectConvert
)
/
NREPEAT
;
long
firstGPUTime
=
(
startIMCLT
-
startDirectConvert
)
/
NREPEAT
;
long
runImcltTime
=
(
endImcltTime
-
startIMCLT
)
/
NREPEAT
;
long
runImcltTime
=
(
endImcltTime
-
startIMCLT
)
/
NREPEAT
;
...
@@ -2171,9 +2176,10 @@ public class TwoQuadCLT {
...
@@ -2171,9 +2176,10 @@ public class TwoQuadCLT {
long
runTexturesRBGATime
=
(
endTexturesRBGA
-
startTexturesRBGA
)
/
NREPEAT
;
long
runTexturesRBGATime
=
(
endTexturesRBGA
-
startTexturesRBGA
)
/
NREPEAT
;
long
runGPUTime
=
(
endGPUTime
-
startGPU
)
/
NREPEAT
;
long
runGPUTime
=
(
endGPUTime
-
startGPU
)
/
NREPEAT
;
// run corr2d
// run corr2d
//RotDerivs
System
.
out
.
println
(
"\n------------ End of running GPU "
+
NREPEAT
+
" times ----------------"
);
System
.
out
.
println
(
"\n------------ End of running GPU "
+
NREPEAT
+
" times ----------------"
);
System
.
out
.
println
(
"GPU run time ="
+
(
runGPUTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
"GPU run time ="
+
(
runGPUTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - calc reverse dist.: "
+(
calcReverseTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - rot/derivs: "
+(
rotDerivsTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - rot/derivs: "
+(
rotDerivsTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - tasks setup: "
+(
tasksSetupTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - tasks setup: "
+(
tasksSetupTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - direct conversion: "
+(
firstGPUTime
*
1.0
e
-
6
)+
"ms"
);
System
.
out
.
println
(
" - direct conversion: "
+(
firstGPUTime
*
1.0
e
-
6
)+
"ms"
);
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
20df596a
...
@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
...
@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
// Make TILESYA >= TILESX and a multiple of 4
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
// increase row length by 1 so vertical passes will use different ports
...
@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
...
@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
#endif //#ifdef USE_CDP
#endif //#ifdef USE_CDP
//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
//#define CONVERT_DIRECT_NUM_CHUNKS ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//__global__ int num_active_tiles;
//__global__ int active_tiles [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
// not maintaining order of the tiles to be processed
extern
"C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
)
// indices to gpu_tasks // should be initialized to zero
{
int
num_tile
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
num_tile
>=
num_tiles
){
return
;
}
if
(
gpu_tasks
[
num_tile
].
task
!=
0
)
{
active_tiles
[
atomicAdd
(
num_active_tiles
,
1
)]
=
num_tile
;
}
}
extern
"C"
__global__
void
convert_direct
(
// called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
,
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
)
// indices to gpu_tasks
{
dim3
threads0
(
CONVERT_DIRECT_INDEXING_THREADS
,
1
,
1
);
dim3
blocks0
((
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
if
(
threadIdx
.
x
==
0
)
{
// of CONVERT_DIRECT_INDEXING_THREADS
*
pnum_active_tiles
=
0
;
index_direct
<<<
blocks0
,
threads0
>>>
(
gpu_tasks
,
// struct tp_task * gpu_tasks,
num_tiles
,
//int num_tiles, // number of tiles in task
gpu_active_tiles
,
//int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles
);
//int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize
();
// now call actual convert_correct_tiles
dim3
threads_tp
(
THREADSX
,
TILES_PER_BLOCK
,
1
);
dim3
grid_tp
((
*
pnum_active_tiles
+
TILES_PER_BLOCK
-
1
)
/
TILES_PER_BLOCK
,
1
);
convert_correct_tiles
<<<
grid_tp
,
threads_tp
>>>
(
gpu_kernel_offsets
,
// float ** gpu_kernel_offsets, // [NUM_CAMS],
gpu_kernels
,
// float ** gpu_kernels, // [NUM_CAMS],
gpu_images
,
// float ** gpu_images, // [NUM_CAMS],
gpu_tasks
,
// struct tp_task * gpu_tasks, // array of tasks
gpu_active_tiles
,
// int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*
pnum_active_tiles
,
// int num_active_tiles, // number of tiles in task
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride
,
// size_t dstride, // in floats (pixels)
lpf_mask
,
// int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width
,
// int woi_width, // varaible to swict between EO and LWIR
woi_height
,
// int woi_height, // varaible to swict between EO and LWIR
kernels_hor
,
// int kernels_hor, // varaible to swict between EO and LWIR
kernels_vert
);
// int kernels_vert); // varaible to swict between EO and LWIR
}
}
#if 0 // trying to keep the same order
extern "C" __global__ void index_direct_init(
int num_chunks, // number of tiles in task
struct convert_direct_tmp* tmp)
{
int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
if (chunk_index <= num_chunks){
tmp->num_acive_per_chunk[chunk_index] = 0;
}
}
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
extern
"C"
extern "C" __global__ void build_index_direct(
__global__
void
convert_correct_tiles
(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
/**
* Top level to call other kernel with CDP
*/
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
...
@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
...
@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * num_active_tiles,
struct convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
{
int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 (num_chunk_blocks,1, 1);
__shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
cudaDeviceSynchronize(); // not needed yet, just for testing
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles)
tmp);
cudaDeviceSynchronize(); // not needed yet, just for testing
// single-threaded - make cumulative
// tmp-> num_acive_per_chunk2[0] = 0;
superchunks[0] = 0;
}
__syncthreads();
// calculate cumulative in 3 steps
//1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
for (int pass = 0; pass < num_passes; pass++){
int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
if (num_cluster2 <= num_chunk_blocks){
superchunks[threadIdx.x+1] = superchunks[0];
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
if (indx <= num_chunks) {
superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
}
}
}
__syncthreads();
// make superchunks cumulative (single-threaded
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
superchunks[i + 1] += superchunks[i];
}
}
__syncthreads();
// now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
if (num_cluster2 <= num_chunk_blocks){
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
int prev = tmp -> num_acive_per_chunk[indx++];
if (indx <= num_chunks) {
tmp -> num_acive_per_chunk[indx] += prev;
}
}
}
__syncthreads();
}
}
#endif
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_width
,
int
woi_height
,
int
woi_height
,
...
@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
...
@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
{
{
dim3
t
=
threadIdx
;
dim3
t
=
threadIdx
;
int
tile_in_block
=
threadIdx
.
y
;
int
tile_in_block
=
threadIdx
.
y
;
int
task_num
=
blockIdx
.
x
*
TILES_PER_BLOCK
+
tile_in_block
;
// int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if
(
task_num
>=
num_tiles
)
return
;
// nothing to do
// if (task_num >= num_tiles) return; // nothing to do
int
task_indx
=
blockIdx
.
x
*
TILES_PER_BLOCK
+
tile_in_block
;
if
(
task_indx
>=
num_active_tiles
){
return
;
// nothing to do
}
int
task_num
=
gpu_active_tiles
[
task_indx
];
struct
tp_task
*
gpu_task
=
&
gpu_tasks
[
task_num
];
struct
tp_task
*
gpu_task
=
&
gpu_tasks
[
task_num
];
if
(
!
gpu_task
->
task
)
return
;
// NOP tile
if
(
!
gpu_task
->
task
)
return
;
// NOP tile
__shared__
struct
tp_task
tt
[
TILES_PER_BLOCK
];
__shared__
struct
tp_task
tt
[
TILES_PER_BLOCK
];
...
@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
...
@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
woi_height
,
// int woi_height,
woi_height
,
// int woi_height,
kernels_hor
,
// int kernels_hor,
kernels_hor
,
// int kernels_hor,
kernels_vert
);
//int kernels_vert)
kernels_vert
);
//int kernels_vert)
__syncthreads
();
__syncthreads
();
// __syncwarp();
}
}
}
}
}
}
...
@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
...
@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
int
tileX
=
tile_num
-
tileY
*
TILESX
;
int
tileX
=
tile_num
-
tileY
*
TILESX
;
int
tile_x0
=
(
tileX
-
*
(
woi
+
0
))
*
DTT_SIZE
;
// - (DTT_SIZE/2); // may be negative == -4
int
tile_x0
=
(
tileX
-
*
(
woi
+
0
))
*
DTT_SIZE
;
// - (DTT_SIZE/2); // may be negative == -4
int
tile_y0
=
(
tileY
-
*
(
woi
+
1
))
*
DTT_SIZE
;
// - (DTT_SIZE/2); // may be negative == -4
int
tile_y0
=
(
tileY
-
*
(
woi
+
1
))
*
DTT_SIZE
;
// - (DTT_SIZE/2); // may be negative == -4
int
height
=
*
(
woi
+
3
)
<<
DTT_SIZE_LOG2
;
///
int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12
#ifdef DEBUG12
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
...
...
src/main/resources/kernels/TileProcessor.h
View file @
20df596a
...
@@ -41,9 +41,14 @@
...
@@ -41,9 +41,14 @@
#include "tp_defines.h"
#include "tp_defines.h"
#endif
#endif
extern
"C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
);
// indices to gpu_tasks // should be initialized to zero
extern
"C"
extern
"C"
__global__
void
convert_direct
(
// called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
__global__
void
convert_correct_tiles
(
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
...
@@ -51,6 +56,24 @@ __global__ void convert_correct_tiles(
...
@@ -51,6 +56,24 @@ __global__ void convert_correct_tiles(
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
,
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_width
,
int
woi_height
,
int
woi_height
,
...
...
src/main/resources/kernels/geometry_correction.cu
View file @
20df596a
...
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
...
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
inline
__device__
float
getRByRDist
(
float
rDist
,
inline
__device__
float
getRByRDist
(
float
rDist
,
float
rByRDist
[
RBYRDIST_LEN
]);
//shared memory
float
rByRDist
[
RBYRDIST_LEN
]);
//shared memory
__constant__
float
ROTS_TEMPLATE
[
7
][
3
][
3
][
3
]
=
{
// ...{cos,sin,const}...
__constant__
float
ROTS_TEMPLATE
[
7
][
3
][
3
][
3
]
=
{
// ...{cos,sin,const}...
{
// azimuth
{
// azimuth
{{
1
,
0
,
0
},{
0
,
0
,
0
},{
0
,
-
1
,
0
}},
{{
1
,
0
,
0
},{
0
,
0
,
0
},{
0
,
-
1
,
0
}},
...
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
...
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
{
-
1
,
-
1
,
-
1
}
// do nothing
{
-
1
,
-
1
,
-
1
}
// do nothing
}};
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__
int
offset_rots
=
0
;
//0
__constant__
int
offset_rots
=
0
;
//0
__constant__
int
offset_derivs
=
1
;
// 1..4 // should be next
__constant__
int
offset_derivs
=
1
;
// 1..4 // should be next
__constant__
int
offset_matrices
=
5
;
// 5..11
__constant__
int
offset_matrices
=
5
;
// 5..11
...
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
}
}
extern
"C"
__global__
void
calcReverseDistortionTable
(
struct
gc
*
geometry_correction
,
float
*
rByRDist
)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int
indx
=
((
blockIdx
.
x
*
blockDim
.
z
+
threadIdx
.
z
)
*
blockDim
.
y
+
threadIdx
.
y
)
*
blockDim
.
x
+
threadIdx
.
x
;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double
delta
=
1E-10
;
// 12; // 10; // -8; 0.730000 ms
double
minDerivative
=
0.01
;
int
numIterations
=
1000
;
double
drDistDr
=
1.0
;
double
d
=
1.0
-
geometry_correction
->
distortionA8
-
geometry_correction
->
distortionA7
-
geometry_correction
->
distortionA6
-
geometry_correction
->
distortionA5
-
geometry_correction
->
distortionA
-
geometry_correction
->
distortionB
-
geometry_correction
->
distortionC
;
double
rPrev
=
0.0
;
int
num_points
=
(
RBYRDIST_LEN
+
CALC_REVERSE_TABLE_BLOCK_THREADS
-
1
)
/
CALC_REVERSE_TABLE_BLOCK_THREADS
;
for
(
int
p
=
0
;
p
<
num_points
;
p
++
){
int
i
=
indx
*
num_points
+
p
;
if
(
i
>=
RBYRDIST_LEN
){
return
;
}
if
(
i
==
0
){
rByRDist
[
0
]
=
(
float
)
1.0
/
d
;
break
;
}
double
rDist
=
RBYRDIST_STEP
*
i
;
double
r
=
(
p
==
0
)
?
rDist
:
rPrev
;
for
(
int
iteration
=
0
;
iteration
<
numIterations
;
iteration
++
){
double
k
=
(((((((
geometry_correction
->
distortionA8
)
*
r
+
geometry_correction
->
distortionA7
)
*
r
+
geometry_correction
->
distortionA6
)
*
r
+
geometry_correction
->
distortionA5
)
*
r
+
geometry_correction
->
distortionA
)
*
r
+
geometry_correction
->
distortionB
)
*
r
+
geometry_correction
->
distortionC
)
*
r
+
d
;
drDistDr
=
(((((((
8
*
geometry_correction
->
distortionA8
)
*
r
+
7
*
geometry_correction
->
distortionA7
)
*
r
+
6
*
geometry_correction
->
distortionA6
)
*
r
+
5
*
geometry_correction
->
distortionA5
)
*
r
+
4
*
geometry_correction
->
distortionA
)
*
r
+
3
*
geometry_correction
->
distortionB
)
*
r
+
2
*
geometry_correction
->
distortionC
)
*
r
+
d
;
if
(
drDistDr
<
minDerivative
)
{
// folds backwards !
return
;
// too high distortion
}
double
rD
=
r
*
k
;
if
(
fabs
(
rD
-
rDist
)
<
delta
){
break
;
}
r
+=
(
rDist
-
rD
)
/
drDistDr
;
}
rPrev
=
r
;
rByRDist
[
i
]
=
(
float
)
r
/
rDist
;
}
}
/**
/**
* Calculate non-distorted radius from distorted using table approximation
* Calculate non-distorted radius from distorted using table approximation
...
...
src/main/resources/kernels/geometry_correction.h
View file @
20df596a
...
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
...
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
float
*
gpu_rByRDist
,
// length should match RBYRDIST_LEN
trot_deriv
*
gpu_rot_deriv
);
trot_deriv
*
gpu_rot_deriv
);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
// uses NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calc_rot_deriv
(
extern
"C"
__global__
void
calc_rot_deriv
(
struct
corr_vector
*
gpu_correction_vector
,
struct
corr_vector
*
gpu_correction_vector
,
trot_deriv
*
gpu_rot_deriv
);
trot_deriv
*
gpu_rot_deriv
);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern
"C"
__global__
void
calcReverseDistortionTable
(
struct
gc
*
geometry_correction
,
float
*
rByRDist
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment