Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
4e5e1f22
Commit
4e5e1f22
authored
Jul 22, 2025
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fixed inter-scene correlation, updated 12.6.0 kernel sources
parent
e08db32b
Changes
3
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
91 additions
and
91 deletions
+91
-91
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+3
-1
OpticalFlow.java
...ain/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+82
-79
TileProcessor.cu
src/main/resources/kernels/12.6.0/TileProcessor.cu
+6
-11
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
4e5e1f22
...
...
@@ -78,7 +78,8 @@ import jcuda.nvrtc.nvrtcProgram;
public
class
GPUTileProcessor
{
public
static
String
CUDA_VERSION
=
JCudaVersion
.
get
();
public
static
boolean
USE_CUDA12
=
CUDA_VERSION
.
startsWith
(
"12."
);
public
static
boolean
USE_DS_DP
=
false
;
// Use Dynamic Shared memory with Dynamic Parallelism (not implemented)
public
static
boolean
USE_DS_DP
=
USE_CUDA12
;
// false; // Use Dynamic Shared memory with Dynamic Parallelism TODO: Split for DP2/shared memory
public
static
String
SOURCE_FOLDER
=
""
;
// where the sources were loaded from - debug feature
String
LIBRARY_PATH
=
"/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"
;
// linux
// Can be downloaded and twice extracted from
// https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-cudart-dev-11-2_11.2.152-1_amd64.deb
...
...
@@ -321,6 +322,7 @@ public class GPUTileProcessor {
file
=
new
File
(
src_dir
.
getPath
(),
src_file
);
System
.
out
.
println
(
"Loading resource "
+
file
);
}
SOURCE_FOLDER
=
file
.
getPath
();
// System.out.println(file.getAbsolutePath());
String
cuFileName
=
file
.
getAbsolutePath
();
// /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
String
sourceFile
=
readFileAsString
(
cuFileName
);
// readResourceAsString(cuFileName);
...
...
src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
View file @
4e5e1f22
This diff is collapsed.
Click to expand it.
src/main/resources/kernels/12.6.0/TileProcessor.cu
View file @
4e5e1f22
...
...
@@ -947,11 +947,9 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
float
*
gpu_corrs
)
// correlation output data
{
dim3
threads0
(
CONVERT_DIRECT_INDEXING_THREADS
,
1
,
1
);
dim3
blocks0
((
num_tiles
*
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
dim3
blocks0
((
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
dim3
threads_corr
(
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
);
// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
dim3
grid_corr
((
num_cams
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
dim3
grid_combine
((
num_tiles
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
if
(
threadIdx
.
x
==
0
)
{
// only 1 thread, 1 block
...
...
@@ -959,6 +957,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
if
(
num_sel_sensors
>
0
){
// try with null tp_tasks to use same sequence from GPU memory
*
pnum_corr_tiles
=
0
;
dim3
grid_corr
((
num_tiles
*
num_sel_sensors
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
index_inter_correlate
<<<
blocks0
,
threads0
,
0
,
cudaStreamFireAndForget
>>>
(
num_cams
,
// int num_cams,
sel_sensors
,
// int sel_sensors,
...
...
@@ -967,12 +966,8 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
tilesx
,
// int width, // number of tiles in a row
gpu_corr_indices
,
// int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles
);
// int * pnum_corr_tiles); // pointer to the length of correlation tasks array
/// cudaDeviceSynchronize();
/// __device__ int num_corr_tiles_with_sum = (*pnum_corr_tiles);
/// int num_corr_tiles_wo_sum = num_corr_tiles_with_sum * num_sel_sensors/ (num_sel_sensors + 1); // remove sum from count
/// dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
/// dim3 grid_corr((num_corr_tiles_wo_sum + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inter_inner
<<<
grid_corr
,
threads_corr
,
0
,
cudaStreamTailLaunch
>>>
(
// will only process to TD, no normalisations and back conversion
correlate2D_inter_inner
<<<
grid_corr
,
threads_corr
,
0
,
cudaStreamTailLaunch
>>>
(
// will only process to TD, no normalizations and back conversion
num_cams
,
// int num_cams,
num_sel_sensors
,
// int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt
,
// float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...
...
@@ -985,7 +980,7 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride
,
// size_t corr_stride, // in floats
gpu_corrs
);
// float * gpu_corrs) // correlation output data (either pixel domain or transform domain
/// dim3 grid_combine((num_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
combine_inter
<<<
grid_combine
,
threads_corr
,
0
,
cudaStreamTailLaunch
>>>
(
// combine per-senor interscene correlations
num_sel_sensors
,
// int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
pnum_corr_tiles
,
//num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
...
...
@@ -2771,7 +2766,7 @@ extern "C" __global__ void convert_direct( // called with a single block, singl
//__device__
*
pnum_active_tiles
=
0
;
// already _device_
int
task_size
=
get_task_size
(
num_cams
);
index_direct
<<<
blocks0
,
threads0
,
0
,
cudaStreamFireAndForget
>>>
(
// cudaStreamFireAndForget>>>(
index_direct
<<<
blocks0
,
threads0
,
0
,
cudaStreamFireAndForget
>>>
(
task_size
,
// int task_size, // flattened task size in 4-byte floats
gpu_ftasks
,
// float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles
,
//int num_tiles, // number of tiles in task
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment