Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
b4d88911
Commit
b4d88911
authored
Jun 16, 2022
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Updated GPU code
parent
e156dbea
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
400 additions
and
25 deletions
+400
-25
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+381
-24
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+19
-1
No files found.
src/main/resources/kernels/TileProcessor.cuh
View file @
b4d88911
...
@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
...
@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
0.32891038
f
,
0.30456742
f
,
0.26124917
f
,
0.20786692
f
,
0.15413642
f
,
0.10818204
f
,
0.07536856
f
,
0.05845371
f
,
0.32891038
f
,
0.30456742
f
,
0.26124917
f
,
0.20786692
f
,
0.15413642
f
,
0.10818204
f
,
0.07536856
f
,
0.05845371
f
,
0.22914618
f
,
0.21218686
f
,
0.18200779
f
,
0.14481729
f
,
0.10738418
f
,
0.07536856
f
,
0.05250797
f
,
0.04072369
f
,
0.22914618
f
,
0.21218686
f
,
0.18200779
f
,
0.14481729
f
,
0.10738418
f
,
0.07536856
f
,
0.05250797
f
,
0.04072369
f
,
0.17771927
f
,
0.16456610
f
,
0.14116007
f
,
0.11231618
f
,
0.08328412
f
,
0.05845371
f
,
0.04072369
f
,
0.03158414
f
0.17771927
f
,
0.16456610
f
,
0.14116007
f
,
0.11231618
f
,
0.08328412
f
,
0.05845371
f
,
0.04072369
f
,
0.03158414
f
};
};
__constant__
float
lpf_corr
[
64
]
=
{
// modify if needed
__constant__
float
lpf_corr
[
64
]
=
{
// modify if needed
1.00000000
f
,
0.87041007
f
,
0.65943687
f
,
0.43487258
f
,
0.24970076
f
,
0.12518080
f
,
0.05616371
f
,
0.02728573
f
,
1.00000000
f
,
0.87041007
f
,
0.65943687
f
,
0.43487258
f
,
0.24970076
f
,
0.12518080
f
,
0.05616371
f
,
0.02728573
f
,
...
@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
...
@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
0.12518080
f
,
0.10895863
f
,
0.08254883
f
,
0.05443770
f
,
0.03125774
f
,
0.01567023
f
,
0.00703062
f
,
0.00341565
f
,
0.12518080
f
,
0.10895863
f
,
0.08254883
f
,
0.05443770
f
,
0.03125774
f
,
0.01567023
f
,
0.00703062
f
,
0.00341565
f
,
0.05616371
f
,
0.04888546
f
,
0.03703642
f
,
0.02442406
f
,
0.01402412
f
,
0.00703062
f
,
0.00315436
f
,
0.00153247
f
,
0.05616371
f
,
0.04888546
f
,
0.03703642
f
,
0.02442406
f
,
0.01402412
f
,
0.00703062
f
,
0.00315436
f
,
0.00153247
f
,
0.02728573
f
,
0.02374977
f
,
0.01799322
f
,
0.01186582
f
,
0.00681327
f
,
0.00341565
f
,
0.00153247
f
,
0.00074451
f
0.02728573
f
,
0.02374977
f
,
0.01799322
f
,
0.01186582
f
,
0.00681327
f
,
0.00341565
f
,
0.00153247
f
,
0.00074451
f
};
};
__constant__
float
LoG_corr
[
64
]
=
{
// modify if needed high-pass filter before correlation to fit into float range
__constant__
float
LoG_corr
[
64
]
=
{
// modify if needed high-pass filter before correlation to fit into float range
...
@@ -1041,6 +1043,15 @@ __global__ void index_correlate(
...
@@ -1041,6 +1043,15 @@ __global__ void index_correlate(
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
pnum_corr_tiles
);
// pointer to the length of correlation tasks array
int
*
pnum_corr_tiles
);
// pointer to the length of correlation tasks array
__global__
void
index_inter_correlate
(
int
num_cams
,
int
sel_sensors
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int
num_tiles
,
// number of tiles in task
int
width
,
// number of tiles in a row
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
pnum_corr_tiles
);
// pointer to the length of correlation tasks array
extern
"C"
__global__
void
create_nonoverlap_list
(
extern
"C"
__global__
void
create_nonoverlap_list
(
int
num_cams
,
int
num_cams
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
...
@@ -1068,6 +1079,27 @@ __global__ void convert_correct_tiles(
...
@@ -1068,6 +1079,27 @@ __global__ void convert_correct_tiles(
int
kernels_vert
,
//);
int
kernels_vert
,
//);
int
tilesx
);
int
tilesx
);
extern
"C"
__global__
void
combine_inter
(
// combine per-senor interscene correlations
int
num_sel_sensors
,
// number of sensors to correlate (not counting sum of all)
int
num_corr_tiles
,
// number of correlation tiles to process (here it includes sum)
int
*
gpu_corr_indices
,
// packed tile+pair
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
);
// correlation output data (either pixel domain or transform domain
extern
"C"
__global__
void
correlate2D_inter_inner
(
// will only process to TD, no normalisations and back conversion
int
num_cams
,
int
num_sel_sensors
,
// number of sensors to correlate (not counting sum of all)
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt_ref
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
int
num_corr_tiles
,
// number of correlation tiles to process (here it includes sum)
int
*
gpu_corr_indices
,
// packed tile+pair
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
);
// correlation output data (either pixel domain or transform domain
extern
"C"
__global__
void
correlate2D_inner
(
extern
"C"
__global__
void
correlate2D_inner
(
int
num_cams
,
int
num_cams
,
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...
@@ -1220,6 +1252,291 @@ extern "C" __global__ void correlate2D(
...
@@ -1220,6 +1252,291 @@ extern "C" __global__ void correlate2D(
}
}
}
}
/**
* Calculate 2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
* ones with CDP, this one should be configured as correlate2D<<<1,1>>>
*
* @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
* @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param tilesx number of tile rows
* @param gpu_corr_indices allocated array for per-tile correlation tasks (4 bytes per tile)
* @param pnum_corr_tiles allocated space for pointer to a number of number of correlation tiles to process
* @param corr_stride, stride (in floats) for correlation outputs.
* @param gpu_corrs) allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/
extern
"C"
__global__
void
correlate2D_inter
(
// only results in TD
int
num_cams
,
int
sel_sensors
,
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt_ref
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int
num_tiles
,
// number of tiles in task
int
tilesx
,
// number of tile rows
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
)
// correlation output data
{
dim3
threads0
(
CONVERT_DIRECT_INDEXING_THREADS
,
1
,
1
);
dim3
blocks0
((
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
if
(
threadIdx
.
x
==
0
)
{
// only 1 thread, 1 block
int
num_sel_sensors
=
__popc
(
sel_sensors
);
// number of non-zero bits
if
(
num_sel_sensors
>
0
){
// try with null tp_tasks to use same sequence from GPU memory
*
pnum_corr_tiles
=
0
;
index_inter_correlate
<<<
blocks0
,
threads0
>>>
(
num_cams
,
// int num_cams,
sel_sensors
,
// int sel_sensors,
gpu_ftasks
,
// float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
num_tiles
,
// int num_tiles, // number of tiles in task
tilesx
,
// int width, // number of tiles in a row
gpu_corr_indices
,
// int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles
);
// int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize
();
int
num_corr_tiles_with_sum
=
(
*
pnum_corr_tiles
);
int
num_corr_tiles_wo_sum
=
num_corr_tiles_with_sum
*
num_sel_sensors
/
(
num_sel_sensors
+
1
);
// remove sum from count
dim3
threads_corr
(
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
);
dim3
grid_corr
((
num_corr_tiles_wo_sum
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
correlate2D_inter_inner
<<<
grid_corr
,
threads_corr
>>>
(
// will only process to TD, no normalisations and back conversion
num_cams
,
// int num_cams,
num_sel_sensors
,
// int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
gpu_clt
,
// float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
gpu_clt_ref
,
// float ** gpu_clt_ref, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors
,
// int colors, // number of colors (3/1)
scale0
,
// float scale0, // scale for R
scale1
,
// float scale1, // scale for B
scale2
,
// float scale2, // scale for G
num_corr_tiles_with_sum
,
// int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile + sensor (0xff - sum)
corr_stride
,
// size_t corr_stride, // in floats
gpu_corrs
);
// float * gpu_corrs) // correlation output data (either pixel domain or transform domain
dim3
grid_combine
((
num_tiles
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
combine_inter
<<<
grid_combine
,
threads_corr
>>>
(
// combine per-senor interscene correlations
num_sel_sensors
,
// int num_sel_sensors, // number of sensors to correlate (not counting sum of all)
num_corr_tiles_with_sum
,
// int num_corr_tiles, // number of correlation tiles to process (here it includes sum)
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile+pair NOT USED
corr_stride
,
// size_t corr_stride, // in floats
gpu_corrs
);
// float * gpu_corrs); // correlation output data (either pixel domain or transform domain
}
}
}
/**
* Used for interscene correlations (for motion vector calculation).
* Calculate sum of selected correlation (in TD) and place it after individual (per-sensor) correlations.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_task_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_sel_sensors number of sensors to correlate
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* should be calculated by correlate2D_inter_inner() leaving gaps for sums, calculated here
*
*/
extern
"C"
__global__
void
combine_inter
(
// combine per-senor interscene correlations
int
num_sel_sensors
,
// number of sensors to correlate (not counting sum of all)
int
num_corr_tiles
,
// number of correlation tiles to process (here it includes sum)
int
*
gpu_corr_indices
,
// packed tile+pair
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
)
// correlation output data (either pixel domain or transform domain
{
int
corr_in_block
=
threadIdx
.
y
;
int
itile
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK
+
corr_in_block
;
// correlation tile index
int
corr_offset
=
itile
*
(
num_sel_sensors
+
1
);
// index of the first correlation for this task;
if
(
corr_offset
>=
(
num_corr_tiles
-
num_sel_sensors
))
{
return
;
}
// __syncthreads();// __syncwarp();
__shared__
float
clt_corrs
[
CORR_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
resetCorrelation
(
clt_corr
);
__syncthreads
();
/// ***** Was not here: probably not needed
for
(
int
isens
=
0
;
isens
<
num_sel_sensors
;
isens
++
){
float
*
mem_corr
=
gpu_corrs
+
corr_stride
*
corr_offset
+
threadIdx
.
x
;
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE4
;
i
++
){
(
*
clt
)
+=
(
*
mem_corr
);
clt
+=
DTT_SIZE1
;
mem_corr
+=
DTT_SIZE
;
}
corr_offset
++
;
}
// Now corr_offset points to the sum of correlations
float
*
mem_corr
=
gpu_corrs
+
corr_stride
*
corr_offset
+
threadIdx
.
x
;
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE4
;
i
++
){
(
*
mem_corr
)
=
(
*
clt
);
clt
+=
DTT_SIZE1
;
mem_corr
+=
DTT_SIZE
;
}
}
/**
* Calculate interscene 2D phase correlation pairs from CLT representation.
* This is an inner kernel that is called from correlate2D_inter.
* Configuration
* threads: dim3 (CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1)
* grids: dim3 ((number_of_corr_tiles_excluding_sums + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1)
*
* @param num_cams number of cameras
* @param num_sel_sensors number of sensors to correlate
* @param gpu_clt array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* @param gpu_clt_ref array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
* gpu_clt_ref and gpu_clt_ref correspond to two scenes, the reference is the first in correlation.
* @param colors number of colors used: 3 for RGB or 1 for monochrome
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param num_corr_tiles number of correlation tiles to process (here it includes sum)
* @param gpu_corr_indices packed tile+pair, similar format as intrascene (tile number << 8), low byte
* is now sensor number or 0xff (last one for each tile) for the sum of all individual
* correlations. Entries for each tile go in the same order (increasing sensor number)
* followed by the sum of all the selected correlations. Entries for different tiles
* are not ordered.
* @param corr_stride stride (in floats) for correlation outputs.
* @param gpu_corrs allocated array for the correlation output data, first num_sel_sensors for each tile
* will be calculated here leaving gaps for sums, calculated by combine_inter()
*/
extern
"C"
__global__
void
correlate2D_inter_inner
(
// will only process to TD, no normalisations and back conversion
int
num_cams
,
int
num_sel_sensors
,
// number of sensors to correlate (not counting sum of all)
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt_ref
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
int
num_corr_tiles
,
// number of correlation tiles to process (here it includes sum)
int
*
gpu_corr_indices
,
// packed tile+pair
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
)
// correlation output data (either pixel domain or transform domain
{
float
scales
[
3
]
=
{
scale0
,
scale1
,
scale2
};
int
corr_in_block
=
threadIdx
.
y
;
int
corr_num
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK
+
corr_in_block
;
// 4
int
tile_index
=
corr_num
/
num_sel_sensors
;
int
corr_offset
=
tile_index
+
corr_num
;
// added for missing sum correlation tiles.
if
(
corr_offset
>=
num_corr_tiles
){
return
;
// nothing to do
}
// get number of pair and number of tile
int
corr_sensor
=
gpu_corr_indices
[
corr_offset
];
// corr_num];
int
tile_num
=
corr_sensor
>>
CORR_NTILE_SHIFT
;
corr_sensor
&=
(
corr_sensor
&
((
1
<<
CORR_NTILE_SHIFT
)
-
1
));
if
(
corr_sensor
>=
num_cams
){
return
;
// BUG - should not happen
}
__syncthreads
();
// __syncwarp();
__shared__
float
clt_tiles1
[
CORR_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
clt_tiles2
[
CORR_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
clt_corrs
[
CORR_TILES_PER_BLOCK
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// __shared__ float mlt_corrs [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
// set clt_corr to all zeros
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
// float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation
(
clt_corr
);
__syncthreads
();
/// ***** Was not here: probably not needed
for
(
int
color
=
0
;
color
<
colors
;
color
++
){
// copy clt (frequency domain data)
float
*
clt_tile1
=
((
float
*
)
clt_tiles1
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
float
*
clt_tile2
=
((
float
*
)
clt_tiles2
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
int
offs
=
(
tile_num
*
colors
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
float
*
gpu_tile1
=
((
float
*
)
gpu_clt_ref
[
corr_sensor
])
+
offs
;
float
*
gpu_tile2
=
((
float
*
)
gpu_clt
[
corr_sensor
])
+
offs
;
float
*
clt_tile1i
=
clt_tile1
+
threadIdx
.
x
;
float
*
clt_tile2i
=
clt_tile2
+
threadIdx
.
x
;
#pragma unroll
#define USE_LOG
#ifdef USE_LOG
// Apply high-pass filter to correlation inputs to reduce dynamic range before multiplication
for
(
int
q
=
0
;
q
<
4
;
q
++
){
float
*
log
=
LoG_corr
+
threadIdx
.
x
;
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
// copy 32 rows (4 quadrants of 8 rows)
*
clt_tile1i
=
(
*
gpu_tile1
)
*
(
*
log
);
*
clt_tile2i
=
(
*
gpu_tile2
)
*
(
*
log
);
clt_tile1i
+=
DTT_SIZE1
;
clt_tile2i
+=
DTT_SIZE1
;
gpu_tile1
+=
DTT_SIZE
;
gpu_tile2
+=
DTT_SIZE
;
log
+=
DTT_SIZE
;
}
}
#else
for
(
int
i
=
0
;
i
<
DTT_SIZE4
;
i
++
){
// copy 32 rows (4 quadrants of 8 rows)
*
clt_tile1i
=
*
gpu_tile1
;
*
clt_tile2i
=
*
gpu_tile2
;
clt_tile1i
+=
DTT_SIZE1
;
clt_tile2i
+=
DTT_SIZE1
;
gpu_tile1
+=
DTT_SIZE
;
gpu_tile2
+=
DTT_SIZE
;
}
#endif //USE_LOG
__syncthreads
();
// each thread should get the same pointers here, offsets are inside
correlateAccumulateTiles
(
scales
[
color
],
// float scale, // scale correlation
clt_tile1
,
// float* clt_tile1, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
clt_tile2
,
// float* clt_tile2, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
clt_corr
);
// float* corr_tile) // [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
__syncthreads
();
if
(
color
==
1
){
// LPF only after B (nothing in mono)
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
#pragma unroll
for
(
int
q
=
0
;
q
<
4
;
q
++
){
float
*
lpf_rb
=
lpf_rb_corr
+
threadIdx
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
(
*
clt
)
*=
(
*
lpf_rb
);
clt
+=
DTT_SIZE1
;
lpf_rb
+=
DTT_SIZE
;
}
}
__syncthreads
();
// __syncwarp();
}
// if (color == 1){ // LPF only after B (nothing in mono)
}
// for (int color = 0; color < colors; color++){
__syncthreads
();
// __syncwarp();
float
*
mem_corr
=
gpu_corrs
+
corr_stride
*
corr_offset
+
threadIdx
.
x
;
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE4
;
i
++
){
(
*
mem_corr
)
=
(
*
clt
);
clt
+=
DTT_SIZE1
;
mem_corr
+=
DTT_SIZE
;
}
__syncthreads
();
// __syncwarp();
}
/**
/**
* Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called
* Calculate 2D phase correlation pairs from CLT representation. This is an inner kernel that is called
* from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
* from correlate2D. If called from the CPU: <<<ceil(number_of_tiles/32),32>>>.
...
@@ -1234,9 +1551,9 @@ extern "C" __global__ void correlate2D(
...
@@ -1234,9 +1551,9 @@ extern "C" __global__ void correlate2D(
* @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process
* @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride
,
stride (in floats) for correlation outputs.
* @param corr_stride
stride (in floats) for correlation outputs.
* @param corr_radius
,
radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
* @param corr_radius
radius of the output correlation (maximal 7 for 15x15). If 0 - output Transform Domain tiles, no normalization
* @param gpu_corrs
)
allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs
allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
*/
*/
extern
"C"
__global__
void
correlate2D_inner
(
extern
"C"
__global__
void
correlate2D_inner
(
int
num_cams
,
int
num_cams
,
...
@@ -1504,11 +1821,12 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1504,11 +1821,12 @@ extern "C" __global__ void correlate2D_inner(
}
}
/**
/**
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
* and separately the two diagonals into another single one
* and separately the two diagonals into another single one
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
* when combining t
h
o diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* when combining t
w
o diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
* (corresponds to a vertical flip).
* (corresponds to a vertical flip).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
* This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>
...
@@ -1695,10 +2013,10 @@ extern "C" __global__ void corr2D_combine_inner(
...
@@ -1695,10 +2013,10 @@ extern "C" __global__ void corr2D_combine_inner(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
* that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
*
*
* @param num_
tiles
number of correlation tiles to process
* @param num_
corr_tiles
number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights
null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_weights
null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
* @param fat_zero2 add this value squared to the sum of squared components before normalization (squared)
...
@@ -1708,7 +2026,7 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1708,7 +2026,7 @@ extern "C" __global__ void corr2D_normalize(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
const
size_t
corr_stride_td
,
// in floats
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per correlation tile weight (fat_zero2 will be divided by it)
float
*
corr_weights
,
// null or per correlation tile weight (fat_zero2 will be divided by it)
, length = num_corr_tile
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero2
,
// here - absolute, squared
float
fat_zero2
,
// here - absolute, squared
...
@@ -1733,14 +2051,14 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1733,14 +2051,14 @@ extern "C" __global__ void corr2D_normalize(
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
* kernel that is called from corr2D_normalize.
* kernel that is called from corr2D_normalize.
*
*
* @param num_
tiles
number of correlation tiles to process
* @param num_
corr_tiles
number of correlation tiles to process
* @param corr_stride_td
,
stride (in floats) for correlation input (transform domain).
* @param corr_stride_td
stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights
null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_weights
null or per-tile weight (fat_zero2 will be divided by it), length = num_corr_tiles
* @param corr_stride
,
stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride
stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param fat_zero2 add this value squared to the sum of squared components before normalization
* @param corr_radius
,
radius of the output correlation (maximal 7 for 15x15)
* @param corr_radius
radius of the output correlation (maximal 7 for 15x15)
*/
*/
extern
"C"
__global__
void
corr2D_normalize_inner
(
extern
"C"
__global__
void
corr2D_normalize_inner
(
...
@@ -2515,7 +2833,6 @@ extern "C" __global__ void create_nonoverlap_list(
...
@@ -2515,7 +2833,6 @@ extern "C" __global__ void create_nonoverlap_list(
* @param num_cams number of cameras <= NUM_CAMS
* @param num_cams number of cameras <= NUM_CAMS
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param sel_pairs array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list
* @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length
* @param pnum_corr_tiles single-element integer array return generated list length
...
@@ -2564,23 +2881,60 @@ __global__ void index_correlate(
...
@@ -2564,23 +2881,60 @@ __global__ void index_correlate(
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
}
}
}
}
/*
}
int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
if (cm != 0) {
/**
int nb = __popc (cm); // number of non-zero bits
* Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
int indx = atomicAdd(pnum_corr_tiles, nb);
* For interscene correlation. One correlation output for each selected sensor
int txy = gpu_tasks[num_tile].txy;
* plus a sum of them all. So for all 16 sensors selected ooutput will have 17
int tx = txy & 0xffff;
* 2D correlations (with some being the l;ast one)
int ty = txy >> 16;
* All pairs for the same tile will always be in the same order: increasing sensor numbers
// int nt = ty * TILES-X + tx;
* with sum being the last. Sum will be marked by 0xff in the LSB.
* With the quad camera each tile may generate up to 6 pairs (int array elements)
* Tiles are not ordered, but the correlation pairs for each tile are.
*
* @param num_cams number of cameras <= NUM_CAMS <32
* @param sel_sensors array of length to accommodate all pairs (4 for 16 cameras, 120 pairs).
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param gpu_corr_indices integer array to place the generated list
* @param pnum_corr_tiles single-element integer array return generated list length
*/
__global__
void
index_inter_correlate
(
int
num_cams
,
int
sel_sensors
,
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int
num_tiles
,
// number of tiles in task
int
width
,
// number of tiles in a row
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
pnum_corr_tiles
)
// pointer to the length of correlation tasks array
{
int
num_tile
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
num_tile
>=
num_tiles
){
return
;
}
// int task_size = get_task_size(num_cams);
int
task_task
=
get_task_task
(
num_tile
,
gpu_ftasks
,
num_cams
);
if
(((
task_task
>>
TASK_CORR_BITS
)
&
1
)
==
0
){
// needs correlation. Maybe just check task_task != 0?
return
;
}
int
nb
=
__popc
(
sel_sensors
);
// number of non-zero bits
if
(
nb
>
0
){
int
indx
=
atomicAdd
(
pnum_corr_tiles
,
nb
+
1
);
int
task_txy
=
get_task_txy
(
num_tile
,
gpu_ftasks
,
num_cams
);
int
tx
=
task_txy
&
0xffff
;
int
ty
=
task_txy
>>
16
;
int
nt
=
ty
*
width
+
tx
;
int
nt
=
ty
*
width
+
tx
;
for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
// for (int b = 0; b < pair_list_len; b++) if ((cm & (1 << b)) != 0) {
for
(
int
b
=
0
;
b
<
num_cams
;
b
++
)
if
((
sel_sensors
&
(
1
<<
(
b
&
31
)))
!=
0
)
{
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
}
}
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
0xff
;
// will be used for sum
}
}
*/
}
}
/**
/**
* Direct MCLT transform and aberration correction with space-variant deconvolution
* Direct MCLT transform and aberration correction with space-variant deconvolution
* kernels. Results are used to output aberration-corrected images, textures and
* kernels. Results are used to output aberration-corrected images, textures and
...
@@ -3917,6 +4271,7 @@ __device__ void correlateAccumulateTiles(
...
@@ -3917,6 +4271,7 @@ __device__ void correlateAccumulateTiles(
float
*
corr_tile_j1
=
corr_tile_j0
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[1][j][0]
float
*
corr_tile_j1
=
corr_tile_j0
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[1][j][0]
float
*
corr_tile_j2
=
corr_tile_j1
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[2][j][0]
float
*
corr_tile_j2
=
corr_tile_j1
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[2][j][0]
float
*
corr_tile_j3
=
corr_tile_j2
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[3][j][0]
float
*
corr_tile_j3
=
corr_tile_j2
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[3][j][0]
// __syncthreads(); // *** TESTING ***
//#pragma unroll
//#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
// k=0
// k=0
...
@@ -3960,6 +4315,8 @@ __device__ void correlateAccumulateTiles(
...
@@ -3960,6 +4315,8 @@ __device__ void correlateAccumulateTiles(
corr_tile_j1
++
;
corr_tile_j1
++
;
corr_tile_j2
++
;
corr_tile_j2
++
;
corr_tile_j3
++
;
corr_tile_j3
++
;
// __syncthreads(); // *** TESTING ***
}
}
}
}
...
...
src/main/resources/kernels/TileProcessor.h
View file @
b4d88911
...
@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D(
...
@@ -82,11 +82,29 @@ extern "C" __global__ void correlate2D(
int
tilesx
,
// number of tile rows
int
tilesx
,
// number of tile rows
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
size_t
corr_stride
,
// in floats
size_t
corr_stride
,
// in floats
// int corr_stride, // in floats
// int corr_stride, // in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
float
*
gpu_corrs
);
// correlation output data
extern
"C"
__global__
void
correlate2D_inter
(
// only results in TD
int
num_cams
,
int
sel_sensors
,
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt_ref
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
int
num_tiles
,
// number of tiles in task
int
tilesx
,
// number of tile rows
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
);
// correlation output data
extern
"C"
__global__
void
corr2D_normalize
(
extern
"C"
__global__
void
corr2D_normalize
(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
const
size_t
corr_stride_td
,
// in floats
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment