Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
4ce57464
Commit
4ce57464
authored
Dec 19, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
missed to commit files
parent
d74fb32e
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
71 additions
and
50 deletions
+71
-50
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+66
-47
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+5
-3
No files found.
src/main/resources/kernels/TileProcessor.cuh
View file @
4ce57464
...
@@ -921,7 +921,7 @@ __device__ void resetCorrelation(
...
@@ -921,7 +921,7 @@ __device__ void resetCorrelation(
__device__
void
normalizeTileAmplitude
(
__device__
void
normalizeTileAmplitude
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
fat_zero
);
// fat zero is absolute, scale it outside
float
fat_zero
2
);
// fat zero is absolute, scale it outside
__device__
void
imclt8threads
(
// for 8 threads
__device__
void
imclt8threads
(
// for 8 threads
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
int
do_acc
,
// 1 - add to previous value, 0 - overwrite
...
@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles(
...
@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles(
int
kernels_vert
,
//);
int
kernels_vert
,
//);
int
tilesx
);
int
tilesx
);
extern
"C"
__global__
void
correlate2D_inner
(
extern
"C"
__global__
void
correlate2D_inner
(
int
num_cams
,
int
num_cams
,
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt
,
// [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...
@@ -1076,20 +1075,21 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1076,20 +1075,21 @@ extern "C" __global__ void correlate2D_inner(
float
scale0
,
// scale for R
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute
size_
t
num_corr_tiles
,
// number of correlation tiles to process
in
t
num_corr_tiles
,
// number of correlation tiles to process
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
gpu_corr_indices
,
// packed tile+pair
const
size_t
corr_stride
,
// in floats
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
int
corr_radius
0
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
float
*
gpu_corrs
);
// correlation output data
(either pixel domain or transform domain
extern
"C"
__global__
void
corr2D_normalize_inner
(
extern
"C"
__global__
void
corr2D_normalize_inner
(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// (in floats) stride for the input TD correlations
const
size_t
corr_stride_td
,
// (in floats) stride for the input TD correlations
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per-tile weight (fat_zero2 will be divided by it)
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
extern
"C"
__global__
void
corr2D_combine_inner
(
extern
"C"
__global__
void
corr2D_combine_inner
(
...
@@ -1151,8 +1151,8 @@ __device__ int get_textures_shared_size( // in bytes
...
@@ -1151,8 +1151,8 @@ __device__ int get_textures_shared_size( // in bytes
* @param scale0 scale red (or mono) component before mixing
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero
add this value squared to the sum of squared components before normalization\
* @param fat_zero
2 add this value squared to the sum of squared components before normalization (squared)
* @param gpu_ftasks flattened tasks, 2
7 floats for quad EO, 99
floats for LWIR16
* @param gpu_ftasks flattened tasks, 2
9 floats for quad EO, 101
floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (now bits 4..9 - correlation pairs)
// * @param gpu_tasks array of per-tile tasks (now bits 4..9 - correlation pairs)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param tilesx number of tile rows
* @param tilesx number of tile rows
...
@@ -1174,14 +1174,14 @@ extern "C" __global__ void correlate2D(
...
@@ -1174,14 +1174,14 @@ extern "C" __global__ void correlate2D(
float
scale0
,
// scale for R
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
float
*
gpu_ftasks
,
// flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
int
tilesx
,
// number of tile rows
int
tilesx
,
// number of tile rows
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
const
size_t
corr_stride
,
// in floats
// const size_t corr_stride, // in floats
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
)
// correlation output data
float
*
gpu_corrs
)
// correlation output data
{
{
...
@@ -1191,14 +1191,12 @@ extern "C" __global__ void correlate2D(
...
@@ -1191,14 +1191,12 @@ extern "C" __global__ void correlate2D(
*
pnum_corr_tiles
=
0
;
*
pnum_corr_tiles
=
0
;
index_correlate
<<<
blocks0
,
threads0
>>>
(
index_correlate
<<<
blocks0
,
threads0
>>>
(
num_cams
,
// int num_cams,
num_cams
,
// int num_cams,
// sel_pairs, // int * sel_pairs,
sel_pairs0
,
// int sel_pairs0,
sel_pairs0
,
// int sel_pairs0,
sel_pairs1
,
// int sel_pairs1,
sel_pairs1
,
// int sel_pairs1,
sel_pairs2
,
// int sel_pairs2,
sel_pairs2
,
// int sel_pairs2,
sel_pairs3
,
// int sel_pairs3,
sel_pairs3
,
// int sel_pairs3,
gpu_ftasks
,
// float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
gpu_ftasks
,
// float * gpu_ftasks, // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles
,
// int num_tiles, // number of tiles in task
num_tiles
,
// int num_tiles, // number of tiles in task
tilesx
,
// int width, // number of tiles in a row
tilesx
,
// int width, // number of tiles in a row
gpu_corr_indices
,
// int * gpu_corr_indices, // array of correlation tasks
gpu_corr_indices
,
// int * gpu_corr_indices, // array of correlation tasks
...
@@ -1213,7 +1211,7 @@ extern "C" __global__ void correlate2D(
...
@@ -1213,7 +1211,7 @@ extern "C" __global__ void correlate2D(
scale0
,
// float scale0, // scale for R
scale0
,
// float scale0, // scale for R
scale1
,
// float scale1, // scale for B
scale1
,
// float scale1, // scale for B
scale2
,
// float scale2, // scale for G
scale2
,
// float scale2, // scale for G
fat_zero
,
// float fat_zero
, // here - absolute
fat_zero
2
,
// float fat_zero2
, // here - absolute
*
pnum_corr_tiles
,
// size_t num_corr_tiles, // number of correlation tiles to process
*
pnum_corr_tiles
,
// size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile+pair
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile+pair
corr_stride
,
// const size_t corr_stride, // in floats
corr_stride
,
// const size_t corr_stride, // in floats
...
@@ -1233,7 +1231,7 @@ extern "C" __global__ void correlate2D(
...
@@ -1233,7 +1231,7 @@ extern "C" __global__ void correlate2D(
* @param scale0 scale red (or mono) component before mixing
* @param scale0 scale red (or mono) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale1 scale blue (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param scale2 scale green (if colors = 3) component before mixing
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2
add this value squared to the sum of squared components before normalization
* @param num_corr_tiles number of correlation tiles to process
* @param num_corr_tiles number of correlation tiles to process
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param gpu_corr_indices packed array (each element, integer contains tile+pair) of correlation tasks
* @param corr_stride, stride (in floats) for correlation outputs.
* @param corr_stride, stride (in floats) for correlation outputs.
...
@@ -1247,13 +1245,17 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1247,13 +1245,17 @@ extern "C" __global__ void correlate2D_inner(
float
scale0
,
// scale for R
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute
size_
t
num_corr_tiles
,
// number of correlation tiles to process
in
t
num_corr_tiles
,
// number of correlation tiles to process
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
gpu_corr_indices
,
// packed tile+pair
const
size_t
corr_stride
,
// in floats
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
int
corr_radius
0
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
)
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
)
// correlation output data (either pixel domain or transform domain
{
{
// int corr_radius = corr_radius0 & 0x1f;// minimal "bad"
// int corr_radius = corr_radius0 & 0xf; // maximal "good"
int
corr_radius
=
corr_radius0
&
0x7
;
// actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
float
scales
[
3
]
=
{
scale0
,
scale1
,
scale2
};
float
scales
[
3
]
=
{
scale0
,
scale1
,
scale2
};
int
corr_in_block
=
threadIdx
.
y
;
int
corr_in_block
=
threadIdx
.
y
;
int
corr_num
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK
+
corr_in_block
;
// 4
int
corr_num
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK
+
corr_in_block
;
// 4
...
@@ -1269,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1269,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner(
if
(
corr_pair
>
pair_list_len
){
if
(
corr_pair
>
pair_list_len
){
return
;
// BUG - should not happen
return
;
// BUG - should not happen
}
}
// int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
// int cam2 = pairs[corr_pair][1]; // number of the second camera in a pair
int
cam1
=
all_pairs
[
pair_list_start
+
corr_pair
][
0
];
// number of the first camera in a pair
int
cam1
=
all_pairs
[
pair_list_start
+
corr_pair
][
0
];
// number of the first camera in a pair
int
cam2
=
all_pairs
[
pair_list_start
+
corr_pair
][
1
];
// number of the second camera in a pair
int
cam2
=
all_pairs
[
pair_list_start
+
corr_pair
][
1
];
// number of the second camera in a pair
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
...
@@ -1282,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1282,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner(
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
mclt_corr
=
((
float
*
)
mlt_corrs
)
+
corr_in_block
*
(
DTT_SIZE2M1
*
DTT_SIZE2M1
);
float
*
mclt_corr
=
((
float
*
)
mlt_corrs
)
+
corr_in_block
*
(
DTT_SIZE2M1
*
DTT_SIZE2M1
);
resetCorrelation
(
clt_corr
);
resetCorrelation
(
clt_corr
);
__syncthreads
();
/// ***** Was not here: probably not needed
for
(
int
color
=
0
;
color
<
colors
;
color
++
){
for
(
int
color
=
0
;
color
<
colors
;
color
++
){
// copy clt (frequency domain data)
// copy clt (frequency domain data)
float
*
clt_tile1
=
((
float
*
)
clt_tiles1
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
float
*
clt_tile1
=
((
float
*
)
clt_tiles1
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
...
@@ -1381,18 +1382,19 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1381,18 +1382,19 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
}
// if (color == 1){ // LPF only after B (nothing in mono)
}
// if (color == 1){ // LPF only after B (nothing in mono)
}
// for (int color = 0; color < colors; color++){
}
// for (int color = 0; color < colors; color++){
// corr_radius = 7;
// Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
// Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
if
(
corr_radius
>
0
)
{
if
(
corr_radius
>
0
)
{
normalizeTileAmplitude
(
normalizeTileAmplitude
(
clt_corr
,
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
clt_corr
,
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero
);
// float fat_zero
) // fat zero is absolute, scale it outside
fat_zero
2
);
// float fat_zero2
) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
// Low Pass Filter from constant area (is it possible to replace?)
__syncthreads
();
/// ***** Was not here: probably not needed
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
printf
(
"
\n
correlate2D CORRELATION NORMALIZED, fat_zero
=%f
\n
"
,
fat_zero
);
printf
(
"
\n
correlate2D CORRELATION NORMALIZED, fat_zero
2=%f
\n
"
,
fat_zero2
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
...
@@ -1409,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1409,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
#endif
#endif
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
float
*
clt
=
clt_corr
+
threadIdx
.
x
;
#pragma unroll
#pragma unroll
for
(
int
q
=
0
;
q
<
4
;
q
++
){
for
(
int
q
=
0
;
q
<
4
;
q
++
){
...
@@ -1423,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1423,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner(
}
}
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
// corr_radius = 7;
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
...
@@ -1435,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1435,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
#endif
#endif
dttii_2d
(
clt_corr
);
dttii_2d
(
clt_corr
);
// has __syncthreads() inside
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
...
@@ -1446,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1446,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
#endif
#endif
__syncthreads
();
// corr_radius = 7;
corrUnfoldTile
(
corrUnfoldTile
(
corr_radius
,
// int corr_radius,
corr_radius
,
// int corr_radius,
(
float
*
)
clt_corr
,
// float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(
float
*
)
clt_corr
,
// float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(
float
*
)
mclt_corr
);
// float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
(
float
*
)
mclt_corr
);
// float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads
();
__syncthreads
();
#ifdef DBG_TILE
#ifdef DBG_TILE
...
@@ -1468,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner(
...
@@ -1468,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
int
size2r1
=
2
*
corr_radius
+
1
;
int
size2r1
=
2
*
corr_radius
+
1
;
// 15 for full corr tile
int
len2r1x2r1
=
size2r1
*
size2r1
;
int
len2r1x2r1
=
size2r1
*
size2r1
;
// 225 for full corr tile
int
corr_tile_offset
=
+
corr_stride
*
corr_num
;
int
corr_tile_offset
=
+
corr_stride
*
corr_num
;
float
*
mem_corr
=
gpu_corrs
+
corr_tile_offset
;
float
*
mem_corr
=
gpu_corrs
+
corr_tile_offset
;
#pragma unroll
#pragma unroll
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
for
(
int
offs
=
threadIdx
.
x
;
offs
<
len2r1x2r1
;
offs
+=
CORR_THREADS_PER_TILE
){
// variable number of cycles per thread
for
(
int
offs
=
threadIdx
.
x
;
offs
<
len2r1x2r1
;
offs
+=
CORR_THREADS_PER_TILE
){
// variable number of cycles per thread
mem_corr
[
offs
]
=
mclt_corr
[
offs
];
mem_corr
[
offs
]
=
mclt_corr
[
offs
];
// copy OK
}
}
__syncthreads
();
__syncthreads
();
...
@@ -1697,18 +1698,20 @@ extern "C" __global__ void corr2D_combine_inner(
...
@@ -1697,18 +1698,20 @@ extern "C" __global__ void corr2D_combine_inner(
* @param num_tiles number of correlation tiles to process
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2 add this value squared to the sum of squared components before normalization (squared)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
*/
extern
"C"
__global__
void
corr2D_normalize
(
extern
"C"
__global__
void
corr2D_normalize
(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
const
size_t
corr_stride_td
,
// in floats
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per correlation tile weight (fat_zero2 will be divided by it)
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
int
corr_radius
)
// radius of the output correlation (7 for 15x15)
int
corr_radius
)
// radius of the output correlation (7 for 15x15)
{
{
if
(
threadIdx
.
x
==
0
)
{
// only 1 thread, 1 block
if
(
threadIdx
.
x
==
0
)
{
// only 1 thread, 1 block
...
@@ -1718,9 +1721,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1718,9 +1721,10 @@ extern "C" __global__ void corr2D_normalize(
num_corr_tiles
,
// int num_corr_tiles, // number of correlation tiles to process
num_corr_tiles
,
// int num_corr_tiles, // number of correlation tiles to process
corr_stride_td
,
// const size_t corr_stride, // in floats
corr_stride_td
,
// const size_t corr_stride, // in floats
gpu_corrs_td
,
// float * gpu_corrs_td, // correlation tiles in transform domain
gpu_corrs_td
,
// float * gpu_corrs_td, // correlation tiles in transform domain
corr_weights
,
// float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
corr_stride
,
// const size_t corr_stride, // in floats
corr_stride
,
// const size_t corr_stride, // in floats
gpu_corrs
,
// float * gpu_corrs, // correlation output data (either pixel domain or transform domain
gpu_corrs
,
// float * gpu_corrs, // correlation output data (either pixel domain or transform domain
fat_zero
,
// float fat_zero
, // here - absolute
fat_zero
2
,
// float fat_zero2
, // here - absolute
corr_radius
);
// int corr_radius, // radius of the output correlation (7 for 15x15)
corr_radius
);
// int corr_radius, // radius of the output correlation (7 for 15x15)
}
}
}
}
...
@@ -1732,9 +1736,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -1732,9 +1736,10 @@ extern "C" __global__ void corr2D_normalize(
* @param num_tiles number of correlation tiles to process
* @param num_tiles number of correlation tiles to process
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param corr_stride_td, stride (in floats) for correlation input (transform domain).
* @param gpu_corrs_td correlation data in transform domain
* @param gpu_corrs_td correlation data in transform domain
* @param corr_weights null or per-tile weight (fat_zero2 will be divided by it)
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param corr_stride, stride (in floats) for correlation pixel-domain outputs.
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param gpu_corrs allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
* @param fat_zero
add this value squared to the sum of squared components before normalization
* @param fat_zero
2
add this value squared to the sum of squared components before normalization
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
* @param corr_radius, radius of the output correlation (maximal 7 for 15x15)
*/
*/
...
@@ -1742,11 +1747,15 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1742,11 +1747,15 @@ extern "C" __global__ void corr2D_normalize_inner(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// (in floats) stride for the input TD correlations
const
size_t
corr_stride_td
,
// (in floats) stride for the input TD correlations
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per-tile weight (fat_zero2 will be divided by it)
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
int
corr_radius
)
// radius of the output correlation (7 for 15x15)
int
corr_radius
)
// radius of the output correlation (7 for 15x15)
{
{
corr_radius
&=
0x7
;
// actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
int
corr_in_block
=
threadIdx
.
y
;
int
corr_in_block
=
threadIdx
.
y
;
int
corr_num
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK_NORMALIZE
+
corr_in_block
;
// 4
int
corr_num
=
blockIdx
.
x
*
CORR_TILES_PER_BLOCK_NORMALIZE
+
corr_in_block
;
// 4
if
(
corr_num
>=
num_corr_tiles
){
if
(
corr_num
>=
num_corr_tiles
){
...
@@ -1755,6 +1764,7 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1755,6 +1764,7 @@ extern "C" __global__ void corr2D_normalize_inner(
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
__shared__
float
clt_corrs
[
CORR_TILES_PER_BLOCK_NORMALIZE
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
clt_corrs
[
CORR_TILES_PER_BLOCK_NORMALIZE
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
__shared__
float
mlt_corrs
[
CORR_TILES_PER_BLOCK_NORMALIZE
][
DTT_SIZE2M1
][
DTT_SIZE2M1
];
// result correlation
__shared__
float
mlt_corrs
[
CORR_TILES_PER_BLOCK_NORMALIZE
][
DTT_SIZE2M1
][
DTT_SIZE2M1
];
// result correlation
__shared__
float
norm_fat_zero
[
CORR_TILES_PER_BLOCK_NORMALIZE
];
// set clt_corr to all zeros
// set clt_corr to all zeros
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
clt_corr
=
((
float
*
)
clt_corrs
)
+
corr_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
mclt_corr
=
((
float
*
)
mlt_corrs
)
+
corr_in_block
*
(
DTT_SIZE2M1
*
DTT_SIZE2M1
);
float
*
mclt_corr
=
((
float
*
)
mlt_corrs
)
+
corr_in_block
*
(
DTT_SIZE2M1
*
DTT_SIZE2M1
);
...
@@ -1770,16 +1780,25 @@ extern "C" __global__ void corr2D_normalize_inner(
...
@@ -1770,16 +1780,25 @@ extern "C" __global__ void corr2D_normalize_inner(
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
if
(
threadIdx
.
x
==
0
){
norm_fat_zero
[
corr_in_block
]
=
fat_zero2
;
if
(
corr_weights
)
{
// same for all
norm_fat_zero
[
corr_in_block
]
/=
*
(
corr_weights
+
corr_num
);
}
}
__syncthreads
();
// __syncwarp();
// normalize Amplitude
// normalize Amplitude
normalizeTileAmplitude
(
normalizeTileAmplitude
(
clt_corr
,
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
clt_corr
,
// float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero
);
// float fat_zero
) // fat zero is absolute, scale it outside
norm_fat_zero
[
corr_in_block
]);
// fat_zero2); // float fat_zero2
) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
// Low Pass Filter from constant area (is it possible to replace?)
#ifdef DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG6
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
printf
(
"
\n
correlate2D CORRELATION NORMALIZED, fat_zero
=%f
\n
"
,
fat_zero
);
printf
(
"
\n
correlate2D CORRELATION NORMALIZED, fat_zero
2=%f
\n
"
,
fat_zero2
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
...
@@ -3978,12 +3997,12 @@ __device__ void resetCorrelation(
...
@@ -3978,12 +3997,12 @@ __device__ void resetCorrelation(
* Called from correlate2D()->correlate2D_inner()
* Called from correlate2D()->correlate2D_inner()
*
*
* @param clt_tile pointer to a correlation result tile [4][8][8+1] to be normalized
* @param clt_tile pointer to a correlation result tile [4][8][8+1] to be normalized
* @param fat_zero
value to add to amplitudes for regularization. Absolute value,
* @param fat_zero
2
value to add to amplitudes for regularization. Absolute value,
* scale if needed outside.
* scale if needed outside.
*/
*/
__device__
void
normalizeTileAmplitude
(
__device__
void
normalizeTileAmplitude
(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
fat_zero
)
// fat zero is absolute, scale it outside
float
fat_zero
2
)
// fat zero is absolute, scale it outside
{
{
int
joffs
=
threadIdx
.
x
*
DTT_SIZE1
;
int
joffs
=
threadIdx
.
x
*
DTT_SIZE1
;
float
*
clt_tile_j0
=
clt_tile
+
joffs
;
// ==&clt_tile[0][j][0]
float
*
clt_tile_j0
=
clt_tile
+
joffs
;
// ==&clt_tile[0][j][0]
...
@@ -3992,7 +4011,7 @@ __device__ void normalizeTileAmplitude(
...
@@ -3992,7 +4011,7 @@ __device__ void normalizeTileAmplitude(
float
*
clt_tile_j3
=
clt_tile_j2
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[3][j][0]
float
*
clt_tile_j3
=
clt_tile_j2
+
(
DTT_SIZE1
*
DTT_SIZE
);
// ==&clt_tile[3][j][0]
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
)
{
float
s2
=
fat_zero
*
fat_zero
+
float
s2
=
fat_zero
2
+
*
(
clt_tile_j0
)
*
*
(
clt_tile_j0
)
+
*
(
clt_tile_j0
)
*
*
(
clt_tile_j0
)
+
*
(
clt_tile_j1
)
*
*
(
clt_tile_j1
)
+
*
(
clt_tile_j1
)
*
*
(
clt_tile_j1
)
+
*
(
clt_tile_j2
)
*
*
(
clt_tile_j2
)
+
*
(
clt_tile_j2
)
*
*
(
clt_tile_j2
)
+
...
...
src/main/resources/kernels/TileProcessor.h
View file @
4ce57464
...
@@ -75,14 +75,15 @@ extern "C" __global__ void correlate2D(
...
@@ -75,14 +75,15 @@ extern "C" __global__ void correlate2D(
float
scale0
,
// scale for R
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
float
*
gpu_ftasks
,
// flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
float
*
gpu_ftasks
,
// flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
int
tilesx
,
// number of tile rows
int
tilesx
,
// number of tile rows
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
const
size_t
corr_stride
,
// in floats
size_t
corr_stride
,
// in floats
// int corr_stride, // in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
float
*
gpu_corrs
);
// correlation output data
...
@@ -90,9 +91,10 @@ extern "C" __global__ void corr2D_normalize(
...
@@ -90,9 +91,10 @@ extern "C" __global__ void corr2D_normalize(
int
num_corr_tiles
,
// number of correlation tiles to process
int
num_corr_tiles
,
// number of correlation tiles to process
const
size_t
corr_stride_td
,
// in floats
const
size_t
corr_stride_td
,
// in floats
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
gpu_corrs_td
,
// correlation tiles in transform domain
float
*
corr_weights
,
// null or per-tile weight (fat_zero2 will be divided by it)
const
size_t
corr_stride
,
// in floats
const
size_t
corr_stride
,
// in floats
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
*
gpu_corrs
,
// correlation output data (either pixel domain or transform domain
float
fat_zero
,
// here - absolute
float
fat_zero
2
,
// here - absolute, squared
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
int
corr_radius
);
// radius of the output correlation (7 for 15x15)
extern
"C"
__global__
void
corr2D_combine
(
extern
"C"
__global__
void
corr2D_combine
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment