Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
fa5947b6
Commit
fa5947b6
authored
Aug 27, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
tested gpu intrascene
parent
2eae496d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
364 additions
and
56 deletions
+364
-56
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+298
-51
QuadCLT.java
src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+66
-5
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
fa5947b6
...
...
@@ -100,6 +100,8 @@ public class GPUTileProcessor {
static
String
GPU_CONVERT_DIRECT_NAME
=
"convert_direct"
;
// name in C code
static
String
GPU_IMCLT_ALL_NAME
=
"imclt_rbg_all"
;
static
String
GPU_CORRELATE2D_NAME
=
"correlate2D"
;
// name in C code
static
String
GPU_CORR2D_COMBINE_NAME
=
"corr2D_combine"
;
// name in C code
static
String
GPU_CORR2D_NORMALIZE_NAME
=
"corr2D_normalize"
;
// name in C code
static
String
GPU_TEXTURES_NAME
=
"textures_nonoverlap"
;
// name in C code
static
String
GPU_RBGA_NAME
=
"generate_RBGA"
;
// name in C code
static
String
GPU_ROT_DERIV
=
"calc_rot_deriv"
;
// calculate rotation matrices and derivatives
...
...
@@ -122,6 +124,8 @@ public class GPUTileProcessor {
static
int
TILES_PER_BLOCK
=
4
;
// 8 - slower
static
int
CORR_THREADS_PER_TILE
=
8
;
static
int
CORR_TILES_PER_BLOCK
=
4
;
static
int
CORR_TILES_PER_BLOCK_NORMALIZE
=
4
;
// maybe change to 8?
static
int
CORR_TILES_PER_BLOCK_COMBINE
=
4
;
// increase to 16?
static
int
TEXTURE_THREADS_PER_TILE
=
8
;
// 16;
static
int
TEXTURE_TILES_PER_BLOCK
=
1
;
static
int
IMCLT_THREADS_PER_TILE
=
16
;
...
...
@@ -159,6 +163,8 @@ public class GPUTileProcessor {
private
CUfunction
GPU_CONVERT_DIRECT_kernel
=
null
;
private
CUfunction
GPU_IMCLT_ALL_kernel
=
null
;
private
CUfunction
GPU_CORRELATE2D_kernel
=
null
;
private
CUfunction
GPU_CORR2D_COMBINE_kernel
=
null
;
private
CUfunction
GPU_CORR2D_NORMALIZE_kernel
=
null
;
private
CUfunction
GPU_TEXTURES_kernel
=
null
;
private
CUfunction
GPU_RBGA_kernel
=
null
;
private
CUfunction
GPU_ROT_DERIV_kernel
=
null
;
...
...
@@ -248,6 +254,8 @@ public class GPUTileProcessor {
"#define TILES_PER_BLOCK "
+
TILES_PER_BLOCK
+
"\n"
+
"#define CORR_THREADS_PER_TILE "
+
CORR_THREADS_PER_TILE
+
"\n"
+
"#define CORR_TILES_PER_BLOCK "
+
CORR_TILES_PER_BLOCK
+
"\n"
+
"#define CORR_TILES_PER_BLOCK_NORMALIZE "
+
CORR_TILES_PER_BLOCK_NORMALIZE
+
"\n"
+
"#define CORR_TILES_PER_BLOCK_COMBINE "
+
CORR_TILES_PER_BLOCK_COMBINE
+
"\n"
+
"#define TEXTURE_THREADS_PER_TILE "
+
TEXTURE_THREADS_PER_TILE
+
"\n"
+
"#define TEXTURE_TILES_PER_BLOCK "
+
TEXTURE_TILES_PER_BLOCK
+
"\n"
+
"#define IMCLT_THREADS_PER_TILE "
+
IMCLT_THREADS_PER_TILE
+
"\n"
+
...
...
@@ -267,8 +275,6 @@ public class GPUTileProcessor {
"#define RBYRDIST_LEN "
+
RBYRDIST_LEN
+
"\n"
+
"#define RBYRDIST_STEP "
+
RBYRDIST_STEP
+
"\n"
+
"#define TILES_PER_BLOCK_GEOM "
+
TILES_PER_BLOCK_GEOM
+
"\n"
;
}
public
GPUTileProcessor
(
...
...
@@ -338,6 +344,8 @@ public class GPUTileProcessor {
GPU_CONVERT_DIRECT_NAME
,
GPU_IMCLT_ALL_NAME
,
GPU_CORRELATE2D_NAME
,
GPU_CORR2D_COMBINE_NAME
,
GPU_CORR2D_NORMALIZE_NAME
,
GPU_TEXTURES_NAME
,
GPU_RBGA_NAME
,
GPU_ROT_DERIV
,
...
...
@@ -351,16 +359,20 @@ public class GPUTileProcessor {
GPU_CONVERT_DIRECT_kernel
=
functions
[
0
];
GPU_IMCLT_ALL_kernel
=
functions
[
1
];
GPU_CORRELATE2D_kernel
=
functions
[
2
];
GPU_TEXTURES_kernel
=
functions
[
3
];
GPU_RBGA_kernel
=
functions
[
4
];
GPU_ROT_DERIV_kernel
=
functions
[
5
];
GPU_SET_TILES_OFFSETS_kernel
=
functions
[
6
];
GPU_CALC_REVERSE_DISTORTION_kernel
=
functions
[
7
];
GPU_CORR2D_COMBINE_kernel
=
functions
[
3
];
GPU_CORR2D_NORMALIZE_kernel
=
functions
[
4
];
GPU_TEXTURES_kernel
=
functions
[
5
];
GPU_RBGA_kernel
=
functions
[
6
];
GPU_ROT_DERIV_kernel
=
functions
[
7
];
GPU_SET_TILES_OFFSETS_kernel
=
functions
[
8
];
GPU_CALC_REVERSE_DISTORTION_kernel
=
functions
[
9
];
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
GPU_CONVERT_DIRECT_kernel
.
toString
());
System
.
out
.
println
(
GPU_IMCLT_ALL_kernel
.
toString
());
System
.
out
.
println
(
GPU_CORRELATE2D_kernel
.
toString
());
System
.
out
.
println
(
GPU_CORR2D_COMBINE_kernel
.
toString
());
System
.
out
.
println
(
GPU_CORR2D_NORMALIZE_kernel
.
toString
());
System
.
out
.
println
(
GPU_TEXTURES_kernel
.
toString
());
System
.
out
.
println
(
GPU_RBGA_kernel
.
toString
());
System
.
out
.
println
(
GPU_ROT_DERIV_kernel
.
toString
());
...
...
@@ -372,7 +384,7 @@ public class GPUTileProcessor {
public
static
String
[]
getCorrTitles
()
{
return
new
String
[]{
"hor-top"
,
"hor-bottom"
,
"vert-left"
,
"vert-right"
,
"diag-main"
,
"diag-other"
};
return
new
String
[]{
"hor-top"
,
"hor-bottom"
,
"vert-left"
,
"vert-right"
,
"diag-main"
,
"diag-other"
,
"quad"
,
"cross"
};
}
public
static
double
[][]
getCorr2DView
(
int
tilesX
,
...
...
@@ -383,28 +395,48 @@ public class GPUTileProcessor {
if
((
corr2d
==
null
)
||
(
corr2d
.
length
==
0
))
{
return
new
double
[
NUM_PAIRS
][
0
];
}
int
num_pairs
=
-
1
;
// corr2d.length;
for
(
int
n
=
0
;
n
<
indices
.
length
;
n
++)
{
int
np
=
indices
[
n
]
&
CORR_PAIRS_MASK
;
// ((1 << CORR_NTILE_SHIFT) - 1); // np should
if
(
np
>
num_pairs
)
num_pairs
=
np
;
}
num_pairs
++;
if
(
num_pairs
<
1
)
{
return
new
double
[
NUM_PAIRS
][
0
];
}
boolean
[]
bpairs
=
new
boolean
[
num_pairs
];
for
(
int
n
=
0
;
n
<
indices
.
length
;
n
++)
{
bpairs
[
indices
[
n
]
&
CORR_PAIRS_MASK
]
=
true
;
}
int
first_pair
=
-
1
;
for
(
int
i
=
0
;
(
i
<
bpairs
.
length
)
&&
(
first_pair
<
0
);
i
++)
{
if
(
bpairs
[
i
])
first_pair
=
i
;
}
int
corr_size
=
(
int
)(
Math
.
round
(
Math
.
sqrt
(
corr2d
[
0
].
length
)));
// make smaller later?
int
width
=
tilesX
*
(
corr_size
+
1
)
+
1
;
int
height
=
tilesY
*
(
corr_size
+
1
)
+
1
;
double
[][]
data
=
new
double
[
NUM_PAIRS
][];
data
[
0
]
=
new
double
[
height
*
width
];
double
[][]
data
=
new
double
[
num_pairs
][];
data
[
first_pair
]
=
new
double
[
height
*
width
];
for
(
int
ty
=
0
;
ty
<
tilesY
;
ty
++)
{
for
(
int
tx
=
0
;
tx
<
tilesX
;
tx
++)
{
for
(
int
i
=
0
;
i
<
corr_size
;
i
++)
{
for
(
int
j
=
0
;
j
<
corr_size
;
j
++)
{
data
[
0
][(
ty
*
(
corr_size
+
1
)
+
i
+
1
)
*
width
+
(
tx
*
(
corr_size
+
1
)
+
j
+
1
)]
=
Double
.
NaN
;
data
[
first_pair
][(
ty
*
(
corr_size
+
1
)
+
i
+
1
)
*
width
+
(
tx
*
(
corr_size
+
1
)
+
j
+
1
)]
=
Double
.
NaN
;
}
}
}
}
for
(
int
np
=
first_pair
+
1
;
np
<
num_pairs
;
np
++)
{
if
(
bpairs
[
np
])
{
data
[
np
]
=
data
[
first_pair
].
clone
();
}
for
(
int
np
=
1
;
np
<
NUM_PAIRS
;
np
++)
{
data
[
np
]
=
data
[
0
].
clone
();
}
for
(
int
n
=
0
;
n
<
indices
.
length
;
n
++)
{
int
nt
=
indices
[
n
]
>>
CORR_NTILE_SHIFT
;
int
np
=
indices
[
n
]
&
CORR_PAIRS_MASK
;
// ((1 << CORR_NTILE_SHIFT) - 1); // np should
assert
np
<
NUM_PAIRS
:
"invalid correllation pair"
;
assert
np
<
num_pairs
:
"invalid correllation pair"
;
int
tx
=
nt
%
tilesX
;
int
ty
=
nt
/
tilesX
;
for
(
int
i
=
0
;
i
<
corr_size
;
i
++)
{
...
...
@@ -534,10 +566,15 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_bayer
;
private
CUdeviceptr
gpu_tasks
;
private
CUdeviceptr
gpu_corrs
;
private
CUdeviceptr
gpu_corrs_td
;
private
CUdeviceptr
gpu_corrs_combo
;
private
CUdeviceptr
gpu_corrs_combo_td
;
private
CUdeviceptr
gpu_textures
;
private
CUdeviceptr
gpu_clt
;
private
CUdeviceptr
gpu_4_images
;
private
CUdeviceptr
gpu_corr_indices
;
private
CUdeviceptr
gpu_corr_combo_indices
;
private
CUdeviceptr
gpu_num_corr_tiles
;
private
CUdeviceptr
gpu_texture_indices_ovlp
;
private
CUdeviceptr
gpu_num_texture_ovlp
;
...
...
@@ -556,12 +593,18 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_num_active_tiles
;
private
int
mclt_stride
;
private
int
corr_stride
;
private
int
corr_stride_td
;
private
int
corr_stride_combo
;
private
int
corr_stride_combo_td
;
private
int
imclt_stride
;
private
int
texture_stride
;
private
int
texture_stride_rgba
;
private
int
num_task_tiles
;
private
int
num_corr_tiles
;
private
int
num_texture_tiles
;
private
int
num_pairs
=
6
;
// number of correlation pairs per tile (should match tsaks)
private
int
num_corr_combo_tiles
;
private
boolean
geometry_correction_set
=
false
;
private
boolean
geometry_correction_vector_set
=
false
;
public
GpuQuad
(
...
...
@@ -598,10 +641,17 @@ public class GPUTileProcessor {
gpu_bayer
=
new
CUdeviceptr
();
gpu_tasks
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT
gpu_corrs
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.FLOAT
gpu_corrs_td
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * NUM_PAIRS * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT
gpu_corrs_combo
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * CORR_SIZE * Sizeof.FLOAT
gpu_corrs_combo_td
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT
gpu_textures
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
gpu_clt
=
new
CUdeviceptr
();
gpu_4_images
=
new
CUdeviceptr
();
gpu_corr_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.FLOAT
// May add separate gpu_corr_indices_td here
gpu_corr_combo_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 1 * Sizeof.FLOAT
gpu_num_corr_tiles
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_texture_indices_ovlp
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.FLOAT
gpu_num_texture_ovlp
=
new
CUdeviceptr
();
// 8 ints
...
...
@@ -695,6 +745,8 @@ public class GPUTileProcessor {
//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
// Set corrs array
cuMemAlloc
(
gpu_corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_corr_combo_indices
,
tilesX
*
tilesY
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
//#define TILESYA ((TILESY +3) & (~3))
...
...
@@ -716,6 +768,7 @@ public class GPUTileProcessor {
cuMemAlloc
(
gpu_active_tiles
,
tilesX
*
tilesY
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_num_active_tiles
,
1
*
Sizeof
.
FLOAT
);
// allocate space for pixel-domain correlations (6 per tile)
cuMemAllocPitch
(
gpu_corrs
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
...
...
@@ -723,6 +776,34 @@ public class GPUTileProcessor {
NUM_PAIRS
*
tilesX
*
tilesY
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
corr_stride
=
(
int
)(
device_stride
[
0
]
/
Sizeof
.
FLOAT
);
// allocate space for transform-domain correlations (6 per tile)
cuMemAllocPitch
(
gpu_corrs_td
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
4
*
DTT_SIZE
*
DTT_SIZE
*
Sizeof
.
FLOAT
,
// long WidthInBytes,
NUM_PAIRS
*
tilesX
*
tilesY
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
corr_stride_td
=
(
int
)(
device_stride
[
0
]
/
Sizeof
.
FLOAT
);
// allocate space for pixel-domain combined correlations (1 per tile)
cuMemAllocPitch
(
gpu_corrs_combo
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
CORR_SIZE
*
Sizeof
.
FLOAT
,
// long WidthInBytes,
tilesX
*
tilesY
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
corr_stride_combo
=
(
int
)(
device_stride
[
0
]
/
Sizeof
.
FLOAT
);
// allocate space for transform-domain combined correlations (1 per tile)
cuMemAllocPitch
(
gpu_corrs_combo_td
,
// CUdeviceptr dptr,
device_stride
,
// long[] pPitch,
4
*
DTT_SIZE
*
DTT_SIZE
*
Sizeof
.
FLOAT
,
// long WidthInBytes,
tilesX
*
tilesY
,
// long Height,
Sizeof
.
FLOAT
);
// int ElementSizeBytes)
corr_stride_combo_td
=
(
int
)(
device_stride
[
0
]
/
Sizeof
.
FLOAT
);
int
max_texture_size
=
(
num_colors
+
1
+
(
num_cams
+
num_colors
+
1
))
*
(
2
*
DTT_SIZE
)*
(
2
*
DTT_SIZE
);
cuMemAllocPitch
(
gpu_textures
,
// CUdeviceptr dptr,
...
...
@@ -1556,6 +1637,134 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
}
/**
* Generate 2D correlations from the CLT representation in transform domain, no normalization
* @param scales R,G,B weights
*/
public
void
execCorr2D_TD
(
double
[]
scales
)
{
if
(
GPU_CORRELATE2D_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CORRELATE2D_kernel"
);
return
;
}
int
tilesX
=
img_width
/
DTT_SIZE
;
int
num_colors
=
scales
.
length
;
if
(
num_colors
>
3
)
num_colors
=
3
;
float
fscale0
=
(
float
)
scales
[
0
];
float
fscale1
=
(
num_colors
>
1
)?((
float
)
scales
[
1
]):
0.0f
;
float
fscale2
=
(
num_colors
>
2
)?((
float
)
scales
[
2
]):
0.0f
;
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_clt
),
// float ** gpu_clt,
Pointer
.
to
(
new
int
[]
{
num_colors
}),
// int colors, // number of colors (3/1)
Pointer
.
to
(
new
float
[]
{
fscale0
}),
// float scale0, // scale for R
Pointer
.
to
(
new
float
[]
{
fscale1
}),
// float scale1, // scale for B
Pointer
.
to
(
new
float
[]
{
fscale2
}),
// float scale2, // scale for G
Pointer
.
to
(
new
float
[]
{(
float
)
0.0
}),
// float fat_zero, // here - absolute
Pointer
.
to
(
gpu_tasks
),
// struct tp_task * gpu_tasks,
Pointer
.
to
(
new
int
[]
{
num_task_tiles
}),
// int num_tiles // number of tiles in task
Pointer
.
to
(
new
int
[]
{
tilesX
}),
// int tilesx, // number of tile rows
Pointer
.
to
(
gpu_corr_indices
),
// int * gpu_corr_indices, // packed tile+pair
Pointer
.
to
(
gpu_num_corr_tiles
),
// int * pnum_corr_tiles, // pointer to a number of tiles to process
Pointer
.
to
(
new
int
[]
{
corr_stride_td
}),
// const size_t corr_stride, // in floats
Pointer
.
to
(
new
int
[]
{
0
}),
// generate TD // int corr_radius, // radius of the output correlation (7 for 15x15)
Pointer
.
to
(
gpu_corrs_td
)
// float * gpu_corrs); // correlation output data
);
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_CORRELATE2D_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
}
/**
* Combine intra-scene correlations in transform domain (possible to accumulate more)
* @param init_corr - true: init output to 0 before accumulating, false: add to current value
* @param num_pairs_in - typically ==6 - number of pairs per tile (tile task should have same number per each tile).
* This number should match correlations in tasks
* @param pairs_mask - selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
*/
public
void
execCorr2D_combine
(
boolean
init_corr
,
// initialize output tiles (false - add to current)
int
num_pairs_in
,
// typically 6 - number of pairs per tile (tile task should have same number per each tile
int
pairs_mask
// selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
)
{
if
(
GPU_CORR2D_COMBINE_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CORR2D_COMBINE_kernel"
);
return
;
}
num_pairs
=
num_pairs_in
;
float
[]
fnum_corrs
=
new
float
[
1
];
cuMemcpyDtoH
(
Pointer
.
to
(
fnum_corrs
),
gpu_num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
num_corr_combo_tiles
=
Float
.
floatToIntBits
(
fnum_corrs
[
0
])/
num_pairs
;
// number of correlation tiles calculated
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
new
int
[]
{
num_corr_combo_tiles
}),
// num_task_tiles }), // int num_tiles // number of tiles in task
Pointer
.
to
(
new
int
[]
{
num_pairs
}),
// int num_pairs, // num pairs per tile (should be the same)
Pointer
.
to
(
new
int
[]
{
init_corr
?
1
:
0
}),
// int init_output, // 1- reset output tiles to zero before accumulating
Pointer
.
to
(
new
int
[]
{
pairs_mask
}),
// int pairs_mask, // selected pairs
Pointer
.
to
(
gpu_corr_indices
),
// int * gpu_corr_indices, // packed tile+pair
Pointer
.
to
(
gpu_corr_combo_indices
),
// int * gpu_combo_indices, // output if not null: packed tile+pairs_mask
Pointer
.
to
(
new
int
[]
{
corr_stride_td
}),
// const size_t corr_stride, // (in floats) stride for the input TD correlations
Pointer
.
to
(
gpu_corrs_td
),
// float * gpu_corrs, // input correlation tiles
Pointer
.
to
(
new
int
[]
{
corr_stride_combo_td
}),
// const size_t corr_stride_combo, // (in floats) stride for the output TD
Pointer
.
to
(
gpu_corrs_combo_td
));
// float * gpu_corrs_combo); // combined correlation output (one per tile)
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_CORR2D_COMBINE_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
}
/**
* Normalize 2D correlations, transform and unfold
* @param fat_zero - absolute fat zero - add to correlations before normalization
* @param corr_radius - correlation result size (maximal 7 for 15x15)
*/
public
void
execCorr2D_normalize
(
double
fat_zero
,
int
corr_radius
)
{
if
(
GPU_CORR2D_NORMALIZE_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CORR2D_NORMALIZE_kernel"
);
return
;
}
// float [] fnum_corrs = new float[1];
// cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
// int num_tiles = Float.floatToIntBits(fnum_corrs[0])/num_pairs; // number of correlation tiles calculated
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
new
int
[]
{
num_corr_combo_tiles
}),
// num_task_tiles }), // int num_corr_tiles, // number of correlation tiles to process
Pointer
.
to
(
new
int
[]
{
corr_stride_combo_td
}),
// const size_t corr_stride_td, // in floats
Pointer
.
to
(
gpu_corrs_combo_td
),
// float * gpu_corrs_combo); // combined correlation output (one per tile)
Pointer
.
to
(
new
int
[]
{
corr_stride_combo
}),
// const size_t corr_stride, // in floats
Pointer
.
to
(
gpu_corrs_combo
),
// float * gpu_corrs, // correlation output data (pixel domain)
Pointer
.
to
(
new
float
[]
{(
float
)
fat_zero
}),
// float fat_zero, // here - absolute
Pointer
.
to
(
new
int
[]
{
corr_radius
}));
// int corr_radius, // radius of the output correlation (7 for 15x15)
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_CORR2D_NORMALIZE_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
}
/**
* Generate combined (overlapping) texture
* @param color_weights - [3] (RGB) or [1] (mono) color weights for matching
...
...
@@ -1734,6 +1943,20 @@ public class GPUTileProcessor {
}
public
int
[]
getCorrComboIndices
()
{
// float [] fnum_corrs = new float[1];
// cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
// int num_corrs = Float.floatToIntBits(fnum_corrs[0]);
float
[]
fcorr_combo_indices
=
new
float
[
num_corr_combo_tiles
];
cuMemcpyDtoH
(
Pointer
.
to
(
fcorr_combo_indices
),
gpu_corr_combo_indices
,
num_corr_combo_tiles
*
Sizeof
.
FLOAT
);
int
[]
corr_combo_indices
=
new
int
[
num_corr_combo_tiles
];
for
(
int
i
=
0
;
i
<
num_corr_combo_tiles
;
i
++)
{
corr_combo_indices
[
i
]
=
Float
.
floatToIntBits
(
fcorr_combo_indices
[
i
]);
}
return
corr_combo_indices
;
}
public
float
[][]
getCorr2D
(
int
corr_rad
){
int
corr_size
=
(
2
*
corr_rad
+
1
)
*
(
2
*
corr_rad
+
1
);
float
[]
cpu_corrs
=
new
float
[
num_corr_tiles
*
corr_size
];
...
...
@@ -1758,6 +1981,30 @@ public class GPUTileProcessor {
return
corrs
;
}
public
float
[][]
getCorr2DCombo
(
int
corr_rad
){
int
corr_size
=
(
2
*
corr_rad
+
1
)
*
(
2
*
corr_rad
+
1
);
float
[]
cpu_corrs
=
new
float
[
num_corr_combo_tiles
*
corr_size
];
CUDA_MEMCPY2D
copyD2H
=
new
CUDA_MEMCPY2D
();
copyD2H
.
srcMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_DEVICE
;
copyD2H
.
srcDevice
=
gpu_corrs_combo
;
copyD2H
.
srcPitch
=
corr_stride_combo
*
Sizeof
.
FLOAT
;
copyD2H
.
dstMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_HOST
;
copyD2H
.
dstHost
=
Pointer
.
to
(
cpu_corrs
);
copyD2H
.
dstPitch
=
corr_size
*
Sizeof
.
FLOAT
;
copyD2H
.
WidthInBytes
=
corr_size
*
Sizeof
.
FLOAT
;
copyD2H
.
Height
=
num_corr_combo_tiles
;
cuMemcpy2D
(
copyD2H
);
// run copy
float
[][]
corrs
=
new
float
[
num_corr_combo_tiles
][
corr_size
];
for
(
int
ncorr
=
0
;
ncorr
<
num_corr_combo_tiles
;
ncorr
++)
{
System
.
arraycopy
(
cpu_corrs
,
ncorr
*
corr_size
,
corrs
[
ncorr
],
0
,
corr_size
);
}
return
corrs
;
}
//
/**
...
...
src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
View file @
fa5947b6
...
...
@@ -657,6 +657,30 @@ public class QuadCLT extends QuadCLTCPU {
clt_parameters
.
gpu_corr_rad
);
// int corr_radius
long
endCorr2d
=
System
.
nanoTime
();
// SHould be done before execCorr2D_TD as corr_indices are shared to save memory
int
[]
corr_indices
=
quadCLT_main
.
getGPU
().
getCorrIndices
();
// the following is not yet shared
float
[][]
corr2D
=
quadCLT_main
.
getGPU
().
getCorr2D
(
clt_parameters
.
gpu_corr_rad
);
// int corr_rad);
// calculate correlations, keep TD
quadCLT_main
.
getGPU
().
execCorr2D_TD
(
scales
);
// double [] scales,
quadCLT_main
.
getGPU
().
execCorr2D_combine
(
// calculate cross pairs
true
,
// boolean init_corr, // initialize output tiles (false - add to current)
6
,
// int num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
0x0f
);
// int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
quadCLT_main
.
getGPU
().
execCorr2D_normalize
(
fat_zero
,
// double fat_zero);
clt_parameters
.
gpu_corr_rad
);
// int corr_radius
// run textures
long
startTextures
=
System
.
nanoTime
();
// System.nanoTime();
boolean
calc_textures
=
clt_parameters
.
gpu_show_jtextures
;
// true;
...
...
@@ -770,16 +794,52 @@ public class QuadCLT extends QuadCLTCPU {
//Show 2D correlations
int
[]
wh
=
new
int
[
2
];
if
(
clt_parameters
.
show_corr
)
{
int
[]
corr_indices
=
quadCLT_main
.
getGPU
().
getCorrIndices
();
float
[][]
corr2D
=
quadCLT_main
.
getGPU
().
getCorr2D
(
clt_parameters
.
gpu_corr_rad
);
// int corr_rad);
// convert to 6-layer image using tasks
double
[][]
dbg_corr
=
GPUTileProcessor
.
getCorr2DView
(
int
[]
corr_quad_indices
=
quadCLT_main
.
getGPU
().
getCorrComboIndices
();
// get quad
float
[][]
corr2D_quad
=
quadCLT_main
.
getGPU
().
getCorr2DCombo
(
clt_parameters
.
gpu_corr_rad
);
// calculate and get cross here!
quadCLT_main
.
getGPU
().
execCorr2D_combine
(
// calculate cross pairs
true
,
// boolean init_corr, // initialize output tiles (false - add to current)
6
,
// int num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
0x30
);
// int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
quadCLT_main
.
getGPU
().
execCorr2D_normalize
(
fat_zero
,
// double fat_zero);
clt_parameters
.
gpu_corr_rad
);
// int corr_radius
int
[]
corr_cross_indices
=
quadCLT_main
.
getGPU
().
getCorrComboIndices
();
// get quad
float
[][]
corr2D_cross
=
quadCLT_main
.
getGPU
().
getCorr2DCombo
(
clt_parameters
.
gpu_corr_rad
);
double
[][]
dbg_corr_pairs
=
GPUTileProcessor
.
getCorr2DView
(
tilesX
,
tilesY
,
corr_indices
,
corr2D
,
wh
);
double
[][]
dbg_corr_quad
=
GPUTileProcessor
.
getCorr2DView
(
tilesX
,
tilesY
,
corr_quad_indices
,
corr2D_quad
,
wh
);
double
[][]
dbg_corr_cross
=
GPUTileProcessor
.
getCorr2DView
(
tilesX
,
tilesY
,
corr_cross_indices
,
corr2D_cross
,
wh
);
double
[][]
dbg_corr
=
{
dbg_corr_pairs
[
0
],
dbg_corr_pairs
[
1
],
dbg_corr_pairs
[
2
],
dbg_corr_pairs
[
3
],
dbg_corr_pairs
[
4
],
dbg_corr_pairs
[
5
],
dbg_corr_quad
[
15
],
dbg_corr_cross
[
48
]
};
(
new
ShowDoubleFloatArrays
()).
showArrays
(
dbg_corr
,
wh
[
0
],
...
...
@@ -788,6 +848,7 @@ public class QuadCLT extends QuadCLTCPU {
name
+
"-CORR2D-D"
+
clt_parameters
.
disparity
,
GPUTileProcessor
.
getCorrTitles
());
}
// convert to overlapping and show
if
(
clt_parameters
.
gen_chn_img
)
{
// save and show 4-slice image
// combine to a sliced color image
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment