Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
651267f2
Commit
651267f2
authored
Mar 12, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added variable-size 2D correlation output
parent
84eeaf35
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
109 additions
and
54 deletions
+109
-54
CLTParameters.java
src/main/java/com/elphel/imagej/cameras/CLTParameters.java
+11
-0
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+10
-8
ImageDtt.java
src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+33
-0
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+6
-4
TileProcessor.cuh
src/main/resources/TileProcessor.cuh
+49
-42
No files found.
src/main/java/com/elphel/imagej/cameras/CLTParameters.java
View file @
651267f2
...
...
@@ -772,6 +772,7 @@ public class CLTParameters {
public
double
gpu_sigma_b
=
1.1
;
public
double
gpu_sigma_g
=
0.7
;
public
double
gpu_sigma_m
=
0.7
;
public
double
gpu_sigma_rb_corr
=
0.5
;
// apply LPF after accumulating R and B correlation before G,
public
double
gpu_sigma_corr
=
0.9
;
public
double
gpu_sigma_corr_m
=
0.15
;
public
double
gpu_fatz
=
30.0
;
...
...
@@ -837,6 +838,9 @@ public class CLTParameters {
return
monochrome
?
gpu_sigma_corr_m
:
gpu_sigma_corr
;
}
public
double
getGpuCorrRBSigma
(
boolean
monochrome
)
{
return
monochrome
?
1.0
:
gpu_sigma_rb_corr
;
}
public
double
getScaleStrength
(
boolean
aux
)
{
return
aux
?
scale_strength_aux
:
scale_strength_main
;
...
...
@@ -1540,6 +1544,7 @@ public class CLTParameters {
properties
.
setProperty
(
prefix
+
"gpu_sigma_b"
,
this
.
gpu_sigma_b
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_sigma_g"
,
this
.
gpu_sigma_g
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_sigma_m"
,
this
.
gpu_sigma_m
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_sigma_rb_corr"
,
this
.
gpu_sigma_rb_corr
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_sigma_corr"
,
this
.
gpu_sigma_corr
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_sigma_corr_m"
,
this
.
gpu_sigma_corr_m
+
""
);
properties
.
setProperty
(
prefix
+
"gpu_fatz"
,
this
.
gpu_fatz
+
""
);
...
...
@@ -2305,6 +2310,7 @@ public class CLTParameters {
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_b"
)!=
null
)
this
.
gpu_sigma_b
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_b"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_g"
)!=
null
)
this
.
gpu_sigma_g
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_g"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_m"
)!=
null
)
this
.
gpu_sigma_m
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_m"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_rb_corr"
)!=
null
)
this
.
gpu_sigma_rb_corr
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_rb_corr"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_corr"
)!=
null
)
this
.
gpu_sigma_corr
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_corr"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_corr_m"
)!=
null
)
this
.
gpu_sigma_corr_m
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_sigma_corr_m"
));
if
(
properties
.
getProperty
(
prefix
+
"gpu_fatz"
)!=
null
)
this
.
gpu_fatz
=
Double
.
parseDouble
(
properties
.
getProperty
(
prefix
+
"gpu_fatz"
));
...
...
@@ -3211,6 +3217,7 @@ public class CLTParameters {
"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b"
);
gd
.
addNumericField
(
"Correlation weight B"
,
this
.
gpu_weight_b
,
4
,
6
,
""
,
"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b"
);
gd
.
addMessage
(
"--- LPF sigmas used for images/textures ---"
);
gd
.
addNumericField
(
"Color LPF sigma R"
,
this
.
gpu_sigma_r
,
4
,
6
,
"pix"
,
"LPF sigma to process color components during aberration correction"
);
gd
.
addNumericField
(
"Color LPF sigma B"
,
this
.
gpu_sigma_b
,
4
,
6
,
"pix"
,
...
...
@@ -3219,6 +3226,9 @@ public class CLTParameters {
"LPF sigma to process color components during aberration correction"
);
gd
.
addNumericField
(
"Monochrome LPF sigma"
,
this
.
gpu_sigma_m
,
4
,
6
,
"pix"
,
"LPF sigma to process monochrome (e.g.LWIR) during aberration correction"
);
gd
.
addMessage
(
"--- LPF sigmas used for 2D phase correlation ---"
);
gd
.
addNumericField
(
"LPF sigma Red/Blue (extra)"
,
this
.
gpu_sigma_rb_corr
,
4
,
6
,
"pix"
,
"LPF sigma to apply to the composite R+B 2D correlation before adding G (next LPF will be applied to R+B+G correlation"
);
gd
.
addNumericField
(
"LPF sigma for correlation, color"
,
this
.
gpu_sigma_corr
,
4
,
6
,
"pix"
,
"LPF sigma to apply to the composite 2D correlation for RGB images"
);
gd
.
addNumericField
(
"LPF sigma for correlation, mono"
,
this
.
gpu_sigma_corr_m
,
4
,
6
,
"pix"
,
...
...
@@ -3964,6 +3974,7 @@ public class CLTParameters {
this
.
gpu_sigma_b
=
gd
.
getNextNumber
();
this
.
gpu_sigma_g
=
gd
.
getNextNumber
();
this
.
gpu_sigma_m
=
gd
.
getNextNumber
();
this
.
gpu_sigma_rb_corr
=
gd
.
getNextNumber
();
this
.
gpu_sigma_corr
=
gd
.
getNextNumber
();
this
.
gpu_sigma_corr_m
=
gd
.
getNextNumber
();
this
.
gpu_fatz
=
gd
.
getNextNumber
();
...
...
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
651267f2
...
...
@@ -765,7 +765,8 @@ public class GPUTileProcessor {
public
void
execCorr2D
(
double
[]
scales
,
double
fat_zero
)
{
double
fat_zero
,
int
corr_radius
)
{
if
(
GPU_CORRELATE2D_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CORRELATE2D_kernel"
);
...
...
@@ -788,6 +789,7 @@ public class GPUTileProcessor {
Pointer
.
to
(
new
int
[]
{
num_corr_tiles
}),
// lpf_mask
Pointer
.
to
(
gpu_corr_indices
),
Pointer
.
to
(
new
int
[]
{
corr_stride
}),
Pointer
.
to
(
new
int
[]
{
corr_radius
}),
Pointer
.
to
(
gpu_corrs
)
// lpf_mask
);
cuCtxSynchronize
();
...
...
@@ -800,9 +802,9 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
}
public
float
[][]
getCorr2D
(){
float
[]
cpu_corrs
=
new
float
[
num_corr_tiles
*
CORR_SIZE
]
;
public
float
[][]
getCorr2D
(
int
corr_rad
){
int
corr_size
=
(
2
*
corr_rad
+
1
)
*
(
2
*
corr_rad
+
1
)
;
float
[]
cpu_corrs
=
new
float
[
num_corr_tiles
*
corr_size
];
CUDA_MEMCPY2D
copyD2H
=
new
CUDA_MEMCPY2D
();
copyD2H
.
srcMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_DEVICE
;
copyD2H
.
srcDevice
=
gpu_corrs
;
...
...
@@ -810,16 +812,16 @@ public class GPUTileProcessor {
copyD2H
.
dstMemoryType
=
CUmemorytype
.
CU_MEMORYTYPE_HOST
;
copyD2H
.
dstHost
=
Pointer
.
to
(
cpu_corrs
);
copyD2H
.
dstPitch
=
CORR_SIZE
*
Sizeof
.
FLOAT
;
copyD2H
.
dstPitch
=
corr_size
*
Sizeof
.
FLOAT
;
copyD2H
.
WidthInBytes
=
CORR_SIZE
*
Sizeof
.
FLOAT
;
copyD2H
.
WidthInBytes
=
corr_size
*
Sizeof
.
FLOAT
;
copyD2H
.
Height
=
num_corr_tiles
;
cuMemcpy2D
(
copyD2H
);
// run copy
float
[][]
corrs
=
new
float
[
num_corr_tiles
][
CORR_SIZE
];
float
[][]
corrs
=
new
float
[
num_corr_tiles
][
corr_size
];
for
(
int
ncorr
=
0
;
ncorr
<
num_corr_tiles
;
ncorr
++)
{
System
.
arraycopy
(
cpu_corrs
,
ncorr
*
CORR_SIZE
,
corrs
[
ncorr
],
0
,
CORR_SIZE
);
System
.
arraycopy
(
cpu_corrs
,
ncorr
*
corr_size
,
corrs
[
ncorr
],
0
,
corr_size
);
}
return
corrs
;
}
...
...
src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
View file @
651267f2
...
...
@@ -9113,11 +9113,15 @@ public class ImageDtt {
final
int
transform_len
=
transform_size
*
transform_size
;
final
double
afat_zero2
=
clt_parameters
.
getGpuFatZero
(
is_mono
);
// ? clt_parameters.gpu_fatz_m : clt_parameters.gpu_fatz;
final
double
sigma_corr
=
clt_parameters
.
getGpuCorrSigma
(
is_mono
);
// ? clt_parameters.gpu_sigma_corr_m : clt_parameters.gpu_sigma_corr;
final
double
sigma_rb_corr
=
clt_parameters
.
getGpuCorrRBSigma
(
is_mono
);
final
int
corr_radius
=
clt_parameters
.
gpu_corr_rad
;
// final double[] lpf = getLpf(sigma_corr);
final
double
[]
lpf_fd
=
doubleGetCltLpfFd
(
sigma_corr
);
final
double
[]
lpf_rb_fd
=
doubleGetCltLpfFd
(
sigma_rb_corr
);
for
(
int
ithread
=
0
;
ithread
<
threads
.
length
;
ithread
++)
{
threads
[
ithread
]
=
new
Thread
()
{
@Override
...
...
@@ -9199,6 +9203,35 @@ public class ImageDtt {
}
}
}
//c
if
(
c
==
1
)
{
// after red and blue, color only
if
(
lpf_rb_fd
!=
null
)
{
for
(
int
n
=
0
;
n
<
4
;
n
++)
{
for
(
int
i
=
0
;
i
<
transform_len
;
i
++)
{
tcorr
[
n
][
i
]
*=
lpf_rb_fd
[
i
];
}
}
if
(
debug_gpu
&&
debug_tile
)
{
System
.
out
.
println
(
"=== LPF-RB for CORRELATION ==="
);
for
(
int
i
=
0
;
i
<
transform_size
;
i
++)
{
for
(
int
j
=
0
;
j
<
transform_size
;
j
++)
{
System
.
out
.
print
(
String
.
format
(
"%10.5f "
,
lpf_rb_fd
[
transform_size
*
i
+
j
]));
}
System
.
out
.
println
();
}
System
.
out
.
println
(
"=== R+B LPF-ed CORRELATION ==="
);
for
(
int
dct_mode
=
0
;
dct_mode
<
4
;
dct_mode
++)
{
System
.
out
.
println
(
"------dct_mode="
+
dct_mode
);
for
(
int
i
=
0
;
i
<
transform_size
;
i
++)
{
for
(
int
j
=
0
;
j
<
transform_size
;
j
++)
{
System
.
out
.
print
(
String
.
format
(
"%10.3f "
,
tcorr
[
dct_mode
][
transform_size
*
i
+
j
]));
}
System
.
out
.
println
();
}
}
}
}
}
}
}
corrs2d
[
n_corr
]
=
correlation2d
.
normalizeConvertCorr
(
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
651267f2
...
...
@@ -1994,7 +1994,8 @@ public class TwoQuadCLT {
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
gPUTileProcessor
.
execCorr2D
(
scales
,
// double [] scales,
fat_zero
);
// double fat_zero);
fat_zero
,
// double fat_zero);
clt_parameters
.
gpu_corr_rad
);
// int corr_radius
long
endCorr2d
=
System
.
nanoTime
();
...
...
@@ -2037,12 +2038,13 @@ public class TwoQuadCLT {
debugLevel
);
}
float
[][]
corr2D
=
gPUTileProcessor
.
getCorr2D
();
float
[][]
corr2D
=
gPUTileProcessor
.
getCorr2D
(
clt_parameters
.
gpu_corr_rad
);
// int corr_rad);
// convert to 6-layer image using tasks
int
tilesX
=
GPUTileProcessor
.
IMG_WIDTH
/
GPUTileProcessor
.
DTT_SIZE
;
int
tilesY
=
GPUTileProcessor
.
IMG_HEIGHT
/
GPUTileProcessor
.
DTT_SIZE
;
int
[]
wh
=
new
int
[
2
];
double
[][]
dbg_corr
=
g
PUTileProcessor
.
getCorr2DView
(
double
[][]
dbg_corr
=
G
PUTileProcessor
.
getCorr2DView
(
tilesX
,
tilesY
,
corr_indices
,
...
...
@@ -2054,7 +2056,7 @@ public class TwoQuadCLT {
wh
[
1
],
true
,
"CORR2D"
,
g
PUTileProcessor
.
getCorrTitles
());
G
PUTileProcessor
.
getCorrTitles
());
if
(
clt_parameters
.
gen_chn_img
)
{
// combine to a sliced color image
...
...
src/main/resources/TileProcessor.cuh
View file @
651267f2
...
...
@@ -57,7 +57,8 @@
#define IMCLT_TILES_PER_BLOCK 4
#define CORR_PAIR_SHIFT 8 // 8 lower bits - number of a pair, other bits tile number
#define TASK_CORR_BITS 4
#define CORR_OUT_RAD 7
#define CORR_OUT_RAD 4
//7
#endif
...
...
@@ -376,6 +377,7 @@ __device__ void debug_print_mclt(
float
*
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
const
int
color
);
__device__
void
debug_print_corr_15x15
(
int
corr_radius
,
float
*
mclt_tile
,
//DTT_SIZE2M1 x DTT_SIZE2M1
const
int
color
);
// Fractional pixel shift (phase rotation), horizontal. In-place.
...
...
@@ -400,6 +402,7 @@ __device__ void normalizeTileAmplitude(
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
fat_zero
);
// fat zero is absolute, scale it outside
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
);
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__device__
void
imclt
(
// implemented, used // why is it twice?
...
...
@@ -426,6 +429,7 @@ __global__ void correlate2D(
size_t
num_corr_tiles
,
// number of correlation tiles to process
int
*
gpu_corr_indices
,
// packed tile+pair
const
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
)
// correlation output data
{
/// int thr3 = threadIdx.x >> 3; // now zero?
...
...
@@ -559,27 +563,17 @@ __global__ void correlate2D(
#endif
#endif
// now new part - need to transform with DCT-II and make 15x15
/*
// quadrant 0 dct_ii hor, dct_ii vert,
// quadrant 1 dct_ii hor, dst_ii vert,
// quadrant 2 dst_ii hor, dct_ii vert,
// quadrant 3 dst_ii hor, dst_ii vert,
Java code:
for (int quadrant = 0; quadrant < 4; quadrant++){
int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
}
*/
// change to 16-32 threads?? in next iteration
//
hor pass
//
vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for
(
int
q
=
0
;
q
<
4
;
q
++
){
int
is_sin
=
(
q
>>
1
)
&
1
;
// int is_sin = q & 1;
// dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
// dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
dttii_shared_mem_nonortho
(
clt_corr
+
q
*
(
DTT_SIZE1
*
DTT_SIZE
)
+
threadIdx
.
x
,
DTT_SIZE1
,
is_sin
);
// vertical pass, thread is column
}
__syncthreads
();
...
...
@@ -593,26 +587,25 @@ Java code:
#endif
#endif
//
vert pass
//
hor pass, corresponding to vert pass in Java
for
(
int
q
=
0
;
q
<
4
;
q
++
){
int
is_sin
=
q
&
1
;
// int is_sin = (q >> 1) & 1;
// dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
// dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
dttii_shared_mem_nonortho
(
clt_corr
+
(
q
*
DTT_SIZE
+
threadIdx
.
x
)
*
DTT_SIZE1
,
1
,
is_sin
);
// horizontal pass, tread is row
}
__syncthreads
();
#ifdef DBG_TILE
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
printf
(
"
\n
correlate2D AFTER HOSIZONTAL (VERTICAL) PASS
\n
"
);
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
4
)){
printf
(
"
\n
correlate2D AFTER HOSIZONTAL (VERTICAL) PASS
, corr_radius=%d
\n
"
,
corr_radius
);
debug_print_clt1
(
clt_corr
,
-
1
,
0xf
);
}
__syncthreads
();
// __syncwarp();
#endif
#endif
corrUnfoldTile
(
corr_radius
,
// int corr_radius,
(
float
*
)
clt_corr
,
// float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(
float
*
)
mclt_corr
);
// float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
...
...
@@ -621,23 +614,28 @@ Java code:
#ifdef DBG_TILE
#ifdef DEBUG6
if
((
tile_num
==
DBG_TILE
)
&&
(
corr_pair
==
0
)
&&
(
threadIdx
.
x
==
0
)){
printf
(
"
\n
correlate2D after UNFOLD
\n
"
);
debug_print_corr_15x15
(
mclt_corr
,
-
1
);
printf
(
"
\n
correlate2D after UNFOL, corr_radius=%d
\n
"
,
corr_radius
);
debug_print_corr_15x15
(
corr_radius
,
// int corr_radius,
mclt_corr
,
-
1
);
}
__syncthreads
();
// __syncwarp();
#endif
#endif
// copy 15x15 tile to main memory
// searching for bug. Uncomment later
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
int
size2r1
=
2
*
corr_radius
+
1
;
int
len2r1x2r1
=
size2r1
*
size2r1
;
int
corr_tile_offset
=
+
corr_stride
*
corr_num
;
float
*
mem_corr
=
gpu_corrs
+
corr_tile_offset
;
//CORR_THREADS_PER_TILE
// int offs = threadIdx.x;
#pragma unroll
for
(
int
offs
=
threadIdx
.
x
;
offs
<
DTT_SIZE2M1
*
DTT_SIZE2M1
;
offs
+=
CORR_THREADS_PER_TILE
){
// variable number of cycles per thread
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
for
(
int
offs
=
threadIdx
.
x
;
offs
<
len2r1x2r1
;
offs
+=
CORR_THREADS_PER_TILE
){
// variable number of cycles per thread
mem_corr
[
offs
]
=
mclt_corr
[
offs
];
}
__syncthreads
();
#ifdef DBG_TILE
#ifdef DEBUG6
...
...
@@ -969,27 +967,36 @@ Converted from DttRad2.java:443
)
*/
__device__
void
corrUnfoldTile
(
int
corr_radius
,
float
*
qdata0
,
// [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
float
*
rslt
)
// [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
{
const
int
rslt_base_index
=
DTT_SIZE2M1
*
(
DTT_SIZE
)
-
DTT_SIZE
;
int
size2r1
=
2
*
corr_radius
+
1
;
// 15
int
crp1
=
corr_radius
+
1
;
//8
/// const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
int
rslt_base_index
=
size2r1
*
crp1
-
crp1
;
// offset of the center
float
*
qdata1
=
qdata0
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata2
=
qdata1
+
(
DTT_SIZE
*
DTT_SIZE1
);
float
*
qdata3
=
qdata2
+
(
DTT_SIZE
*
DTT_SIZE1
);
int
i
=
threadIdx
.
x
;
if
(
i
>
corr_radius
)
{
return
;
// not needed, only use inner
}
// printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
float
corr_pixscale
=
0.25
f
;
int
i_transform_size
=
i
*
DTT_SIZE1
;
// used to address source rows which are 9 long
int
im1_transform_size
=
i_transform_size
-
DTT_SIZE1
;
// negative for i = 0, use only after divergence
int
rslt_row_offs
=
i
*
DTT_SIZE2M1
;
/// int rslt_row_offs = i * DTT_SIZE2M1;
int
rslt_row_offs
=
i
*
size2r1
;
int
rslt_base_index_p
=
rslt_base_index
+
rslt_row_offs
;
// i * DTT_SIZE2M1;
int
rslt_base_index_m
=
rslt_base_index
-
rslt_row_offs
;
// i * DTT_SIZE2M1;
rslt
[
rslt_base_index_p
]
=
corr_pixscale
*
qdata0
[
i_transform_size
];
// incomplete, will only be used for thread i=0
rslt
[
rslt_base_index_m
]
=
rslt
[
rslt_base_index_p
];
// nop for i=0 incomplete, will only be used for thread i=0
for
(
int
j
=
1
;
j
<
DTT_SIZE
;
j
++
)
{
/// for (int j = 1; j < DTT_SIZE; j++) {
for
(
int
j
=
1
;
j
<=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
/// int rslt_base_index_mp = rslt_base_index_m + j;
/// int rslt_base_index_mm = rslt_base_index_m - j;
rslt
[
rslt_base_index_pp
]
=
corr_pixscale
*
(
qdata0
[
i_transform_size
+
j
]
+
qdata1
[
i_transform_size
+
j
-
1
]);
// incomplete, will only be used for thread i=0
...
...
@@ -1000,12 +1007,11 @@ __device__ void corrUnfoldTile(
if
(
i
==
0
)
{
return
;
}
/// int im1 = i-1;
im1_transform_size
=
i_transform_size
-
DTT_SIZE1
;
/// im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
float
d
=
corr_pixscale
*
qdata2
[
im1_transform_size
];
rslt
[
rslt_base_index_p
]
+=
d
;
rslt
[
rslt_base_index_m
]
-=
d
;
for
(
int
j
=
1
;
j
<
DTT_SIZE
;
j
++
)
{
for
(
int
j
=
1
;
j
<
=
corr_radius
;
j
++
)
{
int
rslt_base_index_pp
=
rslt_base_index_p
+
j
;
int
rslt_base_index_pm
=
rslt_base_index_p
-
j
;
int
rslt_base_index_mp
=
rslt_base_index_m
+
j
;
...
...
@@ -1068,14 +1074,15 @@ __device__ void debug_print_mclt(
}
__device__
void
debug_print_corr_15x15
(
int
corr_radius
,
float
*
mclt_tile
,
//DTT_SIZE2M1 x DTT_SIZE2M1
const
int
color
)
{
int
size2r1
=
2
*
corr_radius
+
1
;
if
(
color
>=
0
)
printf
(
"----------- Color = %d -----------
\n
"
,
color
);
for
(
int
dbg_row
=
0
;
dbg_row
<
DTT_SIZE2M
1
;
dbg_row
++
){
for
(
int
dbg_col
=
0
;
dbg_col
<
DTT_SIZE2M
1
;
dbg_col
++
){
printf
(
"%10.5f "
,
mclt_tile
[
dbg_row
*
DTT_SIZE2M
1
+
dbg_col
]);
for
(
int
dbg_row
=
0
;
dbg_row
<
size2r
1
;
dbg_row
++
){
for
(
int
dbg_col
=
0
;
dbg_col
<
size2r
1
;
dbg_col
++
){
printf
(
"%10.5f "
,
mclt_tile
[
dbg_row
*
size2r
1
+
dbg_col
]);
}
printf
(
"
\n
"
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment