Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
4fb94627
Commit
4fb94627
authored
Apr 16, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changing corr2D to CDP
parent
20df596a
Changes
4
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
233 additions
and
322 deletions
+233
-322
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+35
-16
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+5
-4
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+131
-207
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+62
-95
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
4fb94627
...
...
@@ -190,6 +190,7 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_4_images
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_corr_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_num_corr_tiles
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_texture_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_port_offsets
=
new
CUdeviceptr
();
// allocate Quad * 2 * Sizeof.POINTER
private
CUdeviceptr
gpu_woi
=
new
CUdeviceptr
();
// 4 integers (x, y, width, height) Rectangle - in tiles
...
...
@@ -575,8 +576,8 @@ public class GPUTileProcessor {
cuMemAlloc
(
gpu_tasks
,
tilesX
*
tilesY
*
TPTASK_SIZE
*
Sizeof
.
FLOAT
);
//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
// Set corrs array
/// cuMemAlloc(gpu_corrs, tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
);
cuMemAlloc
(
gpu_
corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_
num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
//#define TILESYA ((TILESY +3) & (~3))
int
tilesYa
=
(
tilesY
+
3
)
&
~
3
;
...
...
@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
// remove later
}
public
void
execConverDirect
()
{
public
void
execConver
t
Direct
()
{
if
(
GPU_CONVERT_DIRECT_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CONVERT_DIRECT_kernel"
);
...
...
@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
float
fscale0
=
(
float
)
scales
[
0
];
float
fscale1
=
(
num_colors
>
1
)?((
float
)
scales
[
1
]):
0.0f
;
float
fscale2
=
(
num_colors
>
2
)?((
float
)
scales
[
2
]):
0.0f
;
int
[]
GridFullWarps
=
{(
num_corr_tiles
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
};
// int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
// int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
new
int
[]
{
num_colors
}),
Pointer
.
to
(
new
float
[]
{
fscale0
}),
Pointer
.
to
(
new
float
[]
{
fscale1
}),
Pointer
.
to
(
new
float
[]
{
fscale2
}),
Pointer
.
to
(
new
float
[]
{(
float
)
fat_zero
}),
Pointer
.
to
(
new
int
[]
{
num_corr_tiles
}),
// lpf_mask
Pointer
.
to
(
gpu_corr_indices
),
Pointer
.
to
(
new
int
[]
{
corr_stride
}),
Pointer
.
to
(
new
int
[]
{
corr_radius
}),
Pointer
.
to
(
gpu_corrs
)
// lpf_mask
Pointer
.
to
(
gpu_clt
),
// float ** gpu_clt,
Pointer
.
to
(
new
int
[]
{
num_colors
}),
// int colors, // number of colors (3/1)
Pointer
.
to
(
new
float
[]
{
fscale0
}),
// float scale0, // scale for R
Pointer
.
to
(
new
float
[]
{
fscale1
}),
// float scale1, // scale for B
Pointer
.
to
(
new
float
[]
{
fscale2
}),
// float scale2, // scale for G
Pointer
.
to
(
new
float
[]
{(
float
)
fat_zero
}),
// float fat_zero, // here - absolute
Pointer
.
to
(
gpu_tasks
),
// struct tp_task * gpu_tasks,
Pointer
.
to
(
new
int
[]
{
num_task_tiles
}),
// int num_tiles // number of tiles in task
Pointer
.
to
(
gpu_corr_indices
),
// int * gpu_corr_indices, // packed tile+pair
Pointer
.
to
(
gpu_num_corr_tiles
),
// int * pnum_corr_tiles, // pointer to a number of tiles to process
Pointer
.
to
(
new
int
[]
{
corr_stride
}),
// const size_t corr_stride, // in floats
Pointer
.
to
(
new
int
[]
{
corr_radius
}),
// int corr_radius, // radius of the output correlation (7 for 15x15)
Pointer
.
to
(
gpu_corrs
)
// float * gpu_corrs); // correlation output data
);
cuCtxSynchronize
();
// Call the kernel function
...
...
@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
}
return
corrs
;
}
public
int
[]
getCorrIndices
()
{
float
[]
fnum_corrs
=
new
float
[
1
];
cuMemcpyDtoH
(
Pointer
.
to
(
fnum_corrs
),
gpu_num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
int
num_corrs
=
Float
.
floatToIntBits
(
fnum_corrs
[
0
]);
float
[]
fcorr_indices
=
new
float
[
num_corrs
];
cuMemcpyDtoH
(
Pointer
.
to
(
fcorr_indices
),
gpu_corr_indices
,
num_corrs
*
Sizeof
.
FLOAT
);
int
[]
corr_indices
=
new
int
[
num_corrs
];
for
(
int
i
=
0
;
i
<
num_corrs
;
i
++)
{
corr_indices
[
i
]
=
Float
.
floatToIntBits
(
fcorr_indices
[
i
]);
}
num_corr_tiles
=
num_corrs
;
return
corr_indices
;
}
/**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
4fb94627
...
...
@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
use_aux
);
// boolean use_aux)
int
[]
corr_indices
=
gPUTileProcessor
.
getCorrTasks
(
tp_tasks
);
//
int [] corr_indices = gPUTileProcessor.getCorrTasks(
//
tp_tasks);
// corr_indices array of integers to be passed to GPU
gPUTileProcessor
.
setCorrIndices
(
corr_indices
);
//
gPUTileProcessor.setCorrIndices(corr_indices);
int
[]
texture_indices
=
gPUTileProcessor
.
getTextureTasks
(
tp_tasks
);
...
...
@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
long
startDirectConvert
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execConverDirect
();
gPUTileProcessor
.
execConver
t
Direct
();
}
// run imclt;
...
...
@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
int
tilesY
=
GPUTileProcessor
.
IMG_HEIGHT
/
GPUTileProcessor
.
DTT_SIZE
;
int
[]
wh
=
new
int
[
2
];
if
(
clt_parameters
.
show_corr
)
{
int
[]
corr_indices
=
gPUTileProcessor
.
getCorrIndices
();
float
[][]
corr2D
=
gPUTileProcessor
.
getCorr2D
(
clt_parameters
.
gpu_corr_rad
);
// int corr_rad);
// convert to 6-layer image using tasks
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
4fb94627
This diff is collapsed.
Click to expand it.
src/main/resources/kernels/TileProcessor.h
View file @
4fb94627
...
...
@@ -31,24 +31,19 @@
*/
/**
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
*/
*/
#pragma once
#ifndef NUM_CAMS
#include "tp_defines.h"
#endif
extern
"C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
);
// indices to gpu_tasks // should be initialized to zero
extern
"C"
__global__
void
convert_direct
(
// called with a single block,
CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
extern
"C"
__global__
void
convert_direct
(
// called with a single block,
single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
...
...
@@ -64,49 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
extern
"C"
__global__
void
correlate2D
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
struct
tp_task
*
gpu_tasks
,
// array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
const
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
height
);
// <= TILESY, use for faster processing of LWIR images
extern
"C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
extern
"C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
textures_accumulate
(
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -126,7 +94,7 @@ extern "C" __global__ void textures_accumulate(
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t
texture_rbg_stride
,
// in floats
float
*
gpu_texture_rbg
,
// (number of colors +1 + ?)*16*16 rgba texture tiles
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
...
...
@@ -154,18 +122,17 @@ extern "C" __global__ void imclt_rbg(
int
woi_theight
,
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment