Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
4fb94627
Commit
4fb94627
authored
Apr 16, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
changing corr2D to CDP
parent
20df596a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
233 additions
and
322 deletions
+233
-322
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+35
-16
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+5
-4
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+131
-207
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+62
-95
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
4fb94627
...
...
@@ -190,6 +190,7 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_4_images
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_corr_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_num_corr_tiles
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_texture_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_port_offsets
=
new
CUdeviceptr
();
// allocate Quad * 2 * Sizeof.POINTER
private
CUdeviceptr
gpu_woi
=
new
CUdeviceptr
();
// 4 integers (x, y, width, height) Rectangle - in tiles
...
...
@@ -575,8 +576,8 @@ public class GPUTileProcessor {
cuMemAlloc
(
gpu_tasks
,
tilesX
*
tilesY
*
TPTASK_SIZE
*
Sizeof
.
FLOAT
);
//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
// Set corrs array
/// cuMemAlloc(gpu_corrs, tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
);
cuMemAlloc
(
gpu_
corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_
num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
//#define TILESYA ((TILESY +3) & (~3))
int
tilesYa
=
(
tilesY
+
3
)
&
~
3
;
...
...
@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
// remove later
}
public
void
execConverDirect
()
{
public
void
execConver
t
Direct
()
{
if
(
GPU_CONVERT_DIRECT_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_CONVERT_DIRECT_kernel"
);
...
...
@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
float
fscale0
=
(
float
)
scales
[
0
];
float
fscale1
=
(
num_colors
>
1
)?((
float
)
scales
[
1
]):
0.0f
;
float
fscale2
=
(
num_colors
>
2
)?((
float
)
scales
[
2
]):
0.0f
;
int
[]
GridFullWarps
=
{(
num_corr_tiles
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
};
// int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
// int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
new
int
[]
{
num_colors
}),
Pointer
.
to
(
new
float
[]
{
fscale0
}),
Pointer
.
to
(
new
float
[]
{
fscale1
}),
Pointer
.
to
(
new
float
[]
{
fscale2
}),
Pointer
.
to
(
new
float
[]
{(
float
)
fat_zero
}),
Pointer
.
to
(
new
int
[]
{
num_corr_tiles
}),
// lpf_mask
Pointer
.
to
(
gpu_corr_indices
),
Pointer
.
to
(
new
int
[]
{
corr_stride
}),
Pointer
.
to
(
new
int
[]
{
corr_radius
}),
Pointer
.
to
(
gpu_corrs
)
// lpf_mask
Pointer
.
to
(
gpu_clt
),
// float ** gpu_clt,
Pointer
.
to
(
new
int
[]
{
num_colors
}),
// int colors, // number of colors (3/1)
Pointer
.
to
(
new
float
[]
{
fscale0
}),
// float scale0, // scale for R
Pointer
.
to
(
new
float
[]
{
fscale1
}),
// float scale1, // scale for B
Pointer
.
to
(
new
float
[]
{
fscale2
}),
// float scale2, // scale for G
Pointer
.
to
(
new
float
[]
{(
float
)
fat_zero
}),
// float fat_zero, // here - absolute
Pointer
.
to
(
gpu_tasks
),
// struct tp_task * gpu_tasks,
Pointer
.
to
(
new
int
[]
{
num_task_tiles
}),
// int num_tiles // number of tiles in task
Pointer
.
to
(
gpu_corr_indices
),
// int * gpu_corr_indices, // packed tile+pair
Pointer
.
to
(
gpu_num_corr_tiles
),
// int * pnum_corr_tiles, // pointer to a number of tiles to process
Pointer
.
to
(
new
int
[]
{
corr_stride
}),
// const size_t corr_stride, // in floats
Pointer
.
to
(
new
int
[]
{
corr_radius
}),
// int corr_radius, // radius of the output correlation (7 for 15x15)
Pointer
.
to
(
gpu_corrs
)
// float * gpu_corrs); // correlation output data
);
cuCtxSynchronize
();
// Call the kernel function
...
...
@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
}
return
corrs
;
}
public
int
[]
getCorrIndices
()
{
float
[]
fnum_corrs
=
new
float
[
1
];
cuMemcpyDtoH
(
Pointer
.
to
(
fnum_corrs
),
gpu_num_corr_tiles
,
1
*
Sizeof
.
FLOAT
);
int
num_corrs
=
Float
.
floatToIntBits
(
fnum_corrs
[
0
]);
float
[]
fcorr_indices
=
new
float
[
num_corrs
];
cuMemcpyDtoH
(
Pointer
.
to
(
fcorr_indices
),
gpu_corr_indices
,
num_corrs
*
Sizeof
.
FLOAT
);
int
[]
corr_indices
=
new
int
[
num_corrs
];
for
(
int
i
=
0
;
i
<
num_corrs
;
i
++)
{
corr_indices
[
i
]
=
Float
.
floatToIntBits
(
fcorr_indices
[
i
]);
}
num_corr_tiles
=
num_corrs
;
return
corr_indices
;
}
/**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
4fb94627
...
...
@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
use_aux
);
// boolean use_aux)
int
[]
corr_indices
=
gPUTileProcessor
.
getCorrTasks
(
tp_tasks
);
//
int [] corr_indices = gPUTileProcessor.getCorrTasks(
//
tp_tasks);
// corr_indices array of integers to be passed to GPU
gPUTileProcessor
.
setCorrIndices
(
corr_indices
);
//
gPUTileProcessor.setCorrIndices(corr_indices);
int
[]
texture_indices
=
gPUTileProcessor
.
getTextureTasks
(
tp_tasks
);
...
...
@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
long
startDirectConvert
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execConverDirect
();
gPUTileProcessor
.
execConver
t
Direct
();
}
// run imclt;
...
...
@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
int
tilesY
=
GPUTileProcessor
.
IMG_HEIGHT
/
GPUTileProcessor
.
DTT_SIZE
;
int
[]
wh
=
new
int
[
2
];
if
(
clt_parameters
.
show_corr
)
{
int
[]
corr_indices
=
gPUTileProcessor
.
getCorrIndices
();
float
[][]
corr2D
=
gPUTileProcessor
.
getCorr2D
(
clt_parameters
.
gpu_corr_rad
);
// int corr_rad);
// convert to 6-layer image using tasks
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
4fb94627
...
...
@@ -112,20 +112,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
//56494
// struct tp_task
//#define TASK_SIZE 12
#if 0
struct tp_task {
int task;
union {
int txy;
unsigned short sxy[2];
};
float xy[NUM_CAMS][2];
};
#endif
struct
CltExtra
{
float
data_x
;
// kernel data is relative to this displacement X (0.5 pixel increments)
float
data_y
;
// kernel data is relative to this displacement Y (0.5 pixel increments)
...
...
@@ -835,25 +821,24 @@ __device__ void imclt_plane( // not implemented, not used
float
*
gpu_rbg
,
// WIDTH, HEIGHT
const
size_t
dstride
);
// in floats (pixels)
#if 0
extern "C"
//extern "C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
height
);
// <= TILESY, use for faster processing of LWIR images
extern "C"
//
extern "C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
extern "C"
//
extern "C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
woi
);
// x,y,width,height of the woi
extern "C"
//
extern "C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
...
...
@@ -861,51 +846,101 @@ __global__ void gen_texture_list(
int
*
num_texture_tiles
,
// number of texture tiles to process
int
*
woi
);
// x,y,width,height of the woi
extern "C" __global__ void clear_texture_rbga(
//extern "C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
//extern "C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks // should be initialized to zero
__global__
void
index_correlate
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
pnum_corr_tiles
);
// pointer to the length of correlation tasks array
//extern "C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
extern
"C"
__global__
void
correlate2D_inner
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
size_t
num_corr_tiles
,
// number of correlation tiles to process
int
*
gpu_corr_indices
,
// packed tile+pair
const
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
extern "C"
__global__ void imclt_rbg(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_rbg, // WIDTH, 3 * HEIGHT
int apply_lpf,
int mono, // defines lpf filter
int color, // defines location of clt data
int v_offset,
int h_offset,
const size_t dstride); // in floats (pixels)
//===========================
#endif
// ====== end of local declarations ====
extern
"C"
__global__
void
correlate2D
(
extern
"C"
__global__
void
correlate2D
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
struct
tp_task
*
gpu_tasks
,
// array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
const
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
)
// correlation output data
{
dim3
threads0
(
CONVERT_DIRECT_INDEXING_THREADS
,
1
,
1
);
dim3
blocks0
((
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
if
(
threadIdx
.
x
==
0
)
{
// only 1 thread, 1 block
*
pnum_corr_tiles
=
0
;
index_correlate
<<<
blocks0
,
threads0
>>>
(
gpu_tasks
,
// struct tp_task * gpu_tasks,
num_tiles
,
// int num_tiles, // number of tiles in task
gpu_corr_indices
,
// int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles
);
// int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize
();
dim3
threads_corr
(
CORR_THREADS_PER_TILE
,
CORR_TILES_PER_BLOCK
,
1
);
dim3
grid_corr
((
*
pnum_corr_tiles
+
CORR_TILES_PER_BLOCK
-
1
)
/
CORR_TILES_PER_BLOCK
,
1
,
1
);
correlate2D_inner
<<<
grid_corr
,
threads_corr
>>>
(
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
colors
,
// int colors, // number of colors (3/1)
scale0
,
// float scale0, // scale for R
scale1
,
// float scale1, // scale for B
scale2
,
// float scale2, // scale for G
fat_zero
,
// float fat_zero, // here - absolute
*
pnum_corr_tiles
,
// size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices
,
// int * gpu_corr_indices, // packed tile+pair
corr_stride
,
// const size_t corr_stride, // in floats
corr_radius
,
// int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs
);
// float * gpu_corrs); // correlation output data
}
}
extern
"C"
__global__
void
correlate2D_inner
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
...
...
@@ -1304,7 +1339,8 @@ __global__ void generate_RBGA(
}
// blockDim.x * gridDim.x >= width
extern
"C"
__global__
void
clear_texture_rbga
(
//extern "C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
// aligned to DTT_SIZE
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
...
...
@@ -1329,7 +1365,7 @@ extern "C" __global__ void clear_texture_rbga(
* prepare list of texture tiles, woi, and calculate orthogonal neighbors for tiles (in 4 bits of the task field
* use 4x8=32 threads,
*/
extern
"C"
//
extern "C"
__global__
void
prepare_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
...
...
@@ -1402,7 +1438,7 @@ __global__ void prepare_texture_list(
}
// blockDim.x * gridDim.x >= width
extern
"C"
//
extern "C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
...
...
@@ -1417,7 +1453,7 @@ __global__ void clear_texture_list(
}
// treads (*,1,1), blocks = (*,1,1)
extern
"C"
//
extern "C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
...
...
@@ -1437,7 +1473,7 @@ __global__ void mark_texture_tiles(
// treads (*,1,1), blocks = (*,1,1)
extern
"C"
//
extern "C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
...
...
@@ -1471,7 +1507,7 @@ __global__ void mark_texture_neighbor_tiles(
gpu_tasks
[
task_num
].
task
=
((
task
^
d
)
&
TASK_TEXTURE_BITS
)
^
task
;
}
extern
"C"
//
extern "C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
...
...
@@ -1543,32 +1579,50 @@ __global__ void gen_texture_list(
#endif //#ifdef USE_CDP
//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
//#define CONVERT_DIRECT_NUM_CHUNKS ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//__global__ int num_active_tiles;
//__global__ int active_tiles [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
// not maintaining order of the tiles to be processed
extern
"C"
__global__
void
index_direct
(
//extern "C"
__global__
void
index_direct
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
)
// indices to gpu_tasks // should be initialized to zero
int
*
p
num_active_tiles
)
// indices to gpu_tasks // should be initialized to zero
{
int
num_tile
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
num_tile
>=
num_tiles
){
return
;
}
if
(
gpu_tasks
[
num_tile
].
task
!=
0
)
{
active_tiles
[
atomicAdd
(
num_active_tiles
,
1
)]
=
num_tile
;
active_tiles
[
atomicAdd
(
pnum_active_tiles
,
1
)]
=
num_tile
;
}
}
__global__
void
index_correlate
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task
int
*
gpu_corr_indices
,
// array of correlation tasks
int
*
pnum_corr_tiles
)
// pointer to the length of correlation tasks array
{
int
num_tile
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
num_tile
>=
num_tiles
){
return
;
}
int
cm
=
(
gpu_tasks
[
num_tile
].
task
>>
TASK_CORR_BITS
)
&
((
1
<<
NUM_PAIRS
)
-
1
);
if
(
cm
!=
0
)
{
int
nb
=
__popc
(
cm
);
// number of non-zero bits
int
indx
=
atomicAdd
(
pnum_corr_tiles
,
nb
);
int
txy
=
gpu_tasks
[
num_tile
].
txy
;
int
tx
=
txy
&
0xffff
;
int
ty
=
txy
>>
16
;
int
nt
=
ty
*
TILESX
+
tx
;
for
(
int
b
=
0
;
b
<
NUM_PAIRS
;
b
++
)
if
((
cm
&
(
1
<<
b
))
!=
0
)
{
gpu_corr_indices
[
indx
++
]
=
(
nt
<<
CORR_NTILE_SHIFT
)
|
b
;
}
}
}
extern
"C"
__global__
void
convert_direct
(
// called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
extern
"C"
__global__
void
convert_direct
(
// called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
...
...
@@ -1587,7 +1641,7 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
{
dim3
threads0
(
CONVERT_DIRECT_INDEXING_THREADS
,
1
,
1
);
dim3
blocks0
((
num_tiles
+
CONVERT_DIRECT_INDEXING_THREADS
-
1
)
>>
CONVERT_DIRECT_INDEXING_THREADS_LOG2
,
1
,
1
);
if
(
threadIdx
.
x
==
0
)
{
//
of CONVERT_DIRECT_INDEXING_THREADS
if
(
threadIdx
.
x
==
0
)
{
//
always 1
*
pnum_active_tiles
=
0
;
index_direct
<<<
blocks0
,
threads0
>>>
(
gpu_tasks
,
// struct tp_task * gpu_tasks,
...
...
@@ -1614,139 +1668,9 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
kernels_vert
);
// int kernels_vert); // varaible to swict between EO and LWIR
}
}
#if 0 // trying to keep the same order
extern "C" __global__ void index_direct_init(
int num_chunks, // number of tiles in task
struct convert_direct_tmp* tmp)
{
int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
if (chunk_index <= num_chunks){
tmp->num_acive_per_chunk[chunk_index] = 0;
}
}
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
extern "C" __global__ void build_index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
/**
* Top level to call other kernel with CDP
*/
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * num_active_tiles,
struct convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
{
int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 (num_chunk_blocks,1, 1);
__shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
cudaDeviceSynchronize(); // not needed yet, just for testing
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles)
tmp);
cudaDeviceSynchronize(); // not needed yet, just for testing
// single-threaded - make cumulative
// tmp-> num_acive_per_chunk2[0] = 0;
superchunks[0] = 0;
}
__syncthreads();
// calculate cumulative in 3 steps
//1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
for (int pass = 0; pass < num_passes; pass++){
int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
if (num_cluster2 <= num_chunk_blocks){
superchunks[threadIdx.x+1] = superchunks[0];
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
if (indx <= num_chunks) {
superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
}
}
}
__syncthreads();
// make superchunks cumulative (single-threaded
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
superchunks[i + 1] += superchunks[i];
}
}
__syncthreads();
// now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
if (num_cluster2 <= num_chunk_blocks){
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
int prev = tmp -> num_acive_per_chunk[indx++];
if (indx <= num_chunks) {
tmp -> num_acive_per_chunk[indx] += prev;
}
}
}
__syncthreads();
}
}
#endif
extern
"C"
__global__
void
convert_correct_tiles
(
//extern "C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
...
...
src/main/resources/kernels/TileProcessor.h
View file @
4fb94627
...
...
@@ -31,82 +31,50 @@
*/
/**
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
**************************************************************************
* \file TileProcessor.h
* \brief header file for the Tile Processor for frequency domain
*/
*/
#pragma once
#ifndef NUM_CAMS
#include "tp_defines.h"
#endif
extern
"C"
__global__
void
index_direct
(
extern
"C"
__global__
void
convert_direct
(
// called with a single block, single thread
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
*
active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
num_active_tiles
);
// indices to gpu_tasks // should be initialized to zero
extern
"C"
__global__
void
convert_direct
(
// called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
,
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
,
int
*
gpu_active_tiles
,
// pointer to the calculated number of non-zero tiles
int
*
pnum_active_tiles
);
// indices to gpu_tasks
extern
"C"
__global__
void
convert_correct_tiles
(
float
**
gpu_kernel_offsets
,
// [NUM_CAMS],
float
**
gpu_kernels
,
// [NUM_CAMS],
float
**
gpu_images
,
// [NUM_CAMS],
struct
tp_task
*
gpu_tasks
,
int
*
gpu_active_tiles
,
// indices in gpu_tasks to non-zero tiles
int
num_active_tiles
,
// number of tiles in task
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
// int num_tiles, // number of tiles in task
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
height
);
// <= TILESY, use for faster processing of LWIR images
extern
"C"
__global__
void
correlate2D
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int
colors
,
// number of colors (3/1)
float
scale0
,
// scale for R
float
scale1
,
// scale for B
float
scale2
,
// scale for G
float
fat_zero
,
// here - absolute
struct
tp_task
*
gpu_tasks
,
// array of per-tile tasks (now bits 4..9 - correlation pairs)
int
num_tiles
,
// number of tiles in task
int
*
gpu_corr_indices
,
// packed tile+pair
int
*
pnum_corr_tiles
,
// pointer to a number of correlation tiles to process
const
size_t
corr_stride
,
// in floats
int
corr_radius
,
// radius of the output correlation (7 for 15x15)
float
*
gpu_corrs
);
// correlation output data
extern
"C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
extern
"C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
textures_accumulate
(
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -126,7 +94,7 @@ extern "C" __global__ void textures_accumulate(
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t
texture_rbg_stride
,
// in floats
float
*
gpu_texture_rbg
,
// (number of colors +1 + ?)*16*16 rgba texture tiles
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
...
...
@@ -154,33 +122,32 @@ extern "C" __global__ void imclt_rbg(
int
woi_theight
,
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
// TODO: use geometry_correction rXY !
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment