Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
975dadb4
Commit
975dadb4
authored
Apr 04, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated to new GPU kernels
parent
907cda8d
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
834 additions
and
26 deletions
+834
-26
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+98
-8
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+736
-18
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
975dadb4
...
...
@@ -91,7 +91,10 @@ public class GPUTileProcessor {
static
String
GPU_CONVERT_CORRECT_TILES_NAME
=
"convert_correct_tiles"
;
// name in C code
static
String
GPU_IMCLT_RBG_NAME
=
"imclt_rbg"
;
// name in C code
static
String
GPU_CORRELATE2D_NAME
=
"correlate2D"
;
// name in C code
static
String
GPU_TEXTURES_NAME
=
"textures_gen"
;
// name in C code
// static String GPU_TEXTURES_NAME = "textures_gen"; // name in C code
static
String
GPU_TEXTURES_NAME
=
"textures_accumulate"
;
// name in C code
// pass some defines to gpu source code with #ifdef JCUDA
public
static
int
DTT_SIZE
=
8
;
static
int
THREADSX
=
DTT_SIZE
;
...
...
@@ -120,13 +123,20 @@ public class GPUTileProcessor {
public
static
int
CORR_PAIRS_MASK
=
0x3f
;
// lower bits used to address correlation pair for the selected tile
public
static
int
CORR_TEXTURE_BIT
=
7
;
// bit 7 used to request texture for the tile
public
static
int
TASK_CORR_BITS
=
4
;
// start of pair mask
public
static
int
TASK_TEXTURE_BIT
=
3
;
// bit to request texture calculation int task field of struct tp_task
public
static
int
TASK_TEXTURE_N_BIT
=
0
;
// Texture with North neighbor
public
static
int
TASK_TEXTURE_E_BIT
=
1
;
// Texture with East neighbor
public
static
int
TASK_TEXTURE_S_BIT
=
2
;
// Texture with South neighbor
public
static
int
TASK_TEXTURE_W_BIT
=
3
;
// Texture with West neighbor
// public static int TASK_TEXTURE_BIT = 3; // bit to request texture calculation int task field of struct tp_task
public
static
int
LIST_TEXTURE_BIT
=
7
;
// bit to request texture calculation
public
static
int
CORR_OUT_RAD
=
4
;
// output radius of the correlations (implemented)
public
static
double
FAT_ZERO_WEIGHT
=
0.0001
;
// add to port weights to avoid nan
public
static
int
THREADS_DYNAMIC_BITS
=
5
;
// treads in block for CDP creation of the texture list
public
static
int
TASK_TEXTURE_BITS
=
((
1
<<
TASK_TEXTURE_N_BIT
)
|
(
1
<<
TASK_TEXTURE_E_BIT
)
|
(
1
<<
TASK_TEXTURE_S_BIT
)
|
(
1
<<
TASK_TEXTURE_W_BIT
));
int
DTTTEST_BLOCK_WIDTH
=
32
;
// may be read from the source code
int
DTTTEST_BLOCK_HEIGHT
=
16
;
// may be read from the source code
...
...
@@ -330,7 +340,10 @@ public class GPUTileProcessor {
"#define CORR_PAIRS_MASK "
+
CORR_PAIRS_MASK
+
"\n"
+
"#define CORR_TEXTURE_BIT "
+
CORR_TEXTURE_BIT
+
"\n"
+
"#define TASK_CORR_BITS "
+
TASK_CORR_BITS
+
"\n"
+
"#define TASK_TEXTURE_BIT "
+
TASK_TEXTURE_BIT
+
"\n"
+
"#define TASK_TEXTURE_N_BIT "
+
TASK_TEXTURE_N_BIT
+
"\n"
+
"#define TASK_TEXTURE_E_BIT "
+
TASK_TEXTURE_E_BIT
+
"\n"
+
"#define TASK_TEXTURE_S_BIT "
+
TASK_TEXTURE_S_BIT
+
"\n"
+
"#define TASK_TEXTURE_W_BIT "
+
TASK_TEXTURE_W_BIT
+
"\n"
+
"#define LIST_TEXTURE_BIT "
+
LIST_TEXTURE_BIT
+
"\n"
+
"#define CORR_OUT_RAD "
+
CORR_OUT_RAD
+
"\n"
+
"#define FAT_ZERO_WEIGHT "
+
FAT_ZERO_WEIGHT
+
"\n"
+
...
...
@@ -439,7 +452,11 @@ public class GPUTileProcessor {
// Set corrs array
/// cuMemAlloc(gpu_corrs, tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
cuMemAlloc
(
gpu_corr_indices
,
tilesX
*
tilesY
*
NUM_PAIRS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_texture_indices
,
tilesX
*
tilesY
*
Sizeof
.
POINTER
);
//#define TILESYA ((TILESY +3) & (~3))
int
tilesYa
=
(
tilesY
+
3
)
&
~
3
;
// cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
cuMemAlloc
(
gpu_texture_indices
,
tilesX
*
tilesYa
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_port_offsets
,
NUM_CAMS
*
2
*
Sizeof
.
POINTER
);
...
...
@@ -711,7 +728,7 @@ public class GPUTileProcessor {
int
tilesX
=
IMG_WIDTH
/
DTT_SIZE
;
int
num_textures
=
0
;
for
(
TpTask
tt:
tp_tasks
)
{
if
((
tt
.
task
&
TASK_TEXTURE_BIT
)
!=
0
)
{
if
((
tt
.
task
&
TASK_TEXTURE_BIT
S
)
!=
0
)
{
num_textures
++;
}
}
...
...
@@ -720,7 +737,7 @@ public class GPUTileProcessor {
num_textures
=
0
;
int
b
=
(
1
<<
LIST_TEXTURE_BIT
);
for
(
TpTask
tt:
tp_tasks
)
{
if
((
tt
.
task
&
TASK_TEXTURE_BIT
)
!=
0
)
{
if
((
tt
.
task
&
TASK_TEXTURE_BIT
S
)
!=
0
)
{
int
tile
=
(
tt
.
ty
*
tilesX
+
tt
.
tx
);
iarr
[
num_textures
++]
=
(
tile
<<
CORR_NTILE_SHIFT
)
|
b
;
}
...
...
@@ -901,7 +918,7 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
}
public
void
execTextures
(
public
void
execTextures
Old
(
double
[][]
port_offsets
,
double
[]
color_weights
,
boolean
is_lwir
,
...
...
@@ -966,6 +983,75 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
}
public
void
execTextures
(
double
[][]
port_offsets
,
double
[]
color_weights
,
boolean
is_lwir
,
double
min_shot
,
// 10.0
double
scale_shot
,
// 3.0
double
diff_sigma
,
// pixel value/pixel change
double
diff_threshold
,
// pixel value/pixel change
double
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean
dust_remove
,
boolean
keep_weights
)
{
if
(
GPU_TEXTURES_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_TEXTURES_kernel"
);
return
;
}
float
[]
fport_offsets
=
new
float
[
port_offsets
.
length
*
2
];
for
(
int
cam
=
0
;
cam
<
port_offsets
.
length
;
cam
++)
{
fport_offsets
[
2
*
cam
+
0
]
=
(
float
)
port_offsets
[
cam
][
0
];
fport_offsets
[
2
*
cam
+
1
]
=
(
float
)
port_offsets
[
cam
][
1
];
}
cuMemcpyHtoD
(
gpu_port_offsets
,
Pointer
.
to
(
fport_offsets
),
fport_offsets
.
length
*
Sizeof
.
FLOAT
);
int
num_colors
=
color_weights
.
length
;
if
(
num_colors
>
3
)
num_colors
=
3
;
float
weighht0
=
(
float
)
color_weights
[
0
];
float
weighht1
=
(
num_colors
>
1
)?((
float
)
color_weights
[
1
]):
0.0f
;
float
weighht2
=
(
num_colors
>
2
)?((
float
)
color_weights
[
2
]):
0.0f
;
int
iis_lwir
=
(
is_lwir
)?
1
:
0
;
int
idust_remove
=
(
dust_remove
)?
1
:
0
;
int
ikeep_weights
=
(
keep_weights
)?
1
:
0
;
int
[]
GridFullWarps
=
{(
num_texture_tiles
+
TEXTURE_TILES_PER_BLOCK
-
1
)
/
TEXTURE_TILES_PER_BLOCK
,
1
,
1
};
int
[]
ThreadsFullWarps
=
{
TEXTURE_THREADS_PER_TILE
,
NUM_CAMS
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
new
int
[]
{
0
}),
// 0, // int border_tile, // if 1 - watch for border
Pointer
.
to
(
gpu_texture_indices
),
// int * woi, - not used
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
new
int
[]
{
num_texture_tiles
}),
Pointer
.
to
(
gpu_texture_indices
),
Pointer
.
to
(
gpu_port_offsets
),
Pointer
.
to
(
new
int
[]
{
num_colors
}),
Pointer
.
to
(
new
int
[]
{
iis_lwir
}),
Pointer
.
to
(
new
float
[]
{(
float
)
min_shot
}),
Pointer
.
to
(
new
float
[]
{(
float
)
scale_shot
}),
Pointer
.
to
(
new
float
[]
{(
float
)
diff_sigma
}),
Pointer
.
to
(
new
float
[]
{(
float
)
diff_threshold
}),
Pointer
.
to
(
new
float
[]
{(
float
)
min_agree
}),
Pointer
.
to
(
new
float
[]
{
weighht0
}),
Pointer
.
to
(
new
float
[]
{
weighht1
}),
Pointer
.
to
(
new
float
[]
{
weighht2
}),
Pointer
.
to
(
new
int
[]
{
idust_remove
}),
Pointer
.
to
(
new
int
[]
{
ikeep_weights
}),
Pointer
.
to
(
new
int
[]
{
0
}),
// 0, // const size_t texture_rbg_stride, // in floats - DISABLE GENERATION!
Pointer
.
to
(
gpu_textures
),
// new Pointer(), // Pointer.to(gpu_textures),
Pointer
.
to
(
new
int
[]
{
texture_stride
}),
// can be a null pointer - will not be used! float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer
.
to
(
gpu_textures
)
);
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_TEXTURES_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
}
public
float
[][]
getCorr2D
(
int
corr_rad
){
...
...
@@ -1173,7 +1259,11 @@ public class GPUTileProcessor {
CUlinkState
state
=
new
CUlinkState
();
cuLinkCreate
(
jitOptions
,
state
);
cuLinkAddFile
(
state
,
CU_JIT_INPUT_LIBRARY
,
LIBRARY_PATH
,
jitOptions
);
cuLinkAddData
(
state
,
CU_JIT_INPUT_PTX
,
Pointer
.
to
(
ptxData
),
ptxData
.
length
,
"input.ptx"
,
jitOptions
);
System
.
out
.
println
(
"ptxData.length="
+
ptxData
.
length
);
// System.out.println( ptx[0]);
cuLinkAddData
(
state
,
CU_JIT_INPUT_PTX
,
Pointer
.
to
(
ptxData
),
ptxData
.
length
,
"input.ptx"
,
jitOptions
);
// CUDA_ERROR_INVALID_PTX
long
size
[]
=
{
0
};
Pointer
image
=
new
Pointer
();
cuLinkComplete
(
state
,
image
,
size
);
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
975dadb4
...
...
@@ -61,6 +61,10 @@
#define CORR_PAIRS_MASK 0x3f// lower bits used to address correlation pair for the selected tile
#define CORR_TEXTURE_BIT 7 // bit 7 used to request texture for the tile
#define TASK_CORR_BITS 4
#define TASK_TEXTURE_N_BIT 0 // Texture with North neighbor
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4
...
...
@@ -69,6 +73,7 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#undef HAS_PRINTF
//#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
...
...
@@ -81,8 +86,16 @@
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
//#define USE_textures_gen
#endif //#ifndef JCUDA
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
#endif
//#define IMCLT14
//#define NOICLT 1
//#define TEST_IMCLT
...
...
@@ -131,6 +144,8 @@
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
#define DTT_SIZE1 (DTT_SIZE + 1)
#define DTT_SIZE2 (2 * DTT_SIZE)
...
...
@@ -162,9 +177,10 @@
//#define TASK_SIZE 12
struct
tp_task
{
int
task
;
int
txy
;
// short ty;
// short tx;
union
{
int
txy
;
unsigned
short
sxy
[
2
];
};
float
xy
[
NUM_CAMS
][
2
];
};
struct
CltExtra
{
...
...
@@ -1101,11 +1117,60 @@ __device__ void imclt_plane( // not implemented, not used
float
*
gpu_rbg
,
// WIDTH, HEIGHT
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
height
);
// <= TILESY, use for faster processing of LWIR images
extern
"C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
extern
"C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process
int
*
woi
);
// x,y,width,height of the woi
extern
"C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
textures_accumulate
(
int
border_tile
,
// if 1 - watch for border
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t
texture_rbg_stride
,
// in floats
float
*
gpu_texture_rbg
,
// (number of colors +1 + ?)*16*16 rgba texture tiles
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
correlate2D
(
...
...
@@ -1377,6 +1442,175 @@ Java code:
}
#define USE_CDP
#ifdef USE_CDP
extern
"C"
__global__
void
generate_RBGA
(
// Parameters to generate texture tasks
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
// declare arrays in device code?
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process (8 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int
height
,
// <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
)
// (number of colors +1 + ?)*16*16 rgba texture tiles
{
// TODO use atomic_add to increment num_texture_tiles
// TODO calculate woi
dim3
threads0
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_x
=
(
width
+
((
1
<<
THREADS_DYNAMIC_BITS
)
-
1
))
>>
THREADS_DYNAMIC_BITS
;
dim3
blocks0
(
blocks_x
,
height
,
1
);
if
(
threadIdx
.
x
==
0
)
{
clear_texture_list
<<<
blocks0
,
threads0
>>>
(
gpu_texture_indices
,
width
,
height
);
cudaDeviceSynchronize
();
// not needed yet, just for testing
dim3
threads
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_t
=
(
num_tiles
+
((
1
<<
THREADS_DYNAMIC_BITS
))
-
1
)
>>
THREADS_DYNAMIC_BITS
;
//
dim3
blocks
(
blocks_t
,
1
,
1
);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize
();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*
(
woi
+
0
)
=
TILESX
;
*
(
woi
+
1
)
=
TILESY
;
*
(
woi
+
2
)
=
0
;
// maximal x
*
(
woi
+
3
)
=
0
;
// maximal y
mark_texture_neighbor_tiles
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
woi
);
// min_x, min_y, max_x, max_y
cudaDeviceSynchronize
();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*
(
num_texture_tiles
+
0
)
=
0
;
*
(
num_texture_tiles
+
1
)
=
0
;
*
(
num_texture_tiles
+
2
)
=
0
;
*
(
num_texture_tiles
+
3
)
=
0
;
*
(
num_texture_tiles
+
4
)
=
0
;
*
(
num_texture_tiles
+
5
)
=
0
;
*
(
num_texture_tiles
+
6
)
=
0
;
*
(
num_texture_tiles
+
7
)
=
0
;
gen_texture_list
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
num_texture_tiles
,
// number of texture tiles to process
woi
);
// x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize
();
// not needed yet, just for testing
*
(
woi
+
2
)
+=
1
-
*
(
woi
+
0
);
// width
*
(
woi
+
3
)
+=
1
-
*
(
woi
+
1
);
// height
}
__syncthreads
();
// Zero output textures. Trim
// texture_rbga_stride
int
texture_width
=
*
(
woi
+
2
)
*
DTT_SIZE
;
int
texture_tiles_height
=
*
(
woi
+
3
)
*
DTT_SIZE
;
int
texture_height
=
texture_tiles_height
*
DTT_SIZE
;
int
texture_slices
=
colors
+
1
;
if
(
threadIdx
.
x
==
0
)
{
dim3
threads2
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_x
=
(
texture_width
+
((
1
<<
THREADS_DYNAMIC_BITS
)
-
1
))
>>
THREADS_DYNAMIC_BITS
;
dim3
blocks2
(
blocks_x
,
texture_tiles_height
*
texture_slices
,
1
);
// each thread - 8 vertical
clear_texture_rbga
<<<
blocks2
,
threads2
>>>
(
texture_width
,
texture_tiles_height
*
texture_slices
,
// int texture_slice_height,
texture_rbga_stride
,
// const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles
)
;
// float * gpu_texture_tiles);
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x116 tiles overhang by 4 pixels)
cudaDeviceSynchronize
();
// not needed yet, just for testing
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
){
dim3
threads_texture
(
TEXTURE_THREADS_PER_TILE
,
NUM_CAMS
,
1
);
// TEXTURE_TILES_PER_BLOCK, 1);
int
border_tile
=
pass
>>
2
;
size_t
ntt
=
*
(
num_texture_tiles
+
(
2
*
(
pass
&
3
))
+
border_tile
);
dim3
grid_texture
((
ntt
+
TEXTURE_TILES_PER_BLOCK
-
1
)
/
TEXTURE_TILES_PER_BLOCK
,
1
,
1
);
int
ti_offset
=
(
pass
&
3
)
*
(
TILESX
*
(
TILESYA
>>
2
));
// 1/4
if
(
border_tile
){
ti_offset
+=
TILESX
*
(
TILESYA
>>
2
)
-
ntt
;
}
/* */
textures_accumulate
<<<
grid_texture
,
threads_texture
>>>
(
border_tile
,
// int border_tile, // if 1 - watch for border
woi
,
// int * woi, // x, y, width,height
gpu_clt
,
// float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ntt
,
// size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices
+
ti_offset
,
// int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets
,
// float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
colors
,
// int colors, // number of colors (3/1)
is_lwir
,
// int is_lwir, // do not perform shot correction
min_shot
,
// float min_shot, // 10.0
scale_shot
,
// float scale_shot, // 3.0
diff_sigma
,
// float diff_sigma, // pixel value/pixel change
diff_threshold
,
// float diff_threshold, // pixel value/pixel change
min_agree
,
// float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weight0
,
// float weight0, // scale for R
weight1
,
// float weight1, // scale for B
weight2
,
// float weight2, // scale for G
dust_remove
,
// int dust_remove, // Do not reduce average weight when only one image differs much from the average
0
,
// int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
texture_rbga_stride
,
// size_t texture_rbg_stride, // in floats
gpu_texture_tiles
,
// float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0
,
// size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles
);
// (float *) 0 ); // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
/* */
}
}
__syncthreads
();
}
// blockDim.x * gridDim.x >= width
extern
"C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
)
// (number of colors +1 + ?)*16*16 rgba texture tiles
{
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
col
>
texture_width
)
{
return
;
}
int
row
=
(
blockIdx
.
y
<<
3
);
// includes slices
float
*
pix
=
gpu_texture_tiles
+
col
+
row
*
texture_rbga_stride
;
#pragma unroll
for
(
int
n
=
0
;
n
<
DTT_SIZE
;
n
++
)
{
*
(
pix
)
=
0.0
;
pix
+=
texture_rbga_stride
;
}
}
/**
* prepare list of texture tiles, woi, and calculate orthogonal neighbors for tiles (in 4 bits of the task field
* use 4x8=32 threads,
...
...
@@ -1386,41 +1620,187 @@ __global__ void prepare_texture_list(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process
// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
// border - down from the sam3\e 1/4 of the buffer
int
*
num_texture_tiles
,
// number of texture tiles to process (4 separate elements for accumulation)
int
*
woi
,
// x,y,width,height of the woi
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
width
,
// <= TILESX, use for faster processing of LWIR images
(should be actual + 1)
int
height
)
// <= TILESY, use for faster processing of LWIR images
{
// TODO use atomic_add to increment num_texture_tiles
// TODO calculate woi
// int task_num = blockIdx.x;
// int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
dim3
threads
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_x
=
(
width
+
1
)
>>
THREADS_DYNAMIC_BITS
;
dim3
blocks
(
blocks_x
,
height
,
1
);
dim3
threads0
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
// int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
int
blocks_x
=
(
width
+
((
1
<<
THREADS_DYNAMIC_BITS
)
-
1
))
>>
THREADS_DYNAMIC_BITS
;
dim3
blocks0
(
blocks_x
,
height
,
1
);
if
(
threadIdx
.
x
==
0
)
{
clear_texture_list
<<<
blocks
,
threads
>>>
(
clear_texture_list
<<<
blocks
0
,
threads0
>>>
(
gpu_texture_indices
,
width
,
height
);
cudaDeviceSynchronize
();
// not needed yet, just for testing
dim3
threads
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_t
=
(
num_tiles
+
((
1
<<
THREADS_DYNAMIC_BITS
))
-
1
)
>>
THREADS_DYNAMIC_BITS
;
//
dim3
blocks
(
blocks_t
,
1
,
1
);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
);
// packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize
();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*
(
woi
+
0
)
=
TILESX
;
*
(
woi
+
1
)
=
TILESY
;
*
(
woi
+
2
)
=
0
;
// maximal x
*
(
woi
+
3
)
=
0
;
// maximal y
mark_texture_neighbor_tiles
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
woi
);
// min_x, min_y, max_x, max_y
cudaDeviceSynchronize
();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*
(
num_texture_tiles
+
0
)
=
0
;
*
(
num_texture_tiles
+
1
)
=
0
;
*
(
num_texture_tiles
+
2
)
=
0
;
*
(
num_texture_tiles
+
3
)
=
0
;
*
(
num_texture_tiles
+
4
)
=
0
;
*
(
num_texture_tiles
+
5
)
=
0
;
*
(
num_texture_tiles
+
6
)
=
0
;
*
(
num_texture_tiles
+
7
)
=
0
;
gen_texture_list
<<<
blocks
,
threads
>>>
(
gpu_tasks
,
num_tiles
,
// number of tiles in task list
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
num_texture_tiles
,
// number of texture tiles to process
woi
);
// x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize
();
// not needed yet, just for testing
*
(
woi
+
2
)
+=
1
-
*
(
woi
+
0
);
// width
*
(
woi
+
3
)
+=
1
-
*
(
woi
+
1
);
// height
}
__syncthreads
();
}
// blockDim.x * gridDim.x >= width
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
width
,
// <= TILESX, use for faster processing of LWIR images
int
height
)
// <= TILESY, use for faster processing of LWIR images
{
int
col
=
threadIdx
.
x
+
blockDim
.
x
*
block
Idx
.
x
;
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
thread
Idx
.
x
;
int
row
=
blockIdx
.
y
;
if
(
col
>
width
)
{
return
;
}
*
(
gpu_texture_indices
+
col
+
row
*
TILESX
)
=
0.0
;
*
(
gpu_texture_indices
+
col
+
row
*
TILESX
)
=
0
;
}
// treads (*,1,1), blocks = (*,1,1)
extern
"C"
__global__
void
mark_texture_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
)
// packed tile + bits (now only (1 << 7)
{
int
task_num
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
task_num
>=
num_tiles
)
{
return
;
// nothing to do
}
int
task
=
gpu_tasks
[
task_num
].
task
;
if
(
!
(
task
&
TASK_TEXTURE_BITS
)){
// here any bit in TASK_TEXTURE_BITS is sufficient
return
;
// NOP tile
}
int
cxy
=
gpu_tasks
[
task_num
].
txy
;
*
(
gpu_texture_indices
+
(
cxy
&
0xffff
)
+
(
cxy
>>
16
)
*
TILESX
)
=
1
;
}
// treads (*,1,1), blocks = (*,1,1)
extern
"C"
__global__
void
mark_texture_neighbor_tiles
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
woi
)
// x,y,width,height of the woi
{
int
task_num
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
task_num
>=
num_tiles
)
{
return
;
// nothing to do
}
// struct tp_task * gpu_task = &gpu_tasks[task_num];
// int task = gpu_task->task;
int
task
=
gpu_tasks
[
task_num
].
task
;
if
(
!
(
task
&
TASK_TEXTURE_BITS
)){
// here any bit in TASK_TEXTURE_BITS is sufficient
return
;
// NOP tile
}
// int cxy = gpu_task->txy;
int
cxy
=
gpu_tasks
[
task_num
].
txy
;
int
x
=
(
cxy
&
0xffff
);
int
y
=
(
cxy
>>
16
);
atomicMin
(
woi
+
0
,
x
);
atomicMin
(
woi
+
1
,
y
);
atomicMax
(
woi
+
2
,
x
);
atomicMax
(
woi
+
3
,
y
);
int
d
=
0
;
if
((
y
>
0
)
&&
*
(
gpu_texture_indices
+
x
+
(
y
-
1
)
*
TILESX
))
d
|=
(
1
<<
TASK_TEXTURE_N_BIT
);
if
((
x
<
(
TILESX
-
1
))
&&
*
(
gpu_texture_indices
+
(
x
+
1
)
+
y
*
TILESX
))
d
|=
(
1
<<
TASK_TEXTURE_E_BIT
);
if
((
y
<
(
TILESY
-
1
))
&&
*
(
gpu_texture_indices
+
x
+
(
y
+
1
)
*
TILESX
))
d
|=
(
1
<<
TASK_TEXTURE_S_BIT
);
if
((
x
>
0
)
&&
*
(
gpu_texture_indices
+
(
x
-
1
)
+
y
*
TILESX
))
d
|=
(
1
<<
TASK_TEXTURE_W_BIT
);
gpu_tasks
[
task_num
].
task
=
((
task
^
d
)
&
TASK_TEXTURE_BITS
)
^
task
;
}
extern
"C"
__global__
void
gen_texture_list
(
struct
tp_task
*
gpu_tasks
,
int
num_tiles
,
// number of tiles in task list
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
num_texture_tiles
,
// number of texture tiles to process
int
*
woi
)
// min_x, min_y, max_x, max_y input
{
int
task_num
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
if
(
task_num
>=
num_tiles
)
{
return
;
// nothing to do
}
int
task
=
gpu_tasks
[
task_num
].
task
&
TASK_TEXTURE_BITS
;
if
(
!
task
){
// here any bit in TASK_TEXTURE_BITS is sufficient
return
;
// NOP tile
}
int
cxy
=
gpu_tasks
[
task_num
].
txy
;
int
x
=
(
cxy
&
0xffff
);
int
y
=
(
cxy
>>
16
);
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
// don't care if calculate extra pixels that still fit into memory
int
is_border
=
(
x
==
woi
[
0
])
||
(
y
==
woi
[
1
])
||
(
x
==
(
TILESX
-
1
))
||
(
y
==
(
TILESY
-
1
));
if
(
x
&
1
)
{
gpu_texture_indices
+=
TILESX
*
(
TILESYA
>>
2
);
//TILESYA - 2 LSB == 00
num_texture_tiles
+=
2
;
// int *
}
if
(
y
&
1
)
{
gpu_texture_indices
+=
TILESX
*
(
TILESYA
>>
1
);
num_texture_tiles
+=
4
;
// int *
}
if
(
is_border
){
gpu_texture_indices
+=
(
TILESX
*
(
TILESYA
>>
2
)
-
1
);
// end of the buffer
num_texture_tiles
+=
1
;
// int *
}
// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
int
buf_offset
=
atomicAdd
(
num_texture_tiles
,
1
);
if
(
is_border
){
buf_offset
=
-
buf_offset
;
}
*
(
gpu_texture_indices
+
buf_offset
)
=
task
|
((
x
+
y
*
TILESX
)
<<
CORR_NTILE_SHIFT
)
|
(
1
<<
LIST_TEXTURE_BIT
);
}
#endif //#ifdef USE_CDP
...
...
@@ -1504,6 +1884,9 @@ __global__ void convert_correct_tiles(
}
}
}
//#undef USE_textures_gen
#ifdef USE_textures_gen
extern
"C"
__global__
void
textures_gen
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -1768,10 +2151,347 @@ __global__ void textures_gen(
}
__syncthreads
();
// __syncwarp();
#endif
}
#endif // ifdef USE_textures_gen
extern
"C"
__global__
void
textures_accumulate
(
int
border_tile
,
// if 1 - watch for border
int
*
woi
,
// x, y, width,height
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t
texture_rbg_stride
,
// in floats
float
*
gpu_texture_rbg
,
// (number of colors +1 + ?)*16*16 rgba texture tiles
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
float
*
gpu_texture_tiles
)
// (number of colors +1 + ?)*16*16 rgba texture tiles
{
float
weights
[
3
]
=
{
weight0
,
weight1
,
weight2
};
// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
int
camera_num
=
threadIdx
.
y
;
int
tile_indx
=
blockIdx
.
x
;
// * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if
(
tile_indx
>=
num_texture_tiles
){
return
;
// nothing to do
}
// get number of tile
int
tile_code
=
gpu_texture_indices
[
tile_indx
];
if
((
tile_code
&
(
1
<<
CORR_TEXTURE_BIT
))
==
0
){
return
;
// nothing to do
}
int
tile_num
=
tile_code
>>
CORR_NTILE_SHIFT
;
__shared__
float
mclt_tiles
[
NUM_CAMS
][
NUM_COLORS
][
2
*
DTT_SIZE
][
DTT_SIZE21
];
__shared__
union
{
float
clt_tiles
[
NUM_CAMS
][
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// NUM_CAMS == 4
float
mclt_debayer
[
NUM_CAMS
][
NUM_COLORS
][
MCLT_UNION_LEN
];
// to align with clt_tiles
}
shr
;
__shared__
union
{
float
mclt_tmp
[
NUM_CAMS
][
NUM_COLORS
][
DTT_SIZE2
][
DTT_SIZE21
];
float
rgbaw
[
NUM_COLORS
+
1
+
NUM_CAMS
+
NUM_COLORS
+
1
][
DTT_SIZE2
][
DTT_SIZE21
];
// add more
}
shr1
;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
// __shared__ float color_avg [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
__shared__
float
port_offsets
[
NUM_CAMS
][
2
];
__shared__
float
ports_rgb
[
NUM_CAMS
][
NUM_COLORS
];
// return to system memory (optionally pass null to skip calculation)
__shared__
float
max_diff
[
NUM_CAMS
];
// return to system memory (optionally pass null to skip calculation)
if
(
threadIdx
.
x
<
2
){
port_offsets
[
camera_num
][
threadIdx
.
x
]
=
*
(
gpu_port_offsets
+
2
*
camera_num
+
threadIdx
.
x
);
}
#ifdef DBG_TILE
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen tile = %d
\n
"
,
tile_num
);
// debug_print_clt1(clt_tile1, color, 0xf); //
// printf("\textures_gen tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
// debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads
();
// __syncwarp();
#endif
#endif
// serially for each color, parallel for each camera
// copy clt (frequency domain data)
for
(
int
color
=
0
;
color
<
colors
;
color
++
){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
float
*
clt_tile
=
((
float
*
)
shr
.
clt_tiles
[
camera_num
][
color
]);
// start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
float
*
clt_tilei
=
clt_tile
+
threadIdx
.
x
;
float
*
gpu_tile
=
((
float
*
)
gpu_clt
[
camera_num
])
+
(
tile_num
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
float
*
mclt_tile
=
(
float
*
)
mclt_tiles
[
camera_num
][
color
];
float
*
mclt_dst
=
(
float
*
)
shr
.
mclt_debayer
[
camera_num
][
color
];
float
*
mclt_tmp
=
(
float
*
)
shr1
.
mclt_tmp
[
camera_num
][
color
];
// float scale = 0.25;
#pragma unroll
for
(
int
q
=
0
;
q
<
4
;
q
++
)
{
float
*
lpf
=
lpf_data
[(
colors
>
1
)
?
color
:
3
]
+
threadIdx
.
x
;
// lpf_data[3] - mono
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
// copy 32 rows (4 quadrants of 8 rows)
// *clt_tilei = *gpu_tile * (*lpf) * scale;
*
clt_tilei
=
*
gpu_tile
*
(
*
lpf
);
clt_tilei
+=
DTT_SIZE1
;
gpu_tile
+=
DTT_SIZE
;
lpf
+=
DTT_SIZE
;
}
}
__syncthreads
();
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen LPF for color = %d
\n
"
,
color
);
debug_print_lpf
(
lpf_data
[(
colors
>
1
)
?
color
:
3
]);
printf
(
"
\n
textures_gen tile = %d, color = %d
\n
"
,
tile_num
,
color
);
debug_print_clt_scaled
(
clt_tile
,
color
,
0xf
,
0.25
);
//
}
__syncthreads
();
// __syncwarp();
#endif
// perform idct
imclt8threads
(
0
,
// int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile
,
// float * mclt_tile )
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
__syncthreads
();
// __syncwarp();
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen mclt color = %d
\n
"
,
color
);
debug_print_mclt
(
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color
);
}
__syncthreads
();
// __syncwarp();
#endif
if
(
colors
>
1
)
{
debayer_shot
(
(
color
<
2
),
// const int rb_mode, // 0 - green, 1 - r/b
min_shot
,
// float min_shot, // 10.0
scale_shot
,
// float scale_shot, // 3.0 (0.0 for mono)
mclt_tile
,
// float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst
,
// float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp
,
// float * mclt_tmp,
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
// int debug);
__syncthreads
();
// __syncwarp();
}
else
{
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#pragma unroll
for
(
int
n
=
0
;
n
<=
DTT_SIZE
;
n
+=
DTT_SIZE
){
float
*
msp
=
mclt_tile
+
threadIdx
.
x
+
n
;
float
*
dst
=
mclt_dst
+
threadIdx
.
x
+
n
;
#pragma unroll
for
(
int
row
=
0
;
row
<
DTT_SIZE2
;
row
++
){
*
dst
=
*
msp
;
msp
+=
DTT_SIZE21
;
dst
+=
DTT_SIZE21
;
}
}
__syncthreads
();
}
#ifdef DEBUG77
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
printf
(
"
\n
textures_gen AFTER DEBAER cam= %d, color = %d
\n
"
,
threadIdx
.
y
,
color
);
debug_print_mclt
(
mclt_dst
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
printf
(
"
\n
textures_gen AFTER DEBAER0 cam= %d, color = %d
\n
"
,
threadIdx
.
y
,
0
);
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
0
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
}
// for (int color = 0; color < colors; color++)
__syncthreads
();
// __syncwarp();
/// return;
#ifdef DEBUG77
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for
(
int
nncol
=
0
;
nncol
<
colors
;
nncol
++
){
printf
(
"
\n
textures_gen AFTER DEBAER1 cam= %d, color = %d
\n
"
,
ccam
,
nncol
);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
nncol
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
}
}
__syncthreads
();
// __syncwarp();
#endif
#ifdef DEBUG77
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
for
(
int
nncol
=
0
;
nncol
<
colors
;
nncol
++
){
printf
(
"
\n
textures_gen AFTER DEBAER1 cam= %d, color = %d
\n
"
,
ccam
,
nncol
);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
nncol
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
tile_combine_rgba
(
colors
,
// int colors, // number of colors
(
float
*
)
shr
.
mclt_debayer
,
// float * mclt_tile, // debayer // has gaps to align with union !
(
float
*
)
mclt_tiles
,
// float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(
float
*
)
shr1
.
rgbaw
,
// float * rgba, // result
(
float
*
)
0
,
// float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(
float
*
)
0
,
// float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(
float
*
)
port_offsets
,
// float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma
,
// float diff_sigma, // pixel value/pixel change
diff_threshold
,
// float diff_threshold, // pixel value/pixel change
min_agree
,
// float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights
,
// float * chn_weights, // color channel weights, sum == 1.0
dust_remove
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights
,
// int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(
tile_num
==
DBG_TILE
)
);
//int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
// multiply all 4(2) slices by a window (if not all directions)
if
(
gpu_texture_tiles
&&
(
texture_stride
!=
0
)){
// generate non-ovelapping tiles
float
*
gpu_texture_tile
=
gpu_texture_tiles
+
tile_indx
*
texture_stride
;
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
int
i
=
row
*
DTT_SIZE21
+
col
;
int
gi
=
row
*
DTT_SIZE2
+
col
;
float
*
gpu_texture_tile_gi
=
gpu_texture_tile
+
gi
;
float
*
rgba_i
=
((
float
*
)
shr1
.
rgbaw
)
+
i
;
// always copy 3 (1) colors + alpha
if
(
colors
==
3
){
if
(
keep_weights
)
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
+
NUM_CAMS
+
NUM_COLORS
+
1
;
ncol
++
)
{
// 12
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
else
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
;
ncol
++
)
{
// 4
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
}
else
{
// assuming colors = 1
if
(
keep_weights
)
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
+
NUM_CAMS
+
1
+
1
;
ncol
++
)
{
// 8
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
else
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
;
ncol
++
)
{
// 2
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
}
}
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen tile done = %d, texture_stride= %d
\n
"
,
tile_num
,
(
int
)
texture_stride
);
}
__syncthreads
();
// __syncwarp();
#endif
}
// if (gpu_texture_tiles){ // generate non-ovelapping tiles
tile_code
&=
TASK_TEXTURE_BITS
;
if
(
!
tile_code
){
return
;
// should not happen
}
if
(
gpu_texture_rbg
&&
(
texture_rbg_stride
!=
0
))
{
// generate RGBA
if
(
tile_code
!=
TASK_TEXTURE_BITS
){
// only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
int
i
=
row
*
DTT_SIZE21
+
col
;
int
gi
=
row
*
DTT_SIZE2
+
col
;
float
*
rgba_i
=
((
float
*
)
shr1
.
rgbaw
)
+
i
;
// always copy 3 (1) colors + alpha
if
(
colors
==
3
){
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
;
ncol
++
)
{
// 4
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
tile_code
][
gi
];
// reduce [tile_code] by LUT
}
}
else
{
// assuming colors = 1
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
;
ncol
++
)
{
// 2
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
))
*=
alphaFade
[
tile_code
][
gi
];
// reduce [tile_code] by LUT
}
}
}
}
int
slice_stride
=
texture_rbg_stride
*
*
(
woi
+
3
);
// offset to the next color
int
tileY
=
tile_num
/
TILESX
;
// slow, but 1 per tile
int
tileX
=
tile_num
-
tileY
*
TILESX
;
int
tile_x0
=
(
tileX
-
*
(
woi
+
0
))
*
DTT_SIZE
-
(
DTT_SIZE
/
2
);
// may be negative == -4
int
tile_y0
=
(
tileY
-
*
(
woi
+
1
))
*
DTT_SIZE
-
(
DTT_SIZE
/
2
);
// may be negative == -4
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
// row inside a tile (0..15)
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
// column inside a tile (0..15)
int
g_row
=
row
+
tile_y0
;
int
g_col
=
col
+
tile_x0
;
int
i
=
row
*
DTT_SIZE21
+
col
;
int
gi
=
g_row
*
texture_rbg_stride
+
g_col
;
// offset to the top left corner
float
*
gpu_texture_rbg_gi
=
gpu_texture_rbg
+
gi
;
float
*
rgba_i
=
((
float
*
)
shr1
.
rgbaw
)
+
i
;
if
(
!
border_tile
||
((
g_row
>=
0
)
&&
(
g_col
>=
0
)
&&
(
g_row
<
(
DTT_SIZE
*
TILESX
))
&&
(
g_col
<
(
DTT_SIZE
*
TILESY
)))){
// always copy 3 (1) colors + alpha
if
(
colors
==
3
){
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
;
ncol
++
)
{
// 4
*
(
gpu_texture_rbg_gi
+
ncol
*
slice_stride
)
+=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
else
{
// assuming colors = 1
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
;
ncol
++
)
{
// 2
*
(
gpu_texture_rbg_gi
+
ncol
*
slice_stride
)
+=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
}
}
}
// if (gpu_texture_rbg) { // generate RGBA
}
// textures_accumulate()
}
extern
"C"
...
...
@@ -2801,9 +3521,7 @@ __device__ void convertCorrectTile(
//#endif
}
//#ifndef NOICLT1
#ifdef NOICLT1
extern
"C"
__global__
void
test_imclt
(
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -2859,7 +3577,7 @@ __global__ void test_imclt(
__syncthreads
();
// __syncwarp();
}
}
#endif // NOICLT1
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment