Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
0bb31239
Commit
0bb31239
authored
Apr 14, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
4 images with CDP
parent
095bd8c2
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
154 additions
and
315 deletions
+154
-315
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+55
-6
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+2
-1
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+76
-302
TileProcessor.h
src/main/resources/kernels/TileProcessor.h
+18
-3
geometry_correction.h
src/main/resources/kernels/geometry_correction.h
+3
-3
No files found.
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
0bb31239
...
@@ -107,6 +107,7 @@ public class GPUTileProcessor {
...
@@ -107,6 +107,7 @@ public class GPUTileProcessor {
static
String
GPU_RBGA_NAME
=
"generate_RBGA"
;
// name in C code
static
String
GPU_RBGA_NAME
=
"generate_RBGA"
;
// name in C code
static
String
GPU_ROT_DERIV
=
"calc_rot_deriv"
;
// calculate rotation matrices and derivatives
static
String
GPU_ROT_DERIV
=
"calc_rot_deriv"
;
// calculate rotation matrices and derivatives
static
String
SET_TILES_OFFSETS
=
"get_tiles_offsets"
;
// calculate pixel offsets and disparity distortions
static
String
SET_TILES_OFFSETS
=
"get_tiles_offsets"
;
// calculate pixel offsets and disparity distortions
static
String
GPU_IMCLT_ALL_NAME
=
"imclt_rbg_all"
;
// pass some defines to gpu source code with #ifdef JCUDA
// pass some defines to gpu source code with #ifdef JCUDA
...
@@ -168,6 +169,8 @@ public class GPUTileProcessor {
...
@@ -168,6 +169,8 @@ public class GPUTileProcessor {
private
CUfunction
GPU_RBGA_kernel
=
null
;
private
CUfunction
GPU_RBGA_kernel
=
null
;
private
CUfunction
GPU_ROT_DERIV_kernel
=
null
;
private
CUfunction
GPU_ROT_DERIV_kernel
=
null
;
private
CUfunction
SET_TILES_OFFSETS_kernel
=
null
;
private
CUfunction
SET_TILES_OFFSETS_kernel
=
null
;
private
CUfunction
GPU_IMCLT_ALL_kernel
=
null
;
// CPU arrays of pointers to GPU memory
// CPU arrays of pointers to GPU memory
// These arrays may go to methods, they are here just to be able to free GPU memory if needed
// These arrays may go to methods, they are here just to be able to free GPU memory if needed
...
@@ -186,6 +189,7 @@ public class GPUTileProcessor {
...
@@ -186,6 +189,7 @@ public class GPUTileProcessor {
private
CUdeviceptr
gpu_corrs
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
private
CUdeviceptr
gpu_corrs
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
private
CUdeviceptr
gpu_textures
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
private
CUdeviceptr
gpu_textures
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_clt
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_4_images
=
new
CUdeviceptr
();
private
CUdeviceptr
gpu_corr_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_corr_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_texture_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_texture_indices
=
new
CUdeviceptr
();
// allocate tilesX * tilesY * 6 * Sizeof.POINTER
private
CUdeviceptr
gpu_port_offsets
=
new
CUdeviceptr
();
// allocate Quad * 2 * Sizeof.POINTER
private
CUdeviceptr
gpu_port_offsets
=
new
CUdeviceptr
();
// allocate Quad * 2 * Sizeof.POINTER
...
@@ -466,7 +470,8 @@ public class GPUTileProcessor {
...
@@ -466,7 +470,8 @@ public class GPUTileProcessor {
GPU_TEXTURES_NAME
,
GPU_TEXTURES_NAME
,
GPU_RBGA_NAME
,
GPU_RBGA_NAME
,
GPU_ROT_DERIV
,
GPU_ROT_DERIV
,
SET_TILES_OFFSETS
SET_TILES_OFFSETS
,
GPU_IMCLT_ALL_NAME
};
};
CUfunction
[]
functions
=
createFunctions
(
kernelSources
,
CUfunction
[]
functions
=
createFunctions
(
kernelSources
,
func_names
,
func_names
,
...
@@ -479,7 +484,7 @@ public class GPUTileProcessor {
...
@@ -479,7 +484,7 @@ public class GPUTileProcessor {
GPU_RBGA_kernel
=
functions
[
4
];
GPU_RBGA_kernel
=
functions
[
4
];
GPU_ROT_DERIV_kernel
=
functions
[
5
];
GPU_ROT_DERIV_kernel
=
functions
[
5
];
SET_TILES_OFFSETS_kernel
=
functions
[
6
];
SET_TILES_OFFSETS_kernel
=
functions
[
6
];
GPU_IMCLT_ALL_kernel
=
functions
[
7
];
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
"GPU kernel functions initialized"
);
System
.
out
.
println
(
GPU_CONVERT_CORRECT_TILES_kernel
.
toString
());
System
.
out
.
println
(
GPU_CONVERT_CORRECT_TILES_kernel
.
toString
());
...
@@ -531,10 +536,13 @@ public class GPUTileProcessor {
...
@@ -531,10 +536,13 @@ public class GPUTileProcessor {
cuMemAlloc
(
gpu_kernel_offsets
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_kernel_offsets
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_bayer
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_bayer
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_clt
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_clt
,
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemAlloc
(
gpu_4_images
,
NUM_CAMS
*
Sizeof
.
POINTER
);
long
[]
gpu_kernels_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_kernels_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_kernel_offsets_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_kernel_offsets_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_bayer_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_bayer_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_clt_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_clt_l
=
new
long
[
NUM_CAMS
];
long
[]
gpu_4_images_l
=
new
long
[
NUM_CAMS
];
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_kernels_l
[
ncam
]
=
getPointerAddress
(
gpu_kernels_h
[
ncam
]);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_kernels_l
[
ncam
]
=
getPointerAddress
(
gpu_kernels_h
[
ncam
]);
cuMemcpyHtoD
(
gpu_kernels
,
Pointer
.
to
(
gpu_kernels_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemcpyHtoD
(
gpu_kernels
,
Pointer
.
to
(
gpu_kernels_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
...
@@ -548,6 +556,9 @@ public class GPUTileProcessor {
...
@@ -548,6 +556,9 @@ public class GPUTileProcessor {
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_clt_l
[
ncam
]
=
getPointerAddress
(
gpu_clt_h
[
ncam
]);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_clt_l
[
ncam
]
=
getPointerAddress
(
gpu_clt_h
[
ncam
]);
cuMemcpyHtoD
(
gpu_clt
,
Pointer
.
to
(
gpu_clt_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
cuMemcpyHtoD
(
gpu_clt
,
Pointer
.
to
(
gpu_clt_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++)
gpu_4_images_l
[
ncam
]
=
getPointerAddress
(
gpu_corr_images_h
[
ncam
]);
cuMemcpyHtoD
(
gpu_4_images
,
Pointer
.
to
(
gpu_4_images_l
),
NUM_CAMS
*
Sizeof
.
POINTER
);
// Set GeometryCorrection data
// Set GeometryCorrection data
cuMemAlloc
(
gpu_geometry_correction
,
GeometryCorrection
.
arrayLength
(
NUM_CAMS
)
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_geometry_correction
,
GeometryCorrection
.
arrayLength
(
NUM_CAMS
)
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_rByRDist
,
RBYRDIST_LEN
*
Sizeof
.
FLOAT
);
cuMemAlloc
(
gpu_rByRDist
,
RBYRDIST_LEN
*
Sizeof
.
FLOAT
);
...
@@ -1093,9 +1104,12 @@ public class GPUTileProcessor {
...
@@ -1093,9 +1104,12 @@ public class GPUTileProcessor {
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
gpu_clt
),
Pointer
.
to
(
new
int
[]
{
mclt_stride
}),
Pointer
.
to
(
new
int
[]
{
mclt_stride
}),
Pointer
.
to
(
new
int
[]
{
num_task_tiles
}),
Pointer
.
to
(
new
int
[]
{
num_task_tiles
}),
// move lpf to 4-image generator kernel
// move lpf to 4-image generator kernel - DONE
// Pointer.to(new int[] { 7 }) // lpf_mask ??? (C-code has it 0)
Pointer
.
to
(
new
int
[]
{
0
}),
// lpf_mask
Pointer
.
to
(
new
int
[]
{
0
})
// lpf_mask ??? (C-code has it 0)
Pointer
.
to
(
new
int
[]
{
IMG_WIDTH
}),
// int woi_width,
Pointer
.
to
(
new
int
[]
{
IMG_HEIGHT
}),
// int woi_height,
Pointer
.
to
(
new
int
[]
{
KERNELS_HOR
}),
// int kernels_hor,
Pointer
.
to
(
new
int
[]
{
KERNELS_VERT
})
// int kernels_vert);
);
);
cuCtxSynchronize
();
cuCtxSynchronize
();
...
@@ -1132,10 +1146,12 @@ public class GPUTileProcessor {
...
@@ -1132,10 +1146,12 @@ public class GPUTileProcessor {
Pointer
.
to
(
gpu_clt_h
[
ncam
]),
Pointer
.
to
(
gpu_clt_h
[
ncam
]),
Pointer
.
to
(
gpu_corr_images_h
[
ncam
]),
Pointer
.
to
(
gpu_corr_images_h
[
ncam
]),
Pointer
.
to
(
new
int
[]
{
apply_lpf
}),
Pointer
.
to
(
new
int
[]
{
apply_lpf
}),
Pointer
.
to
(
new
int
[]
{
is_mono
?
1
:
0
}),
Pointer
.
to
(
new
int
[]
{
is_mono
?
1
:
NUM_COLORS
}),
// now - NUM_COLORS
Pointer
.
to
(
new
int
[]
{
color
}),
Pointer
.
to
(
new
int
[]
{
color
}),
Pointer
.
to
(
new
int
[]
{
v_offs
}),
Pointer
.
to
(
new
int
[]
{
v_offs
}),
Pointer
.
to
(
new
int
[]
{
h_offs
}),
Pointer
.
to
(
new
int
[]
{
h_offs
}),
Pointer
.
to
(
new
int
[]
{
tilesX
}),
Pointer
.
to
(
new
int
[]
{
tilesY
}),
Pointer
.
to
(
new
int
[]
{
imclt_stride
})
// lpf_mask
Pointer
.
to
(
new
int
[]
{
imclt_stride
})
// lpf_mask
);
);
cuCtxSynchronize
();
cuCtxSynchronize
();
...
@@ -1152,6 +1168,39 @@ public class GPUTileProcessor {
...
@@ -1152,6 +1168,39 @@ public class GPUTileProcessor {
cuCtxSynchronize
();
cuCtxSynchronize
();
}
}
public
void
execImcltRbgAll
(
boolean
is_mono
)
{
if
(
GPU_IMCLT_ALL_kernel
==
null
)
{
IJ
.
showMessage
(
"Error"
,
"No GPU kernel: GPU_IMCLT_ALL_kernel"
);
return
;
}
int
apply_lpf
=
1
;
int
tilesX
=
IMG_WIDTH
/
DTT_SIZE
;
int
tilesY
=
IMG_HEIGHT
/
DTT_SIZE
;
int
[]
ThreadsFullWarps
=
{
1
,
1
,
1
};
int
[]
GridFullWarps
=
{
1
,
1
,
1
};
Pointer
kernelParameters
=
Pointer
.
to
(
Pointer
.
to
(
gpu_clt
),
// float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
Pointer
.
to
(
gpu_4_images
),
// float ** gpu_corr_images, // [NUM_CAMS][WIDTH, 3 * HEIGHT]
Pointer
.
to
(
new
int
[]
{
apply_lpf
}),
// int apply_lpf,
Pointer
.
to
(
new
int
[]
{
is_mono
?
1
:
NUM_COLORS
}),
// int colors,
Pointer
.
to
(
new
int
[]
{
tilesX
}),
// int woi_twidth,
Pointer
.
to
(
new
int
[]
{
tilesY
}),
// int woi_theight,
Pointer
.
to
(
new
int
[]
{
imclt_stride
})
// const size_t dstride); // in floats (pixels)
);
cuCtxSynchronize
();
// Call the kernel function
cuLaunchKernel
(
GPU_IMCLT_ALL_kernel
,
GridFullWarps
[
0
],
GridFullWarps
[
1
],
GridFullWarps
[
2
],
// Grid dimension
ThreadsFullWarps
[
0
],
ThreadsFullWarps
[
1
],
ThreadsFullWarps
[
2
],
// Block dimension
0
,
null
,
// Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters
,
null
);
// Kernel- and extra parameters
cuCtxSynchronize
();
}
public
void
execCorr2D
(
public
void
execCorr2D
(
double
[]
scales
,
double
[]
scales
,
double
fat_zero
,
double
fat_zero
,
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
0bb31239
...
@@ -2119,7 +2119,8 @@ public class TwoQuadCLT {
...
@@ -2119,7 +2119,8 @@ public class TwoQuadCLT {
// run imclt;
// run imclt;
long
startIMCLT
=
System
.
nanoTime
();
long
startIMCLT
=
System
.
nanoTime
();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
{
gPUTileProcessor
.
execImcltRbg
(
quadCLT_main
.
isMonochrome
());
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor
.
execImcltRbgAll
(
quadCLT_main
.
isMonochrome
());
}
}
long
endImcltTime
=
System
.
nanoTime
();
long
endImcltTime
=
System
.
nanoTime
();
// run correlation
// run correlation
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
0bb31239
...
@@ -732,8 +732,6 @@ __device__ void convertCorrectTile(
...
@@ -732,8 +732,6 @@ __device__ void convertCorrectTile(
const
int
lpf_mask
,
const
int
lpf_mask
,
const
float
centerX
,
const
float
centerX
,
const
float
centerY
,
const
float
centerY
,
// const short tx,
// const short ty,
const
int
txy
,
const
int
txy
,
const
size_t
dstride
,
// in floats (pixels)
const
size_t
dstride
,
// in floats (pixels)
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
float
*
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
...
@@ -742,7 +740,11 @@ __device__ void convertCorrectTile(
...
@@ -742,7 +740,11 @@ __device__ void convertCorrectTile(
float
residual_shift
[
2
],
float
residual_shift
[
2
],
float
window_hor_cos
[
2
*
DTT_SIZE
],
float
window_hor_cos
[
2
*
DTT_SIZE
],
float
window_hor_sin
[
2
*
DTT_SIZE
],
float
window_hor_sin
[
2
*
DTT_SIZE
],
float
window_vert_cos
[
2
*
DTT_SIZE
]);
float
window_vert_cos
[
2
*
DTT_SIZE
],
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
__device__
void
debug_print_lpf
(
__device__
void
debug_print_lpf
(
float
*
lpf_tile
);
float
*
lpf_tile
);
...
@@ -1551,7 +1553,11 @@ __global__ void convert_correct_tiles(
...
@@ -1551,7 +1553,11 @@ __global__ void convert_correct_tiles(
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
int
lpf_mask
)
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
)
{
{
dim3
t
=
threadIdx
;
dim3
t
=
threadIdx
;
int
tile_in_block
=
threadIdx
.
y
;
int
tile_in_block
=
threadIdx
.
y
;
...
@@ -1615,280 +1621,18 @@ __global__ void convert_correct_tiles(
...
@@ -1615,280 +1621,18 @@ __global__ void convert_correct_tiles(
residual_shift
[
tile_in_block
],
// float frac_topleft [NUM_COLORS][2],
residual_shift
[
tile_in_block
],
// float frac_topleft [NUM_COLORS][2],
window_hor_cos
[
tile_in_block
],
// float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_cos
[
tile_in_block
],
// float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_sin
[
tile_in_block
],
//float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_hor_sin
[
tile_in_block
],
//float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_vert_cos
[
tile_in_block
]);
//float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
window_vert_cos
[
tile_in_block
],
//float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
woi_width
,
// int woi_width,
woi_height
,
// int woi_height,
kernels_hor
,
// int kernels_hor,
kernels_vert
);
//int kernels_vert)
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
}
}
}
}
}
}
//#undef USE_textures_gen
//#undef USE_textures_gen
#ifdef USE_textures_gen
extern
"C"
__global__
void
textures_gen
(
float
**
gpu_clt
,
// [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
num_texture_tiles
,
// number of texture tiles to process
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
float
*
gpu_port_offsets
,
// relative ports x,y offsets - just to scale differences, may be approximate
int
colors
,
// number of colors (3/1)
int
is_lwir
,
// do not perform shot correction
float
min_shot
,
// 10.0
float
scale_shot
,
// 3.0
float
diff_sigma
,
// pixel value/pixel change
float
diff_threshold
,
// pixel value/pixel change
// int diff_gauss, // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
float
min_agree
,
// minimal number of channels to agree on a point (real number to work with fuzzy averages)
float
weight0
,
// scale for R
float
weight1
,
// scale for B
float
weight2
,
// scale for G
int
dust_remove
,
// Do not reduce average weight when only one image differs much from the average
int
keep_weights
,
// return channel weights after A in RGBA (was removed)
const
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
float
*
gpu_texture_tiles
)
// (number of colors +1 + ?)*16*16 rgba texture tiles
{
float
weights
[
3
]
=
{
weight0
,
weight1
,
weight2
};
// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
int
camera_num
=
threadIdx
.
y
;
int
tile_indx
=
blockIdx
.
x
;
// * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if
(
tile_indx
>=
num_texture_tiles
){
return
;
// nothing to do
}
// get number of tile
int
tile_code
=
gpu_texture_indices
[
tile_indx
];
if
((
tile_code
&
(
1
<<
CORR_TEXTURE_BIT
))
==
0
){
return
;
// nothing to do
}
int
tile_num
=
tile_code
>>
CORR_NTILE_SHIFT
;
__shared__
float
mclt_tiles
[
NUM_CAMS
][
NUM_COLORS
][
2
*
DTT_SIZE
][
DTT_SIZE21
];
__shared__
union
{
float
clt_tiles
[
NUM_CAMS
][
NUM_COLORS
][
4
][
DTT_SIZE
][
DTT_SIZE1
];
// NUM_CAMS == 4
float
mclt_debayer
[
NUM_CAMS
][
NUM_COLORS
][
MCLT_UNION_LEN
];
// to align with clt_tiles
}
shr
;
__shared__
union
{
float
mclt_tmp
[
NUM_CAMS
][
NUM_COLORS
][
DTT_SIZE2
][
DTT_SIZE21
];
float
rgbaw
[
NUM_COLORS
+
1
+
NUM_CAMS
+
NUM_COLORS
+
1
][
DTT_SIZE2
][
DTT_SIZE21
];
// add more
}
shr1
;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
// __shared__ float color_avg [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
__shared__
float
port_offsets
[
NUM_CAMS
][
2
];
__shared__
float
ports_rgb
[
NUM_CAMS
][
NUM_COLORS
];
// return to system memory (optionally pass null to skip calculation)
__shared__
float
max_diff
[
NUM_CAMS
];
// return to system memory (optionally pass null to skip calculation)
if
(
threadIdx
.
x
<
2
){
port_offsets
[
camera_num
][
threadIdx
.
x
]
=
*
(
gpu_port_offsets
+
2
*
camera_num
+
threadIdx
.
x
);
}
#ifdef DBG_TILE
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen tile = %d
\n
"
,
tile_num
);
// debug_print_clt1(clt_tile1, color, 0xf); //
// printf("\textures_gen tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
// debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads
();
// __syncwarp();
#endif
#endif
// serially for each color, parallel for each camera
// copy clt (frequency domain data)
for
(
int
color
=
0
;
color
<
colors
;
color
++
){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
float
*
clt_tile
=
((
float
*
)
shr
.
clt_tiles
[
camera_num
][
color
]);
// start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
float
*
clt_tilei
=
clt_tile
+
threadIdx
.
x
;
float
*
gpu_tile
=
((
float
*
)
gpu_clt
[
camera_num
])
+
(
tile_num
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
float
*
mclt_tile
=
(
float
*
)
mclt_tiles
[
camera_num
][
color
];
float
*
mclt_dst
=
(
float
*
)
shr
.
mclt_debayer
[
camera_num
][
color
];
float
*
mclt_tmp
=
(
float
*
)
shr1
.
mclt_tmp
[
camera_num
][
color
];
// float scale = 0.25;
#pragma unroll
for
(
int
q
=
0
;
q
<
4
;
q
++
)
{
float
*
lpf
=
lpf_data
[(
colors
>
1
)
?
color
:
3
]
+
threadIdx
.
x
;
// lpf_data[3] - mono
#pragma unroll
for
(
int
i
=
0
;
i
<
DTT_SIZE
;
i
++
){
// copy 32 rows (4 quadrants of 8 rows)
// *clt_tilei = *gpu_tile * (*lpf) * scale;
*
clt_tilei
=
*
gpu_tile
*
(
*
lpf
);
clt_tilei
+=
DTT_SIZE1
;
gpu_tile
+=
DTT_SIZE
;
lpf
+=
DTT_SIZE
;
}
}
__syncthreads
();
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen LPF for color = %d
\n
"
,
color
);
debug_print_lpf
(
lpf_data
[(
colors
>
1
)
?
color
:
3
]);
printf
(
"
\n
textures_gen tile = %d, color = %d
\n
"
,
tile_num
,
color
);
debug_print_clt_scaled
(
clt_tile
,
color
,
0xf
,
0.25
);
//
}
__syncthreads
();
// __syncwarp();
#endif
// perform idct
imclt8threads
(
0
,
// int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile
,
// float * mclt_tile )
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
__syncthreads
();
// __syncwarp();
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen mclt color = %d
\n
"
,
color
);
debug_print_mclt
(
mclt_tile
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color
);
}
__syncthreads
();
// __syncwarp();
#endif
if
(
colors
>
1
)
{
debayer_shot
(
(
color
<
2
),
// const int rb_mode, // 0 - green, 1 - r/b
min_shot
,
// float min_shot, // 10.0
scale_shot
,
// float scale_shot, // 3.0 (0.0 for mono)
mclt_tile
,
// float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst
,
// float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp
,
// float * mclt_tmp,
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)));
// int debug);
__syncthreads
();
// __syncwarp();
}
else
{
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#pragma unroll
for
(
int
n
=
0
;
n
<=
DTT_SIZE
;
n
+=
DTT_SIZE
){
float
*
msp
=
mclt_tile
+
threadIdx
.
x
+
n
;
float
*
dst
=
mclt_dst
+
threadIdx
.
x
+
n
;
#pragma unroll
for
(
int
row
=
0
;
row
<
DTT_SIZE2
;
row
++
){
*
dst
=
*
msp
;
msp
+=
DTT_SIZE21
;
dst
+=
DTT_SIZE21
;
}
}
__syncthreads
();
}
#ifdef DEBUG77
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
printf
(
"
\n
textures_gen AFTER DEBAER cam= %d, color = %d
\n
"
,
threadIdx
.
y
,
color
);
debug_print_mclt
(
mclt_dst
,
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
printf
(
"
\n
textures_gen AFTER DEBAER0 cam= %d, color = %d
\n
"
,
threadIdx
.
y
,
0
);
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
0
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
}
// for (int color = 0; color < colors; color++)
__syncthreads
();
// __syncwarp();
/// return;
#ifdef DEBUG77
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for
(
int
nncol
=
0
;
nncol
<
colors
;
nncol
++
){
printf
(
"
\n
textures_gen AFTER DEBAER1 cam= %d, color = %d
\n
"
,
ccam
,
nncol
);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
nncol
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
}
}
__syncthreads
();
// __syncwarp();
#endif
#ifdef DEBUG77
for
(
int
ccam
=
0
;
ccam
<
NUM_CAMS
;
ccam
++
)
{
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
ccam
)){
for
(
int
nncol
=
0
;
nncol
<
colors
;
nncol
++
){
printf
(
"
\n
textures_gen AFTER DEBAER1 cam= %d, color = %d
\n
"
,
ccam
,
nncol
);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt
(
(
float
*
)
shr
.
mclt_debayer
[
ccam
][
nncol
],
// [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-
1
);
}
}
__syncthreads
();
// __syncwarp();
}
__syncthreads
();
// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
tile_combine_rgba
(
colors
,
// int colors, // number of colors
(
float
*
)
shr
.
mclt_debayer
,
// float * mclt_tile, // debayer // has gaps to align with union !
(
float
*
)
mclt_tiles
,
// float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(
float
*
)
shr1
.
rgbaw
,
// float * rgba, // result
(
float
*
)
0
,
// float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(
float
*
)
0
,
// float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(
float
*
)
port_offsets
,
// float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma
,
// float diff_sigma, // pixel value/pixel change
diff_threshold
,
// float diff_threshold, // pixel value/pixel change
min_agree
,
// float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights
,
// float * chn_weights, // color channel weights, sum == 1.0
dust_remove
,
// int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights
,
// int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(
tile_num
==
DBG_TILE
)
);
//int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
float
*
gpu_texture_tile
=
gpu_texture_tiles
+
tile_indx
*
texture_stride
;
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
int
i
=
row
*
DTT_SIZE21
+
col
;
int
gi
=
row
*
DTT_SIZE2
+
col
;
float
*
gpu_texture_tile_gi
=
gpu_texture_tile
+
gi
;
float
*
rgba_i
=
((
float
*
)
shr1
.
rgbaw
)
+
i
;
// always copy 3 (1) colors + alpha
if
(
colors
==
3
){
if
(
keep_weights
)
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
+
NUM_CAMS
+
NUM_COLORS
+
1
;
ncol
++
)
{
// 12
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
else
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
NUM_COLORS
+
1
;
ncol
++
)
{
// 4
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
}
else
{
// assuming colors = 1
if
(
keep_weights
)
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
+
NUM_CAMS
+
1
+
1
;
ncol
++
)
{
// 8
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
else
{
#pragma unroll
for
(
int
ncol
=
0
;
ncol
<
1
+
1
;
ncol
++
)
{
// 2
*
(
gpu_texture_tile_gi
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE2
))
=
*
(
rgba_i
+
ncol
*
(
DTT_SIZE2
*
DTT_SIZE21
));
}
}
}
}
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_gen tile done = %d, texture_stride= %d
\n
"
,
tile_num
,
(
int
)
texture_stride
);
}
__syncthreads
();
// __syncwarp();
#endif
}
#endif // ifdef USE_textures_gen
extern
"C"
extern
"C"
__global__
void
textures_accumulate
(
__global__
void
textures_accumulate
(
int
*
woi
,
// x, y, width,height
int
*
woi
,
// x, y, width,height
...
@@ -2300,31 +2044,71 @@ __global__ void textures_accumulate(
...
@@ -2300,31 +2044,71 @@ __global__ void textures_accumulate(
}
// textures_accumulate()
}
// textures_accumulate()
extern
"C"
__global__
void
imclt_rbg_all
(
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
**
gpu_corr_images
,
// [NUM_CAMS][WIDTH, 3 * HEIGHT]
int
apply_lpf
,
int
colors
,
int
woi_twidth
,
int
woi_theight
,
const
size_t
dstride
)
// in floats (pixels)
{
dim3
threads_imclt
(
IMCLT_THREADS_PER_TILE
,
IMCLT_TILES_PER_BLOCK
,
1
);
if
(
threadIdx
.
x
==
0
)
{
// anyway 1,1,1
for
(
int
ncam
=
0
;
ncam
<
NUM_CAMS
;
ncam
++
)
{
for
(
int
color
=
0
;
color
<
colors
;
color
++
)
{
for
(
int
v_offs
=
0
;
v_offs
<
2
;
v_offs
++
){
for
(
int
h_offs
=
0
;
h_offs
<
2
;
h_offs
++
){
int
tilesy_half
=
(
woi_theight
+
(
v_offs
^
1
))
>>
1
;
int
tilesx_half
=
(
woi_twidth
+
(
h_offs
^
1
))
>>
1
;
int
tiles_in_pass
=
tilesy_half
*
tilesx_half
;
dim3
grid_imclt
((
tiles_in_pass
+
IMCLT_TILES_PER_BLOCK
-
1
)
/
IMCLT_TILES_PER_BLOCK
,
1
,
1
);
// printf("grid_imclt= (%d, %d, %d)\n",grid_imclt.x, grid_imclt.y, grid_imclt.z);
imclt_rbg
<<<
grid_imclt
,
threads_imclt
>>>
(
gpu_clt
[
ncam
],
// float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_corr_images
[
ncam
],
// float * gpu_rbg, // WIDTH, 3 * HEIGHT
1
,
// int apply_lpf,
colors
,
// int colors, // defines lpf filter
color
,
// int color, // defines location of clt data
v_offs
,
// int v_offset,
h_offs
,
// int h_offset,
woi_twidth
,
// int woi_twidth, // will increase by DTT_SIZE (todo - cut away?)
woi_theight
,
// int woi_theight, // will increase by DTT_SIZE (todo - cut away?)
dstride
);
// const size_t dstride); // in floats (pixels)
cudaDeviceSynchronize
();
}
}
}
}
}
}
extern
"C"
extern
"C"
__global__
void
imclt_rbg
(
__global__
void
imclt_rbg
(
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_rbg
,
// WIDTH, 3 * HEIGHT
float
*
gpu_rbg
,
// WIDTH, 3 * HEIGHT
int
apply_lpf
,
int
apply_lpf
,
int
mono
,
int
colors
,
// was mono
int
color
,
int
color
,
int
v_offset
,
int
v_offset
,
int
h_offset
,
int
h_offset
,
int
woi_twidth
,
// will increase by DTT_SIZE (todo - cut away?)
int
woi_theight
,
// will increase by DTT_SIZE (todo - cut away?)
const
size_t
dstride
)
// in floats (pixels)
const
size_t
dstride
)
// in floats (pixels)
{
{
float
*
color_plane
=
gpu_rbg
+
dstride
*
(
IMG_HEIGHT
+
DTT_SIZE
)
*
color
;
float
*
color_plane
=
gpu_rbg
+
dstride
*
(
woi_theight
*
DTT_SIZE
+
DTT_SIZE
)
*
color
;
int
pass
=
(
v_offset
<<
1
)
+
h_offset
;
// 0..3 to correctly acummulate 16x16 tiles stride 8
int
pass
=
(
v_offset
<<
1
)
+
h_offset
;
// 0..3 to correctly acummulate 16x16 tiles stride 8
int
tile_in_block
=
threadIdx
.
y
;
int
tile_in_block
=
threadIdx
.
y
;
int
tile_num
=
blockIdx
.
x
*
IMCLT_TILES_PER_BLOCK
+
tile_in_block
;
int
tile_num
=
blockIdx
.
x
*
IMCLT_TILES_PER_BLOCK
+
tile_in_block
;
// if (tile_num >= (TILESY * TILESX)) {
int
tilesx_half
=
(
woi_twidth
+
(
h_offset
^
1
))
>>
1
;
// return; // just testing with a single tile
// }
// int tilesy_half = (TILESY + (v_offset ^ 1)) >> 1;
int
tilesx_half
=
(
TILESX
+
(
h_offset
^
1
))
>>
1
;
int
tileY_half
=
tile_num
/
tilesx_half
;
int
tileY_half
=
tile_num
/
tilesx_half
;
int
tileX_half
=
tile_num
-
tileY_half
*
tilesx_half
;
int
tileX_half
=
tile_num
-
tileY_half
*
tilesx_half
;
int
tileY
=
(
tileY_half
<<
1
)
+
v_offset
;
int
tileY
=
(
tileY_half
<<
1
)
+
v_offset
;
int
tileX
=
(
tileX_half
<<
1
)
+
h_offset
;
int
tileX
=
(
tileX_half
<<
1
)
+
h_offset
;
if
(
tileY
>=
TILESY
)
{
if
(
tileY
>=
woi_theight
)
{
return
;
// just testing with a single tile
return
;
// just testing with a single tile
}
}
#ifdef DEBUG4
#ifdef DEBUG4
...
@@ -2351,14 +2135,13 @@ __global__ void imclt_rbg(
...
@@ -2351,14 +2135,13 @@ __global__ void imclt_rbg(
// copy clt (frequency domain data)
// copy clt (frequency domain data)
float
*
clt_tile
=
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
float
*
clt_tile
=
((
float
*
)
clt_tiles
)
+
tile_in_block
*
(
4
*
DTT_SIZE
*
DTT_SIZE1
);
// top left quadrant0
// float * gpu_tile = ((float *) gpu_clt) + ((DBG_TILE_Y * TILESX + DBG_TILE_X) * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE); // top left quadrant0
float
*
gpu_tile
=
((
float
*
)
gpu_clt
)
+
((
tileY
*
woi_twidth
+
tileX
)
*
colors
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
);
// top left quadrant0
float
*
gpu_tile
=
((
float
*
)
gpu_clt
)
+
((
tileY
*
TILESX
+
tileX
)
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
);
// top left quadrant0
clt_tile
+=
column
+
thr3
;
// first 2 rows
clt_tile
+=
column
+
thr3
;
// first 2 rows
gpu_tile
+=
column
;
// first 2 rows
gpu_tile
+=
column
;
// first 2 rows
if
(
apply_lpf
)
{
if
(
apply_lpf
)
{
// lpf - covers 2 rows, as there there are 16 threads
// lpf - covers 2 rows, as there there are 16 threads
float
*
lpf0
=
lpf_data
[
mono
?
3
:
color
]
+
threadIdx
.
x
;
// lpf_data[3] - mono
float
*
lpf0
=
lpf_data
[
(
colors
==
1
)
?
3
:
color
]
+
threadIdx
.
x
;
// lpf_data[3] - mono
#pragma unroll
#pragma unroll
for
(
int
q
=
0
;
q
<
4
;
q
++
){
for
(
int
q
=
0
;
q
<
4
;
q
++
){
float
*
lpf
=
lpf0
;
float
*
lpf
=
lpf0
;
...
@@ -2787,16 +2570,19 @@ __device__ void convertCorrectTile(
...
@@ -2787,16 +2570,19 @@ __device__ void convertCorrectTile(
float
residual_shift
[
2
],
float
residual_shift
[
2
],
float
window_hor_cos
[
2
*
DTT_SIZE
],
float
window_hor_cos
[
2
*
DTT_SIZE
],
float
window_hor_sin
[
2
*
DTT_SIZE
],
float
window_hor_sin
[
2
*
DTT_SIZE
],
float
window_vert_cos
[
2
*
DTT_SIZE
])
float
window_vert_cos
[
2
*
DTT_SIZE
],
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
)
{
{
// TODO: pass these values instead of constants to handle EO/LWIR
// TODO: pass these values instead of constants to handle EO/LWIR
int
max_px
=
IMG_WIDTH
-
1
;
// odd
int
max_px
=
woi_width
-
1
;
//
IMG_WIDTH - 1; // odd
int
max_py
=
IMG_HEIGHT
-
1
;
// odd
int
max_py
=
woi_height
-
1
;
//
IMG_HEIGHT - 1; // odd
int
max_pxm1
=
max_px
-
1
;
// even
int
max_pxm1
=
max_px
-
1
;
// even
int
max_pym1
=
max_py
-
1
;
// even
int
max_pym1
=
max_py
-
1
;
// even
int
max_kernel_hor
=
KERNELS_HOR
-
1
;
int
max_kernel_hor
=
kernels_hor
-
1
;
//
KERNELS_HOR -1;
int
max_kernel_vert
=
KERNELS_VERT
-
1
;
int
max_kernel_vert
=
kernels_vert
-
1
;
//
KERNELS_VERT-1;
int
ktileX
,
ktileY
;
int
ktileX
,
ktileY
;
int
kernel_index
;
// common for all coors
int
kernel_index
;
// common for all coors
...
@@ -2806,7 +2592,7 @@ __device__ void convertCorrectTile(
...
@@ -2806,7 +2592,7 @@ __device__ void convertCorrectTile(
ktileY
=
min
(
max_kernel_vert
,
max
(
0
,
((
int
)
lrintf
(
centerY
*
(
1.0
/
KERNELS_STEP
)
+
1
))));
ktileY
=
min
(
max_kernel_vert
,
max
(
0
,
((
int
)
lrintf
(
centerY
*
(
1.0
/
KERNELS_STEP
)
+
1
))));
kdx
=
centerX
-
(
ktileX
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
kdx
=
centerX
-
(
ktileX
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
kdy
=
centerY
-
(
ktileY
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
kdy
=
centerY
-
(
ktileY
<<
KERNELS_LSTEP
)
+
(
1
<<
(
KERNELS_LSTEP
-
1
));
// difference in pixel
kernel_index
=
(
ktileX
+
ktileY
*
KERNELS_HOR
)
*
NUM_COLORS
;
kernel_index
=
(
ktileX
+
ktileY
*
kernels_hor
)
*
NUM_COLORS
;
}
}
// broadcast kernel_index
// broadcast kernel_index
kernel_index
=
__shfl_sync
(
kernel_index
=
__shfl_sync
(
...
@@ -3157,7 +2943,6 @@ __device__ void convertCorrectTile(
...
@@ -3157,7 +2943,6 @@ __device__ void convertCorrectTile(
}
}
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
#ifdef DBG_TILE
#ifdef DEBUG3
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
if
((
threadIdx
.
x
)
==
0
){
printf
(
"
\n
DTT Tiles after LPF, color = %d
\n
"
,
color
);
printf
(
"
\n
DTT Tiles after LPF, color = %d
\n
"
,
color
);
...
@@ -3165,33 +2950,22 @@ __device__ void convertCorrectTile(
...
@@ -3165,33 +2950,22 @@ __device__ void convertCorrectTile(
printf
(
"
\n
DTT All done
\n
"
);
printf
(
"
\n
DTT All done
\n
"
);
}
}
__syncthreads
();
// __syncwarp();
__syncthreads
();
// __syncwarp();
#endif
#endif
#endif
}
}
// const int tx = txy & 0xffff; // slow again
// const int ty = txy >> 16;
int
offset_src
=
threadIdx
.
x
;
int
offset_src
=
threadIdx
.
x
;
// int offset_dst = ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
// int offset_dst = ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
#ifdef USE_UMUL24
int
offset_dst
=
__umul24
(
__umul24
(
__umul24
(
txy
>>
16
,
TILESX
)
+
(
txy
&
0xfff
)
,
NUM_COLORS
)
+
color
,
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
#else
int
offset_dst
=
(((
txy
>>
16
)
*
TILESX
+
(
txy
&
0xfff
))
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
int
offset_dst
=
(((
txy
>>
16
)
*
TILESX
+
(
txy
&
0xfff
))
*
NUM_COLORS
+
color
)
*
(
4
*
DTT_SIZE
*
DTT_SIZE
)
+
threadIdx
.
x
;
#endif
float
*
clt_src
=
clt_tile
+
offset_src
;
// threadIdx.x;
float
*
clt_src
=
clt_tile
+
offset_src
;
// threadIdx.x;
float
*
clt_dst
=
gpu_clt
+
offset_dst
;
// ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE1) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
float
*
clt_dst
=
gpu_clt
+
offset_dst
;
// ((ty * TILESX + tx)*NUM_COLORS + color)* ( 4 * DTT_SIZE * DTT_SIZE1) + threadIdx.x; // gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
//#ifndef NOICLT
//#ifndef NOICLT
#ifdef DBG_TILE
#ifdef DEBUG3
#ifdef DEBUG3
if
((
threadIdx
.
x
)
==
0
){
if
((
threadIdx
.
x
)
==
0
){
printf
(
"clt_src = 0x%lx
\n
"
,
clt_src
);
printf
(
"clt_src = 0x%lx
\n
"
,
clt_src
);
printf
(
"clt_dst = 0x%lx
\n
"
,
clt_dst
);
printf
(
"clt_dst = 0x%lx
\n
"
,
clt_dst
);
}
}
#endif
#endif
#endif
...
...
src/main/resources/kernels/TileProcessor.h
View file @
0bb31239
...
@@ -51,7 +51,12 @@ __global__ void convert_correct_tiles(
...
@@ -51,7 +51,12 @@ __global__ void convert_correct_tiles(
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t
dstride
,
// in floats (pixels)
size_t
dstride
,
// in floats (pixels)
int
num_tiles
,
// number of tiles in task
int
num_tiles
,
// number of tiles in task
int
lpf_mask
);
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
lpf_mask
,
// apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int
woi_width
,
int
woi_height
,
int
kernels_hor
,
int
kernels_vert
);
extern
"C"
__global__
void
clear_texture_list
(
extern
"C"
__global__
void
clear_texture_list
(
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
int
*
gpu_texture_indices
,
// packed tile + bits (now only (1 << 7)
...
@@ -104,6 +109,16 @@ extern "C" __global__ void textures_accumulate(
...
@@ -104,6 +109,16 @@ extern "C" __global__ void textures_accumulate(
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
size_t
texture_stride
,
// in floats (now 256*4 = 1024)
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
extern
"C"
__global__
void
imclt_rbg_all
(
float
**
gpu_clt
,
// [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
**
gpu_corr_images
,
// [NUM_CAMS][WIDTH, 3 * HEIGHT]
int
apply_lpf
,
int
colors
,
int
woi_twidth
,
int
woi_theight
,
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
__global__
void
imclt_rbg
(
extern
"C"
__global__
void
imclt_rbg
(
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_clt
,
// [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float
*
gpu_rbg
,
// WIDTH, 3 * HEIGHT
float
*
gpu_rbg
,
// WIDTH, 3 * HEIGHT
...
@@ -112,6 +127,8 @@ extern "C" __global__ void imclt_rbg(
...
@@ -112,6 +127,8 @@ extern "C" __global__ void imclt_rbg(
int
color
,
// defines location of clt data
int
color
,
// defines location of clt data
int
v_offset
,
int
v_offset
,
int
h_offset
,
int
h_offset
,
int
woi_twidth
,
int
woi_theight
,
const
size_t
dstride
);
// in floats (pixels)
const
size_t
dstride
);
// in floats (pixels)
extern
"C"
extern
"C"
...
@@ -144,5 +161,3 @@ __global__ void generate_RBGA(
...
@@ -144,5 +161,3 @@ __global__ void generate_RBGA(
const
size_t
texture_rbga_stride
,
// in floats
const
size_t
texture_rbga_stride
,
// in floats
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
float
*
gpu_texture_tiles
);
// (number of colors +1 + ?)*16*16 rgba texture tiles
src/main/resources/kernels/geometry_correction.h
View file @
0bb31239
...
@@ -114,9 +114,9 @@ struct gc {
...
@@ -114,9 +114,9 @@ struct gc {
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA7
;
//r^7 (normalized to focal length or to sensor half width?)
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
float
distortionA8
;
//r^8 (normalized to focal length or to sensor half width?)
#ifndef NVRTC_BUG
#ifndef NVRTC_BUG
//
};
};
//
float rad_coeff [7];
float
rad_coeff
[
7
];
//
};
};
#endif
#endif
// parameters, common for all sensors
// parameters, common for all sensors
float
elevation
;
// degrees, up - positive;
float
elevation
;
// degrees, up - positive;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment