Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
c198d5f3
Commit
c198d5f3
authored
Oct 01, 2018
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
more testing
parent
b0ba6ef5
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
79 additions
and
43 deletions
+79
-43
TileProcessor.cuh
src/main/resources/TileProcessor.cuh
+79
-43
No files found.
src/main/resources/TileProcessor.cuh
View file @
c198d5f3
...
@@ -40,7 +40,13 @@
...
@@ -40,7 +40,13 @@
#pragma once
#pragma once
#include "dtt8x8.cuh"
#include "dtt8x8.cuh"
// Not enough shared memory to have more threads per block,even just for the result clt tiles
// Not enough shared memory to have more threads per block,even just for the result clt tiles
#define TILES_PER_BLOCK 2
// What to do:
// 1) make single image aberration correction: 1/4 of the result tiles
// With 4 cameras = calculate correlations (9x9), reusing kernel or just clt ones after color reducing, then output them to device memory
//Average run time =12502.638672 - with 2 tiles/block it is longer!
//#define TILES_PER_BLOCK 2
//Average run time =9656.743164 ms
#define TILES_PER_BLOCK 1
#define THREADS_PER_TILE 32
#define THREADS_PER_TILE 32
#define IMG_WIDTH 2592
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define IMG_HEIGHT 1936
...
@@ -70,9 +76,9 @@
...
@@ -70,9 +76,9 @@
#define DBG_TILE_X 174
#define DBG_TILE_X 174
#define DBG_TILE_Y 118
#define DBG_TILE_Y 118
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
//
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
#define DEBUG1 1
//
#define DEBUG1 1
#undef DEBUG2
//
#undef DEBUG2
//56494
//56494
// struct tp_task
// struct tp_task
//#define TASK_SIZE 12
//#define TASK_SIZE 12
...
@@ -312,6 +318,7 @@ __global__ void tileProcessor(
...
@@ -312,6 +318,7 @@ __global__ void tileProcessor(
window_hor_cos
[
tile_in_block
],
// float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_cos
[
tile_in_block
],
// float window_hor_cos [NUM_COLORS][2*DTT_SIZE],
window_hor_sin
[
tile_in_block
],
//float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_hor_sin
[
tile_in_block
],
//float window_hor_sin [NUM_COLORS][2*DTT_SIZE],
window_vert_cos
[
tile_in_block
]);
//float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
window_vert_cos
[
tile_in_block
]);
//float window_vert_cos [NUM_COLORS][2*DTT_SIZE]);
__syncthreads
();
}
}
}
}
...
@@ -593,6 +600,7 @@ __device__ void convertCorrectTile(
...
@@ -593,6 +600,7 @@ __device__ void convertCorrectTile(
py
=
centerY
-
DTT_SIZE
-
(
clt_extra
->
data_y
+
clt_extra
->
dyc_dx
*
kdx
+
clt_extra
->
dyc_dy
*
kdy
)
;
// fractional top corner
py
=
centerY
-
DTT_SIZE
-
(
clt_extra
->
data_y
+
clt_extra
->
dyc_dx
*
kdx
+
clt_extra
->
dyc_dy
*
kdy
)
;
// fractional top corner
int
itly
=
(
int
)
floorf
(
py
+
0.5
f
);
int
itly
=
(
int
)
floorf
(
py
+
0.5
f
);
int_topleft
[
bayer_color
][
1
]
=
itly
;
int_topleft
[
bayer_color
][
1
]
=
itly
;
/// float shift_vert = py - itly;
/// float shift_vert = py - itly;
float
shift_vert
=
itly
-
py
;
float
shift_vert
=
itly
-
py
;
residual_shift
[
bayer_color
][
1
]
=
shift_vert
;
residual_shift
[
bayer_color
][
1
]
=
shift_vert
;
...
@@ -635,27 +643,42 @@ __device__ void convertCorrectTile(
...
@@ -635,27 +643,42 @@ __device__ void convertCorrectTile(
// prepare, fold and write data to DTT buffers
// prepare, fold and write data to DTT buffers
int
dstride2
=
dstride
<<
1
;
// in floats (pixels)
int
dstride2
=
dstride
<<
1
;
// in floats (pixels)
int
bayer_color
=
min
((
NUM_COLORS
-
1
),
threadIdx
.
y
);
int
bayer_color
=
min
((
NUM_COLORS
-
1
),
threadIdx
.
y
);
// TODO: Make a special case for border tiles
if
(
bayer_color
<
BAYER_GREEN
){
// process R and B (2 * 8 threads) threads 0..15
if
(
bayer_color
<
BAYER_GREEN
){
// process R and B (2 * 8 threads) threads 0..15
// Find correct column and start row for each of the 8 participating threads
// Find correct column and start row for each of the 8 participating threads
int
col_tl
=
int_topleft
[
bayer_color
][
0
];
// + (threadIdx.x << 1);
int
col_tl
=
int_topleft
[
bayer_color
][
0
];
// + (threadIdx.x << 1);
int
row_tl
=
int_topleft
[
bayer_color
][
1
];
int
row_tl
=
int_topleft
[
bayer_color
][
1
];
int
local_col
=
((
col_tl
&
1
)
^
BAYER_RED_COL
^
bayer_color
)
+
(
threadIdx
.
x
<<
1
);
int
local_col
=
((
col_tl
&
1
)
^
BAYER_RED_COL
^
bayer_color
)
+
(
threadIdx
.
x
<<
1
);
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
bayer_color
);
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
bayer_color
);
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
hwind_cos
=
window_hor_cos
[
bayer_color
][
local_col
];
float
hwind_cos
=
window_hor_cos
[
bayer_color
][
local_col
];
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
int
dtt_offset_inc
=
fold_inc
[
local_row
];
int
dtt_offset_inc
=
fold_inc
[
local_row
];
float
*
dtt_buf
=
(
float
*
)
clt_tile
[
bayer_color
][
0
];
float
*
dtt_buf
=
(
float
*
)
clt_tile
[
bayer_color
][
0
];
if
((
col_tl
>=
0
)
&&
((
col_tl
<
(
IMG_WIDTH
-
DTT_SIZE
*
2
)))
&&
(
row_tl
>=
0
)
&&
((
row_tl
<
(
IMG_HEIGHT
-
DTT_SIZE
*
2
))))
{
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows (DTT_SIZE1)
dtt_buf
[
dtt_offset1
]
=
(
*
image_p
)
*
hwind_cos
*
window_vert_cos
[
bayer_color
][
local_row
];
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
image_p
+=
dstride2
;
}
}
else
{
// handling border tiles
int
eff_col
=
(
min
(
IMG_HEIGHT
/
2
-
1
,
max
(
0
,
col_tl
>>
1
))
<<
1
)
+
(
col_tl
&
1
);
int
row_lsb
=
row_tl
&
1
;
int
row_pair
=
row_tl
>>
1
;
float
*
image_p
=
gpu_images
+
dstride
*
local_row
+
(
eff_col
+
local_col
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
// dtt_buf[dtt_offset] = (*image_p) * hwind_cos * window_vert_cos[bayer_color][local_row];
int
eff_row
=
(
min
(
IMG_WIDTH
/
2
-
1
,
max
(
0
,
row_pair
+
i
))
<<
1
)
+
row_lsb
;
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows (DTT_SIZE1)
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows (DTT_SIZE1)
float
dbg_pix
=
(
*
image_p
);
dtt_buf
[
dtt_offset1
]
=
image_p
[
dstride
*
eff_row
]
*
hwind_cos
*
window_vert_cos
[
bayer_color
][
local_row
];
dtt_buf
[
dtt_offset1
]
=
(
*
image_p
)
*
hwind_cos
*
window_vert_cos
[
bayer_color
][
local_row
];
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
local_row
+=
2
;
}
image_p
+=
dstride2
;
}
}
}
else
{
// process green color threads 16..31
}
else
{
// process green color threads 16..31
// no need to sync here
// no need to sync here
...
@@ -664,30 +687,43 @@ __device__ void convertCorrectTile(
...
@@ -664,30 +687,43 @@ __device__ void convertCorrectTile(
// Find correct column and start row for each of the 8 participating threads
// Find correct column and start row for each of the 8 participating threads
int
col_tl
=
int_topleft
[
BAYER_GREEN
][
0
];
// + (threadIdx.x << 1);
int
col_tl
=
int_topleft
[
BAYER_GREEN
][
0
];
// + (threadIdx.x << 1);
int
row_tl
=
int_topleft
[
BAYER_GREEN
][
1
];
int
row_tl
=
int_topleft
[
BAYER_GREEN
][
1
];
int
local_col
=
((
col_tl
&
1
)
^
(
BAYER_RED_COL
^
1
)
^
ipass
)
+
(
threadIdx
.
x
<<
1
);
// green red row: invert column from red
int
local_col
=
((
col_tl
&
1
)
^
(
BAYER_RED_COL
^
1
)
^
ipass
)
+
(
threadIdx
.
x
<<
1
);
// green red row: invert column from red
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
ipass
);
// use red row
int
local_row
=
((
row_tl
&
1
)
^
BAYER_RED_ROW
^
ipass
);
// use red row
int
dbg_image_offset
=
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
hwind_cos
=
window_hor_cos
[
BAYER_GREEN
][
local_col
];
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
float
hwind_sin
=
window_hor_sin
[
BAYER_GREEN
][
local_col
];
float
dbg_pix1
=
gpu_images
[
dbg_image_offset
];
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
float
*
dbg_pix2_p
=
gpu_images
+
dbg_image_offset
;
int
dtt_offset_inc
=
fold_inc
[
local_row
];
float
dbg_pix2
=
*
dbg_pix2_p
;
float
*
dct_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][
ipass
<<
1
];
// use 2 buffers, second - borrowing from rotated DTT
float
hwind_cos
=
window_hor_cos
[
BAYER_GREEN
][
local_col
];
float
*
dst_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][(
ipass
<<
1
)
+
1
];
float
hwind_sin
=
window_hor_sin
[
BAYER_GREEN
][
local_col
];
if
((
col_tl
>=
0
)
&&
((
col_tl
<
(
IMG_WIDTH
-
DTT_SIZE
*
2
)))
&&
(
row_tl
>=
0
)
&&
((
row_tl
<
(
IMG_HEIGHT
-
DTT_SIZE
*
2
))))
{
int
dtt_offset
=
fold_indx2
[
local_row
][
local_col
];
float
*
image_p
=
gpu_images
+
dstride
*
(
row_tl
+
local_row
)
+
col_tl
+
local_col
;
int
dtt_offset_inc
=
fold_inc
[
local_row
];
#pragma unroll
float
*
dct_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][
ipass
<<
1
];
// use 2 buffers, second - borrowing from rotated DTT
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
float
*
dst_buf
=
(
float
*
)
clt_tile
[
BAYER_GREEN
][(
ipass
<<
1
)
+
1
];
float
d
=
(
*
image_p
)
*
window_vert_cos
[
BAYER_GREEN
][
local_row
];
//warp illegal address (0,2,1)
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows
dct_buf
[
dtt_offset1
]
=
d
*
hwind_cos
;
// was +=
dst_buf
[
dtt_offset1
]
=
d
*
hwind_sin
;
// was +=
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
image_p
+=
dstride2
;
}
}
else
{
// handling border tiles
int
eff_col
=
(
min
(
IMG_HEIGHT
/
2
-
1
,
max
(
0
,
col_tl
>>
1
))
<<
1
)
+
(
col_tl
&
1
);
int
row_lsb
=
row_tl
&
1
;
int
row_pair
=
row_tl
>>
1
;
float
*
image_p
=
gpu_images
+
dstride
*
local_row
+
(
eff_col
+
local_col
);
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
float
d
=
(
*
image_p
)
*
window_vert_cos
[
BAYER_GREEN
][
local_row
]
;
int
eff_row
=
(
min
(
IMG_WIDTH
/
2
-
1
,
max
(
0
,
row_pair
+
i
))
<<
1
)
+
row_lsb
;
float
dbg_pix
=
(
*
image_p
)
;
float
d
=
image_p
[
dstride
*
eff_row
]
*
window_vert_cos
[
BAYER_GREEN
][
local_row
]
;
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows
int
dtt_offset1
=
dtt_offset
+
(
dtt_offset
>>
3
);
// converting for 9-long rows
dct_buf
[
dtt_offset1
]
=
d
*
hwind_cos
;
// was +=
dct_buf
[
dtt_offset1
]
=
d
*
hwind_cos
;
// was +=
dst_buf
[
dtt_offset1
]
=
d
*
hwind_sin
;
// was +=
dst_buf
[
dtt_offset1
]
=
d
*
hwind_sin
;
// was +=
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset
=
(
dtt_offset
+
((
dtt_offset_inc
&
0xf
)
<<
3
))
&
0x3f
;
dtt_offset_inc
>>=
4
;
dtt_offset_inc
>>=
4
;
local_row
+=
2
;
local_row
+=
2
;
image_p
+=
dstride2
;
}
}
}
}
}
__syncthreads
();
__syncthreads
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment