Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
I
imagej-elphel
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
3
Issues
3
List
Board
Labels
Milestones
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
imagej-elphel
Commits
b5bfb231
Commit
b5bfb231
authored
Apr 05, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
debugging RGBA texture generation
parent
975dadb4
Changes
6
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
355 additions
and
83 deletions
+355
-83
ThermalColor.java
src/main/java/com/elphel/imagej/cameras/ThermalColor.java
+13
-0
GPUTileProcessor.java
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+103
-38
QuadCLT.java
src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+78
-15
TwoQuadCLT.java
...main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+60
-8
TileProcessor.cuh
src/main/resources/kernels/TileProcessor.cuh
+98
-21
dtt8x8.cuh
src/main/resources/kernels/dtt8x8.cuh
+3
-1
No files found.
src/main/java/com/elphel/imagej/cameras/ThermalColor.java
View file @
b5bfb231
...
...
@@ -38,6 +38,19 @@ public class ThermalColor {
return
rslt
;
}
public
float
[]
getRGB
(
float
v
)
{
// Get R,G,B (0..255) triplet for input value in the range 0..1
double
k
=
out_range
/
PALETTE_RANGE
;
double
value
=
(
v
-
min
)/(
max
-
min
)
*
(
this
.
palette
.
length
-
1
);
int
ivalue
=
(
int
)
(
value
);
if
(
ivalue
<
0
)
return
getRGB
((
float
)
min
);
// this.palette[0];
if
(
ivalue
>=
(
this
.
palette
.
length
-
1
))
return
getRGB
((
float
)
max
);
// this.palette[this.palette.length -1];
double
a
=
(
value
-
ivalue
);
// 0..1
float
[]
rslt
=
{
(
float
)
(
k
*((
1
-
a
)
*
this
.
palette
[
ivalue
][
0
]
+
a
*
this
.
palette
[
ivalue
+
1
][
0
])),
(
float
)
(
k
*((
1
-
a
)
*
this
.
palette
[
ivalue
][
1
]
+
a
*
this
.
palette
[
ivalue
+
1
][
1
])),
(
float
)
(
k
*((
1
-
a
)
*
this
.
palette
[
ivalue
][
2
]
+
a
*
this
.
palette
[
ivalue
+
1
][
2
]))};
return
rslt
;
}
private
int
[]
setupPalette
(
int
indx
)
{
//https://stackoverflow.com/questions/28495390/thermal-imaging-palette
...
...
src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
View file @
b5bfb231
This diff is collapsed.
Click to expand it.
src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
View file @
b5bfb231
...
...
@@ -5350,11 +5350,73 @@ public class QuadCLT {
ShowDoubleFloatArrays
sdfa_instance
=
new
ShowDoubleFloatArrays
();
// just for debugging?
// convert to ImageStack of 3 slices
String
[]
sliceNames
=
{
"red"
,
"blue"
,
"green"
};
int
green_index
=
2
;
float
[][]
rbg_in
=
{
iclt_data
[
0
],
iclt_data
[
1
],
iclt_data
[
2
]};
float
[]
alpha
=
null
;
// (0..1.0)
float
[][]
rgb_in
=
{
iclt_data
[
0
],
iclt_data
[
1
],
iclt_data
[
2
]};
//
float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
if
(
iclt_data
.
length
>
3
)
alpha
=
iclt_data
[
3
];
if
(
isLwir
())
{
String
[]
rgb_titles
=
{
"red"
,
"green"
,
"blue"
};
String
[]
rgba_titles
=
{
"red"
,
"green"
,
"blue"
,
"alpha"
};
String
[]
titles
=
(
alpha
==
null
)
?
rgb_titles
:
rgba_titles
;
int
num_slices
=
(
alpha
==
null
)
?
3
:
4
;
double
mn
=
colorProcParameters
.
lwir_low
;
double
mx
=
colorProcParameters
.
lwir_high
;
double
[]
cold_hot
=
getColdHot
();
if
(
cold_hot
!=
null
)
{
mn
=
cold_hot
[
0
];
mx
=
cold_hot
[
1
];
}
double
offset
=
getLwirOffset
();
if
(!
Double
.
isNaN
(
offset
))
{
mn
-=
offset
;
mx
-=
offset
;
}
ThermalColor
tc
=
new
ThermalColor
(
colorProcParameters
.
lwir_palette
,
mn
,
mx
,
255.0
);
float
[][]
rgba
=
new
float
[
num_slices
][];
for
(
int
i
=
0
;
i
<
3
;
i
++)
rgba
[
i
]
=
new
float
[
iclt_data
[
green_index
].
length
];
for
(
int
i
=
0
;
i
<
rbg_in
[
green_index
].
length
;
i
++)
{
if
(
i
==
700
)
{
System
.
out
.
println
(
"linearStackToColor(): i="
+
i
);
}
float
[]
rgb
=
tc
.
getRGB
(
iclt_data
[
green_index
][
i
]);
rgba
[
0
][
i
]
=
rgb
[
0
];
// red
rgba
[
1
][
i
]
=
rgb
[
1
];
// green
rgba
[
2
][
i
]
=
rgb
[
2
];
// blue
}
if
(
alpha
!=
null
)
{
rgba
[
3
]
=
alpha
;
// 0..1
}
ImageStack
stack
=
sdfa_instance
.
makeStack
(
rgba
,
// iclt_data,
width
,
// (tilesX + 0) * clt_parameters.transform_size,
height
,
// (tilesY + 0) * clt_parameters.transform_size,
titles
,
// or use null to get chn-nn slice names
true
);
// replace NaN with 0.0
ImagePlus
imp_rgba
=
EyesisCorrections
.
convertRGBAFloatToRGBA32
(
stack
,
// ImageStack stackFloat, //r,g,b,a
// name+"ARGB"+suffix, // String title,
name
+
suffix
,
// String title,
0.0
,
// double r_min,
255.0
,
// double r_max,
0.0
,
// double g_min,
255.0
,
// double g_max,
0.0
,
// double b_min,
255.0
,
// double b_max,
0.0
,
// double alpha_min,
1.0
);
// double alpha_max)
return
imp_rgba
;
}
ImageStack
stack
=
sdfa_instance
.
makeStack
(
rgb_in
,
// iclt_data,
// rgb_in, // iclt_data,
rbg_in
,
// iclt_data,
width
,
// (tilesX + 0) * clt_parameters.transform_size,
height
,
// (tilesY + 0) * clt_parameters.transform_size,
sliceNames
,
// or use null to get chn-nn slice names
...
...
@@ -5375,8 +5437,6 @@ public class QuadCLT {
height
,
// int height, // int tilesY,
scaleExposure
,
// double scaleExposure,
debugLevel
);
//int debugLevel
}
// double data
...
...
@@ -5460,16 +5520,16 @@ public class QuadCLT {
true
);
// replace NaN with 0.0
ImagePlus
imp_rgba
=
EyesisCorrections
.
convertRGBAFloatToRGBA32
(
stack
,
// ImageStack stackFloat, //r,g,b,a
// name+"ARGB"+suffix, // String title,
name
+
suffix
,
// String title,
0.0
,
// double r_min,
255.0
,
// double r_max,
0.0
,
// double g_min,
255.0
,
// double g_max,
0.0
,
// double b_min,
255.0
,
// double b_max,
0.0
,
// double alpha_min,
1.0
);
// double alpha_max)
// name+"ARGB"+suffix, // String title,
name
+
suffix
,
// String title,
0.0
,
// double r_min,
255.0
,
// double r_max,
0.0
,
// double g_min,
255.0
,
// double g_max,
0.0
,
// double b_min,
255.0
,
// double b_max,
0.0
,
// double alpha_min,
1.0
);
// double alpha_max)
return
imp_rgba
;
}
...
...
@@ -5499,10 +5559,13 @@ public class QuadCLT {
debugLevel
);
//int debugLevel
}
// Convert a single value pixels to color (r,b,g) values to be processed instead of the normal colors
public
ImagePlus
linearStackToColor
(
// USED in lwir
public
ImagePlus
linearStackToColor
(
CLTParameters
clt_parameters
,
ColorProcParameters
colorProcParameters
,
EyesisCorrectionParameters
.
RGBParameters
rgbParameters
,
...
...
src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
View file @
b5bfb231
...
...
@@ -2061,21 +2061,36 @@ public class TwoQuadCLT {
clt_parameters
.
diff_threshold
,
// double diff_threshold, // pixel value/pixel change
clt_parameters
.
min_agree
,
// double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters
.
dust_remove
,
// boolean dust_remove,
clt_parameters
.
keep_weights
);
// boolean keep_weights); // int corr_radius
clt_parameters
.
keep_weights
);
// boolean keep_weights);
long
endTextures
=
System
.
nanoTime
();
// run texturesRBGA
long
startTexturesRBGA
=
System
.
nanoTime
();
// System.nanoTime();
for
(
int
i
=
0
;
i
<
NREPEAT
;
i
++
)
gPUTileProcessor
.
execRBGA
(
port_offsets
,
// double [][] port_offsets,
col_weights
,
// double [] color_weights,
quadCLT_main
.
isLwir
(),
// boolean is_lwir,
clt_parameters
.
min_shot
,
// double min_shot, // 10.0
clt_parameters
.
scale_shot
,
// double scale_shot, // 3.0
clt_parameters
.
diff_sigma
,
// double diff_sigma, // pixel value/pixel change
clt_parameters
.
diff_threshold
,
// double diff_threshold, // pixel value/pixel change
clt_parameters
.
min_agree
,
// double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters
.
dust_remove
);
// boolean dust_remove,
long
endTexturesRBGA
=
System
.
nanoTime
();
long
endGPUTime
=
System
.
nanoTime
();
long
firstGPUTime
=
(
startIMCLT
-
startGPU
)/
NREPEAT
;
long
runImcltTime
=
(
endImcltTime
-
startIMCLT
)/
NREPEAT
;
long
runCorr2DTime
=
(
endCorr2d
-
startCorr2d
)/
NREPEAT
;
long
runTexturesTime
=
(
endTextures
-
startTextures
)/
NREPEAT
;
long
runGPUTime
=
(
endGPUTime
-
startGPU
)/
NREPEAT
;
long
firstGPUTime
=
(
startIMCLT
-
startGPU
)
/
NREPEAT
;
long
runImcltTime
=
(
endImcltTime
-
startIMCLT
)
/
NREPEAT
;
long
runCorr2DTime
=
(
endCorr2d
-
startCorr2d
)
/
NREPEAT
;
long
runTexturesTime
=
(
endTextures
-
startTextures
)
/
NREPEAT
;
long
runTexturesRBGATime
=
(
endTexturesRBGA
-
startTexturesRBGA
)/
NREPEAT
;
long
runGPUTime
=
(
endGPUTime
-
startGPU
)
/
NREPEAT
;
// run corr2d
System
.
out
.
println
(
"\n------------ End of running GPU "
+
NREPEAT
+
" times ----------------"
);
System
.
out
.
println
(
"GPU run time ="
+(
runGPUTime
*
1.0
e
-
6
)+
"ms, (direct conversion: "
+(
firstGPUTime
*
1.0
e
-
6
)+
"ms, imclt: "
+
(
runImcltTime
*
1.0
e
-
6
)+
"ms), corr2D: "
+(
runCorr2DTime
*
1.0
e
-
6
)+
"ms), textures: "
+(
runTexturesTime
*
1.0
e
-
6
)+
"ms"
);
(
runImcltTime
*
1.0
e
-
6
)+
"ms), corr2D: "
+(
runCorr2DTime
*
1.0
e
-
6
)+
"ms), textures: "
+(
runTexturesTime
*
1.0
e
-
6
)+
"ms, RGBA: "
+
(
runTexturesRBGATime
*
1.0
e
-
6
)+
"ms"
);
// get data back from GPU
float
[][][]
iclt_fimg
=
new
float
[
GPUTileProcessor
.
NUM_CAMS
][][];
for
(
int
ncam
=
0
;
ncam
<
iclt_fimg
.
length
;
ncam
++)
{
...
...
@@ -2190,6 +2205,41 @@ public class TwoQuadCLT {
debugLevel
);
}
// Use GPU prepared RBGA
if
(
clt_parameters
.
show_rgba_color
)
{
Rectangle
woi
=
new
Rectangle
();
float
[][]
rbga
=
gPUTileProcessor
.
getRBGA
(
(
is_mono
?
1
:
3
),
// int num_colors,
woi
);
// for now - use just RGB. Later add option for RGBA
float
[][]
rgb_main
=
{
rbga
[
0
],
rbga
[
1
],
rbga
[
2
]};
float
[][]
rgba_main
=
{
rbga
[
0
],
rbga
[
1
],
rbga
[
2
],
rbga
[
3
]};
ImagePlus
imp_rgba_main
=
quadCLT_main
.
linearStackToColor
(
clt_parameters
,
colorProcParameters
,
rgbParameters
,
name
+
"-texture"
,
// String name,
"-D"
+
clt_parameters
.
disparity
+
"-MAINGPU"
,
//String suffix, // such as disparity=...
toRGB
,
!
quadCLT_main
.
correctionsParameters
.
jpeg
,
// boolean bpp16, // 16-bit per channel color mode for result
false
,
// true, // boolean saveShowIntermediate, // save/show if set globally
false
,
// true, // boolean saveShowFinal, // save/show result (color image?)
((
clt_parameters
.
alpha1
>
0
)?
rgba_main:
rgb_main
),
tilesX
*
image_dtt
.
transform_size
,
tilesY
*
image_dtt
.
transform_size
,
1.0
,
// double scaleExposure, // is it needed?
debugLevel
);
int
width
=
imp_rgba_main
.
getWidth
();
int
height
=
imp_rgba_main
.
getHeight
();
ImageStack
texture_stack
=
new
ImageStack
(
width
,
height
);
texture_stack
.
addSlice
(
"main"
,
imp_rgba_main
.
getProcessor
().
getPixels
());
// single slice
ImagePlus
imp_texture_stack
=
new
ImagePlus
(
name
+
"-RGBA-D"
+
clt_parameters
.
disparity
,
texture_stack
);
imp_texture_stack
.
getProcessor
().
resetMinAndMax
();
imp_texture_stack
.
show
();
}
// convert textures to RGBA in Java
if
(
clt_parameters
.
show_rgba_color
)
{
int
numcol
=
quadCLT_main
.
isMonochrome
()?
1
:
3
;
int
ports
=
imp_quad_main
.
length
;
...
...
@@ -2311,6 +2361,8 @@ public class TwoQuadCLT {
}
return
results
;
}
...
...
src/main/resources/kernels/TileProcessor.cuh
View file @
b5bfb231
...
...
@@ -72,8 +72,8 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#undef HAS_PRINTF
//
#define HAS_PRINTF
//
#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
...
...
@@ -87,7 +87,8 @@
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//#define USE_textures_gen
#endif //#ifndef JCUDA
...
...
@@ -1533,10 +1534,15 @@ __global__ void generate_RBGA(
int
texture_slices
=
colors
+
1
;
if
(
threadIdx
.
x
==
0
)
{
//DTT_SIZE_LOG2
// dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
// int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
// dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
dim3
threads2
((
1
<<
THREADS_DYNAMIC_BITS
),
1
,
1
);
int
blocks_x
=
(
texture_width
+
((
1
<<
THREADS_DYNAMIC_BITS
)
-
1
))
>>
THREADS_DYNAMIC_BITS
;
int
blocks_x
=
(
texture_width
+
((
1
<<
(
THREADS_DYNAMIC_BITS
+
DTT_SIZE_LOG2
))
-
1
))
>>
(
THREADS_DYNAMIC_BITS
+
DTT_SIZE_LOG2
)
;
dim3
blocks2
(
blocks_x
,
texture_tiles_height
*
texture_slices
,
1
);
// each thread - 8 vertical
clear_texture_rbga
<<<
blocks2
,
threads2
>>>
(
clear_texture_rbga
<<<
blocks2
,
threads2
>>>
(
// illegal value error
texture_width
,
texture_tiles_height
*
texture_slices
,
// int texture_slice_height,
texture_rbga_stride
,
// const size_t texture_rbga_stride, // in floats 8*stride
...
...
@@ -1547,12 +1553,23 @@ __global__ void generate_RBGA(
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
){
dim3
threads_texture
(
TEXTURE_THREADS_PER_TILE
,
NUM_CAMS
,
1
);
// TEXTURE_TILES_PER_BLOCK, 1);
int
border_tile
=
pass
>>
2
;
size_t
ntt
=
*
(
num_texture_tiles
+
(
2
*
(
pass
&
3
)
)
+
border_tile
);
int
ntt
=
*
(
num_texture_tiles
+
((
pass
&
3
)
<<
1
)
+
border_tile
);
dim3
grid_texture
((
ntt
+
TEXTURE_TILES_PER_BLOCK
-
1
)
/
TEXTURE_TILES_PER_BLOCK
,
1
,
1
);
int
ti_offset
=
(
pass
&
3
)
*
(
TILESX
*
(
TILESYA
>>
2
));
// 1/4
if
(
border_tile
){
ti_offset
+=
TILESX
*
(
TILESYA
>>
2
)
-
ntt
;
}
#ifdef DEBUG12
printf
(
"
\n
generate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d
\n
"
,
pass
,
border_tile
,
ti_offset
,
ntt
);
printf
(
"
\n
generate_RBGA() gpu_texture_indices= 0x%x, gpu_texture_indices + ti_offset=0x%x
\n
"
,
(
int
)
gpu_texture_indices
,
(
int
)
(
gpu_texture_indices
+
ti_offset
));
printf
(
"
\n
generate_RBGA() grid_texture={%d, %d, %d)
\n
"
,
grid_texture
.
x
,
grid_texture
.
y
,
grid_texture
.
z
);
printf
(
"
\n
generate_RBGA() threads_texture={%d, %d, %d)
\n
"
,
threads_texture
.
x
,
threads_texture
.
y
,
threads_texture
.
z
);
printf
(
"
\n
"
);
#endif
/* */
textures_accumulate
<<<
grid_texture
,
threads_texture
>>>
(
border_tile
,
// int border_tile, // if 1 - watch for border
...
...
@@ -1578,9 +1595,8 @@ __global__ void generate_RBGA(
gpu_texture_tiles
,
// float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0
,
// size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles
);
// (float *) 0 ); // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
cudaDeviceSynchronize
();
// not needed yet, just for testing
/* */
}
}
...
...
@@ -1590,21 +1606,20 @@ __global__ void generate_RBGA(
// blockDim.x * gridDim.x >= width
extern
"C"
__global__
void
clear_texture_rbga
(
int
texture_width
,
int
texture_width
,
// aligned to DTT_SIZE
int
texture_slice_height
,
const
size_t
texture_rbga_stride
,
// in floats 8*stride
float
*
gpu_texture_tiles
)
// (number of colors +1 + ?)*16*16 rgba texture tiles
{
int
col
=
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
;
int
col
=
(
blockDim
.
x
*
blockIdx
.
x
+
threadIdx
.
x
)
<<
DTT_SIZE_LOG2
;
if
(
col
>
texture_width
)
{
return
;
}
int
row
=
(
blockIdx
.
y
<<
3
)
;
// includes slices
int
row
=
blockIdx
.
y
;
;
// includes slices
float
*
pix
=
gpu_texture_tiles
+
col
+
row
*
texture_rbga_stride
;
#pragma unroll
for
(
int
n
=
0
;
n
<
DTT_SIZE
;
n
++
)
{
*
(
pix
)
=
0.0
;
pix
+=
texture_rbga_stride
;
*
(
pix
++
)
=
0.0
;
}
}
...
...
@@ -1778,26 +1793,51 @@ __global__ void gen_texture_list(
int
cxy
=
gpu_tasks
[
task_num
].
txy
;
int
x
=
(
cxy
&
0xffff
);
int
y
=
(
cxy
>>
16
);
#ifdef DEBUG12
if
((
x
==
DBG_TILE_X
)
&&
(
y
==
DBG_TILE_Y
)){
printf
(
"
\n
gen_texture_list() x = %d, y= %d
\n
"
,
x
,
y
);
printf
(
"
\n
gen_texture_list() num_texture_tiles = %d(%d) %d(%d) %d(%d) %d(%d)
\n
"
,
num_texture_tiles
[
0
],
num_texture_tiles
[
1
],
num_texture_tiles
[
2
],
num_texture_tiles
[
3
],
num_texture_tiles
[
4
],
num_texture_tiles
[
5
],
num_texture_tiles
[
6
],
num_texture_tiles
[
7
]);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
// don't care if calculate extra pixels that still fit into memory
int
is_border
=
(
x
==
woi
[
0
])
||
(
y
==
woi
[
1
])
||
(
x
==
(
TILESX
-
1
))
||
(
y
==
(
TILESY
-
1
));
int
buff_head
=
0
;
int
num_offset
=
0
;
if
(
x
&
1
)
{
gpu_texture_indices
+=
TILESX
*
(
TILESYA
>>
2
);
//TILESYA - 2 LSB == 00
num_
texture_tiles
+=
2
;
// int *
buff_head
+=
TILESX
*
(
TILESYA
>>
2
);
//TILESYA - 2 LSB == 00
num_
offset
+=
2
;
// int *
}
if
(
y
&
1
)
{
gpu_texture_indices
+=
TILESX
*
(
TILESYA
>>
1
);
num_
texture_tiles
+=
4
;
// int *
buff_head
+=
TILESX
*
(
TILESYA
>>
1
);
num_
offset
+=
4
;
// int *
}
if
(
is_border
){
gpu_texture_indices
+=
(
TILESX
*
(
TILESYA
>>
2
)
-
1
);
// end of the buffer
num_
texture_tiles
+=
1
;
// int *
buff_head
+=
(
TILESX
*
(
TILESYA
>>
2
)
-
1
);
// end of the buffer
num_
offset
+=
1
;
// int *
}
gpu_texture_indices
+=
buff_head
;
num_texture_tiles
+=
num_offset
;
// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
int
buf_offset
=
atomicAdd
(
num_texture_tiles
,
1
);
if
(
is_border
){
buf_offset
=
-
buf_offset
;
}
#ifdef DEBUG12
if
((
x
==
DBG_TILE_X
)
&&
(
y
==
DBG_TILE_Y
)){
printf
(
"
\n
gen_texture_list() buff_head=%d, buf_offset = %d, num_offset= %d, is_border=%d
\n
"
,
buff_head
,
buf_offset
,
num_offset
,
is_border
);
printf
(
"
\n
gen_texture_list() gpu_texture_indices = 0x%x, gpu_texture_indices + buf_offset = 0x%x
\n
"
,
(
int
)
gpu_texture_indices
,
(
int
)
(
gpu_texture_indices
+
buf_offset
));
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
*
(
gpu_texture_indices
+
buf_offset
)
=
task
|
((
x
+
y
*
TILESX
)
<<
CORR_NTILE_SHIFT
)
|
(
1
<<
LIST_TEXTURE_BIT
);
}
...
...
@@ -2420,7 +2460,7 @@ __global__ void textures_accumulate(
}
#ifdef DEBUG7
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\
n
textures_gen
tile done = %d, texture_stride= %d
\n
"
,
tile_num
,
(
int
)
texture_stride
);
printf
(
"
\
t
extures_accumulate
tile done = %d, texture_stride= %d
\n
"
,
tile_num
,
(
int
)
texture_stride
);
}
__syncthreads
();
// __syncwarp();
#endif
...
...
@@ -2432,6 +2472,20 @@ __global__ void textures_accumulate(
}
if
(
gpu_texture_rbg
&&
(
texture_rbg_stride
!=
0
))
{
// generate RGBA
#ifdef DEBUG12
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d
\n
"
,
tile_num
,
(
int
)
tile_code
,
border_tile
);
for
(
int
ncol
=
0
;
ncol
<=
colors
;
ncol
++
)
{
printf
(
"
\n
tile[%d]
\n
"
,
ncol
);
debug_print_mclt
(
(
float
*
)
(
shr1
.
rgbaw
[
ncol
]),
-
1
);
}
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
if
(
tile_code
!=
TASK_TEXTURE_BITS
){
// only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
...
...
@@ -2453,12 +2507,26 @@ __global__ void textures_accumulate(
}
}
}
int
slice_stride
=
texture_rbg_stride
*
*
(
woi
+
3
);
// offset to the next color
int
slice_stride
=
texture_rbg_stride
*
*
(
woi
+
3
)
*
DTT_SIZE
;
// offset to the next color
int
tileY
=
tile_num
/
TILESX
;
// slow, but 1 per tile
int
tileX
=
tile_num
-
tileY
*
TILESX
;
int
tile_x0
=
(
tileX
-
*
(
woi
+
0
))
*
DTT_SIZE
-
(
DTT_SIZE
/
2
);
// may be negative == -4
int
tile_y0
=
(
tileY
-
*
(
woi
+
1
))
*
DTT_SIZE
-
(
DTT_SIZE
/
2
);
// may be negative == -4
#ifdef DEBUG12
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d
\n
"
,
tileX
,
tileY
,
tile_x0
,
tile_y0
,
slice_stride
);
for
(
int
ncol
=
0
;
ncol
<=
colors
;
ncol
++
)
{
printf
(
"
\n
tile[%d]
\n
"
,
ncol
);
debug_print_mclt
(
(
float
*
)
(
shr1
.
rgbaw
[
ncol
]),
-
1
);
}
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
for
(
int
pass
=
0
;
pass
<
8
;
pass
++
)
{
int
row
=
pass
*
2
+
(
threadIdx
.
y
>>
1
);
// row inside a tile (0..15)
int
col
=
((
threadIdx
.
y
&
1
)
<<
3
)
+
threadIdx
.
x
;
// column inside a tile (0..15)
...
...
@@ -2468,6 +2536,15 @@ __global__ void textures_accumulate(
int
gi
=
g_row
*
texture_rbg_stride
+
g_col
;
// offset to the top left corner
float
*
gpu_texture_rbg_gi
=
gpu_texture_rbg
+
gi
;
float
*
rgba_i
=
((
float
*
)
shr1
.
rgbaw
)
+
i
;
#ifdef DEBUG12
if
((
tile_num
==
DBG_TILE
)
&&
(
threadIdx
.
x
==
0
)
&&
(
threadIdx
.
y
==
0
)){
printf
(
"
\n
textures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d
\n
"
,
pass
,
row
,
col
,
g_row
,
g_col
,
i
,
gi
);
}
__syncthreads
();
// __syncwarp();
#endif // DEBUG12
if
(
!
border_tile
||
((
g_row
>=
0
)
&&
(
g_col
>=
0
)
&&
(
g_row
<
(
DTT_SIZE
*
TILESX
))
&&
(
g_col
<
(
DTT_SIZE
*
TILESY
)))){
// always copy 3 (1) colors + alpha
...
...
src/main/resources/kernels/dtt8x8.cuh
View file @
b5bfb231
...
...
@@ -45,9 +45,11 @@
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE 8
#define DTT_SIZE_LOG2 3
//#define DTT_SIZE 8
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment