Commit b5bfb231 authored by Andrey Filippov's avatar Andrey Filippov

debugging RGBA texture generation

parent 975dadb4
......@@ -38,6 +38,19 @@ public class ThermalColor {
return rslt;
}
public float [] getRGB(float v) { // Get R,G,B (0..255) triplet for input value in the range 0..1
double k = out_range/PALETTE_RANGE;
double value = (v-min)/(max-min) * (this.palette.length - 1);
int ivalue = (int) (value);
if (ivalue < 0) return getRGB((float) min); // this.palette[0];
if (ivalue >= (this.palette.length -1)) return getRGB((float) max); // this.palette[this.palette.length -1];
double a = (value-ivalue); // 0..1
float [] rslt = {
(float) (k*((1 - a) * this.palette[ivalue][0] + a * this.palette[ivalue+1][0])),
(float) (k*((1 - a) * this.palette[ivalue][1] + a * this.palette[ivalue+1][1])),
(float) (k*((1 - a) * this.palette[ivalue][2] + a * this.palette[ivalue+1][2]))};
return rslt;
}
private int [] setupPalette(int indx) {
//https://stackoverflow.com/questions/28495390/thermal-imaging-palette
......
......@@ -45,6 +45,7 @@ import static jcuda.driver.JCudaDriver.cuLinkDestroy;
import static jcuda.driver.JCudaDriver.cuMemAlloc;
import static jcuda.driver.JCudaDriver.cuMemAllocPitch;
import static jcuda.driver.JCudaDriver.cuMemcpy2D;
import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
import static jcuda.driver.JCudaDriver.cuModuleGetGlobal;
......@@ -93,10 +94,12 @@ public class GPUTileProcessor {
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
// static String GPU_TEXTURES_NAME = "textures_gen"; // name in C code
static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code
// pass some defines to gpu source code with #ifdef JCUDA
public static int DTT_SIZE = 8;
public static int DTT_SIZE_LOG2 = 3;
public static int DTT_SIZE = (1 << DTT_SIZE_LOG2);
static int THREADSX = DTT_SIZE;
public static int NUM_CAMS = 4;
public static int NUM_PAIRS = 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
......@@ -147,6 +150,8 @@ public class GPUTileProcessor {
private CUfunction GPU_IMCLT_RBG_kernel = null;
private CUfunction GPU_CORRELATE2D_kernel = null;
private CUfunction GPU_TEXTURES_kernel = null;
private CUfunction GPU_RBGA_kernel = null;
// CPU arrays of pointers to GPU memory
// These arrays may go to method, they are here just to be able to free GPU memory if needed
private CUdeviceptr [] gpu_kernels_h = new CUdeviceptr[NUM_CAMS];
......@@ -166,11 +171,16 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.POINTER
private CUdeviceptr gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
private CUdeviceptr gpu_num_texture_tiles = new CUdeviceptr(); // 8 ints
private CUdeviceptr gpu_textures_rgba = new CUdeviceptr(); // allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
CUmodule module; // to access constants memory
private int mclt_stride;
private int corr_stride;
private int imclt_stride;
private int texture_stride;
private int texture_stride_rgba;
public int num_task_tiles;
public int num_corr_tiles;
public int num_texture_tiles;
......@@ -318,7 +328,7 @@ public class GPUTileProcessor {
ClassLoader classLoader = getClass().getClassLoader();
String kernelSource =
"#define JCUDA\n"+
"#define DTT_SIZE " + DTT_SIZE+"\n"+
"#define DTT_SIZE_LOG2 " + DTT_SIZE_LOG2+"\n"+
"#define THREADSX " + THREADSX+"\n"+
"#define NUM_CAMS " + NUM_CAMS+"\n"+
"#define NUM_PAIRS " + NUM_PAIRS+"\n"+
......@@ -372,7 +382,12 @@ public class GPUTileProcessor {
}
// Create the kernel functions (first - just test)
String [] func_names = {GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME, GPU_CORRELATE2D_NAME, GPU_TEXTURES_NAME};
String [] func_names = {
GPU_CONVERT_CORRECT_TILES_NAME,
GPU_IMCLT_RBG_NAME,
GPU_CORRELATE2D_NAME,
GPU_TEXTURES_NAME,
GPU_RBGA_NAME};
CUfunction[] functions = createFunctions(kernelSource,
func_names,
capability); // on my - 75
......@@ -381,12 +396,14 @@ public class GPUTileProcessor {
this.GPU_IMCLT_RBG_kernel = functions[1];
this.GPU_CORRELATE2D_kernel = functions[2];
this.GPU_TEXTURES_kernel= functions[3];
this.GPU_RBGA_kernel= functions[4];
System.out.println("GPU kernel functions initialized");
System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
System.out.println(GPU_IMCLT_RBG_kernel.toString());
System.out.println(GPU_CORRELATE2D_kernel.toString());
System.out.println(GPU_TEXTURES_kernel.toString());
System.out.println(GPU_RBGA_kernel.toString());
// Init data arrays for all kernels
int tilesX = IMG_WIDTH / DTT_SIZE;
......@@ -459,6 +476,8 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
cuMemAlloc(gpu_port_offsets, NUM_CAMS * 2 * Sizeof.POINTER);
cuMemAlloc(gpu_woi, 4 * Sizeof.POINTER); // may be hidden in device code as a static array?
cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.POINTER); // may be hidden in device code as a static array?
cuMemAllocPitch (
gpu_corrs, // CUdeviceptr dptr,
......@@ -476,7 +495,17 @@ public class GPUTileProcessor {
tilesX * tilesY, // long Height,
Sizeof.FLOAT); // int ElementSizeBytes)
texture_stride = (int)(device_stride[0] / Sizeof.FLOAT);
int max_rgba_width = tilesX * DTT_SIZE;
int max_rgba_height = tilesY * DTT_SIZE;
int max_rbga_slices = NUM_COLORS + 1;
cuMemAllocPitch (
gpu_textures_rgba, // CUdeviceptr dptr,
device_stride, // long[] pPitch,
max_rgba_width * Sizeof.FLOAT, // long WidthInBytes,
max_rgba_height * max_rbga_slices, // long Height,
Sizeof.FLOAT); // int ElementSizeBytes)
texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
}
......@@ -819,12 +848,9 @@ public class GPUTileProcessor {
Pointer.to(gpu_bayer),
Pointer.to(gpu_tasks),
Pointer.to(gpu_clt),
/* 2020*/// Pointer.to(gpu_corrs),
/* 2020*/// Pointer.to(gpu_corr_indices), // corr indices (tile_num <<8 + pair_index
/* 2020*/// Pointer.to(new int[] { num_corr_tiles }), // total number of 2D correlations to calculate
Pointer.to(new int[] { mclt_stride }),
/* 2020*/// Pointer.to(new int[] { corr_stride }),
Pointer.to(new int[] { num_task_tiles }),
// move lpf to 4-image generator kernel
// Pointer.to(new int[] { 7 }) // lpf_mask ??? (C-code has it 0)
Pointer.to(new int[] { 0 }) // lpf_mask ??? (C-code has it 0)
);
......@@ -918,7 +944,7 @@ public class GPUTileProcessor {
cuCtxSynchronize();
}
public void execTexturesOld(
public void execRBGA(
double [][] port_offsets,
double [] color_weights,
boolean is_lwir,
......@@ -927,9 +953,8 @@ public class GPUTileProcessor {
double diff_sigma, // pixel value/pixel change
double diff_threshold, // pixel value/pixel change
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove,
boolean keep_weights) {
if (GPU_TEXTURES_kernel == null)
boolean dust_remove) {
if (GPU_RBGA_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
return;
......@@ -948,34 +973,41 @@ public class GPUTileProcessor {
float weighht2 = (num_colors >2)?((float) color_weights[2]):0.0f;
int iis_lwir = (is_lwir)? 1:0;
int idust_remove = (dust_remove)? 1 : 0;
int ikeep_weights = (keep_weights)? 1 : 0;
int [] GridFullWarps = {(num_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1};
int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
// uses dynamic parallelization, top kernel is a single-thread one
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt),
Pointer.to(new int[] { num_texture_tiles }),
Pointer.to(gpu_texture_indices),
Pointer.to(gpu_port_offsets),
Pointer.to(new int[] { num_colors }),
Pointer.to(new int[] { iis_lwir }),
Pointer.to(new float[] {(float) min_shot }),
Pointer.to(new float[] {(float) scale_shot }),
Pointer.to(new float[] {(float) diff_sigma }),
Pointer.to(new float[] {(float) diff_threshold }),
Pointer.to(new float[] {(float) min_agree }),
Pointer.to(new float[] {weighht0 }),
Pointer.to(new float[] {weighht1 }),
Pointer.to(new float[] {weighht2 }),
Pointer.to(new int[] { idust_remove }),
Pointer.to(new int[] { ikeep_weights }),
Pointer.to(new int[] { texture_stride }),
Pointer.to(gpu_textures) // lpf_mask
);
Pointer.to(gpu_tasks), // struct tp_task * gpu_tasks,
Pointer.to(new int[] { num_task_tiles }), // int num_tiles, // number of tiles in task list
// declare arrays in device code?
Pointer.to(gpu_texture_indices), // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
Pointer.to(gpu_num_texture_tiles), // int * num_texture_tiles, // number of texture tiles to process (8 elements)
Pointer.to(gpu_woi), // int * woi, // x,y,width,height of the woi
// set smaller for LWIR - it is used to reduce work aread
Pointer.to(new int[] {IMG_WIDTH / DTT_SIZE}), // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
Pointer.to(new int[] {IMG_HEIGHT / DTT_SIZE}), // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
Pointer.to(gpu_clt), // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
Pointer.to(gpu_port_offsets), // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
Pointer.to(new int[] {num_colors}), // int colors, // number of colors (3/1)
Pointer.to(new int[] {iis_lwir}), // int is_lwir, // do not perform shot correction
Pointer.to(new float[] {(float) min_shot}), // float min_shot, // 10.0
Pointer.to(new float[] {(float) scale_shot}), // float scale_shot, // 3.0
Pointer.to(new float[] {(float) diff_sigma}), // float diff_sigma, // pixel value/pixel change
Pointer.to(new float[] {(float) diff_threshold}),// float diff_threshold, // pixel value/pixel change
Pointer.to(new float[] {(float) min_agree}), // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
Pointer.to(new float[] {weighht0}), // float weight0, // scale for R
Pointer.to(new float[] {weighht1}), // float weight1, // scale for B
Pointer.to(new float[] {weighht2}), // float weight2, // scale for G
Pointer.to(new int[] { idust_remove }), // int dust_remove, // Do not reduce average weight when only one image differes much from the average
Pointer.to(new int[] {0}), // int keep_weights, // return channel weights after A in RGBA
Pointer.to(new int[] { texture_stride_rgba }), // const size_t texture_rbga_stride, // in floats
Pointer.to(gpu_textures_rgba)); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_TEXTURES_kernel,
cuLaunchKernel(GPU_RBGA_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
......@@ -1038,7 +1070,7 @@ public class GPUTileProcessor {
Pointer.to(new int[] { idust_remove }),
Pointer.to(new int[] { ikeep_weights }),
Pointer.to(new int[] {0}),// 0, // const size_t texture_rbg_stride, // in floats - DISABLE GENERATION!
Pointer.to(gpu_textures), // new Pointer(), // Pointer.to(gpu_textures),
Pointer.to(new int[] {0}), // null, // new Pointer(), //Pointer.to(gpu_textures), // new Pointer(), // Pointer.to(gpu_textures),
Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
Pointer.to(gpu_textures)
);
......@@ -1053,7 +1085,6 @@ public class GPUTileProcessor {
}
public float [][] getCorr2D(int corr_rad){
int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
......@@ -1078,6 +1109,41 @@ public class GPUTileProcessor {
return corrs;
}
/**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
* @param num_colors number of colors (1 or 3)
* @param woi should be initialized as Rectangle(). x,y,width, height will be populated (in pixels,)
* @return RBGA slices, last (alpha) in 0.0... 1.0 range, colors match input range
*/
public float [][] getRBGA(
int num_colors,
Rectangle woi) { // will update to woi
// first - read woi
float [] fwoi = new float[4];
cuMemcpyDtoH(Pointer.to(fwoi), gpu_woi, 4 * Sizeof.FLOAT);
woi.x = Float.floatToIntBits(fwoi[0]) * DTT_SIZE;
woi.y = Float.floatToIntBits(fwoi[1]) * DTT_SIZE;
woi.width = Float.floatToIntBits(fwoi[2]) * DTT_SIZE;
woi.height = Float.floatToIntBits(fwoi[3]) * DTT_SIZE;
float [][] rslt = new float[num_colors + 1][woi.width * woi.height];
CUDA_MEMCPY2D copy_rbga = new CUDA_MEMCPY2D();
copy_rbga.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_DEVICE;
copy_rbga.srcDevice = gpu_textures_rgba;
copy_rbga.srcPitch = texture_stride_rgba * Sizeof.FLOAT;
copy_rbga.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
// copy_woi.dstHost = Pointer.to(rslt);
copy_rbga.dstPitch = woi.width * Sizeof.FLOAT;
copy_rbga.WidthInBytes = woi.width * Sizeof.FLOAT;
copy_rbga.Height = woi.height;
for (int ncol = 0; ncol<= num_colors; ncol++ ) {
copy_rbga.dstHost = Pointer.to(rslt[ncol]);
copy_rbga.srcY = woi.height * ncol;
cuMemcpy2D(copy_rbga); // run copy
}
return rslt;
}
public float [] getFlatTextures(
int num_colors,
boolean keep_weights){
......@@ -1104,8 +1170,7 @@ public class GPUTileProcessor {
return cpu_textures;
}
public float [][][] getTextures(
public float [][][] getTextures( // todo - get rid of copying by multiple CUDA_MEMCPY2D?
int num_colors,
boolean keep_weights){
......
......@@ -5350,11 +5350,73 @@ public class QuadCLT {
ShowDoubleFloatArrays sdfa_instance = new ShowDoubleFloatArrays(); // just for debugging?
// convert to ImageStack of 3 slices
String [] sliceNames = {"red", "blue", "green"};
int green_index = 2;
float [][] rbg_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
float [] alpha = null; // (0..1.0)
float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
// float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
if (iclt_data.length > 3) alpha = iclt_data[3];
if (isLwir()) {
String [] rgb_titles = {"red","green","blue"};
String [] rgba_titles = {"red","green","blue","alpha"};
String [] titles = (alpha == null) ? rgb_titles : rgba_titles;
int num_slices = (alpha == null) ? 3 : 4;
double mn = colorProcParameters.lwir_low;
double mx = colorProcParameters.lwir_high;
double [] cold_hot = getColdHot();
if (cold_hot != null) {
mn = cold_hot[0];
mx = cold_hot[1];
}
double offset = getLwirOffset();
if (!Double.isNaN(offset)) {
mn -= offset;
mx -= offset;
}
ThermalColor tc = new ThermalColor(
colorProcParameters.lwir_palette,
mn,
mx,
255.0);
float [][] rgba = new float [num_slices][];
for (int i = 0; i < 3; i++) rgba[i] = new float [iclt_data[green_index].length];
for (int i = 0; i < rbg_in[green_index].length; i++) {
if (i == 700) {
System.out.println("linearStackToColor(): i="+i);
}
float [] rgb = tc.getRGB(iclt_data[green_index][i]);
rgba[0][i] = rgb[0]; // red
rgba[1][i] = rgb[1]; // green
rgba[2][i] = rgb[2]; // blue
}
if (alpha != null) {
rgba[3] = alpha; // 0..1
}
ImageStack stack = sdfa_instance.makeStack(
rgba, // iclt_data,
width, // (tilesX + 0) * clt_parameters.transform_size,
height, // (tilesY + 0) * clt_parameters.transform_size,
titles, // or use null to get chn-nn slice names
true); // replace NaN with 0.0
ImagePlus imp_rgba = EyesisCorrections.convertRGBAFloatToRGBA32(
stack, // ImageStack stackFloat, //r,g,b,a
// name+"ARGB"+suffix, // String title,
name+suffix, // String title,
0.0, // double r_min,
255.0, // double r_max,
0.0, // double g_min,
255.0, // double g_max,
0.0, // double b_min,
255.0, // double b_max,
0.0, // double alpha_min,
1.0); // double alpha_max)
return imp_rgba;
}
ImageStack stack = sdfa_instance.makeStack(
rgb_in, // iclt_data,
// rgb_in, // iclt_data,
rbg_in, // iclt_data,
width, // (tilesX + 0) * clt_parameters.transform_size,
height, // (tilesY + 0) * clt_parameters.transform_size,
sliceNames, // or use null to get chn-nn slice names
......@@ -5375,8 +5437,6 @@ public class QuadCLT {
height, // int height, // int tilesY,
scaleExposure, // double scaleExposure,
debugLevel); //int debugLevel
}
// double data
......@@ -5460,16 +5520,16 @@ public class QuadCLT {
true); // replace NaN with 0.0
ImagePlus imp_rgba = EyesisCorrections.convertRGBAFloatToRGBA32(
stack, // ImageStack stackFloat, //r,g,b,a
// name+"ARGB"+suffix, // String title,
name+suffix, // String title,
0.0, // double r_min,
255.0, // double r_max,
0.0, // double g_min,
255.0, // double g_max,
0.0, // double b_min,
255.0, // double b_max,
0.0, // double alpha_min,
1.0); // double alpha_max)
// name+"ARGB"+suffix, // String title,
name+suffix, // String title,
0.0, // double r_min,
255.0, // double r_max,
0.0, // double g_min,
255.0, // double g_max,
0.0, // double b_min,
255.0, // double b_max,
0.0, // double alpha_min,
1.0); // double alpha_max)
return imp_rgba;
}
......@@ -5499,10 +5559,13 @@ public class QuadCLT {
debugLevel); //int debugLevel
}
// Convert a single value pixels to color (r,b,g) values to be processed instead of the normal colors
public ImagePlus linearStackToColor( // USED in lwir
public ImagePlus linearStackToColor(
CLTParameters clt_parameters,
ColorProcParameters colorProcParameters,
EyesisCorrectionParameters.RGBParameters rgbParameters,
......
......@@ -2061,21 +2061,36 @@ public class TwoQuadCLT {
clt_parameters.diff_threshold, // double diff_threshold, // pixel value/pixel change
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove, // boolean dust_remove,
clt_parameters.keep_weights); // boolean keep_weights); // int corr_radius
clt_parameters.keep_weights); // boolean keep_weights);
long endTextures = System.nanoTime();
// run texturesRBGA
long startTexturesRBGA = System.nanoTime(); // System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
port_offsets, // double [][] port_offsets,
col_weights, // double [] color_weights,
quadCLT_main.isLwir(), // boolean is_lwir,
clt_parameters.min_shot, // double min_shot, // 10.0
clt_parameters.scale_shot, // double scale_shot, // 3.0
clt_parameters.diff_sigma, // double diff_sigma, // pixel value/pixel change
clt_parameters.diff_threshold, // double diff_threshold, // pixel value/pixel change
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove); // boolean dust_remove,
long endTexturesRBGA = System.nanoTime();
long endGPUTime = System.nanoTime();
long firstGPUTime= (startIMCLT- startGPU)/NREPEAT;
long runImcltTime = (endImcltTime - startIMCLT)/NREPEAT;
long runCorr2DTime = (endCorr2d - startCorr2d)/NREPEAT;
long runTexturesTime = (endTextures - startTextures)/NREPEAT;
long runGPUTime = (endGPUTime - startGPU)/NREPEAT;
long firstGPUTime= (startIMCLT- startGPU) /NREPEAT;
long runImcltTime = (endImcltTime - startIMCLT) /NREPEAT;
long runCorr2DTime = (endCorr2d - startCorr2d) /NREPEAT;
long runTexturesTime = (endTextures - startTextures) /NREPEAT;
long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA)/NREPEAT;
long runGPUTime = (endGPUTime - startGPU) /NREPEAT;
// run corr2d
System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
System.out.println("GPU run time ="+(runGPUTime * 1.0e-6)+"ms, (direct conversion: "+(firstGPUTime*1.0e-6)+"ms, imclt: "+
(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms), textures: "+(runTexturesTime*1.0e-6)+"ms");
(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms), textures: "+(runTexturesTime*1.0e-6)+"ms, RGBA: "+
(runTexturesRBGATime*1.0e-6)+"ms");
// get data back from GPU
float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
......@@ -2190,6 +2205,41 @@ public class TwoQuadCLT {
debugLevel);
}
// Use GPU prepared RBGA
if (clt_parameters.show_rgba_color) {
Rectangle woi = new Rectangle();
float [][] rbga = gPUTileProcessor.getRBGA(
(is_mono?1:3), // int num_colors,
woi);
// for now - use just RGB. Later add option for RGBA
float [][] rgb_main = {rbga[0],rbga[1],rbga[2]};
float [][] rgba_main = {rbga[0],rbga[1],rbga[2],rbga[3]};
ImagePlus imp_rgba_main = quadCLT_main.linearStackToColor(
clt_parameters,
colorProcParameters,
rgbParameters,
name+"-texture", // String name,
"-D"+clt_parameters.disparity+"-MAINGPU", //String suffix, // such as disparity=...
toRGB,
!quadCLT_main.correctionsParameters.jpeg, // boolean bpp16, // 16-bit per channel color mode for result
false, // true, // boolean saveShowIntermediate, // save/show if set globally
false, // true, // boolean saveShowFinal, // save/show result (color image?)
((clt_parameters.alpha1 > 0)? rgba_main: rgb_main),
tilesX * image_dtt.transform_size,
tilesY * image_dtt.transform_size,
1.0, // double scaleExposure, // is it needed?
debugLevel );
int width = imp_rgba_main.getWidth();
int height =imp_rgba_main.getHeight();
ImageStack texture_stack=new ImageStack(width,height);
texture_stack.addSlice("main", imp_rgba_main.getProcessor().getPixels()); // single slice
ImagePlus imp_texture_stack = new ImagePlus(name+"-RGBA-D"+clt_parameters.disparity, texture_stack);
imp_texture_stack.getProcessor().resetMinAndMax();
imp_texture_stack.show();
}
// convert textures to RGBA in Java
if (clt_parameters.show_rgba_color) {
int numcol = quadCLT_main.isMonochrome()?1:3;
int ports = imp_quad_main.length;
......@@ -2311,6 +2361,8 @@ public class TwoQuadCLT {
}
return results;
}
......
......@@ -72,8 +72,8 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#undef HAS_PRINTF
//#define HAS_PRINTF
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
......@@ -87,7 +87,8 @@
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//#define USE_textures_gen
#endif //#ifndef JCUDA
......@@ -1533,10 +1534,15 @@ __global__ void generate_RBGA(
int texture_slices = colors + 1;
if (threadIdx.x == 0) {
//DTT_SIZE_LOG2
// dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
// int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
// dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
clear_texture_rbga<<<blocks2,threads2>>>(
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
......@@ -1547,12 +1553,23 @@ __global__ void generate_RBGA(
for (int pass = 0; pass < 8; pass++){
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2)); // 1/4
if (border_tile){
ti_offset += TILESX * (TILESYA >> 2) - ntt;
}
#ifdef DEBUG12
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
pass, border_tile,ti_offset, ntt);
printf("\ngenerate_RBGA() gpu_texture_indices= 0x%x, gpu_texture_indices + ti_offset=0x%x\n",
(int) gpu_texture_indices, (int) (gpu_texture_indices + ti_offset));
printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
grid_texture.x, grid_texture.y, grid_texture.z);
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n");
#endif
/* */
textures_accumulate<<<grid_texture,threads_texture>>>(
border_tile, // int border_tile, // if 1 - watch for border
......@@ -1578,9 +1595,8 @@ __global__ void generate_RBGA(
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles); // (float *) 0 ); // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
cudaDeviceSynchronize(); // not needed yet, just for testing
/* */
}
}
......@@ -1590,21 +1606,20 @@ __global__ void generate_RBGA(
// blockDim.x * gridDim.x >= width
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_width, // aligned to DTT_SIZE
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
if (col > texture_width) {
return;
}
int row = (blockIdx.y << 3); // includes slices
int row = blockIdx.y;; // includes slices
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
*(pix) = 0.0;
pix += texture_rbga_stride;
*(pix++) = 0.0;
}
}
......@@ -1778,26 +1793,51 @@ __global__ void gen_texture_list(
int cxy = gpu_tasks[task_num].txy;
int x = (cxy & 0xffff);
int y = (cxy >> 16);
#ifdef DEBUG12
if ((x == DBG_TILE_X) && (y == DBG_TILE_Y)){
printf("\ngen_texture_list() x = %d, y= %d\n",x, y);
printf("\ngen_texture_list() num_texture_tiles = %d(%d) %d(%d) %d(%d) %d(%d)\n",
num_texture_tiles[0],num_texture_tiles[1],num_texture_tiles[2],num_texture_tiles[3],
num_texture_tiles[4],num_texture_tiles[5],num_texture_tiles[6],num_texture_tiles[7]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
// don't care if calculate extra pixels that still fit into memory
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
int buff_head = 0;
int num_offset = 0;
if (x & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
num_texture_tiles += 2; // int *
buff_head += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
num_offset += 2; // int *
}
if (y & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 1);
num_texture_tiles += 4; // int *
buff_head += TILESX * (TILESYA >> 1);
num_offset += 4; // int *
}
if (is_border){
gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
num_texture_tiles += 1; // int *
buff_head += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
num_offset += 1; // int *
}
gpu_texture_indices += buff_head;
num_texture_tiles += num_offset;
// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
int buf_offset = atomicAdd(num_texture_tiles, 1);
if (is_border){
buf_offset = -buf_offset;
}
#ifdef DEBUG12
if ((x == DBG_TILE_X) && (y == DBG_TILE_Y)){
printf("\ngen_texture_list() buff_head=%d, buf_offset = %d, num_offset= %d, is_border=%d\n",
buff_head, buf_offset, num_offset,is_border);
printf("\ngen_texture_list() gpu_texture_indices = 0x%x, gpu_texture_indices + buf_offset = 0x%x\n",
(int) gpu_texture_indices, (int) (gpu_texture_indices + buf_offset));
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
*(gpu_texture_indices + buf_offset) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
......@@ -2420,7 +2460,7 @@ __global__ void textures_accumulate(
}
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
printf("\textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
}
__syncthreads();// __syncwarp();
#endif
......@@ -2432,6 +2472,20 @@ __global__ void textures_accumulate(
}
if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
tile_num, (int) tile_code, border_tile);
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
-1);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
......@@ -2453,12 +2507,26 @@ __global__ void textures_accumulate(
}
}
}
int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color
int tileY = tile_num / TILESX; // slow, but 1 per tile
int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
tileX, tileY, tile_x0, tile_y0, slice_stride);
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
-1);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1); // row inside a tile (0..15)
int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
......@@ -2468,6 +2536,15 @@ __global__ void textures_accumulate(
int gi = g_row * texture_rbg_stride + g_col; // offset to the top left corner
float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
pass, row, col, g_row, g_col, i, gi);
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
if (!border_tile ||
((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
// always copy 3 (1) colors + alpha
......
......@@ -45,9 +45,11 @@
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE 8
#define DTT_SIZE_LOG2 3
//#define DTT_SIZE 8
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment