debugging RGBA texture generation

b5bfb231 · Andrey Filippov · 975dadb4 · b5bfb231 · b5bfb231 · b5bfb231
Commit b5bfb231 authored Apr 05, 2020 by Andrey Filippov
6 changed files
--- a/src/main/java/com/elphel/imagej/cameras/ThermalColor.java
+++ b/src/main/java/com/elphel/imagej/cameras/ThermalColor.java
@@ -38,6 +38,19 @@ public class ThermalColor {
 		return rslt;
 	}
+	public float [] getRGB(float v) { // Get R,G,B (0..255) triplet for input value in the range 0..1
+		double k = out_range/PALETTE_RANGE;
+		double value = (v-min)/(max-min) * (this.palette.length - 1);
+		int ivalue = (int) (value);
+		if (ivalue < 0)                         return getRGB((float) min); // this.palette[0];
+		if (ivalue >= (this.palette.length -1)) return getRGB((float) max); // this.palette[this.palette.length -1];
+		double a = (value-ivalue); // 0..1
+		float [] rslt = {
+				(float) (k*((1 - a) * this.palette[ivalue][0] + a * this.palette[ivalue+1][0])),
+				(float) (k*((1 - a) * this.palette[ivalue][1] + a * this.palette[ivalue+1][1])),
+				(float) (k*((1 - a) * this.palette[ivalue][2] + a * this.palette[ivalue+1][2]))};
+		return rslt;
+	}
 	private int []  setupPalette(int indx) {
 		//https://stackoverflow.com/questions/28495390/thermal-imaging-palette

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -45,6 +45,7 @@ import static jcuda.driver.JCudaDriver.cuLinkDestroy;
 import static jcuda.driver.JCudaDriver.cuMemAlloc;
 import static jcuda.driver.JCudaDriver.cuMemAllocPitch;
 import static jcuda.driver.JCudaDriver.cuMemcpy2D;
+import static jcuda.driver.JCudaDriver.cuMemcpyDtoH;
 import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
 import static jcuda.driver.JCudaDriver.cuModuleGetGlobal;
@@ -93,10 +94,12 @@ public class GPUTileProcessor {
 	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
 //	static String GPU_TEXTURES_NAME =              "textures_gen"; // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
+	static String GPU_RBGA_NAME =                  "generate_RBGA"; // name in C code
 //  pass some defines to gpu source code with #ifdef JCUDA
-	public static int DTT_SIZE =                  8;
+	public static int DTT_SIZE_LOG2 =             3;
+	public static int DTT_SIZE =                  (1 << DTT_SIZE_LOG2);
 	static int        THREADSX =                  DTT_SIZE;
 	public static int NUM_CAMS =                  4;
 	public static int NUM_PAIRS =                 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
@@ -147,6 +150,8 @@ public class GPUTileProcessor {
    private CUfunction GPU_IMCLT_RBG_kernel =             null;
    private CUfunction GPU_CORRELATE2D_kernel =           null;
    private CUfunction GPU_TEXTURES_kernel =              null;
+    private CUfunction GPU_RBGA_kernel =                  null;
    // CPU arrays of pointers to GPU memory
    // These arrays may go to method, they are here just to be able to free GPU memory if needed
    private CUdeviceptr [] gpu_kernels_h =        new CUdeviceptr[NUM_CAMS];
@@ -166,11 +171,16 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
+    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+    private CUdeviceptr gpu_num_texture_tiles =   new CUdeviceptr(); //  8 ints
+    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
    CUmodule    module; // to access constants memory
    private int mclt_stride;
    private int corr_stride;
    private int imclt_stride;
    private int texture_stride;
+    private int texture_stride_rgba;
    public int num_task_tiles;
    public int num_corr_tiles;
    public int num_texture_tiles;
@@ -318,7 +328,7 @@ public class GPUTileProcessor {
        ClassLoader classLoader = getClass().getClassLoader();
        String kernelSource =
        		"#define JCUDA\n"+
-        				"#define DTT_SIZE " +                 DTT_SIZE+"\n"+
+        				"#define DTT_SIZE_LOG2 " +            DTT_SIZE_LOG2+"\n"+
        				"#define THREADSX " +                 THREADSX+"\n"+
        				"#define NUM_CAMS " +                 NUM_CAMS+"\n"+
        				"#define NUM_PAIRS " +                NUM_PAIRS+"\n"+
@@ -372,7 +382,12 @@ public class GPUTileProcessor {
        }
        // Create the kernel functions (first - just test)
-        String [] func_names = {GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME, GPU_CORRELATE2D_NAME, GPU_TEXTURES_NAME};
+        String [] func_names = {
+        		GPU_CONVERT_CORRECT_TILES_NAME,
+        		GPU_IMCLT_RBG_NAME,
+        		GPU_CORRELATE2D_NAME,
+        		GPU_TEXTURES_NAME,
+        		GPU_RBGA_NAME};
        CUfunction[] functions = createFunctions(kernelSource,
        		                                 func_names,
        		                                 capability); // on my - 75
@@ -381,12 +396,14 @@ public class GPUTileProcessor {
        this.GPU_IMCLT_RBG_kernel =             functions[1];
        this.GPU_CORRELATE2D_kernel =           functions[2];
        this.GPU_TEXTURES_kernel=               functions[3];
+        this.GPU_RBGA_kernel=                   functions[4];
        System.out.println("GPU kernel functions initialized");
        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
        System.out.println(GPU_IMCLT_RBG_kernel.toString());
        System.out.println(GPU_CORRELATE2D_kernel.toString());
        System.out.println(GPU_TEXTURES_kernel.toString());
+        System.out.println(GPU_RBGA_kernel.toString());
        // Init data arrays for all kernels
        int tilesX =  IMG_WIDTH / DTT_SIZE;
@@ -459,6 +476,8 @@ public class GPUTileProcessor {
    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.POINTER);
+    	cuMemAlloc(gpu_woi,                 4 * Sizeof.POINTER); // may be hidden in device code as a static array?
+    	cuMemAlloc(gpu_num_texture_tiles,   8 * Sizeof.POINTER); // may be hidden in device code as a static array?
        cuMemAllocPitch (
        		gpu_corrs,                             // CUdeviceptr dptr,
@@ -476,7 +495,17 @@ public class GPUTileProcessor {
        		tilesX * tilesY,             // long Height,
                Sizeof.FLOAT);                         // int ElementSizeBytes)
        texture_stride = (int)(device_stride[0] / Sizeof.FLOAT);
+        int max_rgba_width  =  tilesX * DTT_SIZE;
+        int max_rgba_height =  tilesY * DTT_SIZE;
+        int max_rbga_slices =  NUM_COLORS + 1;
+        cuMemAllocPitch (
+        		gpu_textures_rgba,                     // CUdeviceptr dptr,
+        		device_stride,                         // long[] pPitch,
+        		max_rgba_width * Sizeof.FLOAT,         // long WidthInBytes,
+        		max_rgba_height * max_rbga_slices,     // long Height,
+                Sizeof.FLOAT);                         // int ElementSizeBytes)
+        texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
    }
@@ -819,12 +848,9 @@ public class GPUTileProcessor {
            Pointer.to(gpu_bayer),
            Pointer.to(gpu_tasks),
            Pointer.to(gpu_clt),
-/* 2020*///   Pointer.to(gpu_corrs),
-/* 2020*///   Pointer.to(gpu_corr_indices),              // corr indices (tile_num <<8 + pair_index
-/* 2020*///   Pointer.to(new int[] { num_corr_tiles }),  // total number of 2D correlations to calculate
            Pointer.to(new int[] { mclt_stride }),
-/* 2020*///   Pointer.to(new int[] { corr_stride }),
            Pointer.to(new int[] { num_task_tiles }),
+            // move lpf to 4-image generator kernel
 //            Pointer.to(new int[] { 7 }) // lpf_mask ??? (C-code has it 0)
            Pointer.to(new int[] { 0 }) // lpf_mask ??? (C-code has it 0)
        );
@@ -918,7 +944,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }
-    public void execTexturesOld(
+    public void execRBGA(
    		double [][] port_offsets,
    		double [] color_weights,
    		boolean   is_lwir,
@@ -927,9 +953,8 @@ public class GPUTileProcessor {
    		double    diff_sigma,         // pixel value/pixel change
    		double    diff_threshold,     // pixel value/pixel change
    		double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-    		boolean   dust_remove,
+    		boolean   dust_remove) {
-    		boolean   keep_weights) {
+    	if (GPU_RBGA_kernel == null)
-    	if (GPU_TEXTURES_kernel == null)
    	{
    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
    		return;
@@ -948,34 +973,41 @@ public class GPUTileProcessor {
    	float weighht2 = (num_colors >2)?((float) color_weights[2]):0.0f;
    	int iis_lwir =      (is_lwir)? 1:0;
    	int idust_remove =  (dust_remove)? 1 : 0;
-    	int ikeep_weights = (keep_weights)? 1 : 0;
-		int [] GridFullWarps =    {(num_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1};
+    	// uses dynamic parallelization, top kernel is a single-thread one
-    	int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
+		int [] GridFullWarps =    {1, 1, 1};
+    	int [] ThreadsFullWarps = {1, 1, 1};
    	Pointer kernelParameters = Pointer.to(
-    			Pointer.to(gpu_clt),
+                Pointer.to(gpu_tasks),                           // struct tp_task   * gpu_tasks,
-    			Pointer.to(new int[] { num_texture_tiles }),
+                Pointer.to(new int[] { num_task_tiles }),        // int                num_tiles,          // number of tiles in task list
-    			Pointer.to(gpu_texture_indices),
+            	// declare arrays in device code?
-    			Pointer.to(gpu_port_offsets),
+                Pointer.to(gpu_texture_indices),                 // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(gpu_num_texture_tiles),               // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-    			Pointer.to(new int[] { iis_lwir }),
+    			Pointer.to(gpu_woi),                             // int              * woi,                // x,y,width,height of the woi
-    			Pointer.to(new float[] {(float) min_shot }),
+    			// set smaller for LWIR - it is used to reduce work aread
-    			Pointer.to(new float[] {(float) scale_shot }),
+    			Pointer.to(new int[] {IMG_WIDTH / DTT_SIZE}),    // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-    			Pointer.to(new float[] {(float) diff_sigma }),
+    			Pointer.to(new int[] {IMG_HEIGHT / DTT_SIZE}),   // int                height); // <= TILESY, use for faster processing of LWIR images
-    			Pointer.to(new float[] {(float) diff_threshold }),
+    	    	// Parameters for the texture generation
-    			Pointer.to(new float[] {(float) min_agree }),
+    			Pointer.to(gpu_clt),                             // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-    			Pointer.to(new float[] {weighht0 }),
+	            Pointer.to(gpu_port_offsets),                    // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-    			Pointer.to(new float[] {weighht1 }),
+	            Pointer.to(new int[]   {num_colors}),            // int               colors,             // number of colors (3/1)
-    			Pointer.to(new float[] {weighht2 }),
+	            Pointer.to(new int[]   {iis_lwir}),              // int               is_lwir,            // do not perform shot correction
-    			Pointer.to(new int[] { idust_remove }),
+	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
-    			Pointer.to(new int[] { ikeep_weights }),
+	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
-    			Pointer.to(new int[] { texture_stride }),
+	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
-    			Pointer.to(gpu_textures) // lpf_mask
+	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
-    			);
+	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	            Pointer.to(new float[] {weighht0}),              // float             weight0,            // scale for R
+	            Pointer.to(new float[] {weighht1}),              // float             weight1,            // scale for B
+	            Pointer.to(new float[] {weighht2}),              // float             weight2,            // scale for G
+	            Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
+	            Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
+	            Pointer.to(gpu_textures_rgba));                  // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
    	cuCtxSynchronize();
    	// Call the kernel function
-    	cuLaunchKernel(GPU_TEXTURES_kernel,
+    	cuLaunchKernel(GPU_RBGA_kernel,
    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
@@ -1038,7 +1070,7 @@ public class GPUTileProcessor {
    			Pointer.to(new int[] { idust_remove }),
    			Pointer.to(new int[] { ikeep_weights }),
    			Pointer.to(new int[] {0}),//  0, // const size_t      texture_rbg_stride, // in floats - DISABLE GENERATION!
-    			Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
+    			Pointer.to(new int[] {0}), // null, //  new Pointer(),  //Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
    			Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
    			Pointer.to(gpu_textures)
    			);
@@ -1053,7 +1085,6 @@ public class GPUTileProcessor {
    }
    public float [][] getCorr2D(int corr_rad){
        int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
        float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
@@ -1078,6 +1109,41 @@ public class GPUTileProcessor {
        return corrs;
    }
+    /**
+     * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
+     * @param num_colors number of colors (1 or 3)
+     * @param woi should be initialized as Rectangle(). x,y,width, height will be populated (in pixels,)
+     * @return RBGA slices, last (alpha) in 0.0... 1.0 range, colors match input range
+     */
+    public float [][] getRBGA(
+    		int     num_colors,
+    		Rectangle woi) { // will update to woi
+    	// first - read woi
+    	float [] fwoi = new float[4];
+    	cuMemcpyDtoH(Pointer.to(fwoi), gpu_woi,  4 * Sizeof.FLOAT);
+    	woi.x =      Float.floatToIntBits(fwoi[0]) * DTT_SIZE;
+    	woi.y =      Float.floatToIntBits(fwoi[1]) * DTT_SIZE;
+    	woi.width =  Float.floatToIntBits(fwoi[2]) * DTT_SIZE;
+    	woi.height = Float.floatToIntBits(fwoi[3]) * DTT_SIZE;
+    	float [][] rslt = new float[num_colors + 1][woi.width * woi.height];
+        CUDA_MEMCPY2D copy_rbga =  new CUDA_MEMCPY2D();
+        copy_rbga.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+        copy_rbga.srcDevice =       gpu_textures_rgba;
+        copy_rbga.srcPitch =        texture_stride_rgba * Sizeof.FLOAT;
+        copy_rbga.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+//        copy_woi.dstHost =         Pointer.to(rslt);
+        copy_rbga.dstPitch =        woi.width * Sizeof.FLOAT;
+        copy_rbga.WidthInBytes =    woi.width * Sizeof.FLOAT;
+        copy_rbga.Height =          woi.height;
+        for (int ncol = 0; ncol<= num_colors; ncol++ ) {
+        	copy_rbga.dstHost =     Pointer.to(rslt[ncol]);
+            copy_rbga.srcY =        woi.height * ncol;
+            cuMemcpy2D(copy_rbga); // run copy
+        }
+        return rslt;
+    }
    public float [] getFlatTextures(
    		int     num_colors,
    		boolean keep_weights){
@@ -1104,8 +1170,7 @@ public class GPUTileProcessor {
        return cpu_textures;
    }
+    public float [][][] getTextures( // todo - get rid of copying by multiple CUDA_MEMCPY2D?
-    public float [][][] getTextures(
    		int     num_colors,
    		boolean keep_weights){

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -5350,11 +5350,73 @@ public class QuadCLT {
 		  ShowDoubleFloatArrays sdfa_instance = new ShowDoubleFloatArrays(); // just for debugging?
 		  // convert to ImageStack of 3 slices
 		  String [] sliceNames = {"red", "blue", "green"};
+		  int green_index = 2;
+		  float [][] rbg_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
 		  float []   alpha = null; // (0..1.0)
-		  float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
+//		  float [][] rgb_in = {iclt_data[0],iclt_data[1],iclt_data[2]};
 		  if (iclt_data.length > 3) alpha = iclt_data[3];
+		  if (isLwir()) {
+			  String [] rgb_titles =  {"red","green","blue"};
+			  String [] rgba_titles = {"red","green","blue","alpha"};
+			  String [] titles = (alpha == null) ? rgb_titles : rgba_titles;
+			  int num_slices = (alpha == null) ? 3 : 4;
+			  double mn = colorProcParameters.lwir_low;
+			  double mx = colorProcParameters.lwir_high;
+			  double [] cold_hot = getColdHot();
+			  if (cold_hot != null) {
+				  mn = cold_hot[0];
+				  mx = cold_hot[1];
+			  }
+			  double offset = getLwirOffset();
+			  if (!Double.isNaN(offset)) {
+				  mn -=  offset;
+				  mx -=  offset;
+			  }
+			  ThermalColor tc = new ThermalColor(
+					  colorProcParameters.lwir_palette,
+					  mn,
+					  mx,
+					  255.0);
+			  float [][] rgba = new float [num_slices][];
+			  for (int i = 0; i < 3; i++) rgba[i] = new float [iclt_data[green_index].length];
+			  for (int i = 0; i < rbg_in[green_index].length; i++) {
+				  if (i == 700) {
+					  System.out.println("linearStackToColor(): i="+i);
+				  }
+				  float [] rgb = tc.getRGB(iclt_data[green_index][i]);
+				  rgba[0][i] = rgb[0]; // red
+				  rgba[1][i] = rgb[1]; // green
+				  rgba[2][i] = rgb[2]; // blue
+			  }
+			  if (alpha != null) {
+				  rgba[3] = alpha; // 0..1
+			  }
+			  ImageStack stack = sdfa_instance.makeStack(
+					  rgba,       // iclt_data,
+					  width,      // (tilesX + 0) * clt_parameters.transform_size,
+					  height,     // (tilesY + 0) * clt_parameters.transform_size,
+					  titles,     // or use null to get chn-nn slice names
+					  true);      // replace NaN with 0.0
+			  ImagePlus imp_rgba =  EyesisCorrections.convertRGBAFloatToRGBA32(
+					  stack,   // ImageStack stackFloat, //r,g,b,a
+					  //						name+"ARGB"+suffix, // String title,
+					  name+suffix, // String title,
+					  0.0,   // double r_min,
+					  255.0, // double r_max,
+					  0.0,   // double g_min,
+					  255.0, // double g_max,
+					  0.0,   // double b_min,
+					  255.0, // double b_max,
+					  0.0,   // double alpha_min,
+					  1.0);  // double alpha_max)
+			  return imp_rgba;
+		  }
 		  ImageStack stack = sdfa_instance.makeStack(
-				  rgb_in, // iclt_data,
+//				  rgb_in, // iclt_data,
+				  rbg_in, // iclt_data,
 				  width,  // (tilesX + 0) * clt_parameters.transform_size,
 				  height, // (tilesY + 0) * clt_parameters.transform_size,
 				  sliceNames,  // or use null to get chn-nn slice names
@@ -5375,8 +5437,6 @@ public class QuadCLT {
 				  height, // int height, // int tilesY,
 				  scaleExposure, // double scaleExposure,
 				  debugLevel); //int debugLevel
 	  }
 	  // double data
@@ -5460,16 +5520,16 @@ public class QuadCLT {
 					  true);      // replace NaN with 0.0
 			  ImagePlus imp_rgba =  EyesisCorrections.convertRGBAFloatToRGBA32(
 					  stack,   // ImageStack stackFloat, //r,g,b,a
-//						name+"ARGB"+suffix, // String title,
+					  //						name+"ARGB"+suffix, // String title,
-						name+suffix, // String title,
+					  name+suffix, // String title,
-						0.0,   // double r_min,
+					  0.0,   // double r_min,
-						255.0, // double r_max,
+					  255.0, // double r_max,
-						0.0,   // double g_min,
+					  0.0,   // double g_min,
-						255.0, // double g_max,
+					  255.0, // double g_max,
-						0.0,   // double b_min,
+					  0.0,   // double b_min,
-						255.0, // double b_max,
+					  255.0, // double b_max,
-						0.0,   // double alpha_min,
+					  0.0,   // double alpha_min,
-						1.0);  // double alpha_max)
+					  1.0);  // double alpha_max)
 			  return imp_rgba;
 		  }
@@ -5499,10 +5559,13 @@ public class QuadCLT {
 				  debugLevel); //int debugLevel
 	  }
 	  // Convert a single value pixels to color (r,b,g) values to be processed instead of the normal colors
-	  public ImagePlus linearStackToColor( // USED in lwir
+	  public ImagePlus linearStackToColor(
 			  CLTParameters         clt_parameters,
 			  ColorProcParameters   colorProcParameters,
 			  EyesisCorrectionParameters.RGBParameters         rgbParameters,

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2061,21 +2061,36 @@ public class TwoQuadCLT {
 				clt_parameters.diff_threshold, // double    diff_threshold,     // pixel value/pixel change
 				clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 				clt_parameters.dust_remove,    // boolean   dust_remove,
-				clt_parameters.keep_weights);  // boolean   keep_weights); // int corr_radius
+				clt_parameters.keep_weights);  // boolean   keep_weights);
 		long endTextures = System.nanoTime();
+// run texturesRBGA
+		long startTexturesRBGA = System.nanoTime();   // System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
+				port_offsets,                  // double [][] port_offsets,
+				col_weights,                   // double [] color_weights,
+				quadCLT_main.isLwir(),         // boolean   is_lwir,
+				clt_parameters.min_shot,       // double    min_shot,           // 10.0
+				clt_parameters.scale_shot,     // double    scale_shot,         // 3.0
+				clt_parameters.diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
+				clt_parameters.diff_threshold, // double    diff_threshold,     // pixel value/pixel change
+				clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				clt_parameters.dust_remove);   // boolean   dust_remove,
+		long endTexturesRBGA = System.nanoTime();
 		long endGPUTime = System.nanoTime();
-		long firstGPUTime= (startIMCLT- startGPU)/NREPEAT;
+		long firstGPUTime=         (startIMCLT-       startGPU)         /NREPEAT;
-		long runImcltTime = (endImcltTime - startIMCLT)/NREPEAT;
+		long runImcltTime =        (endImcltTime -    startIMCLT)       /NREPEAT;
-		long runCorr2DTime = (endCorr2d - startCorr2d)/NREPEAT;
+		long runCorr2DTime =       (endCorr2d -       startCorr2d)      /NREPEAT;
-		long runTexturesTime = (endTextures - startTextures)/NREPEAT;
+		long runTexturesTime =     (endTextures -     startTextures)    /NREPEAT;
-		long runGPUTime = (endGPUTime - startGPU)/NREPEAT;
+		long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA)/NREPEAT;
+		long runGPUTime =          (endGPUTime -      startGPU)         /NREPEAT;
 		// run corr2d
 		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
 		System.out.println("GPU run time ="+(runGPUTime * 1.0e-6)+"ms, (direct conversion: "+(firstGPUTime*1.0e-6)+"ms, imclt: "+
-				(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms), textures: "+(runTexturesTime*1.0e-6)+"ms");
+				(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms), textures: "+(runTexturesTime*1.0e-6)+"ms, RGBA: "+
+				(runTexturesRBGATime*1.0e-6)+"ms");
 		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
@@ -2190,6 +2205,41 @@ public class TwoQuadCLT {
 					debugLevel);
 		}
+		// Use GPU prepared RBGA
+		if (clt_parameters.show_rgba_color) {
+			Rectangle woi = new Rectangle();
+			float [][] rbga = gPUTileProcessor.getRBGA(
+					(is_mono?1:3), // int     num_colors,
+					woi);
+			// for now - use just RGB. Later add option for RGBA
+			float [][] rgb_main = {rbga[0],rbga[1],rbga[2]};
+			float [][] rgba_main = {rbga[0],rbga[1],rbga[2],rbga[3]};
+			ImagePlus imp_rgba_main = quadCLT_main.linearStackToColor(
+					clt_parameters,
+					colorProcParameters,
+					rgbParameters,
+					name+"-texture", // String name,
+					"-D"+clt_parameters.disparity+"-MAINGPU", //String suffix, // such as disparity=...
+					toRGB,
+					!quadCLT_main.correctionsParameters.jpeg, // boolean bpp16, // 16-bit per channel color mode for result
+					false, // true, // boolean saveShowIntermediate, // save/show if set globally
+					false, // true, // boolean saveShowFinal,        // save/show result (color image?)
+					((clt_parameters.alpha1 > 0)? rgba_main: rgb_main),
+					tilesX *  image_dtt.transform_size,
+					tilesY *  image_dtt.transform_size,
+					1.0,         // double scaleExposure, // is it needed?
+					debugLevel );
+			int width = imp_rgba_main.getWidth();
+			int height =imp_rgba_main.getHeight();
+			ImageStack texture_stack=new ImageStack(width,height);
+			texture_stack.addSlice("main",      imp_rgba_main.getProcessor().getPixels()); // single slice
+			ImagePlus imp_texture_stack = new ImagePlus(name+"-RGBA-D"+clt_parameters.disparity, texture_stack);
+			imp_texture_stack.getProcessor().resetMinAndMax();
+			imp_texture_stack.show();
+		}
+		// convert textures to RGBA in Java
 		if (clt_parameters.show_rgba_color) {
 			int numcol = quadCLT_main.isMonochrome()?1:3;
 			int ports = imp_quad_main.length;
@@ -2311,6 +2361,8 @@ public class TwoQuadCLT {
 		}
 		return results;
 	}

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -72,8 +72,8 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list
-#undef HAS_PRINTF
+//#undef HAS_PRINTF
-//#define HAS_PRINTF
+#define HAS_PRINTF
 //7
 //#define DEBUG1 1
 //#define DEBUG2 1
@@ -87,7 +87,8 @@
 #define DEBUG9 1
 */
 #define DEBUG10 1
+#define DEBUG11 1
+#define DEBUG12 1
 //#define USE_textures_gen
 #endif //#ifndef JCUDA
@@ -1533,10 +1534,15 @@ __global__ void generate_RBGA(
 	 int texture_slices =       colors + 1;
 	 if (threadIdx.x == 0) {
+		 //DTT_SIZE_LOG2
+//		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
+//		    int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+//		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
 		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
-		    int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+		    int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
 		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
-		    clear_texture_rbga<<<blocks2,threads2>>>(
+		    clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
 		    		texture_width,
 					texture_tiles_height * texture_slices, // int               texture_slice_height,
 					texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
@@ -1547,12 +1553,23 @@ __global__ void generate_RBGA(
 			for (int pass = 0; pass < 8; pass++){
 			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 			    int border_tile =  pass >> 2;
-			    size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
+			    int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
 			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
 			    int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2));  // 1/4
 			    if (border_tile){
 			    	ti_offset += TILESX * (TILESYA >> 2) - ntt;
 			    }
+#ifdef DEBUG12
+				printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
+						pass, border_tile,ti_offset, ntt);
+				printf("\ngenerate_RBGA() gpu_texture_indices= 0x%x, gpu_texture_indices + ti_offset=0x%x\n",
+						(int) gpu_texture_indices, (int) (gpu_texture_indices + ti_offset));
+				printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
+						grid_texture.x, grid_texture.y, grid_texture.z);
+				printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
+						threads_texture.x, threads_texture.y, threads_texture.z);
+				printf("\n");
+#endif
 			    /* */
 			    textures_accumulate<<<grid_texture,threads_texture>>>(
 			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
@@ -1578,9 +1595,8 @@ __global__ void generate_RBGA(
 						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
 						gpu_texture_tiles); // (float *) 0 );                   // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				cudaDeviceSynchronize(); // not needed yet, just for testing
 				/* */
 			}
 	 }
@@ -1590,21 +1606,20 @@ __global__ void generate_RBGA(
 // blockDim.x * gridDim.x >= width
 extern "C" __global__ void clear_texture_rbga(
-		int               texture_width,
+		int               texture_width, // aligned to DTT_SIZE
 		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
 {
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
 	if (col > texture_width) {
 		return;
 	}
-	int row = (blockIdx.y << 3); // includes slices
+	int row = blockIdx.y;; // includes slices
 	float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
 #pragma unroll
 	for (int n = 0; n < DTT_SIZE; n++) {
-		*(pix) = 0.0;
+		*(pix++) = 0.0;
-		pix += texture_rbga_stride;
 	}
 }
@@ -1778,26 +1793,51 @@ __global__ void gen_texture_list(
 	int cxy = gpu_tasks[task_num].txy;
 	int x = (cxy & 0xffff);
 	int y = (cxy >> 16);
+#ifdef DEBUG12
+		if ((x == DBG_TILE_X)  && (y == DBG_TILE_Y)){
+			printf("\ngen_texture_list() x = %d, y= %d\n",x, y);
+			printf("\ngen_texture_list() num_texture_tiles = %d(%d) %d(%d) %d(%d) %d(%d)\n",
+					num_texture_tiles[0],num_texture_tiles[1],num_texture_tiles[2],num_texture_tiles[3],
+					num_texture_tiles[4],num_texture_tiles[5],num_texture_tiles[6],num_texture_tiles[7]);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 //	int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
 	// don't care if calculate extra pixels that still fit into memory
 	int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
+	int buff_head = 0;
+	int num_offset = 0;
 	if (x & 1) {
-		gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
+		buff_head += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
-		num_texture_tiles += 2; // int *
+		num_offset += 2; // int *
 	}
 	if (y & 1) {
-		gpu_texture_indices += TILESX * (TILESYA >> 1);
+		buff_head += TILESX * (TILESYA >> 1);
-		num_texture_tiles += 4; // int *
+		num_offset += 4; // int *
 	}
 	if (is_border){
-		gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
+		buff_head += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
-		num_texture_tiles += 1; // int *
+		num_offset += 1; // int *
 	}
+	gpu_texture_indices += buff_head;
+	num_texture_tiles += num_offset;
 	// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
 	int buf_offset = atomicAdd(num_texture_tiles, 1);
 	if (is_border){
 		buf_offset = -buf_offset;
 	}
+#ifdef DEBUG12
+	if ((x == DBG_TILE_X)  && (y == DBG_TILE_Y)){
+		printf("\ngen_texture_list() buff_head=%d, buf_offset = %d, num_offset= %d, is_border=%d\n",
+				buff_head, buf_offset, num_offset,is_border);
+		printf("\ngen_texture_list() gpu_texture_indices = 0x%x,  gpu_texture_indices + buf_offset = 0x%x\n",
+				 (int) gpu_texture_indices, (int) (gpu_texture_indices + buf_offset));
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG12
 	*(gpu_texture_indices + buf_offset) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
 }
@@ -2420,7 +2460,7 @@ __global__ void textures_accumulate(
 		}
 #ifdef DEBUG7
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
+			printf("\textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
 		}
 		__syncthreads();// __syncwarp();
 #endif
@@ -2432,6 +2472,20 @@ __global__ void textures_accumulate(
 	}
 	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
+					tile_num, (int) tile_code, border_tile);
+			for (int ncol = 0; ncol <= colors; ncol++) {
+				printf("\ntile[%d]\n",ncol);
+				debug_print_mclt(
+						(float *) (shr1.rgbaw[ncol]),
+						-1);
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 		if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
 			for (int pass = 0; pass < 8; pass ++) {
 				int row = pass * 2 + (threadIdx.y >> 1);
@@ -2453,12 +2507,26 @@ __global__ void textures_accumulate(
 				}
 			}
 		}
-		int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
+		int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color
 		int tileY = tile_num / TILESX; // slow, but 1 per tile
 		int tileX = tile_num - tileY * TILESX;
 		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
+					tileX, tileY, tile_x0, tile_y0, slice_stride);
+			for (int ncol = 0; ncol <= colors; ncol++) {
+				printf("\ntile[%d]\n",ncol);
+				debug_print_mclt(
+						(float *) (shr1.rgbaw[ncol]),
+						-1);
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 		for (int pass = 0; pass < 8; pass ++) {
 			int row = pass * 2 + (threadIdx.y >> 1);          // row    inside a tile (0..15)
 			int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
@@ -2468,6 +2536,15 @@ __global__ void textures_accumulate(
 			int gi = g_row * texture_rbg_stride  + g_col; // offset to the top left corner
 			float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
 			float * rgba_i = ((float *) shr1.rgbaw) + i;
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
+					pass, row, col, g_row, g_col, i, gi);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 			if (!border_tile ||
 					((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
 				// always copy 3 (1) colors + alpha

--- a/src/main/resources/kernels/dtt8x8.cuh
+++ b/src/main/resources/kernels/dtt8x8.cuh
@@ -45,9 +45,11 @@
 * with Nvidia Nsight, driver API when calling these kernels from Java
 */
 #ifndef JCUDA
-#define DTT_SIZE                      8
+#define DTT_SIZE_LOG2                 3
+//#define DTT_SIZE                      8
 #endif
 #pragma once
+#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
 #define DTTTEST_BLOCK_WIDTH          32
 #define DTTTEST_BLOCK_HEIGHT         16
 #define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)