added variable LPF through GPU constants memory

1338de2c · Andrey Filippov · 72b6bdce · 1338de2c · 1338de2c
Commit 1338de2c authored Oct 09, 2018 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 101 additions and 302 deletions

GPUTileProcessor.java src/main/java/GPUTileProcessor.java +85 -280

TwoQuadCLT.java src/main/java/TwoQuadCLT.java +16 -22

No files found.
--- a/src/main/java/GPUTileProcessor.java
+++ b/src/main/java/GPUTileProcessor.java
@@ -33,10 +33,10 @@ import static jcuda.driver.JCudaDriver.cuInit;
 import static jcuda.driver.JCudaDriver.cuLaunchKernel;
 import static jcuda.driver.JCudaDriver.cuMemAlloc;
 import static jcuda.driver.JCudaDriver.cuMemAllocPitch;
-import static jcuda.driver.JCudaDriver.cuMemFree;
 import static jcuda.driver.JCudaDriver.cuMemcpy2D;
 import static jcuda.driver.JCudaDriver.cuMemcpyHtoD;
 import static jcuda.driver.JCudaDriver.cuModuleGetFunction;
+import static jcuda.driver.JCudaDriver.cuModuleGetGlobal;
 import static jcuda.driver.JCudaDriver.cuModuleLoadData;
 import static jcuda.nvrtc.JNvrtc.nvrtcCompileProgram;
 import static jcuda.nvrtc.JNvrtc.nvrtcCreateProgram;
@@ -69,8 +69,8 @@ import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
 	static String GPU_KERNEL_FILE = "dtt8x8.cuh";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
-	static String GPU_DTT24_NAME =  "GPU_DTT24_DRV";       // this.kernelFunction = createFunction(sourceCode, "GPU_DTT24_DRV"); // "invert");
 	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles";
+	static String GPU_IMCLT_RBG_NAME = "imclt_rbg";
 //  pass some defines to gpu source code with #ifdef JCUDA
 	static int DTT_SIZE =                8;
 	static int THREADSX =         DTT_SIZE;
@@ -92,38 +92,12 @@ public class GPUTileProcessor {
 	static int KERN_SIZE =  KERN_TILES * 4 * 64;
-	/*
-extern "C"
-__global__ void convert_correct_tiles(
-		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
-		struct tp_task  * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t            dstride,             // in floats (pixels)
-		int               num_tiles,           // number of tiles in task
-		int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-	 */
-	static String GPU_IMCLT_RBG_NAME = "imclt_rbg";
-	/*
-extern "C"
-__global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-		int               color,
-		int               v_offset,
-		int               h_offset,
-		const size_t      dstride)            // in floats (pixels)
-	 */
    int DTTTEST_BLOCK_WIDTH =        32; // may be read from the source code
    int DTTTEST_BLOCK_HEIGHT =       16; // may be read from the source code
    public boolean kernels_set = false;
    public boolean bayer_set =   false;
-    private CUfunction GPU_DTT24_kernel =                 null;
    private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null;
    private CUfunction GPU_IMCLT_RBG_kernel =             null;
    // CPU arrays of pointers to GPU memory
@@ -143,18 +117,17 @@ __global__ void imclt_rbg(
    private CUdeviceptr gpu_tasks =               new CUdeviceptr();
    private CUdeviceptr gpu_clt =                 new CUdeviceptr();
+//    private
+    CUmodule    module; // to access constants memory
 //    private CUdeviceptr gpu_lpf =            new CUdeviceptr();
    private int mclt_stride;
    private int imclt_stride;
    public int num_task_tiles;
-    /*
-    */
    public class TpTask {
    	public int   task;
    	public float target_disparity;
-//    	int   txy;
    	public int ty;
    	public int tx;
    	public float[][] xy = null;
@@ -167,27 +140,13 @@ __global__ void imclt_rbg(
    		this.target_disparity = target_disparity;
    		this.task = task;
    	}
-    	/*
-    	public TpTask(int main, int aux, float [] flt, int indx) {
-    		if (main > 0) xy =     new float[main][2];
-    		if (aux > 0)  xy_aux = new float[main][2];
-    		task =     Float.floatToRawIntBits(flt[indx++]);
-    		int txy =  Float.floatToRawIntBits(flt[indx++]);
-    		tx = txy & 0xffff;
-    		ty = txy >> 16;
-    		for (int i = 0; i < xy.length; i++) {
-    			xy[i][0] = flt[indx++];
-    			xy[i][1] = flt[indx++];
-    		}
-    	}
-    	*/
-    	// convert this class to float array to match layout of the C struct
+    	// convert this class instance to float array to match layout of the C struct
    	public float [] asFloatArray(boolean use_aux) {
    		float [] flt = new float [NUM_CAMS * 2 + 2];
    		return asFloatArray(flt, 0, use_aux);
    	}
-    	// convert this class to float array to match layout of the C struct,
+    	// convert this class instance to float array to match layout of the C struct,
    	// fill existing float array from the specified index
    	public float [] asFloatArray(float [] flt, int indx, boolean use_aux) {
    		flt[indx++] = Float.intBitsToFloat(task);
@@ -258,8 +217,6 @@ __global__ void imclt_rbg(
    	}
    };
    private static long getPointerAddress(CUdeviceptr p)
    {
@@ -278,50 +235,6 @@ __global__ void imclt_rbg(
        return new PointerWithAddress(p).getAddress();
    }
-/*
- *
-struct tp_task {
-	int   task;
-	int   txy;
-//	short ty;
-//	short tx;
-	float xy[NUM_CAMS][2];
-};
-struct CltExtra{
-	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
-	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
-	float center_x; // actual center X (use to find derivatives)
-	float center_y; // actual center X (use to find derivatives)
-	float dxc_dx;   // add this to data_x per each pixel X-shift relative to the kernel center location
-	float dxc_dy;   // same per each Y-shift pixel
-	float dyc_dx;
-	float dyc_dy;
-};
- *
- *
- intBitsToFloat(int)
-public static int floatToRawIntBits(float value)
-public static float intBitsToFloat(int bits)
-    // host array of pointers to GPU memory
-    float            * gpu_kernels_h        [NUM_CAMS];
-    struct CltExtra  * gpu_kernel_offsets_h [NUM_CAMS];
-    float            * gpu_images_h         [NUM_CAMS];
-    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
-    float            * gpu_clt_h            [NUM_CAMS];
-    float            * gpu_lpf_h            [NUM_COLORS];
-#ifndef NOICLT
-    float            * gpu_corr_images_h    [NUM_CAMS];
-#endif
-    // GPU pointers to GPU pointers to memory
-    float           ** gpu_kernels; //           [NUM_CAMS];
-    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
-    float           ** gpu_images; //            [NUM_CAMS];
-    float           ** gpu_clt;    //           [NUM_CAMS];
-    float           ** gpu_lpf;    //           [NUM_CAMS];
- */
    public GPUTileProcessor() throws IOException
    {
    	// From code by Marco Hutter - http://www.jcuda.org
@@ -356,21 +269,7 @@ public static float intBitsToFloat(int bits)
        				"#define TILES_PER_BLOCK " +        TILES_PER_BLOCK+"\n"+
        				"#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+
        				"#define IMCLT_TILES_PER_BLOCK " +  IMCLT_TILES_PER_BLOCK+"\n";
-/*
-#define THREADSX         (DTT_SIZE)
-#define IMG_WIDTH       2592
-#define IMG_HEIGHT      1936
-#define KERNELS_HOR      164
-#define KERNELS_VERT     123
-#define NUM_CAMS           4
-#define NUM_COLORS         3
-#define KERNELS_LSTEP      4
-#define THREADS_PER_TILE   8
-#define TILES_PER_BLOCK    4
-#define IMCLT_THREADS_PER_TILE 16
-#define IMCLT_TILES_PER_BLOCK   4
- */
        for (String src_file:GPU_KERNEL_FILES) {
        	File file = new File(classLoader.getResource(src_file).getFile());
        	System.out.println(file.getAbsolutePath());
@@ -385,14 +284,10 @@ public static float intBitsToFloat(int bits)
        }
        // Create the kernel functions (first - just test)
-//        this.GPU_DTT24_kernel = createFunction(kernelSource, GPU_DTT24_NAME);
+        String [] func_names = {GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME};
-        String [] func_names = {GPU_DTT24_NAME, GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME};
        CUfunction[] functions = createFunctions(kernelSource, func_names);
-        this.GPU_DTT24_kernel =                 functions[0];
+        this.GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
-        this.GPU_CONVERT_CORRECT_TILES_kernel = functions[1];
+        this.GPU_IMCLT_RBG_kernel =             functions[1];
-        this.GPU_IMCLT_RBG_kernel =             functions[2];
-//        this.GPU_CONVERT_CORRECT_TILES = createFunction(kernelSource, GPU_CONVERT_CORRECT_TILES_NAME);
- //       this.GPU_IMCLT_RBG =             createFunction(kernelSource, GPU_IMCLT_RBG_NAME);
        System.out.println("GPU kernel functions initialized");
        System.out.println("Sizeof.POINTER="+Sizeof.POINTER);
        System.out.println(GPU_IMCLT_RBG_kernel.toString());
@@ -426,7 +321,6 @@ public static float intBitsToFloat(int bits)
            gpu_clt_h[ncam] = new CUdeviceptr();
        	cuMemAlloc(gpu_clt_h[ncam],tilesY * tilesX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT ); //     public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
-            //gpu_clt_h
        }
        // now create device arrays pointers
        if (Sizeof.POINTER != Sizeof.LONG) {
@@ -455,11 +349,9 @@ public static float intBitsToFloat(int bits)
        for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_clt_l[ncam] =            getPointerAddress(gpu_clt_h[ncam]);
        cuMemcpyHtoD(gpu_clt, Pointer.to(gpu_clt_l),                             NUM_CAMS * Sizeof.POINTER);
        // Set task array
    	cuMemAlloc(gpu_tasks,       tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER);
-//TPTASK_SIZE
    }
@@ -551,7 +443,6 @@ public static float intBitsToFloat(int bits)
        copyH2D.Height =          IMG_HEIGHT; // /4;
        cuMemcpy2D(copyH2D);
    }
    // combines 3 bayer channels into one and transfers to GPU memory
    public void setBayerImages(
 			double [][][]       bayer_data,
@@ -571,9 +462,6 @@ public static float intBitsToFloat(int bits)
 		bayer_set = true;
    }
    // prepare tasks for full frame, same dispaity.
    // need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
    public TpTask [] setFullFrameImages(
@@ -661,6 +549,7 @@ public static float intBitsToFloat(int bits)
    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize(); // remove later
    }
    public void execImcltRbg() {
@@ -695,11 +584,11 @@ public static float intBitsToFloat(int bits)
 								ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
 								0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
 								kernelParameters, null);   // Kernel- and extra parameters
-//						System.out.println("ncam = "+ncam+", color="+color+", v_offs="+v_offs+", h_offs="+h_offs);
    				}
    			}
    		}
    	}
+    	cuCtxSynchronize();
    }
    float [][] getRBG (int ncam){
@@ -731,159 +620,8 @@ public static float intBitsToFloat(int bits)
        return fimg;
    }
+//    private static CUfunction [] createFunctions(
+      private CUfunction [] createFunctions(
-    /*
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaMemcpy2D( // segfault
-    			cpu_corr_image,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				gpu_corr_images_h[ncam],
-				dstride_rslt,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				3* (IMG_HEIGHT + DTT_SIZE),
-    			cudaMemcpyDeviceToHost));
-        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
-    	writeFloatsToFile( // will have margins
-    			cpu_corr_image, // float *       data, // allocated array
-				rslt_img_size, // int           size, // length in elements
-				result_rbg_file[ncam]); // 			   const char *  path) // file path
-    }
-     */
-    // run kernel with dttx
-    public void exec_dtt24(float src_pixels[],float dst_pixels[], int width, int dtt_mode)
-    {
-        if (GPU_DTT24_kernel == null)
-        {
-            IJ.showMessage("Error", "No GPU kernel");
-            return;
-        }
-        int height = src_pixels.length / width;
-        long width_in_bytes = width * Sizeof.FLOAT;
-        CUdeviceptr src_dpointer = new CUdeviceptr();
-        CUdeviceptr dst_dpointer = new CUdeviceptr();
-        long [] device_stride = new long [1];
-        cuMemAllocPitch (
-        		src_dpointer,   // CUdeviceptr dptr,
-        		device_stride,  // long[] pPitch,
-        		width_in_bytes, // long WidthInBytes,
-        		height,         // long Height,
-                Sizeof.FLOAT);  // int ElementSizeBytes)
-        int pitchInElements = (int)(device_stride[0] / Sizeof.FLOAT);
-        cuMemAllocPitch (
-        		dst_dpointer,   // CUdeviceptr dptr,
-        		device_stride,  // long[] pPitch,
-        		width_in_bytes, // long WidthInBytes,
-        		height,         // long Height,
-                Sizeof.FLOAT);  // int ElementSizeBytes)
-        CUDA_MEMCPY2D copyH2D =   new CUDA_MEMCPY2D();
-        copyH2D.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
-        copyH2D.srcHost =         Pointer.to(src_pixels);
-        copyH2D.srcPitch =        width_in_bytes;
-        copyH2D.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
-        copyH2D.dstDevice =       src_dpointer;
-        copyH2D.dstPitch =        device_stride[0];
-        copyH2D.WidthInBytes =    width_in_bytes;
-        copyH2D.Height =          height; // /4;
-// for copying results to host
-        CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
-        copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
-        copyD2H.srcDevice =       dst_dpointer; // ((test & 1) ==0) ? src_dpointer : dst_dpointer; // copy same data
-        copyD2H.srcPitch =        device_stride[0];
-        copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
-        copyD2H.dstHost =         Pointer.to(dst_pixels);
-        copyD2H.dstPitch =        width_in_bytes;
-        copyD2H.WidthInBytes =    width_in_bytes;
-        copyD2H.Height =          height; // /2;
-        // kernel parameters: pointer to pointers
-        Pointer kernelParameters = Pointer.to(
-            Pointer.to(dst_dpointer),
-            Pointer.to(src_dpointer),
-            Pointer.to(new int[] { pitchInElements }),
-            Pointer.to(new int[] { dtt_mode })
-        );
-        int [] GridFullWarps =    {width / DTTTEST_BLOCK_WIDTH, height / DTTTEST_BLOCK_HEIGHT, 1};
-        int [] ThreadsFullWarps = {DTT_SIZE, DTTTEST_BLOCK_WIDTH/DTT_SIZE, DTTTEST_BLOCK_HEIGHT/DTT_SIZE};
-        // Actual work starts here:
-        cuMemcpy2D(copyH2D);
-        cuCtxSynchronize();
-        	// Call the kernel function
-    	cuLaunchKernel(GPU_DTT24_kernel,
-    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],  // Grid dimension
-    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
-    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
-    			kernelParameters, null);   // Kernel- and extra parameters
-        // Copy the data from the device to the host
-        cuMemcpy2D(copyD2H);
-        // clean up
-        cuMemFree(src_dpointer);
-        cuMemFree(dst_dpointer);
-    }
-    /**
-     * Create the kernel function by its name in the  source code
-     * @param sourceCode The source code
-     * @param kernelName The kernel function name
-     * @return
-     * @throws IOException
-     */
-    /*
-    private static CUfunction createFunction(
-        String sourceCode, String kernelName) throws IOException
-    {
-    	boolean OK = false;
-        // Use the NVRTC to create a program by compiling the source code
-        nvrtcProgram program = new nvrtcProgram();
-        nvrtcCreateProgram(
-            program, sourceCode, null, 0, null, null);
-        try {
-        	nvrtcCompileProgram(program, 0, null);
-        	OK = true;
-    	} catch (Exception e) {
-    		System.out.println("nvrtcCompileProgram() FAILED");
-    	}
-        // Compilation log with errors/warnongs
-        String programLog[] = new String[1];
-        nvrtcGetProgramLog(program, programLog);
-        String log = programLog[0].trim();
-        if (!log.isEmpty())
-        {
-            System.err.println("Program compilation log:\n" + log);
-        }
-        if (!OK) {
-        	throw new IOException("Could not compile program");
-        }
-        // Get the PTX code of the compiled program (not the binary)
-        String[] ptx = new String[1];
-        nvrtcGetPTX(program, ptx);
-        nvrtcDestroyProgram(program);
-        // Create a CUDA module from the PTX code
-        CUmodule module = new CUmodule();
-        cuModuleLoadData(module, ptx[0]);
-        // Find the function in the source by name, get its pointer
-        CUfunction function = new CUfunction();
-        cuModuleGetFunction(function, module, kernelName);
-        return function;
-    }
-*/
-    private static CUfunction [] createFunctions(
            String sourceCode, String [] kernelNames) throws IOException
        {
    	CUfunction [] functions = new CUfunction [kernelNames.length];
@@ -916,7 +654,8 @@ public static float intBitsToFloat(int bits)
            nvrtcDestroyProgram(program);
            // Create a CUDA module from the PTX code
-            CUmodule module = new CUmodule();
+//            CUmodule
+            module = new CUmodule();
            cuModuleLoadData(module, ptx[0]);
            for (int i = 0; i < kernelNames.length; i++) {
@@ -928,9 +667,6 @@ public static float intBitsToFloat(int bits)
            return functions;
        }
    static String readFileAsString(String path)
    {
    	byte[] encoded;
@@ -1036,6 +772,75 @@ public static float intBitsToFloat(int bits)
 		ImageDtt.startAndJoin(threads);
 	}
+	public void setLpfRbg(
+			float sigma_r,
+			float sigma_b,
+			float sigma_g)
+	{
+		int dct_size = DTT_SIZE;
+		DttRad2 dtt = new DttRad2(dct_size);
+		double [][] lpf_rbg = {
+				dtt.dttt_iiie(setCltLpf(sigma_r)),
+				dtt.dttt_iiie(setCltLpf(sigma_b)),
+				dtt.dttt_iiie(setCltLpf(sigma_g))};
+		int l = dct_size*dct_size;
+		float []   lpf_flat = new float [3 * l];
+		for (int i = 0; i < 3;i++) {
+//			System.arraycopy(lpf_rbg[i], 0, lpf_flat, l* i, l);
+			for (int j = 0; j < l; j++) {
+				lpf_flat[j+i*l] = (float) (lpf_rbg[i][j]*2*dct_size);
+			}
+		}
+		CUdeviceptr constantMemoryPointer = new CUdeviceptr();
+		long constantMemorySizeArray[] = { 0 };
+		cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray,  module, "lpf_data");
+		int constantMemorySize = (int)constantMemorySizeArray[0];
+//__constant__ float lpf_data[3][64]={
+		System.out.println("constantMemoryPointer: " + constantMemoryPointer);
+		System.out.println("constantMemorySize: " + constantMemorySize);
+        cuMemcpyHtoD(constantMemoryPointer, Pointer.to(lpf_flat), constantMemorySize);
+		System.out.println();
+	}
+	public double [] setCltLpf(
+			double   sigma)
+	{
+			int dct_size = DTT_SIZE;
+		double [] lpf = new double [dct_size*dct_size];
+		int dct_len = dct_size * dct_size;
+		if (sigma == 0.0f) {
+			lpf[0] = 1.0f;
+			for (int i = 1; i < dct_len; i++){
+				lpf[i] = 0.0f;
+			}
+		} else {
+			for (int i = 0; i < dct_size; i++){
+				for (int j = 0; j < dct_size; j++){
+					lpf[i*dct_size+j] = (float) Math.exp(-(i*i+j*j)/(2*sigma));
+				}
+			}
+			// normalize
+			double sum = 0;
+			for (int i = 0; i < dct_size; i++){
+				for (int j = 0; j < dct_size; j++){
+					double d = 	lpf[i*dct_size+j];
+					d*=Math.cos(Math.PI*i/(2*dct_size))*Math.cos(Math.PI*j/(2*dct_size));
+					if (i > 0) d*= 2.0;
+					if (j > 0) d*= 2.0;
+					sum +=d;
+				}
+			}
+			for (int i = 0; i< dct_len; i++){
+				lpf[i] /= sum;
+			}
+		}
+		return lpf;
+	}

--- a/src/main/java/TwoQuadCLT.java
+++ b/src/main/java/TwoQuadCLT.java
@@ -1355,27 +1355,7 @@ public class TwoQuadCLT {
 				dst_bayer[nc][i]= nc*main_bayer[nc].length + i;
 			}
 		}
-/*
-		int iwidth = imp_quad_main[0].getWidth();
-		String [] dbg_titles= {"src0","dst0","src1","dst1","src2","dst2","src3","dst3"};
-		for (int nc = 0; nc < main_bayer.length; nc++) {
-			gPUTileProcessor.exec_dtt24(
-					main_bayer[nc], // float src_pixels[],
-					dst_bayer[nc], // float dst_pixels[],
-					iwidth, // int width,
-					0); // int dtt_mode);
-		}
-		float [][] both_bayer = {main_bayer[0],dst_bayer[0],main_bayer[1],dst_bayer[1],main_bayer[2],dst_bayer[2],main_bayer[3],dst_bayer[3]};
-		(new showDoubleFloatArrays()).showArrays(
-				both_bayer,
-				iwidth,
-				main_bayer[0].length / iwidth,
-				true,
-				"converted",
-				dbg_titles);
-*/
 		double [][][]       port_xy_main_dbg = new double [tilesX*tilesY][][];
 		double [][][]       port_xy_aux_dbg = new double [tilesX*tilesY][][];
@@ -1832,6 +1812,12 @@ public class TwoQuadCLT {
 			final boolean    updateStatus,
 			final int        debugLevel){
+		gPUTileProcessor.setLpfRbg(
+				1.1f,  // float sigma_r,
+				1.1f,  // float sigma_b,
+				0.7f); // float sigma_g)
 		final boolean use_aux = false; // currently GPU is configured for a single quad camera
 		final boolean      batch_mode = clt_parameters.batch_run; //disable any debug images
@@ -1889,9 +1875,17 @@ public class TwoQuadCLT {
 				use_aux); // boolean use_aux)
 		// All set, run kernel (correct and convert)
-		gPUTileProcessor.execConverCorrectTiles();
+		int NREPEAT = 1; // 00;
+		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
+		long startGPU=System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execConverCorrectTiles();
 		// run imclt;
-		gPUTileProcessor.execImcltRbg();
+		long firstGPUTime= (System.nanoTime() - startGPU)/NREPEAT;
+		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execImcltRbg();
+		long runGPUTime = (System.nanoTime() - startGPU)/NREPEAT;
+		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
+		System.out.println("GPU run time ="+(runGPUTime * 1.0e-6)+"ms, (direct conversion: "+(firstGPUTime*1.0e-6)+"ms, imclt: "+
+				((runGPUTime - firstGPUTime)*1.0e-6)+"ms)");
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {