converting to multi-cam

58d01fd2 · Andrey Filippov · 6a300e5a · 58d01fd2 · 58d01fd2 · 58d01fd2
Commit 58d01fd2 authored Aug 06, 2020 by Andrey Filippov
4 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -92,13 +92,37 @@
 				<artifactId>loci_tools</artifactId>
 				<version>6.1.0</version>
 			</dependency>
+<!-- 			
 			<dependency>
 			  <groupId>com.drewnoakes</groupId>
 			  <artifactId>metadata-extractor</artifactId>
 			  <version>2.11.0</version>
 			  <type>java-source</type>
+			</dependency> -->
+<!-- https://mvnrepository.com/artifact/com.drewnoakes/metadata-extractor -->
+			<dependency>
+			    <groupId>com.drewnoakes</groupId>
+			    <artifactId>metadata-extractor</artifactId>
+			    <version>2.11.0</version>
 			</dependency>
 			
+			<!-- https://mvnrepository.com/artifact/net.sf.ehcache/ehcache-core -->
+			<!-- 
+			<dependency>
+			    <groupId>net.sf.ehcache</groupId>
+			    <artifactId>ehcache-core</artifactId>
+			    <version>2.6.2</version>
+			</dependency>\
+			 -->
+			<!-- https://mvnrepository.com/artifact/org.slf4j/jcl-over-slf4j -->
+			<!--
+			<dependency>
+			    <groupId>org.slf4j</groupId>
+			    <artifactId>jcl-over-slf4j</artifactId>
+			    <version>1.7.5</version>
+			</dependency>
+		  -->
+			
 		</dependencies>

 	<build>

--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
@@ -184,6 +184,9 @@ private Panel panel1,
   public static QuadCLT          QUAD_CLT_AUX =         null;
   public static TwoQuadCLT       TWO_QUAD_CLT =         null;
   public static GPUTileProcessor GPU_TILE_PROCESSOR =   null;
+   // Add macro for GPU_QUAD?
+   public static GPUTileProcessor.GpuQuad GPU_QUAD =     null;
+   public static GPUTileProcessor.GpuQuad GPU_QUAD_AUX = null;
   public static LwirReader       LWIR_READER = null;

   public static EyesisCorrectionParameters.DebayerParameters DEBAYER_PARAMETERS = new EyesisCorrectionParameters.DebayerParameters(
@@ -5803,12 +5806,31 @@ private Panel panel1,
 			} //final int        debugLevel);

 		}
+		
+		if (GPU_QUAD == null) {
+			try {
+				GPU_QUAD = GPU_TILE_PROCESSOR. new GpuQuad(
+						2592,
+						1936,
+						4,
+						3);
+			} catch (Exception e) {
+				System.out.println("Failed to initialize GpuQuad class");
+				// TODO Auto-generated catch block
+				e.printStackTrace();
+				return false;
+			} //final int        debugLevel);
+		}
+		// For now keep GPU_QUAD_AUX==null
+		
 		if (COLOR_PROC_PARAMETERS_AUX == null) {
 			COLOR_PROC_PARAMETERS_AUX = COLOR_PROC_PARAMETERS.clone();
 		}
 		try {
 			TWO_QUAD_CLT.processCLTQuadCorrPairsGpu(
-					GPU_TILE_PROCESSOR,
+//					GPU_TILE_PROCESSOR,
+					GPU_QUAD,     // GPUTileProcessor.GpuQuad                        gpuQuad_main,
+					GPU_QUAD_AUX, // GPUTileProcessor.GpuQuad                        gpuQuad_aux,
 					QUAD_CLT, // QuadCLT quadCLT_main,
 					QUAD_CLT_AUX, // QuadCLT quadCLT_aux,
 					CLT_PARAMETERS,  // EyesisCorrectionParameters.DCTParameters           dct_parameters,

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -92,24 +92,16 @@ public class GPUTileProcessor {
 	static String GPU_RESOURCE_DIR =              "kernels";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
 	// "*" - generated defines, first index - separately compiled unit
-/*	static String [][] GPU_SRC_FILES = {
-	{"*","dtt8x8.h","dtt8x8.cu"},
-	{"*","dtt8x8.h","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
-*/
 	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
-//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
-	//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct"; // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
-//	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
 	static String GPU_RBGA_NAME =                  "generate_RBGA"; // name in C code
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv"; // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
 	static String GPU_CALC_REVERSE_DISTORTION =    "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction

-
 //  pass some defines to gpu source code with #ifdef JCUDA
 	public static int DTT_SIZE_LOG2 =             3;
 	public static int DTT_SIZE =                  (1 << DTT_SIZE_LOG2);
@@ -171,8 +163,11 @@ public class GPUTileProcessor {
    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null;
    private CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;

+    CUmodule    module; // to access constants memory
    
    // CPU arrays of pointers to GPU memory
+    // Moved to GpuQuad class
+/*    
    // These arrays may go to methods, they are here just to be able to free GPU memory if needed
    private CUdeviceptr [] gpu_kernels_h =        new CUdeviceptr[NUM_CAMS];
    private CUdeviceptr [] gpu_kernel_offsets_h = new CUdeviceptr[NUM_CAMS];
@@ -221,6 +216,10 @@ public class GPUTileProcessor {
    public int num_task_tiles;
    public int num_corr_tiles;
    public int num_texture_tiles;
+*/    
+//    public GpuQuad [][] gpuQuad; // array of GpuQuad instances 2x2? ({{rgb, rgb_macro}, {lwir, lwir_macro})
+    // initilize with 4 dimensions each
+    
    public class TpTask {
    	public int   task; // [0](+1) - generate 4 images, [4..9]+16..+512 - correlation pairs, 2 - generate texture tiles
    	public float target_disparity;
@@ -273,7 +272,8 @@ public class GPUTileProcessor {
    	}
    }

-    public class CltExtra{
+    /*
+    public class CltExtra{ // never used?
    	public float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
    	public float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
    	public float center_x; // actual center X (use to find derivatives)
@@ -328,7 +328,8 @@ public class GPUTileProcessor {
    		flt[indx++] = this.dyc_dy;
    		return flt;
    	}
-    };
+    }
+    */

    private static long getPointerAddress(CUdeviceptr p)

@@ -451,7 +452,6 @@ public class GPUTileProcessor {

        // Create the kernel functions (first - just test)
        String [] func_names = {
-//        		GPU_CONVERT_CORRECT_TILES_NAME,
        		GPU_CONVERT_DIRECT_NAME,
        		GPU_IMCLT_ALL_NAME,
        		GPU_CORRELATE2D_NAME,
@@ -465,7 +465,6 @@ public class GPUTileProcessor {
        		                                 func_names,
        		                                 capability); // on my - 75

-//        GPU_CONVERT_CORRECT_TILES_kernel =   functions[0];
        GPU_CONVERT_DIRECT_kernel =          functions[0];
        GPU_IMCLT_ALL_kernel =               functions[1];
        GPU_CORRELATE2D_kernel =             functions[2];
@@ -476,7 +475,6 @@ public class GPUTileProcessor {
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[7];

        System.out.println("GPU kernel functions initialized");
-//        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
        System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
        System.out.println(GPU_IMCLT_ALL_kernel.toString());
        System.out.println(GPU_CORRELATE2D_kernel.toString());
@@ -486,11 +484,249 @@ public class GPUTileProcessor {
        System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
        System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());
        
+        // GPU data structures are now initialized through GpuQuad instances
+    }
+    
+    public static String [] getCorrTitles() {
+    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other"};
+    }
+    public static double [][] getCorr2DView(
+    		int tilesX,
+    		int tilesY,
+    		int [] indices,
+    		float [][] corr2d,
+    		int [] wh){ // if is [2] - return width, height
+    	if ((corr2d == null) || (corr2d.length == 0)) {
+    		return new double [NUM_PAIRS][0];
+    	}
+
+    	int corr_size = (int)(Math.round(Math.sqrt(corr2d[0].length)));//  make smaller later?
+    	int width =  tilesX * (corr_size + 1) + 1;
+    	int height = tilesY * (corr_size + 1) + 1;
+    	double [][] data = new double [NUM_PAIRS][];
+    	data[0] = new double[height*width];
+    	for (int ty = 0; ty < tilesY; ty++) {
+    		for (int tx = 0; tx < tilesX; tx++) {
+    			for (int i = 0; i< corr_size; i++) {
+    				for (int j = 0; j < corr_size; j++) {
+    					data[0][(ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1)] = Double.NaN;
+    				}
+    			}
+    		}
+    	}
+		for (int np = 1; np < NUM_PAIRS; np++) {
+			data[np] = data[0].clone();
+		}
+		for (int n = 0; n < indices.length; n++) {
+			int nt = indices[n] >> CORR_NTILE_SHIFT;
+			int np = indices[n] & CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
+			assert np < NUM_PAIRS : "invalid correllation pair";
+			int tx = nt % tilesX;
+			int ty = nt / tilesX;
+			for (int i = 0; i< corr_size; i++) {
+				for (int j = 0; j < corr_size; j++) {
+					//java.lang.ArrayIndexOutOfBoundsException: 20081634
+					int indx1 = (ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1);
+					int indx2 = i*corr_size+j;
+//    					if ((indx1 > data[0].length) || (indx1 > data[0].length)){
+//    						System.out.println("Bugggg!)");
+//    					}
+					data[np][indx1] = corr2d[n][indx2];
+				}
+			}
+		}
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+    	return data;
+    }
+
+
+
+//    private static CUfunction [] createFunctions(
+    private CUfunction [] createFunctions(
+    		String []  sourceCodeUnits,
+    		String []  kernelNames,
+    		int        capability
+    		) throws IOException
+    {
+    	CUfunction [] functions = new CUfunction [kernelNames.length];
+    	byte[][] ptxDataUnits = new byte [sourceCodeUnits.length][];
+    	boolean OK = false;
+       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
+       		String sourceCode = sourceCodeUnits[cunit];
+    		// Use the NVRTC to create a program by compiling the source code
+    		nvrtcProgram program = new nvrtcProgram();
+    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
+    		String options[] = {"--gpu-architecture=compute_"+capability};
+
+    		try {
+    			nvrtcCompileProgram(program, options.length, options);
+    			OK = true;
+    		} catch (Exception e) {
+    			System.out.println("nvrtcCompileProgram() FAILED");
+    		}
+    		// Compilation log with errors/warnings
+    		String programLog[] = new String[1];
+    		nvrtcGetProgramLog(program, programLog);
+    		String log = programLog[0].trim();
+    		if (!log.isEmpty())
+    		{
+    			System.err.println("Program compilation log:\n" + log);
+    		}
+    		if (!OK) {
+    			throw new IOException("Could not compile program");
+    		}
+
+    		// Get the PTX code of the compiled program (not the binary)
+    		String[] ptx = new String[1];
+    		nvrtcGetPTX(program, ptx);
+    		nvrtcDestroyProgram(program);
+    		ptxDataUnits[cunit] = ptx[0].getBytes();
+    		System.out.println("ptxDataUnits["+cunit+"].length="+ptxDataUnits[cunit].length);
+    	}
+    	JITOptions jitOptions = new JITOptions();
+    	jitOptions.putInt(CU_JIT_LOG_VERBOSE, 1);
+    	CUlinkState state = new CUlinkState();
+    	cuLinkCreate(jitOptions, state);
+    	cuLinkAddFile(state, CU_JIT_INPUT_LIBRARY, LIBRARY_PATH, jitOptions);
+
+       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
+       		cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxDataUnits[cunit]), ptxDataUnits[cunit].length, "input"+cunit+".ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
+       	}
+    	long size[] = { 0 };
+    	Pointer image = new Pointer();
+    	JCudaDriver.setExceptionsEnabled(false);
+    	int cuda_result = cuLinkComplete(state, image, size);
+    	System.out.println("cuLinkComplete() -> "+cuda_result);
+    	JCudaDriver.setExceptionsEnabled(true);
+    	module = new CUmodule();
+    	cuModuleLoadDataEx(module, image, 0, new int[0], Pointer.to(new int[0]));
+    	cuLinkDestroy(state);
+
+    	for (int i = 0; i < kernelNames.length; i++) {
+    		// Find the function in the source by name, get its pointer
+    		functions[i] = new CUfunction();
+    		cuModuleGetFunction(functions[i] , module, kernelNames[i]);
+    	}
+    	return functions;
+    }
+
+    static String readFileAsString(String path)
+    {
+    	byte[] encoded;
+    	try {
+    		encoded = Files.readAllBytes(Paths.get(path));
+    	} catch (IOException e) {
+    		return null;
+    	}
+    	return new String(encoded, StandardCharsets.UTF_8);
+    }
+
+   public class GpuQuad{ // quad camera description
+    	public final int IMG_WIDTH;
+    	public final int IMG_HEIGHT;
+    	public final int NUM_CAMS;
+    	public final int NUM_COLORS; // maybe should always be 3?
+//    	public final GPUTileProcessor gPUTileProcessor;
+        // CPU arrays of pointers to GPU memory
+        // These arrays may go to methods, they are here just to be able to free GPU memory if needed
+        private CUdeviceptr [] gpu_kernels_h;
+        private CUdeviceptr [] gpu_kernel_offsets_h;
+        private CUdeviceptr [] gpu_bayer_h;
+        private CUdeviceptr [] gpu_clt_h;
+        private CUdeviceptr [] gpu_corr_images_h;
+        // GPU pointers to array of GPU pointers
+        private CUdeviceptr gpu_kernels;
+        private CUdeviceptr gpu_kernel_offsets;
+        private CUdeviceptr gpu_bayer;
+        private CUdeviceptr gpu_tasks;
+        private CUdeviceptr gpu_corrs;
+        private CUdeviceptr gpu_textures;
+        private CUdeviceptr gpu_clt;
+        private CUdeviceptr gpu_4_images;
+        private CUdeviceptr gpu_corr_indices;
+        private CUdeviceptr gpu_num_corr_tiles;
+        private CUdeviceptr gpu_texture_indices_ovlp;
+        private CUdeviceptr gpu_num_texture_ovlp;
+        private CUdeviceptr gpu_texture_indices;
+        private CUdeviceptr gpu_texture_indices_len;
+        private CUdeviceptr gpu_diff_rgb_combo;
+        private CUdeviceptr gpu_color_weights;
+        private CUdeviceptr gpu_generate_RBGA_params;
+        private CUdeviceptr gpu_woi;
+        private CUdeviceptr gpu_textures_rgba;
+        private CUdeviceptr gpu_correction_vector;
+        private CUdeviceptr gpu_rot_deriv;
+        private CUdeviceptr gpu_geometry_correction;
+        private CUdeviceptr gpu_rByRDist;
+        private CUdeviceptr gpu_active_tiles;
+        private CUdeviceptr gpu_num_active_tiles;
+        private int mclt_stride;
+        private int corr_stride;
+        private int imclt_stride;
+        private int texture_stride;
+        private int texture_stride_rgba;
+        private int num_task_tiles;
+        private int num_corr_tiles;
+        private int num_texture_tiles;
+
+    	public GpuQuad(
+//    		final GPUTileProcessor gPUTileProcessor,	
+   			final int img_width,
+   			final int img_height,
+   			final int num_cams,
+   			final int num_colors
+) {
+//    		this.gPUTileProcessor = gPUTileProcessor;
+        	IMG_WIDTH =  img_width;
+        	IMG_HEIGHT = img_height;
+        	NUM_CAMS =   num_cams;
+        	NUM_COLORS = num_colors; // maybe should always be 3?
+        	
+            // CPU arrays of pointers to GPU memory
+            // These arrays may go to methods, they are here just to be able to free GPU memory if needed
+            gpu_kernels_h =           new CUdeviceptr[NUM_CAMS];
+            gpu_kernel_offsets_h =    new CUdeviceptr[NUM_CAMS];
+            gpu_bayer_h =             new CUdeviceptr[NUM_CAMS];
+            gpu_clt_h =               new CUdeviceptr[NUM_CAMS];
+            gpu_corr_images_h=        new CUdeviceptr[NUM_CAMS];
+            // GPU pointers to array of GPU pointers
+            gpu_kernels =             new CUdeviceptr();
+            gpu_kernel_offsets =      new CUdeviceptr();
+            gpu_bayer =               new CUdeviceptr();
+            gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT
+            gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.FLOAT
+            gpu_textures =            new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
+            gpu_clt =                 new CUdeviceptr();
+            gpu_4_images =            new CUdeviceptr();
+            gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
+            gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int
+
+            gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
+            gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT
+
+            gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
+            gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
+
+            gpu_correction_vector=    new CUdeviceptr();
+            gpu_rot_deriv=            new CUdeviceptr(); //  used internally by device, may be read to CPU for testing
+            gpu_geometry_correction=  new CUdeviceptr();
+            gpu_rByRDist=             new CUdeviceptr(); //  calculated once for the camera distortion model in CPU (move to GPU?)
+
+            gpu_active_tiles =        new CUdeviceptr(); //  TILESX*TILESY*sizeof(int)
+            gpu_num_active_tiles =    new CUdeviceptr(); //  1 int
+        	
            // Init data arrays for all kernels
            int tilesX =  IMG_WIDTH / DTT_SIZE;
            int tilesY =  IMG_HEIGHT / DTT_SIZE;
            long [] device_stride = new long [1];
-
            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
            	gpu_kernels_h[ncam] =        new CUdeviceptr();
            	cuMemAlloc(gpu_kernels_h[ncam],KERN_SIZE * Sizeof.FLOAT ); //     public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
@@ -504,7 +740,6 @@ public class GPUTileProcessor {
                		IMG_HEIGHT,               // long Height,
                        Sizeof.FLOAT);            // int ElementSizeBytes)
                mclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
-
                gpu_corr_images_h[ncam] =  new CUdeviceptr();
                cuMemAllocPitch (
                		gpu_corr_images_h[ncam],               // CUdeviceptr dptr,
@@ -515,7 +750,6 @@ public class GPUTileProcessor {
                imclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
                gpu_clt_h[ncam] = new CUdeviceptr();
            	cuMemAlloc(gpu_clt_h[ncam],tilesY * tilesX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT ); //     public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
-
            }
            // now create device arrays pointers
            if (Sizeof.POINTER != Sizeof.LONG) {
@@ -558,7 +792,7 @@ public class GPUTileProcessor {

            // Set task array
        	cuMemAlloc(gpu_tasks,      tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
-//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
+    //=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
        	// Set corrs array
        	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
        	cuMemAlloc(gpu_num_corr_tiles,                    1 * Sizeof.FLOAT);
@@ -589,7 +823,6 @@ public class GPUTileProcessor {
            		NUM_PAIRS * tilesX * tilesY,             // long Height,
                    Sizeof.FLOAT);                         // int ElementSizeBytes)
            corr_stride = (int)(device_stride[0] / Sizeof.FLOAT);
-
            int max_texture_size = (NUM_COLORS + 1 + (NUM_CAMS + NUM_COLORS + 1)) * (2 * DTT_SIZE)* (2 * DTT_SIZE);
            cuMemAllocPitch (
            		gpu_textures,                             // CUdeviceptr dptr,
@@ -601,7 +834,6 @@ public class GPUTileProcessor {
            int max_rgba_width  =  (tilesX + 1) * DTT_SIZE;
            int max_rgba_height =  (tilesY + 1) * DTT_SIZE;
            int max_rbga_slices =  NUM_COLORS + 1;
-
            cuMemAllocPitch (
            		gpu_textures_rgba,                     // CUdeviceptr dptr,
            		device_stride,                         // long[] pPitch,
@@ -609,7 +841,6 @@ public class GPUTileProcessor {
            		max_rgba_height * max_rbga_slices,     // long Height,
                    Sizeof.FLOAT);                         // int ElementSizeBytes)
            texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
-
    	}
    	
        public void setGeometryCorrection(GeometryCorrection gc,
@@ -681,11 +912,7 @@ public class GPUTileProcessor {
        	return texture_indices;
        }

-
-//texture_indices
-
-
-
+    //texture_indices
        public void setConvolutionKernel(
        		float [] kernel,  // [tileY][tileX][color][..]
        		float [] kernel_offsets,
@@ -900,30 +1127,25 @@ public class GPUTileProcessor {
        				num_tiles--;
        			}
        		}
-//nextInt(int bound)
+    //nextInt(int bound)
        	}

        	if (dbg1) {
-//    		mask[(woi.y-1) * tilesX + (woi.x-1)] = true;
        		mask[(woi.y+woi.height) * tilesX + (woi.x+woi.width)] = true;
    			num_tiles += 1; // 2;

        	}

-//    	TpTask [] tp_tasks = new TpTask[tilesX*tilesY];
        	TpTask [] tp_tasks = new TpTask[num_tiles];

        	int indx = 0;
        	for (int ty = 0; ty < tilesY; ty++) {
            	for (int tx = 0; tx < tilesX; tx++) if (mask[ty * tilesX + tx]) {
-
-//        		tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx], 1); // task == 1 for now
-// Only generate for non-empty tasks, use 1 empty empty as a terminator?
+    // Only generate for non-empty tasks, use 1 empty empty as a terminator?
            		tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx],
            				((out_images[indx] & 0x0f) << 0) |
            				((corr_mask [indx] & 0x3f) << 4)
            				); // task == 1 for now
-
            		indx++;
            	}
        	}
@@ -985,7 +1207,6 @@ public class GPUTileProcessor {
        			num_textures++;
        		}
        	}
-
        	int [] iarr = new int[num_textures];
        	num_textures = 0;
        	int b = (1 << LIST_TEXTURE_BIT);
@@ -998,67 +1219,7 @@ public class GPUTileProcessor {
        	return iarr;
        }

-
-
-    public static String [] getCorrTitles() {
-    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other"};
-    }
-    public static double [][] getCorr2DView(
-    		int tilesX,
-    		int tilesY,
-    		int [] indices,
-    		float [][] corr2d,
-    		int [] wh){ // if is [2] - return width, height
-    	if ((corr2d == null) || (corr2d.length == 0)) {
-    		return new double [NUM_PAIRS][0];
-    	}
-
-    	int corr_size = (int)(Math.round(Math.sqrt(corr2d[0].length)));//  make smaller later?
-    	int width =  tilesX * (corr_size + 1) + 1;
-    	int height = tilesY * (corr_size + 1) + 1;
-    	double [][] data = new double [NUM_PAIRS][];
-    	data[0] = new double[height*width];
-    	for (int ty = 0; ty < tilesY; ty++) {
-    		for (int tx = 0; tx < tilesX; tx++) {
-    			for (int i = 0; i< corr_size; i++) {
-    				for (int j = 0; j < corr_size; j++) {
-    					data[0][(ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1)] = Double.NaN;
-    				}
-    			}
-    		}
-    	}
-		for (int np = 1; np < NUM_PAIRS; np++) {
-			data[np] = data[0].clone();
-		}
-		for (int n = 0; n < indices.length; n++) {
-			int nt = indices[n] >> CORR_NTILE_SHIFT;
-			int np = indices[n] & CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
-			assert np < NUM_PAIRS : "invalid correllation pair";
-			int tx = nt % tilesX;
-			int ty = nt / tilesX;
-			for (int i = 0; i< corr_size; i++) {
-				for (int j = 0; j < corr_size; j++) {
-					//java.lang.ArrayIndexOutOfBoundsException: 20081634
-					int indx1 = (ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1);
-					int indx2 = i*corr_size+j;
-//					if ((indx1 > data[0].length) || (indx1 > data[0].length)){
-//						System.out.println("Bugggg!)");
-//					}
-					data[np][indx1] = corr2d[n][indx2];
-				}
-			}
-		}
-		if (wh != null) {
-			wh[0] = width;
-			wh[1] = height;
-		}
-    	return data;
-    }
-
-
-// All data is already copied to GPU memory
-
-
+    // All data is already copied to GPU memory

        public void execRotDerivs() {
            if (GPU_ROT_DERIV_kernel == null)
@@ -1083,6 +1244,7 @@ public class GPUTileProcessor {
        			kernelParameters, null);   // Kernel- and extra parameters
        	cuCtxSynchronize(); // remove later
        }
+        
        public void execCalcReverseDistortions() {
            if (GPU_CALC_REVERSE_DISTORTION_kernel == null)
            {
@@ -1201,7 +1363,6 @@ public class GPUTileProcessor {
        	cuCtxSynchronize();
        }

-
        public void execCorr2D(
        		double [] scales,
        		double fat_zero,
@@ -1217,8 +1378,6 @@ public class GPUTileProcessor {
        	float fscale0 = (float) scales[0];
        	float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
        	float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
-//		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
-//    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
    		int [] GridFullWarps =    {1, 1, 1};
        	int [] ThreadsFullWarps = {1, 1, 1};
        	Pointer kernelParameters = Pointer.to(
@@ -1300,11 +1459,6 @@ public class GPUTileProcessor {
    	            Pointer.to(new int[]   {num_colors}),            // int               colors,             // number of colors (3/1)
    	            Pointer.to(new int[]   {iis_lwir}),              // int               is_lwir,            // do not perform shot correction
    	            Pointer.to(gpu_generate_RBGA_params),            // float             generate_RBGA_params[5],
-//	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
-//	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
-//	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
-//	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
-//	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
    	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
    	            Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
    	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
@@ -1371,15 +1525,8 @@ public class GPUTileProcessor {
        			Pointer.to(new int[] { num_colors }),
        			Pointer.to(new int[] { iis_lwir }),
    	            Pointer.to(gpu_generate_RBGA_params),            // float             generate_RBGA_params[5],
-//	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
-//	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
-//	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
-//	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
-//	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
    	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
        			Pointer.to(new int[] { idust_remove }),
-//    			Pointer.to(new int[] { 0}), // texture_stride }),        // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-//    			Pointer.to(new int[] {0}),  // gpu_textures),
        			Pointer.to(new int[] {texture_stride}),        // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
        			Pointer.to(gpu_textures),
    	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
@@ -1393,9 +1540,6 @@ public class GPUTileProcessor {
        	cuCtxSynchronize();
        }

-
-
-
        public float [][] getCorr2D(int corr_rad){
            int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
            float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
@@ -1419,6 +1563,7 @@ public class GPUTileProcessor {
            }
            return corrs;
        }
+        
        public int [] getCorrIndices() {
        	float [] fnum_corrs = new float[1];
        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
@@ -1437,17 +1582,6 @@ public class GPUTileProcessor {
 //	        read extra data for macro generation: 4 DIFFs, 4 of R,  4 of B, 4 of G
        public float [][] getExtra(){
    		int [] texture_indices = getTextureIndices();
-/*
-    	float [] fnum_tiles = new float[1];
-    	cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_num_texture_ovlp,  1 * Sizeof.FLOAT);
-    	int num_tiles =      Float.floatToIntBits(fnum_tiles[0]);
-    	float [] ftiles = new float[num_tiles];
-    	cuMemcpyDtoH(Pointer.to(fnum_tiles), gpu_texture_indices_ovlp,  num_tiles * Sizeof.FLOAT);
-    	int [] tiles = new int[num_tiles];
-    	for (int i = 0; i < num_tiles; i++) {
-    		tiles[i] = Float.floatToIntBits(ftiles[i]);
-    	}
-*/
        	int num_tile_extra = NUM_CAMS*(NUM_COLORS+1);
        	float [] diff_rgb_combo = new float[texture_indices.length * num_tile_extra];
        	cuMemcpyDtoH(Pointer.to(diff_rgb_combo), gpu_diff_rgb_combo,  diff_rgb_combo.length * Sizeof.FLOAT);
@@ -1492,7 +1626,6 @@ public class GPUTileProcessor {
            copy_rbga.srcDevice =       gpu_textures_rgba;
            copy_rbga.srcPitch =        texture_stride_rgba * Sizeof.FLOAT;
            copy_rbga.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
-//        copy_woi.dstHost =         Pointer.to(rslt);
            copy_rbga.dstPitch =        woi.width * Sizeof.FLOAT;

            copy_rbga.WidthInBytes =    woi.width * Sizeof.FLOAT;
@@ -1516,7 +1649,6 @@ public class GPUTileProcessor {
        	int texture_slice_size = (2 * DTT_SIZE)* (2 * DTT_SIZE);        // number of (float) elements in a single slice of a tile
        	int texture_tile_size =  texture_slices * texture_slice_size;   // number of (float) elements in a multi-slice tile
        	int texture_size =       texture_tile_size * num_texture_tiles; // number of (float) elements in the whole texture
-//        float [] cpu_textures = new float [ num_texture_tiles * texture_size];
            float [] cpu_textures = new float [texture_size];
            CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
            copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
@@ -1588,7 +1720,6 @@ public class GPUTileProcessor {
        	return textures;
        }

-
        public double [][][][] doubleTextures(
        		Rectangle    woi,
        		int []       indices,
@@ -1617,9 +1748,6 @@ public class GPUTileProcessor {
        	return textures;
        }

-
-
-
        public float [][] getRBG (int ncam){
            int height = (IMG_HEIGHT + DTT_SIZE);
            int width =  (IMG_WIDTH + DTT_SIZE);
@@ -1649,98 +1777,6 @@ public class GPUTileProcessor {
            return fimg;
        }
        
-//    private static CUfunction [] createFunctions(
-    private CUfunction [] createFunctions(
-    		String []  sourceCodeUnits,
-    		String []  kernelNames,
-    		int        capability
-    		) throws IOException
-    {
-    	CUfunction [] functions = new CUfunction [kernelNames.length];
-    	byte[][] ptxDataUnits = new byte [sourceCodeUnits.length][];
-    	boolean OK = false;
-//    	for (String sourceCode: sourceCodeUnits) {
-       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
-       		String sourceCode = sourceCodeUnits[cunit];
-//       		System.out.print(sourceCode);
-    		// Use the NVRTC to create a program by compiling the source code
-    		nvrtcProgram program = new nvrtcProgram();
-    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);
-    		String options[] = {"--gpu-architecture=compute_"+capability};
-
-    		try {
-    			nvrtcCompileProgram(program, options.length, options);
-    			OK = true;
-    		} catch (Exception e) {
-    			System.out.println("nvrtcCompileProgram() FAILED");
-    		}
-    		// Compilation log with errors/warnings
-    		String programLog[] = new String[1];
-    		nvrtcGetProgramLog(program, programLog);
-    		String log = programLog[0].trim();
-    		if (!log.isEmpty())
-    		{
-    			System.err.println("Program compilation log:\n" + log);
-    		}
-    		if (!OK) {
-    			throw new IOException("Could not compile program");
-    		}
-
-    		// Get the PTX code of the compiled program (not the binary)
-    		String[] ptx = new String[1];
-    		nvrtcGetPTX(program, ptx);
-    		nvrtcDestroyProgram(program);
-//    		byte[] ptxData = ptx[0].getBytes();
-    		ptxDataUnits[cunit] = ptx[0].getBytes();
-    		System.out.println("ptxDataUnits["+cunit+"].length="+ptxDataUnits[cunit].length);
-    		//    	System.out.println( ptx[0]);
-    	}
-    	JITOptions jitOptions = new JITOptions();
-    	jitOptions.putInt(CU_JIT_LOG_VERBOSE, 1);
-    	CUlinkState state = new CUlinkState();
-    	cuLinkCreate(jitOptions, state);
-    	cuLinkAddFile(state, CU_JIT_INPUT_LIBRARY, LIBRARY_PATH, jitOptions);
-
-       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
-//	    	cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxData), ptxData.length, "input.ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
-       		cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxDataUnits[cunit]), ptxDataUnits[cunit].length, "input"+cunit+".ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
-//       		cuLinkAddData(state, CU_JIT_INPUT_PTX,     Pointer.to(ptxDataUnits[cunit]), ptxDataUnits[cunit].length, "input.ptx", jitOptions); // CUDA_ERROR_INVALID_PTX
-       	}
-//    	cuLinkAddFile(state, CU_JIT_INPUT_LIBRARY, LIBRARY_PATH, jitOptions);
-
-    	long size[] = { 0 };
-    	Pointer image = new Pointer();
-    	JCudaDriver.setExceptionsEnabled(false);
-    	int cuda_result = cuLinkComplete(state, image, size);
-    	System.out.println("cuLinkComplete() -> "+cuda_result);
-
-    	JCudaDriver.setExceptionsEnabled(true);
-
-
-    	module = new CUmodule();
-    	cuModuleLoadDataEx(module, image, 0, new int[0], Pointer.to(new int[0]));
-    	cuLinkDestroy(state);
-
-    	for (int i = 0; i < kernelNames.length; i++) {
-    		// Find the function in the source by name, get its pointer
-    		functions[i] = new CUfunction();
-    		cuModuleGetFunction(functions[i] , module, kernelNames[i]);
-    	}
-
-    	return functions;
-    }
-
-    static String readFileAsString(String path)
-    {
-    	byte[] encoded;
-    	try {
-    		encoded = Files.readAllBytes(Paths.get(path));
-    	} catch (IOException e) {
-    		return null;
-    	}
-    	return new String(encoded, StandardCharsets.UTF_8);
-    }
-
    	public void  getTileSubcamOffsets(
    			final TpTask[]            tp_tasks,        // will use // modify to have offsets for 8 cameras
    			final GeometryCorrection  geometryCorrection_main,
@@ -1937,9 +1973,6 @@ public class GPUTileProcessor {
    		return lpf;
    	}
        
+    } // end of public class GpuQuad

-
-
-
-
-}
+} // end of public class GPUTileProcessor
--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -452,7 +452,9 @@ public class TwoQuadCLT {
 	}

 	public void processCLTQuadCorrPairsGpu(
-			GPUTileProcessor                                gPUTileProcessor,
+//			GPUTileProcessor                                gPUTileProcessor,
+			GPUTileProcessor.GpuQuad                        gpuQuad_main,
+			GPUTileProcessor.GpuQuad                        gpuQuad_aux,
 			QuadCLT                                         quadCLT_main,
 			QuadCLT                                         quadCLT_aux,
 			CLTParameters       clt_parameters,
@@ -516,7 +518,9 @@ public class TwoQuadCLT {

 			// Tempporarily processing individaully with the old code
 			processCLTQuadCorrPairGpu(
-					gPUTileProcessor,           // GPUTileProcessor                               gPUTileProcessor,
+//					gPUTileProcessor,           // GPUTileProcessor                               gPUTileProcessor,
+					gpuQuad_main,               // GPUTileProcessor.GpuQuad                       gpuQuad_main,
+					gpuQuad_aux,                // GPUTileProcessor.GpuQuad                       gpuQuad_aux,
 					quadCLT_main,               // QuadCLT                                        quadCLT_main,
 					quadCLT_aux,                // QuadCLT                                        quadCLT_aux,
 					imp_srcs_main,              // ImagePlus []                                   imp_quad_main,
@@ -1934,7 +1938,9 @@ public class TwoQuadCLT {
 	}

 	public ImagePlus [] processCLTQuadCorrPairGpu(
-			GPUTileProcessor                                gPUTileProcessor,
+//			GPUTileProcessor                                gPUTileProcessor,
+			GPUTileProcessor.GpuQuad                        gpuQuad_main,
+			GPUTileProcessor.GpuQuad                        gpuQuad_aux,
 			QuadCLT                                         quadCLT_main,
 			QuadCLT                                         quadCLT_aux,
 			ImagePlus []                                    imp_quad_main,
@@ -1980,17 +1986,17 @@ public class TwoQuadCLT {
 			image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_g),
 			image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_m)
 		};
-		gPUTileProcessor.setLpfRbg(
+		gpuQuad_main.setLpfRbg(
 				lpf_rgb);

 		float [] lpf_flat = image_dtt.floatGetCltLpfFd(clt_parameters.getGpuCorrSigma(is_mono));

-		gPUTileProcessor.setLpfCorr(
+		gpuQuad_main.setLpfCorr(
 				"lpf_corr", // String const_name, // "lpf_corr"
 				lpf_flat);

 		float [] lpf_rb_flat = image_dtt.floatGetCltLpfFd(clt_parameters.getGpuCorrRBSigma(is_mono));
-		gPUTileProcessor.setLpfCorr(
+		gpuQuad_main.setLpfCorr(
 				"lpf_rb_corr", // String const_name, // "lpf_corr"
 				lpf_rb_flat);

@@ -2026,11 +2032,11 @@ public class TwoQuadCLT {
 				debugLevel);     // final int        debugLevel);


-		gPUTileProcessor.setConvolutionKernels(
+		gpuQuad_main.setConvolutionKernels(
 				(use_aux?quadCLT_aux.getCLTKernels() : quadCLT_main.getCLTKernels()), // double [][][][][][] clt_kernels,
 	    		false); // boolean force)

-		gPUTileProcessor.setBayerImages(
+		gpuQuad_main.setBayerImages(
 				(use_aux? quadCLT_aux.image_data: quadCLT_main.image_data), // double [][][]       bayer_data,
 	    		true); // boolean                  force);

@@ -2040,7 +2046,7 @@ public class TwoQuadCLT {
 				clt_parameters.gpu_woi_ty,
 				clt_parameters.gpu_woi_twidth,
 				clt_parameters.gpu_woi_theight);
-		GPUTileProcessor.TpTask [] tp_tasks  = gPUTileProcessor.setFullFrameImages(
+		GPUTileProcessor.TpTask [] tp_tasks  = gpuQuad_main.setFullFrameImages(
 				false,                                // boolean                   calc_offsets, // old way, now not needed with GPU calculation
 				twoi,                                 // Rectangle                 woi,
 				clt_parameters.gpu_woi_round,         // boolean                   round_woi,
@@ -2096,26 +2102,14 @@ public class TwoQuadCLT {
 			}
 		}

-		gPUTileProcessor.setTasks(
+		gpuQuad_main.setTasks(
 				tp_tasks, // TpTask [] tile_tasks,
 				use_aux); // boolean use_aux)

-
-//		int [] corr_indices = gPUTileProcessor.getCorrTasks(
-//				tp_tasks);
-		// corr_indices array of integers to be passed to GPU
-//		gPUTileProcessor.setCorrIndices(corr_indices);
-
-/*
-		int [] texture_indices = gPUTileProcessor.getTextureTasks(
-				tp_tasks);
-		gPUTileProcessor.setTextureIndices(
-				texture_indices);
-*/
-		gPUTileProcessor.setGeometryCorrection(
+		gpuQuad_main.setGeometryCorrection(
 				quadCLT_main.getGeometryCorrection(),
 				false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
-		gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
+		gpuQuad_main.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image

 /*		// TODO: calculate from the camera geometry?
 		double[][] port_offsets = { // used only in textures to scale differences
@@ -2129,33 +2123,33 @@ public class TwoQuadCLT {
 		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
 		long startGPU=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execCalcReverseDistortions();
+			gpuQuad_main.execCalcReverseDistortions();
 		}
 		long startRotDerivs=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execRotDerivs();
+			gpuQuad_main.execRotDerivs();
 		}

 		long startTasksSetup=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execSetTilesOffsets();
+			gpuQuad_main.execSetTilesOffsets();
 		}

 		long startDirectConvert=System.nanoTime();

 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execConvertDirect();
+			gpuQuad_main.execConvertDirect();
 		}

 // run imclt;
 		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
+			gpuQuad_main.execImcltRbgAll(quadCLT_main.isMonochrome());
 		}
 		long endImcltTime = System.nanoTime();
 // run correlation
 		long startCorr2d=System.nanoTime();   // System.nanoTime();
-		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execCorr2D(
+		for (int i = 0; i < NREPEAT; i++ ) gpuQuad_main.execCorr2D(
 	    		scales,// double [] scales,
 	    		fat_zero, // double fat_zero);
 	    		clt_parameters.gpu_corr_rad); // int corr_radius
@@ -2163,7 +2157,7 @@ public class TwoQuadCLT {
 		long endCorr2d = System.nanoTime();
 // run textures
 		long startTextures = System.nanoTime();   // System.nanoTime();
-		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execTextures(
+		for (int i = 0; i < NREPEAT; i++ ) gpuQuad_main.execTextures(
 				col_weights,                   // double [] color_weights,
 				quadCLT_main.isLwir(),         // boolean   is_lwir,
 				clt_parameters.min_shot,       // double    min_shot,           // 10.0
@@ -2176,7 +2170,7 @@ public class TwoQuadCLT {
 // run texturesRBGA
 		long startTexturesRBGA = System.nanoTime();   // System.nanoTime();

-		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
+		for (int i = 0; i < NREPEAT; i++ ) gpuQuad_main.execRBGA(
 				col_weights,                   // double [] color_weights,
 				quadCLT_main.isLwir(),         // boolean   is_lwir,
 				clt_parameters.min_shot,       // double    min_shot,           // 10.0
@@ -2212,7 +2206,7 @@ public class TwoQuadCLT {
 		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
-			iclt_fimg[ncam] = gPUTileProcessor.getRBG(ncam);
+			iclt_fimg[ncam] = gpuQuad_main.getRBG(ncam);
 		}

 		int out_width =  GPUTileProcessor.IMG_WIDTH +  GPUTileProcessor.DTT_SIZE;
@@ -2228,7 +2222,7 @@ public class TwoQuadCLT {
 				extra_titles[g * GPUTileProcessor.NUM_CAMS+ncam]= extra_group_titles[g]+"-"+ncam;
 			}
 		}
-		float [][] extra = gPUTileProcessor.getExtra();
+		float [][] extra = gpuQuad_main.getExtra();
 		(new ShowDoubleFloatArrays()).showArrays(
 				extra,
 				tilesX,
@@ -2260,8 +2254,8 @@ public class TwoQuadCLT {
 		//show_corr
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
-			int [] corr_indices = gPUTileProcessor.getCorrIndices();
-			float [][] corr2D = gPUTileProcessor.getCorr2D(
+			int [] corr_indices = gpuQuad_main.getCorrIndices();
+			float [][] corr2D = gpuQuad_main.getCorr2D(
 					clt_parameters.gpu_corr_rad); //  int corr_rad);
 			// convert to 6-layer image		 using tasks
 			double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
@@ -2344,7 +2338,7 @@ public class TwoQuadCLT {
 		// Use GPU prepared RBGA
 		if (clt_parameters.show_rgba_color) {
 			Rectangle woi = new Rectangle();
-			float [][] rbga = gPUTileProcessor.getRBGA(
+			float [][] rbga = gpuQuad_main.getRBGA(
 					(is_mono?1:3), // int     num_colors,
 					woi);
 			(new ShowDoubleFloatArrays()).showArrays(
@@ -2413,12 +2407,12 @@ public class TwoQuadCLT {
 		if (clt_parameters.show_rgba_color && (debugLevel > 100)) { // disabling
 			int numcol = quadCLT_main.isMonochrome()?1:3;
 			int ports = imp_quad_main.length;
-			int [] texture_indices = gPUTileProcessor.getTextureIndices();
+			int [] texture_indices = gpuQuad_main.getTextureIndices();
 			int          num_src_slices = numcol + 1 + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
 //			float [][][] ftextures = gPUTileProcessor.getTextures(
 //		    		(is_mono?1:3), // int     num_colors,
 //		    		clt_parameters.keep_weights); // boolean keep_weights);
-			float [] flat_textures =  gPUTileProcessor.getFlatTextures(
+			float [] flat_textures =  gpuQuad_main.getFlatTextures(
 					texture_indices.length,
 		    		(is_mono?1:3), // int     num_colors,
 		    		clt_parameters.keep_weights); // boolean keep_weights);
@@ -2447,7 +2441,7 @@ public class TwoQuadCLT {
 		    		}
 		    	}
 			}
-			double [][][][] texture_tiles =     gPUTileProcessor.doubleTextures(
+			double [][][][] texture_tiles =     gpuQuad_main.doubleTextures(
 		    		new Rectangle(0, 0, tilesX, tilesY), // Rectangle    woi,
 		    		texture_indices,                  // int []       indices,
 		    		flat_textures,                    // float [][][] ftextures,