Added GPU 2D phase correlation, related and debug functionality

84eeaf35 · Andrey Filippov · dc4e1f60 · 84eeaf35 · 84eeaf35 · 84eeaf35
Commit 84eeaf35 authored Mar 10, 2020 by Andrey Filippov
14 changed files
--- a/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
+++ b/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
@@ -30,6 +30,7 @@ public class CLTParameters {
 	public int        dbg_mode =            0;  // 0 - normal, +1 - no DCT/IDCT
 	public int        ishift_x =            0;  // debug feature - shift source image by this pixels left
 	public int        ishift_y =            0;  // debug feature - shift source image by this pixels down
+
 	private double    fat_zero =          0.05; // modify phase correlation to prevent division by very small numbers
 	private double    fat_zero_mono =     0.1;  // modify phase correlation to prevent division by very small numbers
 	private double    corr_sigma =        0.8;  // LPF correlation sigma
@@ -763,7 +764,18 @@ public class CLTParameters {
 	public boolean    taEnFlaps            = true;   // Enable cost of using supertile "flaps" (not in the center 8x8 tiles area)
 	public boolean    taEnMismatch         = false;  // Enable cost of a measurement layer not having same layer in the same location or near

-
+// gpu processing parameters
+	public int        gpu_corr_rad =        7;  // size of the correlation to save - initially only 15x15
+	public double     gpu_weight_r =      0.25;
+	public double     gpu_weight_b =      0.25; // weight g = 1.0 - gpu_weight_r - gpu_weight_b
+	public double     gpu_sigma_r =       1.1;
+	public double     gpu_sigma_b =       1.1;
+	public double     gpu_sigma_g =       0.7;
+	public double     gpu_sigma_m =       0.7;
+	public double     gpu_sigma_corr =    0.9;
+	public double     gpu_sigma_corr_m =  0.15;
+	public double     gpu_fatz =          30.0;
+	public double     gpu_fatz_m =        30.0;

 	public boolean    replaceWeakOutliers =   true; // false;

@@ -817,6 +829,15 @@ public class CLTParameters {
 		return monochrome ? fat_zero_mono : fat_zero;
 	}

+	public double getGpuFatZero(boolean monochrome) {
+		return monochrome ? gpu_fatz_m : gpu_fatz;
+	}
+
+	public double getGpuCorrSigma(boolean monochrome) {
+		return monochrome ? gpu_sigma_corr_m : gpu_sigma_corr;
+	}
+
+
 	public double getScaleStrength(boolean aux) {
 		return aux ? scale_strength_aux : scale_strength_main;
 	}
@@ -1512,6 +1533,18 @@ public class CLTParameters {
 		properties.setProperty(prefix+"taEnMismatch",               this.taEnMismatch +"");


+		properties.setProperty(prefix+"gpu_corr_rad",               this.gpu_corr_rad +"");
+		properties.setProperty(prefix+"gpu_weight_r",               this.gpu_weight_r +"");
+		properties.setProperty(prefix+"gpu_weight_b",               this.gpu_weight_b +"");
+		properties.setProperty(prefix+"gpu_sigma_r",                this.gpu_sigma_r +"");
+		properties.setProperty(prefix+"gpu_sigma_b",                this.gpu_sigma_b +"");
+		properties.setProperty(prefix+"gpu_sigma_g",                this.gpu_sigma_g +"");
+		properties.setProperty(prefix+"gpu_sigma_m",                this.gpu_sigma_m +"");
+		properties.setProperty(prefix+"gpu_sigma_corr",             this.gpu_sigma_corr +"");
+		properties.setProperty(prefix+"gpu_sigma_corr_m",           this.gpu_sigma_corr_m +"");
+		properties.setProperty(prefix+"gpu_fatz",                   this.gpu_fatz +"");
+		properties.setProperty(prefix+"gpu_fatz_m",                 this.gpu_fatz_m +"");
+
 		properties.setProperty(prefix+"debug_initial_discriminate",           this.debug_initial_discriminate+"");
 		properties.setProperty(prefix+"dbg_migrate",                          this.dbg_migrate+"");

@@ -2265,6 +2298,17 @@ public class CLTParameters {
 		if (properties.getProperty(prefix+"taEnFlaps")!=null)                   this.taEnFlaps=Boolean.parseBoolean(properties.getProperty(prefix+"taEnFlaps"));
 		if (properties.getProperty(prefix+"taEnMismatch")!=null)                this.taEnMismatch=Boolean.parseBoolean(properties.getProperty(prefix+"taEnMismatch"));

+		if (properties.getProperty(prefix+"gpu_corr_rad")!=null)                this.gpu_corr_rad=Integer.parseInt(properties.getProperty(prefix+"gpu_corr_rad"));
+		if (properties.getProperty(prefix+"gpu_weight_r")!=null)                this.gpu_weight_r=Double.parseDouble(properties.getProperty(prefix+"gpu_weight_r"));
+		if (properties.getProperty(prefix+"gpu_weight_b")!=null)                this.gpu_weight_b=Double.parseDouble(properties.getProperty(prefix+"gpu_weight_b"));
+		if (properties.getProperty(prefix+"gpu_sigma_r")!=null)                 this.gpu_sigma_r=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_r"));
+		if (properties.getProperty(prefix+"gpu_sigma_b")!=null)                 this.gpu_sigma_b=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_b"));
+		if (properties.getProperty(prefix+"gpu_sigma_g")!=null)                 this.gpu_sigma_g=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_g"));
+		if (properties.getProperty(prefix+"gpu_sigma_m")!=null)                 this.gpu_sigma_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_m"));
+		if (properties.getProperty(prefix+"gpu_sigma_corr")!=null)              this.gpu_sigma_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr"));
+		if (properties.getProperty(prefix+"gpu_sigma_corr_m")!=null)            this.gpu_sigma_corr_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr_m"));
+		if (properties.getProperty(prefix+"gpu_fatz")!=null)                    this.gpu_fatz=Double.parseDouble(properties.getProperty(prefix+"gpu_fatz"));
+		if (properties.getProperty(prefix+"gpu_fatz_m")!=null)                  this.gpu_fatz_m=Double.parseDouble(properties.getProperty(prefix+"gpu_fatz_m"));

 		if (properties.getProperty(prefix+"debug_initial_discriminate")!=null)           this.debug_initial_discriminate=Boolean.parseBoolean(properties.getProperty(prefix+"debug_initial_discriminate"));
 		if (properties.getProperty(prefix+"dbg_migrate")!=null)                          this.dbg_migrate=Boolean.parseBoolean(properties.getProperty(prefix+"dbg_migrate"));
@@ -3159,6 +3203,35 @@ public class CLTParameters {
 		gd.addCheckbox    ("Cost of using supertile \"flaps\" (not in the center 8x8 tiles area)",                      this.taEnFlaps);
 		gd.addCheckbox    ("Cost of a measurement layer not having same layer in the same location or near",            this.taEnMismatch);

+		gd.addTab         ("GPU", "Parameters for GPU development");
+		gd.addMessage     ("--- GPU processing parameters ---");
+		gd.addNumericField("Correlation radius",                                                                        this.gpu_corr_rad, 0, 6,"pix",
+				"Size of the 2D correlation - maximal radius = 7 corresponds to full 15x15 pixel tile");
+		gd.addNumericField("Correlation weight R",                                                                      this.gpu_weight_r, 4, 6,"",
+				"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b");
+		gd.addNumericField("Correlation weight B",                                                                      this.gpu_weight_b, 4, 6,"",
+				"Weight of R for composite 2D correlation (green weight is 1.0 -gpu_weight_r - gpu_weight_b");
+		gd.addNumericField("Color LPF sigma R",                                                                         this.gpu_sigma_r, 4, 6,"pix",
+				"LPF sigma to process color components during aberration correction");
+		gd.addNumericField("Color LPF sigma B",                                                                         this.gpu_sigma_b, 4, 6,"pix",
+				"LPF sigma to process color components during aberration correction");
+		gd.addNumericField("Color LPF sigma G",                                                                         this.gpu_sigma_g, 4, 6,"pix",
+				"LPF sigma to process color components during aberration correction");
+		gd.addNumericField("Monochrome LPF sigma",                                                                      this.gpu_sigma_m, 4, 6,"pix",
+				"LPF sigma to process monochrome (e.g.LWIR) during aberration correction");
+		gd.addNumericField("LPF sigma for correlation, color",                                                          this.gpu_sigma_corr, 4, 6,"pix",
+				"LPF sigma to apply to the composite 2D correlation for RGB images");
+		gd.addNumericField("LPF sigma for correlation, mono",                                                           this.gpu_sigma_corr_m, 4, 6,"pix",
+				"LPF sigma to apply to the composite 2D correlation for monochrome images");
+		gd.addNumericField("Fat zero (absolute) for phase correlation of color images",                                 this.gpu_fatz, 4, 6,"",
+				"Add squared fat zero to the sum of squared amplitudes, color images");
+		gd.addNumericField("Fat zero (absolute) for phase correlation of monochrome images",                            this.gpu_fatz_m, 4, 6,"",
+				"Add squared fat zero to the sum of squared amplitudes, monochrome images");
+
+		gd.addTab         ("LWIR", "parameters for LWIR/EO 8-camera rig");
+		this.lwir.dialogQuestions(gd);
+
+
 		gd.addTab         ("Debug", "Other debug images");
 		gd.addMessage     ("--- Other debug images ---");
 		//	clt_parameters.debug_initial_discriminate, // final boolean    debug_initial_discriminate,
@@ -3190,8 +3263,6 @@ public class CLTParameters {
 		gd.addMessage     ("Unity up vector in camera coordinate system (x - right, y - up, z - to camera): {"+
 				this.vertical_xyz[0]+","+          this.vertical_xyz[1]+","+          this.vertical_xyz[2]+"}");

-		gd.addTab         ("LWIR", "parameters for LWIR/EO 8-camera rig");
-		this.lwir.dialogQuestions(gd);

 		//  			gd.buildDialog();
 		gd.showDialog();
@@ -3886,6 +3957,20 @@ public class CLTParameters {
 		this.taEnFlaps=             gd.getNextBoolean();
 		this.taEnMismatch=          gd.getNextBoolean();

+		this.gpu_corr_rad =   (int) gd.getNextNumber();
+		this.gpu_weight_r =         gd.getNextNumber();
+		this.gpu_weight_b =         gd.getNextNumber();
+		this.gpu_sigma_r =          gd.getNextNumber();
+		this.gpu_sigma_b =          gd.getNextNumber();
+		this.gpu_sigma_g =          gd.getNextNumber();
+		this.gpu_sigma_m =          gd.getNextNumber();
+		this.gpu_sigma_corr =       gd.getNextNumber();
+		this.gpu_sigma_corr_m =     gd.getNextNumber();
+		this.gpu_fatz =             gd.getNextNumber();
+		this.gpu_fatz_m =           gd.getNextNumber();
+
+		this.lwir.dialogAnswers(gd);
+
 		this.debug_initial_discriminate= gd.getNextBoolean();
 		this.dbg_migrate=                gd.getNextBoolean();

@@ -3911,8 +3996,6 @@ public class CLTParameters {
 		this.show_first_clusters=   gd.getNextBoolean();
 		this.show_planes=           gd.getNextBoolean();

-		this.lwir.dialogAnswers(gd);
-
 		return true;
 	}


--- a/src/main/java/com/elphel/imagej/cameras/EyesisCorrectionParameters.java
+++ b/src/main/java/com/elphel/imagej/cameras/EyesisCorrectionParameters.java
@@ -231,6 +231,7 @@ public class EyesisCorrectionParameters {
  			cp.zcorrect=  			    this.zcorrect;
  			cp.saveSettings=  		    this.saveSettings;
  			cp.sourceDirectory=    	    this.sourceDirectory;
+  			cp.tile_processor_gpu =     this.tile_processor_gpu;
  			cp.use_set_dirs =           this.use_set_dirs;
 //  			cp.sourcePrefix=    	    this.sourcePrefix;
 //  			cp.sourceSuffix=    	    this.sourceSuffix;
@@ -388,6 +389,8 @@ public class EyesisCorrectionParameters {
  			properties.setProperty(prefix+"saveSettings",this.saveSettings+"");

    		properties.setProperty(prefix+"sourceDirectory",this.sourceDirectory);
+    		properties.setProperty(prefix+"tile_processor_gpu",this.tile_processor_gpu);
+
    		properties.setProperty(prefix+"use_set_dirs",   this.use_set_dirs+"");

    		properties.setProperty(prefix+"sourcePrefix",this.sourcePrefix);
@@ -543,6 +546,7 @@ public class EyesisCorrectionParameters {
  		    if (properties.getProperty(prefix+"zcorrect")!=null) this.zcorrect=Boolean.parseBoolean(properties.getProperty(prefix+"zcorrect"));
  		    if (properties.getProperty(prefix+"saveSettings")!=null) this.saveSettings=Boolean.parseBoolean(properties.getProperty(prefix+"saveSettings"));
 			if (properties.getProperty(prefix+"sourceDirectory")!=      null) this.sourceDirectory=properties.getProperty(prefix+"sourceDirectory");
+			if (properties.getProperty(prefix+"tile_processor_gpu")!=      null) this.tile_processor_gpu=properties.getProperty(prefix+"tile_processor_gpu");
  		    if (properties.getProperty(prefix+"firstSubCamera")!=       null) this.firstSubCamera=Integer.parseInt(properties.getProperty(prefix+"firstSubCamera"));
  		    if (properties.getProperty(prefix+"firstSubCameraConfig")!= null) this.firstSubCameraConfig=Integer.parseInt(properties.getProperty(prefix+"firstSubCameraConfig"));
  		    if (properties.getProperty(prefix+"numSubCameras")!=        null) this.numSubCameras=Integer.parseInt(properties.getProperty(prefix+"numSubCameras"));

--- a/src/main/java/com/elphel/imagej/correction/EyesisDCT.java
+++ b/src/main/java/com/elphel/imagej/correction/EyesisDCT.java
@@ -1220,7 +1220,7 @@ public class EyesisDCT {
 		  }

 		  if (this.correctionsParameters.deconvolve) { // process with DCT, otherwise use simple debayer
-			  ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
+			  ImageDtt image_dtt = new ImageDtt(dctParameters.dct_size, false, 1.0); // Bayer( not monochrome), scale correlation strengths
 			  double [][][][] dct_data = image_dtt.mdctStack(
 					  stack,
 					  channel,

--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
@@ -3075,7 +3075,7 @@ private Panel panel1,
           	}

           }
-		  ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
+		  ImageDtt image_dtt = new ImageDtt(DCT_PARAMETERS.dct_size, false, 1.0); // Bayer( not monochrome), scale correlation strengths
           double [][][][] dctdc_data = image_dtt.mdctScale(
           		DBG_IMP.getStack(),
           		DCT_PARAMETERS.kernel_chn,
@@ -3173,7 +3173,7 @@ private Panel panel1,
        	}

        }
-        ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
+        ImageDtt image_dtt = new ImageDtt(DCT_PARAMETERS.dct_size,false, 1.0); // Bayer( not monochrome), scale correlation strengths
        double [][][][] dctdc_data = image_dtt.mdctStack(
        		DBG_IMP.getStack(),
        		DCT_PARAMETERS.kernel_chn,
@@ -5723,6 +5723,7 @@ private Panel panel1,
 		if (!prepareRigImages()) return false;
 		String configPath=getSaveCongigPath();
 		if (configPath.equals("ABORT")) return false;
+//		if ((CORRECTION_PARAMETERS.tile_processor_gpu != null) &&

 		if (DEBUG_LEVEL > -2){
 			System.out.println("++++++++++++++ Calculating combined correlations ++++++++++++++");
@@ -5740,6 +5741,7 @@ private Panel panel1,

 		try {
 			TWO_QUAD_CLT.prepareFilesForGPUDebug(
+					CORRECTION_PARAMETERS.tile_processor_gpu,//			String                                         save_prefix, // absolute path to the cuda project root
 					QUAD_CLT, // QuadCLT quadCLT_main,
 					QUAD_CLT_AUX, // QuadCLT quadCLT_aux,
 					CLT_PARAMETERS,  // EyesisCorrectionParameters.DCTParameters           dct_parameters,
@@ -7048,7 +7050,10 @@ private Panel panel1,
 			}
 		}

-		ImageDtt image_dtt = new ImageDtt(false, 1.0); // Bayer( not monochrome), scale correlation strengths
+		ImageDtt image_dtt = new ImageDtt(
+				CLT_PARAMETERS.transform_size,
+				false,
+				1.0); // Bayer( not monochrome), scale correlation strengths
 		double [][][][][] clt_data = image_dtt.cltStack(
 				DBG_IMP.getStack(),
 				0, // CLT_PARAMETERS.kernel_chn,
@@ -7082,7 +7087,7 @@ private Panel panel1,
 			for (int chn = 0; chn < clt_data.length; chn++) {
 				clt_data[chn] = image_dtt.clt_shiftXY(
 						clt_data[chn],                  // final double [][][][] dct_data,  // array [tilesY][tilesX][4][dct_size*dct_size]
-						CLT_PARAMETERS.transform_size,  // final int             dct_size,
+///						CLT_PARAMETERS.transform_size,  // final int             dct_size,
 						CLT_PARAMETERS.shift_x,         // final double          shiftX,
 						CLT_PARAMETERS.shift_y,         // final double          shiftY,
 						(CLT_PARAMETERS.dbg_mode >> 2) & 3, // swap order hor/vert
@@ -7095,7 +7100,7 @@ private Panel panel1,
 		for (int chn=0; chn<iclt_data.length;chn++){
 			iclt_data[chn] = image_dtt.iclt_2d(
 					clt_data[chn],                  // scanline representation of dcd data, organized as dct_size x dct_size tiles
-					CLT_PARAMETERS.transform_size,  // final int
+///					CLT_PARAMETERS.transform_size,  // final int
 					CLT_PARAMETERS.clt_window,      //window_type
 					CLT_PARAMETERS.iclt_mask,       //which of 4 to transform back
 					CLT_PARAMETERS.dbg_mode,        //which of 4 to transform back
@@ -7178,7 +7183,10 @@ private Panel panel1,

        }
        String suffix = "-dx_"+(CLT_PARAMETERS.ishift_x+CLT_PARAMETERS.shift_x)+"_dy_"+(CLT_PARAMETERS.ishift_y+CLT_PARAMETERS.shift_y);
-        ImageDtt image_dtt = new ImageDtt(COLOR_PROC_PARAMETERS.isMonochrome(), CLT_PARAMETERS.getScaleStrength(false)); // Bayer, not monochrome
+        ImageDtt image_dtt = new ImageDtt(
+        		CLT_PARAMETERS.transform_size,
+        		COLOR_PROC_PARAMETERS.isMonochrome(),
+        		CLT_PARAMETERS.getScaleStrength(false)); // Bayer, not monochrome
        String [] titles = {
        		"redCC",  "redSC",  "redCS",  "redSS",
        		"blueCC", "blueSC", "blueCS", "blueSS",
@@ -7235,7 +7243,7 @@ private Panel panel1,
            for (int chn = 0; chn < clt_data.length; chn++) {
        	clt_data2[chn] = image_dtt.clt_shiftXY(
        			clt_data2[chn],                 // final double [][][][] dct_data,  // array [tilesY][tilesX][4][dct_size*dct_size]
-        			CLT_PARAMETERS.transform_size,  // final int             dct_size,
+///        			CLT_PARAMETERS.transform_size,  // final int             dct_size,
        			CLT_PARAMETERS.shift_x,         // final double          shiftX,
        			CLT_PARAMETERS.shift_y,         // final double          shiftY,
        			(CLT_PARAMETERS.dbg_mode >> 2) & 3, // swap order hor/vert
@@ -7266,7 +7274,7 @@ private Panel panel1,
        	clt_corr[chn] = image_dtt.clt_correlate(
        			clt_data[chn],                  // final double [][][][] data1,  // array [tilesY][tilesX][4][dct_size*dct_size]
        			clt_data2[chn],                 // final double [][][][] data2,  // array [tilesY][tilesX][4][dct_size*dct_size]
-        			CLT_PARAMETERS.transform_size,  // final int             dct_size,
+///        			CLT_PARAMETERS.transform_size,  // final int             dct_size,
        			CLT_PARAMETERS.getFatZero(image_dtt.isMonochrome()),  // final double          fat_zero,    // add to denominator to modify phase correlation (same units as data1, data2)
            		CLT_PARAMETERS.tileX, //final int debug_tileX
            		CLT_PARAMETERS.tileY, //final int debug_tileY
@@ -7297,7 +7305,7 @@ private Panel panel1,
        		image_dtt.clt_lpf( // filter in-place
        				CLT_PARAMETERS.getCorrSigma(image_dtt.isMonochrome()),            // final double          sigma,
        				clt_corr[chn],                        // final double [][][][] clt_data,
-        				CLT_PARAMETERS.transform_size,
+///        				CLT_PARAMETERS.transform_size,
        				THREADS_MAX,                          // maximal number of threads to launch
        				DEBUG_LEVEL);                        // globalDebugLevel)
        	}

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -74,12 +74,14 @@ import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
 	static String GPU_KERNEL_FILE = "dtt8x8.cuh";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
-	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles";
-	static String GPU_IMCLT_RBG_NAME = "imclt_rbg";
+	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
+	static String GPU_IMCLT_RBG_NAME =             "imclt_rbg"; // name in C code
+	static String GPU_CORRELATE2D_NAME =           "correlate2D"; // name in C code
 //  pass some defines to gpu source code with #ifdef JCUDA
 	public static int DTT_SIZE =         8;
 	static int THREADSX =         DTT_SIZE;
 	public static int NUM_CAMS =         4;
+	public static int NUM_PAIRS =        6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
 	static int NUM_COLORS =              3;
 	public static int IMG_WIDTH =     2592;
 	public static int IMG_HEIGHT =    1936;
@@ -88,6 +90,8 @@ public class GPUTileProcessor {
 	static int KERNELS_LSTEP =           4;
 	static int THREADS_PER_TILE =        8;
 	static int TILES_PER_BLOCK =         4; // 8 - slower
+	static int CORR_THREADS_PER_TILE =   8;
+	static int CORR_TILES_PER_BLOCK	=    4;
 	static int IMCLT_THREADS_PER_TILE = 16;
 	static int IMCLT_TILES_PER_BLOCK =   4;

@@ -95,6 +99,10 @@ public class GPUTileProcessor {
 	static int CLTEXTRA_SIZE = 8;
 	static int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
 	static int KERN_SIZE =  KERN_TILES * 4 * 64;
+	static int CORR_SIZE =  (2* DTT_SIZE - 1) * (2* DTT_SIZE - 1); // 15x15
+	public static int CORR_PAIR_SHIFT = 8;
+	public static int TASK_CORR_BITS =  4; // start of pair mask
+	public static int CORR_OUT_RAD =    7; // output radius of the correelations (implement)


    int DTTTEST_BLOCK_WIDTH =        32; // may be read from the source code
@@ -105,6 +113,7 @@ public class GPUTileProcessor {

    private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null;
    private CUfunction GPU_IMCLT_RBG_kernel =             null;
+    private CUfunction GPU_CORRELATE2D_kernel =           null;
    // CPU arrays of pointers to GPU memory
    // These arrays may go to method, they are here just to be able to free GPU memory if needed
    private CUdeviceptr [] gpu_kernels_h =        new CUdeviceptr[NUM_CAMS];
@@ -119,18 +128,25 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_kernel_offsets =      new CUdeviceptr();
    private CUdeviceptr gpu_bayer =               new CUdeviceptr();

-    private CUdeviceptr gpu_tasks =               new CUdeviceptr();
+    private CUdeviceptr gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER
+
+    private CUdeviceptr gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER

    private CUdeviceptr gpu_clt =                 new CUdeviceptr();
-//    private
+
+    private CUdeviceptr gpu_corr_indices =       new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
+
+    //    private
    CUmodule    module; // to access constants memory
 //    private CUdeviceptr gpu_lpf =            new CUdeviceptr();
    private int mclt_stride;
+    private int corr_stride;
    private int imclt_stride;
    public int num_task_tiles;
+    public int num_corr_tiles;

    public class TpTask {
-    	public int   task;
+    	public int   task; // [0](+1) - generate 4 images, [4..9]+16..+512 - correlation pairs
    	public float target_disparity;

    	public int ty;
@@ -264,6 +280,7 @@ public class GPUTileProcessor {
        				"#define DTT_SIZE " +               DTT_SIZE+"\n"+
        				"#define THREADSX " +               THREADSX+"\n"+
        				"#define NUM_CAMS " +               NUM_CAMS+"\n"+
+        				"#define NUM_PAIRS " +              NUM_PAIRS+"\n"+
        				"#define NUM_COLORS " +             NUM_COLORS+"\n"+
        				"#define IMG_WIDTH " +              IMG_WIDTH+"\n"+
        				"#define IMG_HEIGHT " +             IMG_HEIGHT+"\n"+
@@ -272,8 +289,13 @@ public class GPUTileProcessor {
        				"#define KERNELS_LSTEP " +          KERNELS_LSTEP+"\n"+
        				"#define THREADS_PER_TILE " +       THREADS_PER_TILE+"\n"+
        				"#define TILES_PER_BLOCK " +        TILES_PER_BLOCK+"\n"+
+        				"#define CORR_THREADS_PER_TILE " +  CORR_THREADS_PER_TILE+"\n"+
+        				"#define CORR_TILES_PER_BLOCK " +   CORR_TILES_PER_BLOCK+"\n"+
        				"#define IMCLT_THREADS_PER_TILE " + IMCLT_THREADS_PER_TILE+"\n"+
-        				"#define IMCLT_TILES_PER_BLOCK " +  IMCLT_TILES_PER_BLOCK+"\n";
+        				"#define IMCLT_TILES_PER_BLOCK " +  IMCLT_TILES_PER_BLOCK+"\n"+
+        				"#define CORR_PAIR_SHIFT " +        CORR_PAIR_SHIFT+"\n"+
+        				"#define TASK_CORR_BITS " +         TASK_CORR_BITS+"\n"+
+        				"#define CORR_OUT_RAD " +           CORR_OUT_RAD+"\n";

        for (String src_file:GPU_KERNEL_FILES) {
        	File file = null;
@@ -297,17 +319,21 @@ public class GPUTileProcessor {

        }
        // Create the kernel functions (first - just test)
-        String [] func_names = {GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME};
+        String [] func_names = {GPU_CONVERT_CORRECT_TILES_NAME, GPU_IMCLT_RBG_NAME, GPU_CORRELATE2D_NAME};
        CUfunction[] functions = createFunctions(kernelSource, func_names);
        this.GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
        this.GPU_IMCLT_RBG_kernel =             functions[1];
+        this.GPU_CORRELATE2D_kernel =           functions[2];
        System.out.println("GPU kernel functions initialized");
-        System.out.println("Sizeof.POINTER="+Sizeof.POINTER);
+//        System.out.println("Sizeof.POINTER="+Sizeof.POINTER);
+        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
        System.out.println(GPU_IMCLT_RBG_kernel.toString());
+        System.out.println(GPU_CORRELATE2D_kernel.toString());

-        // Init data arrays
+        // Init data arrays for all kernels
        int tilesX =  IMG_WIDTH / DTT_SIZE;
        int tilesY =  IMG_HEIGHT / DTT_SIZE;
+        long [] device_stride = new long [1];

        for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        	gpu_kernels_h[ncam] =        new CUdeviceptr();
@@ -315,7 +341,6 @@ public class GPUTileProcessor {
        	gpu_kernel_offsets_h[ncam] = new CUdeviceptr();
        	cuMemAlloc(gpu_kernel_offsets_h[ncam],KERN_TILES * CLTEXTRA_SIZE * Sizeof.FLOAT ); //     public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
        	gpu_bayer_h[ncam] =          new CUdeviceptr();
-            long [] device_stride = new long [1];
            cuMemAllocPitch (
            		gpu_bayer_h[ncam],        // CUdeviceptr dptr,
            		device_stride,            // long[] pPitch,
@@ -323,6 +348,7 @@ public class GPUTileProcessor {
            		IMG_HEIGHT,               // long Height,
                    Sizeof.FLOAT);            // int ElementSizeBytes)
            mclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
+
            gpu_corr_images_h[ncam] =  new CUdeviceptr();
            cuMemAllocPitch (
            		gpu_corr_images_h[ncam],               // CUdeviceptr dptr,
@@ -365,10 +391,22 @@ public class GPUTileProcessor {
        // Set task array
    	cuMemAlloc(gpu_tasks,       tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER);

+    	// Set corrs array
+///    	cuMemAlloc(gpu_corrs,       tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
+    	cuMemAlloc(gpu_corr_indices,tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
+
+        cuMemAllocPitch (
+        		gpu_corrs,                             // CUdeviceptr dptr,
+        		device_stride,                         // long[] pPitch,
+        		CORR_SIZE * Sizeof.FLOAT,              // long WidthInBytes,
+        		NUM_PAIRS * tilesX * tilesY,             // long Height,
+                Sizeof.FLOAT);                         // int ElementSizeBytes)
+        corr_stride = (int)(device_stride[0] / Sizeof.FLOAT);
+
    }


-    public void setTasks(TpTask [] tile_tasks, boolean use_aux)
+    public void setTasks(TpTask [] tile_tasks, boolean use_aux) // while is it in class member? - just to be able to free
    {
    	num_task_tiles = tile_tasks.length;
    	float [] ftasks = new float [TPTASK_SIZE * num_task_tiles];
@@ -378,6 +416,17 @@ public class GPUTileProcessor {
        cuMemcpyHtoD(gpu_tasks,        Pointer.to(ftasks),         TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT);
    }

+    public void setCorrIndices(int [] corr_indices)
+    {
+    	num_corr_tiles = corr_indices.length;
+    	float [] fcorr_indices = new float [corr_indices.length];
+    	for (int i = 0; i < num_corr_tiles; i++) {
+    		fcorr_indices[i] = Float.intBitsToFloat(corr_indices[i]);
+    	}
+        cuMemcpyHtoD(gpu_corr_indices, Pointer.to(fcorr_indices),  num_corr_tiles * Sizeof.FLOAT);
+    }
+
+
    public void setConvolutionKernel(
    		float [] kernel,  // [tileY][tileX][color][..]
    		float [] kernel_offsets,
@@ -479,6 +528,8 @@ public class GPUTileProcessor {
    // need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
    public TpTask [] setFullFrameImages(
    		float                     target_disparity, // apply same disparity to all tiles
+    		int                       out_image, // from which tiles to generate image (currently 0/1)
+    		int                       corr_mask,  // which correlation pairs to generate (maybe later - reduce size from 15x15)
    		boolean                   use_master,
    		boolean                   use_aux,
 			final GeometryCorrection  geometryCorrection_main,
@@ -489,11 +540,19 @@ public class GPUTileProcessor {
        int tilesX =  IMG_WIDTH / DTT_SIZE;
        int tilesY =  IMG_HEIGHT / DTT_SIZE;
    	float [] target_disparities = new float [tilesX * tilesY];
+    	int [] out_images = new int [tilesX * tilesY];
+    	int [] corr_masks = new int [tilesX * tilesY];
    	if (target_disparity != 0.0) {
    		for (int i = 0; i <target_disparities.length; i++ ) target_disparities[i] = target_disparity;
    	}
+		for (int i = 0; i <out_images.length; i++ ) {
+			out_images[i] = out_image; //  0xf;  // all 4 images
+			corr_masks[i] = corr_mask; // 0x3f; // all 6 correlations
+		}
    	return setFullFrameImages(
        		target_disparities, // should be tilesX*tilesY long
+        		out_images, // int   []                  out_images, // from which tiles to generate image (currently 0/1)
+        		corr_masks, // int   []                  corr_mask,  // which correlation pairs to generate (maybe later - reduce size from 15x15)
        		use_master,
        		use_aux,
    			geometryCorrection_main,
@@ -505,6 +564,8 @@ public class GPUTileProcessor {

    public TpTask [] setFullFrameImages(
    		float []                  target_disparities, // should be tilesX*tilesY long
+    		int   []                  out_images, // from which tiles to generate image (currently 0/1)
+    		int   []                  corr_mask,  // which correlation pairs to generate (maybe later - reduce size from 15x15)
    		boolean                   use_master,
    		boolean                   use_aux,
 			final GeometryCorrection  geometryCorrection_main,
@@ -520,7 +581,13 @@ public class GPUTileProcessor {
    	int indx = 0;
    	for (int ty = 0; ty < tilesY; ty++) {
        	for (int tx = 0; tx < tilesX; tx++) {
-        		tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx], 1); // task == 1 for now
+//        		tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx], 1); // task == 1 for now
+// Only generate for non-empty tasks, use 1 empty empty as a terminator?
+        		tp_tasks[indx] = new TpTask(tx,ty, target_disparities[indx],
+        				((out_images[indx] & 0x0f) << 0) |
+        				((corr_mask [indx] & 0x3f) << 4)
+        				); // task == 1 for now
+
        		indx++;
        	}
    	}
@@ -534,6 +601,94 @@ public class GPUTileProcessor {
    	return tp_tasks;
    }

+    /**
+     * Prepare contents pointers for calculation of the correlation pairs
+     * @param tp_tasks array of tasks that contain masks of the required pairs
+     * @return each element has (tile_number << 8) | (pair_number & 0xff)
+     */
+    public int [] getCorrTasks(
+    		TpTask [] tp_tasks) {
+    	int tilesX = IMG_WIDTH / DTT_SIZE;
+    	int num_corr = 0;
+    	int task_mask = (1 << NUM_PAIRS) - 1;
+    	for (TpTask tt: tp_tasks) {
+    		int pm = (tt.task >> TASK_CORR_BITS) & task_mask;
+    		if (pm != 0) {
+    			for (int b = 0; b < NUM_PAIRS; b++) if ((pm & (1 << b)) != 0) {
+    				num_corr++;    			}
+    		}
+    	}
+
+    	int [] iarr = new int[num_corr];
+    	num_corr = 0;
+    	for (TpTask tt: tp_tasks) {
+    		int pm = (tt.task >> TASK_CORR_BITS) & task_mask;
+    		if (pm != 0) {
+    			int tile = (tt.ty * tilesX +tt.tx);
+    			for (int b = 0; b < NUM_PAIRS; b++) if ((pm & (1 << b)) != 0) {
+    				iarr[num_corr++] = (tile << CORR_PAIR_SHIFT) | b;
+    			}
+    		}
+    	}
+    	return iarr;
+    }
+
+    public static String [] getCorrTitles() {
+    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other"};
+    }
+    public static double [][] getCorr2DView(
+    		int tilesX,
+    		int tilesY,
+    		int [] indices,
+    		float [][] corr2d,
+    		int [] wh){ // if is [2] - return width, height
+    	if ((corr2d == null) || (corr2d.length == 0)) {
+    		return new double [NUM_PAIRS][0];
+    	}
+
+    	int corr_size = (int)(Math.round(Math.sqrt(corr2d[0].length)));//  make smaller later?
+    	int width =  tilesX * (corr_size + 1) + 1;
+    	int height = tilesY * (corr_size + 1) + 1;
+    	double [][] data = new double [NUM_PAIRS][];
+    	data[0] = new double[height*width];
+    	for (int ty = 0; ty < tilesY; ty++) {
+    		for (int tx = 0; tx < tilesX; tx++) {
+    			for (int i = 0; i< corr_size; i++) {
+    				for (int j = 0; j < corr_size; j++) {
+    					data[0][(ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1)] = Double.NaN;
+    				}
+    			}
+    		}
+    	}
+		for (int np = 1; np < NUM_PAIRS; np++) {
+			data[np] = data[0].clone();
+		}
+		for (int n = 0; n < indices.length; n++) {
+			int nt = indices[n] >> CORR_PAIR_SHIFT;
+			int np = indices[n] & ((1 << CORR_PAIR_SHIFT) - 1); // np should
+			assert np < NUM_PAIRS : "invalid correllation pair";
+			int tx = nt % tilesX;
+			int ty = nt / tilesX;
+			for (int i = 0; i< corr_size; i++) {
+				for (int j = 0; j < corr_size; j++) {
+					//java.lang.ArrayIndexOutOfBoundsException: 20081634
+					int indx1 = (ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1);
+					int indx2 = i*corr_size+j;
+//					if ((indx1 > data[0].length) || (indx1 > data[0].length)){
+//						System.out.println("Bugggg!)");
+//					}
+					data[np][indx1] = corr2d[n][indx2];
+				}
+			}
+		}
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+    	return data;
+    }
+
+
 // All data is already copied to GPU memory
    public void execConverCorrectTiles() {
        if (GPU_CONVERT_CORRECT_TILES_kernel == null)
@@ -542,7 +697,7 @@ public class GPUTileProcessor {
            return;
        }
        // kernel parameters: pointer to pointers
-        int [] GridFullWarps =    {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1};
+        int [] GridFullWarps =    {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1}; // round up
        int [] ThreadsFullWarps = {THREADSX, TILES_PER_BLOCK, 1};
        Pointer kernelParameters = Pointer.to(
            Pointer.to(gpu_kernel_offsets),
@@ -550,7 +705,11 @@ public class GPUTileProcessor {
            Pointer.to(gpu_bayer),
            Pointer.to(gpu_tasks),
            Pointer.to(gpu_clt),
+/* 2020*///   Pointer.to(gpu_corrs),
+/* 2020*///   Pointer.to(gpu_corr_indices),              // corr indices (tile_num <<8 + pair_index
+/* 2020*///   Pointer.to(new int[] { num_corr_tiles }),  // total number of 2D correlations to calculate
            Pointer.to(new int[] { mclt_stride }),
+/* 2020*///   Pointer.to(new int[] { corr_stride }),
            Pointer.to(new int[] { num_task_tiles }),
            Pointer.to(new int[] { 7 }) // lpf_mask
        );
@@ -604,6 +763,68 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

+    public void execCorr2D(
+    		double [] scales,
+    		double fat_zero) {
+    	if (GPU_CORRELATE2D_kernel == null)
+    	{
+    		IJ.showMessage("Error", "No GPU kernel: GPU_CORRELATE2D_kernel");
+    		return;
+    	}
+    	int num_colors = scales.length;
+    	if (num_colors > 3) num_colors = 3;
+    	float fscale0 = (float) scales[0];
+    	float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
+    	float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
+		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
+    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
+    	Pointer kernelParameters = Pointer.to(
+    			Pointer.to(gpu_clt),
+    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(new float[] {fscale0  }),
+    			Pointer.to(new float[] {fscale1  }),
+    			Pointer.to(new float[] {fscale2  }),
+    			Pointer.to(new float[] {(float) fat_zero }),
+    			Pointer.to(new int[] { num_corr_tiles }), // lpf_mask
+    			Pointer.to(gpu_corr_indices),
+    			Pointer.to(new int[] { corr_stride }),
+    			Pointer.to(gpu_corrs) // lpf_mask
+    			);
+    	cuCtxSynchronize();
+    	// Call the kernel function
+    	cuLaunchKernel(GPU_CORRELATE2D_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize();
+    }
+
+    public float [][] getCorr2D(){
+        float [] cpu_corrs = new float [ num_corr_tiles * CORR_SIZE];
+
+        CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
+        copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+        copyD2H.srcDevice =       gpu_corrs;
+        copyD2H.srcPitch =        corr_stride * Sizeof.FLOAT;
+
+        copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+        copyD2H.dstHost =         Pointer.to(cpu_corrs);
+        copyD2H.dstPitch =        CORR_SIZE * Sizeof.FLOAT;
+
+        copyD2H.WidthInBytes =    CORR_SIZE * Sizeof.FLOAT;
+        copyD2H.Height =          num_corr_tiles;
+
+        cuMemcpy2D(copyD2H); // run copy
+
+        float [][] corrs = new float [num_corr_tiles][ CORR_SIZE];
+        for (int ncorr = 0; ncorr < num_corr_tiles; ncorr++) {
+        	System.arraycopy(cpu_corrs, ncorr*CORR_SIZE, corrs[ncorr], 0, CORR_SIZE);
+        }
+        return corrs;
+    }
+
+
    public float [][] getRBG (int ncam){
        int height = (IMG_HEIGHT + DTT_SIZE);
        int width =  (IMG_WIDTH + DTT_SIZE);
@@ -791,23 +1012,20 @@ public class GPUTileProcessor {
 	}

 	public void setLpfRbg(
-			float sigma_r,
-			float sigma_b,
-			float sigma_g)
+			float [][] lpf_rbg) // 3 or single 64-el. array(s)
 	{
-		int dct_size = DTT_SIZE;
-		DttRad2 dtt = new DttRad2(dct_size);

-		double [][] lpf_rbg = {
-				dtt.dttt_iiie(setCltLpf(sigma_r)),
-				dtt.dttt_iiie(setCltLpf(sigma_b)),
-				dtt.dttt_iiie(setCltLpf(sigma_g))};
-		int l = dct_size*dct_size;
+		int l = lpf_rbg[0].length; // 64
+
 		float []   lpf_flat = new float [3 * l];
 		for (int i = 0; i < 3;i++) {
-//			System.arraycopy(lpf_rbg[i], 0, lpf_flat, l* i, l);
+			int ii = i;
+			if (ii > lpf_rbg.length) {
+				ii = 0; // mono
+			}
 			for (int j = 0; j < l; j++) {
-				lpf_flat[j+i*l] = (float) (lpf_rbg[i][j]*2*dct_size);
+//				lpf_flat[j + ii*l] = (float) (lpf_rbg[i][j]*2*dct_size);
+				lpf_flat[j + ii*l] = lpf_rbg[i][j];
 			}
 		}

@@ -821,14 +1039,51 @@ public class GPUTileProcessor {
 		System.out.println("constantMemorySize: " + constantMemorySize);
        cuMemcpyHtoD(constantMemoryPointer, Pointer.to(lpf_flat), constantMemorySize);
 		System.out.println();
+	}

+	public void setLpfCorr(
+			float [] lpf_flat)
+	{
+		CUdeviceptr constantMemoryPointer = new CUdeviceptr();
+		long constantMemorySizeArray[] = { 0 };
+		cuModuleGetGlobal(constantMemoryPointer, constantMemorySizeArray,  module, "lpf_corr");
+		int constantMemorySize = (int)constantMemorySizeArray[0];
+		System.out.println("constantMemoryPointer: " + constantMemoryPointer);
+		System.out.println("constantMemorySize: " + constantMemorySize);
+        cuMemcpyHtoD(constantMemoryPointer, Pointer.to(lpf_flat), constantMemorySize);
+		System.out.println();
+	}
+
+	public float [] floatSetCltLpfFd(
+			double   sigma) {
+		int dct_size = DTT_SIZE;
+		DttRad2 dtt = new DttRad2(dct_size);
+		double [] clt_fd = dtt.dttt_iiie(setCltLpf(sigma));
+		int l = dct_size*dct_size;
+		float []   lpf_flat = new float [l];
+		for (int j = 0; j < l; j++) {
+			lpf_flat[j] = (float) (clt_fd[j]*2*dct_size);
+		}
+		return lpf_flat;
+	}
+
+	public double [] doubleSetCltLpfFd(
+			double   sigma) {
+		int dct_size = DTT_SIZE;
+		DttRad2 dtt = new DttRad2(dct_size);
+		double [] clt_fd = dtt.dttt_iiie(setCltLpf(sigma));
+		int l = dct_size*dct_size;
+		double []   lpf_flat = new double [l];
+		for (int j = 0; j < l; j++) {
+			lpf_flat[j] = (float) (clt_fd[j]*2*dct_size);
+		}
+		return lpf_flat;
 	}

 	public double [] setCltLpf(
 			double   sigma)
 	{
 		int dct_size = DTT_SIZE;
-
 		double [] lpf = new double [dct_size*dct_size];
 		int dct_len = dct_size * dct_size;
 		if (sigma == 0.0f) {

--- a/src/main/java/com/elphel/imagej/tileprocessor/Correlation2d.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/Correlation2d.java
@@ -194,6 +194,147 @@ public class Correlation2d {
    	  return this.transpose_all_diagonal;
      }

+      /**
+       * Multiply CLT data of two channels, OK with null inputs (missing colors for monochrome images)
+       * @param clt_data1 first operand FD CLT data[4][transform_len]
+       * @param clt_data2 second operand FD CLT data[4][transform_len]
+       * @return [4][transform_len] FD CLT data
+       */
+      public double[][] correlateSingleColorFD(
+      		double [][] clt_data1,
+      		double [][] clt_data2,
+      		double [][] tcorr){ // null or initialized to [4][transform_len]
+
+      	if (tcorr == null) tcorr = new double [4][transform_len];
+    	if ((clt_data1 == null) || (clt_data1 == null)) return null; // to work with missing colors for monochrome
+  		for (int i = 0; i < transform_len; i++) {
+  			for (int n = 0; n<4; n++){
+  				tcorr[n][i] = 0;
+  				for (int k=0; k<4; k++){
+  					if (ZI[n][k] < 0)
+  						tcorr[n][i] -=
+  								clt_data1[-ZI[n][k]][i] * clt_data2[k][i];
+  					else
+  						tcorr[n][i] +=
+  								clt_data1[ZI[n][k]][i] * clt_data2[k][i];
+  				}
+  			}
+  		}
+  		return tcorr;
+      }
+
+      /**
+       * Normalize 2D correlation in FD, LPF (if not null) and convert to pixel domain and trim
+       * @param tcorr FD representation of the correlation[4][64]
+       * @param lpf LPF [64] or null
+       * @param afat_zero2 fat zero to add during normalization, units of squared values
+       * @param corr_radius if >=0 and < 7 - extract only the central part of the 15x15 square
+       * @return 2D phase correlation in linescan order
+       */
+      public double[] normalizeConvertCorr(
+    		  double [][] tcorr, // null or initialized to [4][transform_len]
+    		  double []   lpf,
+    		  double      afat_zero2, // absolute fat zero, same units as components squared values
+    		  int corr_radius,
+    		  boolean debug_gpu){
+    	  if (tcorr == null) return null;
+    	  double afat_zero4 = afat_zero2*afat_zero2;
+
+    	  for (int i = 0; i < transform_len; i++) {
+    		  double s = afat_zero4;
+    		  for (int n = 0; n< 4; n++){
+    			  s += tcorr[n][i]*tcorr[n][i];
+    		  }
+    		  double k = 1.0/ Math.sqrt(s);
+    		  for (int n = 0; n< 4; n++){
+    			  tcorr[n][i]*= k;
+    		  }
+    	  }
+    	  if (debug_gpu) {
+    		  System.out.println("=== NORMALIZED CORRELATION , afat_zero2="+afat_zero2+", afat_zero4="+afat_zero4+" ===");
+    		  for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
+    			  System.out.println("------dct_mode="+dct_mode);
+    			  for (int i = 0; i < transform_size; i++) {
+    				  for (int j = 0; j < transform_size; j++) {
+    					  System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
+    				  }
+    				  System.out.println();
+    			  }
+    		  }
+    	  }
+    	  if (lpf != null) {
+        	  if (debug_gpu) {
+        		  System.out.println("=== LPF for CORRELATION ===");
+        		  for (int i = 0; i < transform_size; i++) {
+        			  for (int j = 0; j < transform_size; j++) {
+        				  System.out.print(String.format("%10.5f ", lpf[transform_size * i + j]));
+        			  }
+        			  System.out.println();
+        		  }
+        	  }
+    		  for (int n = 0; n<4; n++) {
+    			  for (int i = 0; i < transform_len; i++) {
+    				  tcorr[n][i] *= lpf[i];
+    			  }
+    		  }
+    	  }
+    	  if (debug_gpu) {
+    		  System.out.println("=== LPF-ed CORRELATION ===");
+    		  for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
+    			  System.out.println("------dct_mode="+dct_mode);
+    			  for (int i = 0; i < transform_size; i++) {
+    				  for (int j = 0; j < transform_size; j++) {
+    					  System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
+    				  }
+    				  System.out.println();
+    			  }
+    		  }
+    	  }
+		  for (int quadrant = 0; quadrant < 4; quadrant++){
+			  int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
+			  tcorr[quadrant] = dtt.dttt_iie(tcorr[quadrant], mode, transform_size, debug_gpu); // not orthogonal, term[0] is NOT *= 1/sqrt(2)
+		  }
+    	  if (debug_gpu) {
+    		  System.out.println("=== CONVERTED CORRELATION ===");
+    		  for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
+    			  System.out.println("------dct_mode="+dct_mode);
+    			  for (int i = 0; i < transform_size; i++) {
+    				  for (int j = 0; j < transform_size; j++) {
+    					  System.out.print(String.format("%10.5f ", tcorr[dct_mode][transform_size * i + j]));
+    				  }
+    				  System.out.println();
+    			  }
+    		  }
+    	  }
+
+    	  // convert from 4 quadrants to 15x15 centered tiles (only composite)
+    	  double [] corr_pd =  dtt.corr_unfold_tile(tcorr,	transform_size);
+    	  if (debug_gpu) {
+    		  int corr_size = 2* transform_size -1;
+    		  System.out.println("=== UNFOLDED CORRELATION ===");
+    		  for (int i = 0; i < corr_size; i++) {
+    			  for (int j = 0; j < corr_size; j++) {
+    				  System.out.print(String.format("%10.5f ", corr_pd[corr_size * i + j]));
+    			  }
+    			  System.out.println();
+    		  }
+    	  }
+
+    	  if ((corr_radius <= 0) || (corr_radius >= (transform_size - 1))) {
+    		  return corr_pd;
+    	  }
+    	  int full_size = 2 * transform_size - 1;
+    	  int trimmed_size = 2 * corr_radius + 1;
+    	  int trim =  transform_size - 1 - corr_radius;
+    	  double [] trimmed_pd = new double [trimmed_size * trimmed_size];
+    	  int ioffs = (full_size + 1)*trim;
+    	  for (int orow = 0; orow < trimmed_size; orow++) {
+    		  System.arraycopy(corr_pd, orow*full_size + ioffs, trimmed_pd, orow*trimmed_size, trimmed_size);
+    	  }
+    	  return trimmed_pd;
+      }
+
+
    /**
     * Multiply CLT data of two channels, normalize amplitude, OK with null inputs (missing colors for monochrome images)
     * @param clt_data1 first operand FD CLT data[4][transform_len]
@@ -343,7 +484,7 @@ public class Correlation2d {
    		double              scale_value, // scale correlation value
    		double []           col_weights,
    		double              fat_zero) {
-    	double [][][][]     clt_data_tile = new double[clt_data.length][][][];
+    	double [][][][]     clt_data_tile = new double[clt_data.length][][][]; // [camera][color][quadrant][index]
    	for (int ncam = 0; ncam < clt_data.length; ncam++) if (clt_data[ncam] != null){
    		clt_data_tile[ncam] = new double[clt_data[ncam].length][][];
        	for (int ncol = 0; ncol < clt_data[ncam].length; ncol++) if ((clt_data[ncam][ncol] != null) && (clt_data[ncam][ncol][tileY] != null)){

--- a/src/main/java/com/elphel/imagej/tileprocessor/DttRad2.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/DttRad2.java
@@ -613,7 +613,61 @@ public class DttRad2 {
 		return y;
 	}

+	public double [] dttt_iie(double [] x, int mode, int n, boolean debug_gpu){

+		double [] y = new double [n*n];
+		double [] line = new double[n];
+		// first (horizontal) pass
+		for (int i = 0; i<n; i++){
+			System.arraycopy(x, n*i, line, 0, n);
+			line = ((mode & 1)!=0)? dstiie_direct(line):dctiie_direct(line);
+			for (int j=0; j < n;j++) y[j*n+i] =line[j]; // transpose
+		}
+		if (debug_gpu) {
+			System.out.println("------after hor, mode="+mode);
+			for (int i = 0; i < n; i++) {
+				for (int j = 0; j < n; j++) {
+					System.out.print(String.format("%10.5f ", y[n * i + j]));
+				}
+				System.out.println();
+			}
+		}
+
+
+		// second (vertical) pass
+		for (int i = 0; i<n; i++){
+			System.arraycopy(y, n*i, line, 0, n);
+			line = ((mode & 2)!=0)? dstiie_direct(line):dctiie_direct(line);
+			System.arraycopy(line, 0, y, n*i, n);
+		}
+		if (debug_gpu) {
+			System.out.println("------after vert, mode="+mode);
+			for (int i = 0; i < n; i++) {
+				for (int j = 0; j < n; j++) {
+					System.out.print(String.format("%10.5f ", y[n * i + j]));
+				}
+				System.out.println();
+			}
+		}
+		return y;
+	}
+
+
+/*
+    	  if (debug_gpu) {
+    		  System.out.println("=== CONVERTED CORRELATION ===");
+    		  for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
+    			  System.out.println("------dct_mode="+dct_mode);
+    			  for (int i = 0; i < transform_size; i++) {
+    				  for (int j = 0; j < transform_size; j++) {
+    					  System.out.print(String.format("%10.3f ", tcorr[dct_mode][transform_size * i + j]));
+    				  }
+    				  System.out.println();
+    			  }
+    		  }
+    	  }
+
+ */


 	public double [] dttt_iii(double [] x){
@@ -780,7 +834,7 @@ public class DttRad2 {
 	}


-	public double [] dctii_direct(double[] x){
+	public double [] dctii_direct(double[] x){ // orthogonal, term[0] *= 1/sqrt(2)
 		int n = x.length;
 		int t = ilog2(n)-1;
 		if (CII==null){
@@ -796,7 +850,7 @@ public class DttRad2 {
 		return y;
 	}

-	public double [] dctiie_direct(double[] x){
+	public double [] dctiie_direct(double[] x){ // not orthogonal
 		int n = x.length;
 		int t = ilog2(n)-1;
 		if (CIIe==null){
@@ -928,7 +982,7 @@ public class DttRad2 {
 		}
 	}

-	private void setup_CII(int maxN){
+	private void setup_CII(int maxN){ // orthogonal, term[0] *= 1/sqrt(2)
 		if (maxN > N) setup_arrays(maxN);
 		int l = ilog2(N);
 		if (!(CII==null) && (CII.length >= l)) return;
@@ -949,7 +1003,7 @@ public class DttRad2 {
 		}
 	}

-	private void setup_CIIe(int maxN){
+	private void setup_CIIe(int maxN){ // not orthogonal
 		if (maxN > N) setup_arrays(maxN);
 		int l = ilog2(N);
 		if (!(CIIe==null) && (CIIe.length >= l)) return;

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/MacroCorrelation.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/MacroCorrelation.java
@@ -307,6 +307,7 @@ public class MacroCorrelation {

 		//		  double [][][][] texture_tiles =   save_textures ? new double [tilesY][tilesX][][] : null; // ["RGBA".length()][];
 		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
 				this.mtp.isMonochrome(),
 				clt_parameters.getScaleStrength(this.mtp.isAux()));
 		image_dtt.clt_aberrations_quad_corr(
@@ -350,7 +351,7 @@ public class MacroCorrelation {
 				null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				null,     // clt_kernels,                  // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				clt_parameters.kernel_step,
-				clt_parameters.transform_size,
+///				clt_parameters.transform_size,
 				clt_parameters.clt_window,
 				shiftXY, //
 				0.0, // disparity_corr, // final double              disparity_corr, // disparity at infinity

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -580,7 +580,10 @@ public class QuadCLT {
 					  double [] kernel=      new double[kernelSize*kernelSize];
 					  int centered_len = (2*dtt_size-1) * (2*dtt_size-1);
 					  double [] kernel_centered = new double [centered_len + extra_items];
-					  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+					  ImageDtt image_dtt = new ImageDtt(
+							  clt_parameters.transform_size,
+							  isMonochrome(),
+							  clt_parameters.getScaleStrength(isAux()));
 					  int chn,tileY,tileX;
 					  DttRad2 dtt = new DttRad2(dtt_size);
 					  ShowDoubleFloatArrays sdfa_instance = null;
@@ -622,8 +625,8 @@ public class QuadCLT {
 								  kernel,          // double []   src_kernel, //
 								  kernel_centered, // double []   dst_kernel, // should be (2*dtt_size-1) * (2*dtt_size-1) + extra_items size - kernel and dx, dy to the nearest 1/2 pixels
 								                   // also actual full center shifts in sensor pixels
-								  kernelSize,      // int src_size, // 64
-								  dtt_size);       // 8
+								  kernelSize); // ,      // int src_size, // 64
+///								  dtt_size);       // 8
 						  if ((globalDebugLevel > 0) && (tileY == clt_parameters.tileY/2)  && (tileX == clt_parameters.tileX/2)) {
 							  int length=kernel_centered.length;
 							  int size=(int) Math.sqrt(length);
@@ -642,7 +645,7 @@ public class QuadCLT {
 							  image_dtt.clt_normalize_kernel( //
 									  kernel_centered, // double []   kernel, // should be (2*dtt_size-1) * (2*dtt_size-1) + extra_items size (last (2*dtt_size-1) are not modified)
 									  norm_sym_weights, // double []   window, // normalizes result kernel * window to have sum of elements == 1.0
-									  dtt_size,
+///									  dtt_size,
 									  (globalDebugLevel > 0) && (tileY == clt_parameters.tileY/2)  && (tileX == clt_parameters.tileX/2)); // 8
 							  if ((globalDebugLevel > 0) && (tileY == clt_parameters.tileY/2)  && (tileX == clt_parameters.tileX/2)) {
 								  int length=kernel_centered.length;
@@ -661,8 +664,8 @@ public class QuadCLT {
 						  }
 						  image_dtt.clt_symmetrize_kernel( //
 								  kernel_centered, // double []     kernel,      // should be (2*dtt_size-1) * (2*dtt_size-1) +4 size (last 4 are not modified)
-								  clt_kernels[chn][tileY][tileX], // 	double [][]   sym_kernels, // set of 4 SS, AS, SA, AA kdernels, each dtt_size * dtt_size (may have 5-th with center shift
-								  dtt_size); // 8
+								  clt_kernels[chn][tileY][tileX]); // , // 	double [][]   sym_kernels, // set of 4 SS, AS, SA, AA kdernels, each dtt_size * dtt_size (may have 5-th with center shift
+///								  dtt_size); // 8
 						  for (int i = 0; i < extra_items; i++){
 							  clt_kernels[chn][tileY][tileX][4][i] = kernel_centered [centered_len + i];
 						  }
@@ -756,7 +759,7 @@ public class QuadCLT {
 		  if (globalDebugLevel > 1) System.out.println("Threads done at "+IJ.d2s(0.000000001*(System.nanoTime()-startTime),3));
 		  System.out.println("1.Threads done at "+IJ.d2s(0.000000001*(System.nanoTime()-startTime),3));
 		  // Calculate differential offsets to interpolate for tiles between kernel centers
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(clt_parameters.transform_size,isMonochrome(),clt_parameters.getScaleStrength(isAux()));
 		  image_dtt.clt_fill_coord_corr(
 				  clt_parameters.kernel_step,  //  final int             kern_step, // distance between kernel centers, in pixels.
 				  clt_kernels,                 // final double [][][][] clt_data,
@@ -1716,7 +1719,10 @@ public class QuadCLT {
 			  sdfa_instance.showArrays(double_stack,  imp_src.getWidth(), imp_src.getHeight(), true, "BEFORE_CLT_PROC", rbg_titles);
 		  }
 		  if (this.correctionsParameters.deconvolve) { // process with DCT, otherwise use simple debayer
-			  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+			  ImageDtt image_dtt = new ImageDtt(
+					  clt_parameters.transform_size,
+					  isMonochrome(),
+					  clt_parameters.getScaleStrength(isAux()));
 			  for (int i =0 ; i < double_stack[0].length; i++){
 				  double_stack[2][i]*=0.5; // Scale blue twice to compensate less pixels than green
 			  }
@@ -1725,7 +1731,7 @@ public class QuadCLT {
 					  imp_src.getWidth(),           //	final int               width,
 					  clt_kernels[channel],         // final double [][][][][] clt_kernels, // [color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 					  clt_parameters.kernel_step,
-					  clt_parameters.transform_size,
+//					  clt_parameters.transform_size,
 					  clt_parameters.clt_window,
 					  clt_parameters.shift_x,       // final int               shiftX, // shift image horizontally (positive - right) - just for testing
 					  clt_parameters.shift_y,       // final int               shiftY, // shift image vertically (positive - down)
@@ -1760,7 +1766,7 @@ public class QuadCLT {
 						  image_dtt.clt_lpf(
 								  clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 								  clt_data[chn],
-								  clt_parameters.transform_size,
+///								  clt_parameters.transform_size,
 								  threadsMax,
 								  debugLevel);
 					  }
@@ -1768,8 +1774,8 @@ public class QuadCLT {
 /*
 			  }
 */
-			  int tilesY = imp_src.getHeight()/clt_parameters.transform_size;
-			  int tilesX = imp_src.getWidth()/clt_parameters.transform_size;
+			  int tilesY = imp_src.getHeight()/image_dtt.transform_size;
+			  int tilesX = imp_src.getWidth()/image_dtt.transform_size;
 			  if (debugLevel > 0){
 				  System.out.println("--tp.tilesX="+tilesX);
 				  System.out.println("--tp.tilesY="+tilesY);
@@ -1786,8 +1792,8 @@ public class QuadCLT {

 			        if (debugLevel > 0){
 			        	sdfa_instance.showArrays(clt,
-			        			tilesX*clt_parameters.transform_size,
-			        			tilesY*clt_parameters.transform_size,
+			        			tilesX*image_dtt.transform_size,
+			        			tilesY*image_dtt.transform_size,
 			        			true,
 			        			result.getTitle()+"-CLT");
 			        }
@@ -1796,7 +1802,7 @@ public class QuadCLT {
 			  for (int chn=0; chn<clt_data.length;chn++){
 				  iclt_data[chn] = image_dtt.iclt_2d(
 						  clt_data[chn],                  // scanline representation of dcd data, organized as dct_size x dct_size tiles
-						  clt_parameters.transform_size,  // final int
+//						  image_dtt.transform_size,  // final int
 						  clt_parameters.clt_window,      // window_type
 						  15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 						  0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -1806,8 +1812,8 @@ public class QuadCLT {
 			  }
 					  if (debugLevel > -1) sdfa_instance.showArrays(
 							  iclt_data,
-							  (tilesX + 1) * clt_parameters.transform_size,
-							  (tilesY + 1) * clt_parameters.transform_size,
+							  (tilesX + 1) * image_dtt.transform_size,
+							  (tilesY + 1) * image_dtt.transform_size,
 							  true,
 							  result.getTitle()+"-rbg_sigma");
 				/*
@@ -1815,8 +1821,8 @@ public class QuadCLT {
 			  }
 			 */
 			  if (debugLevel > 0) sdfa_instance.showArrays(iclt_data,
-					  (tilesX + 1) * clt_parameters.transform_size,
-					  (tilesY + 1) * clt_parameters.transform_size,
+					  (tilesX + 1) * image_dtt.transform_size,
+					  (tilesY + 1) * image_dtt.transform_size,
 					  true,
 					  result.getTitle()+"-ICLT-RGB");

@@ -1824,8 +1830,8 @@ public class QuadCLT {
 			  String [] sliceNames = {"red", "blue", "green"};
 			  stack = sdfa_instance.makeStack(
 					  iclt_data,
-					  (tilesX + 1) * clt_parameters.transform_size,
-					  (tilesY + 1) * clt_parameters.transform_size,
+					  (tilesX + 1) * image_dtt.transform_size,
+					  (tilesY + 1) * image_dtt.transform_size,
 					  sliceNames); // or use null to get chn-nn slice names


@@ -2318,7 +2324,10 @@ public class QuadCLT {
 			  sdfa_instance.showArrays(double_stack,  imp_src.getWidth(), imp_src.getHeight(), true, "BEFORE_CLT_PROC", rbg_titles);
 		  }
 		  if (this.correctionsParameters.deconvolve) { // process with DCT, otherwise use simple debayer
-			  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+			  ImageDtt image_dtt = new ImageDtt(
+					  clt_parameters.transform_size,
+					  isMonochrome(),
+					  clt_parameters.getScaleStrength(isAux()));
 			  for (int i =0 ; i < double_stack[0].length; i++){
 				  double_stack[2][i]*=0.5; // Scale blue twice to compensate less pixels than green
 			  }
@@ -2327,7 +2336,7 @@ public class QuadCLT {
 					  imp_src.getWidth(),           //	final int               width,
 					  clt_kernels[channel],         // final double [][][][][] clt_kernels, // [color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 					  clt_parameters.kernel_step,
-					  clt_parameters.transform_size,
+//					  image_dtt.transform_size,
 					  clt_parameters.clt_window,
 					  clt_parameters.shift_x,       // final int               shiftX, // shift image horizontally (positive - right) - just for testing
 					  clt_parameters.shift_y,       // final int               shiftY, // shift image vertically (positive - down)
@@ -2362,7 +2371,7 @@ public class QuadCLT {
 						  image_dtt.clt_lpf(
 								  clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 								  clt_data[chn],
-								  clt_parameters.transform_size,
+///								  image_dtt.transform_size,
 								  threadsMax,
 								  debugLevel);
 					  }
@@ -2370,8 +2379,8 @@ public class QuadCLT {
 /*
 			  }
 */
-			  int tilesY = imp_src.getHeight()/clt_parameters.transform_size;
-			  int tilesX = imp_src.getWidth()/clt_parameters.transform_size;
+			  int tilesY = imp_src.getHeight()/image_dtt.transform_size;
+			  int tilesX = imp_src.getWidth()/image_dtt.transform_size;
 			  if (debugLevel > 0){
 				  System.out.println("--tilesX="+tilesX);
 				  System.out.println("--tilesY="+tilesY);
@@ -2388,8 +2397,8 @@ public class QuadCLT {

 			        if (debugLevel > 0){
 			        	sdfa_instance.showArrays(clt,
-			        			tilesX*clt_parameters.transform_size,
-			        			tilesY*clt_parameters.transform_size,
+			        			tilesX*image_dtt.transform_size,
+			        			tilesY*image_dtt.transform_size,
 			        			true,
 			        			result.getTitle()+"-CLT");
 			        }
@@ -2398,7 +2407,7 @@ public class QuadCLT {
 			  for (int chn=0; chn<clt_data.length;chn++){
 				  iclt_data[chn] = image_dtt.iclt_2d(
 						  clt_data[chn],                  // scanline representation of dcd data, organized as dct_size x dct_size tiles
-						  clt_parameters.transform_size,  // final int
+///						  image_dtt.transform_size,  // final int
 						  clt_parameters.clt_window,      // window_type
 						  15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 						  0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -2410,8 +2419,8 @@ public class QuadCLT {
 //					  if (debugLevel > -1) System.out.println("Applyed LPF, sigma = "+dct_parameters.dbg_sigma);
 					  if (debugLevel > 0) sdfa_instance.showArrays(
 							  iclt_data,
-							  (tilesX + 1) * clt_parameters.transform_size,
-							  (tilesY + 1) * clt_parameters.transform_size,
+							  (tilesX + 1) * image_dtt.transform_size,
+							  (tilesY + 1) * image_dtt.transform_size,
 							  true,
 							  result.getTitle()+"-rbg_sigma");
 				/*
@@ -2419,8 +2428,8 @@ public class QuadCLT {
 			  }
 			 */
 			  if (debugLevel > 0) sdfa_instance.showArrays(iclt_data,
-					  (tilesX + 0) * clt_parameters.transform_size,
-					  (tilesY + 0) * clt_parameters.transform_size,
+					  (tilesX + 0) * image_dtt.transform_size,
+					  (tilesY + 0) * image_dtt.transform_size,
 					  true,
 					  result.getTitle()+"-ICLT-RGB");

@@ -2428,8 +2437,8 @@ public class QuadCLT {
 			  String [] sliceNames = {"red", "blue", "green"};
 			  stack = sdfa_instance.makeStack(
 					  iclt_data,
-					  (tilesX + 0) * clt_parameters.transform_size,
-					  (tilesY + 0) * clt_parameters.transform_size,
+					  (tilesX + 0) * image_dtt.transform_size,
+					  (tilesY + 0) * image_dtt.transform_size,
 					  sliceNames); // or use null to get chn-nn slice names


@@ -2883,7 +2892,10 @@ public class QuadCLT {
 //		  String [] rbg_titles = {"Red", "Blue", "Green"};
 		  ImageStack stack;
 		  // =================
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  for (int i = 0; i < double_stacks.length; i++){
 			  for (int j =0 ; j < double_stacks[i][0].length; j++){
 				  double_stacks[i][2][j]*=0.5; // Scale green 0.5 to compensate more pixels than R,B
@@ -2896,7 +2908,7 @@ public class QuadCLT {
 				  geometryCorrection,           // final GeometryCorrection  geometryCorrection,
 				  clt_kernels,                  // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+//				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  clt_parameters.shift_x,       // final int               shiftX, // shift image horizontally (positive - right) - just for testing
 				  clt_parameters.shift_y,       // final int               shiftY, // shift image vertically (positive - down)
@@ -2924,14 +2936,14 @@ public class QuadCLT {
 					  image_dtt.clt_lpf(
 							  clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 							  clt_data[iQuad][chn],
-							  clt_parameters.transform_size,
+///							  image_dtt.transform_size,
 							  threadsMax,
 							  debugLevel);
 				  }
 			  }

-			  int tilesY = imp_quad[iQuad].getHeight()/clt_parameters.transform_size;
-			  int tilesX = imp_quad[iQuad].getWidth()/clt_parameters.transform_size;
+			  int tilesY = imp_quad[iQuad].getHeight()/image_dtt.transform_size;
+			  int tilesX = imp_quad[iQuad].getWidth()/image_dtt.transform_size;
 			  if (debugLevel > 0){
 				  System.out.println("--tp.tilesX="+tilesX);
 				  System.out.println("--tp.tilesY="+tilesY);
@@ -2948,8 +2960,8 @@ public class QuadCLT {

 				  if (debugLevel > 0){
 					  sdfa_instance.showArrays(clt,
-							  tilesX*clt_parameters.transform_size,
-							  tilesY*clt_parameters.transform_size,
+							  tilesX*image_dtt.transform_size,
+							  tilesY*image_dtt.transform_size,
 							  true,
 							  results[iQuad].getTitle()+"-CLT");
 				  }
@@ -2958,7 +2970,7 @@ public class QuadCLT {
 			  for (int chn=0; chn<iclt_data.length;chn++){
 				  iclt_data[chn] = image_dtt.iclt_2d(
 						  clt_data[iQuad][chn],           // scanline representation of dcd data, organized as dct_size x dct_size tiles
-						  clt_parameters.transform_size,  // final int
+///						  image_dtt.transform_size,  // final int
 						  clt_parameters.clt_window,      // window_type
 						  15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 						  0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -2968,13 +2980,13 @@ public class QuadCLT {
 			  }
 			  if (debugLevel > 0) sdfa_instance.showArrays(
 					  iclt_data,
-					  (tilesX + 0) * clt_parameters.transform_size,
-					  (tilesY + 0) * clt_parameters.transform_size,
+					  (tilesX + 0) * image_dtt.transform_size,
+					  (tilesY + 0) * image_dtt.transform_size,
 					  true,
 					  results[iQuad].getTitle()+"-rbg_sigma");
 			  if (debugLevel > 0) sdfa_instance.showArrays(iclt_data,
-					  (tilesX + 0) * clt_parameters.transform_size,
-					  (tilesY + 0) * clt_parameters.transform_size,
+					  (tilesX + 0) * image_dtt.transform_size,
+					  (tilesY + 0) * image_dtt.transform_size,
 					  true,
 					  results[iQuad].getTitle()+"-ICLT-RGB");

@@ -2982,8 +2994,8 @@ public class QuadCLT {
 			  String [] sliceNames = {"red", "blue", "green"};
 			  stack = sdfa_instance.makeStack(
 					  iclt_data,
-					  (tilesX + 0) * clt_parameters.transform_size,
-					  (tilesY + 0) * clt_parameters.transform_size,
+					  (tilesX + 0) * image_dtt.transform_size,
+					  (tilesY + 0) * image_dtt.transform_size,
 					  sliceNames); // or use null to get chn-nn slice names

 			  if (debugLevel > -1){
@@ -4115,7 +4127,10 @@ public class QuadCLT {
 					  this.is_mono);
 		  }

-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  for (int i = 0; i < double_stacks.length; i++){
 			  if ( double_stacks[i].length > 2) {
 				  for (int j =0 ; j < double_stacks[i][0].length; j++){
@@ -4232,7 +4247,7 @@ public class QuadCLT {
 				  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity
@@ -4265,15 +4280,15 @@ public class QuadCLT {
 			  if (clt_parameters.show_nonoverlap){// not used in lwir
 				  texture_nonoverlap = image_dtt.combineRBGATiles(
 						  texture_tiles,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						  clt_parameters.transform_size,
+///						  image_dtt.transform_size,
 						  false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						  clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						  threadsMax,                    // maximal number of threads to launch
 						  debugLevel);
 				  sdfa_instance.showArrays(
 						  texture_nonoverlap,
-						  tilesX * (2 * clt_parameters.transform_size),
-						  tilesY * (2 * clt_parameters.transform_size),
+						  tilesX * (2 * image_dtt.transform_size),
+						  tilesY * (2 * image_dtt.transform_size),
 						  true,
 						  name+sAux() + "-TXTNOL-D"+clt_parameters.disparity,
 						  (clt_parameters.keep_weights?rbga_weights_titles:rbga_titles));
@@ -4283,7 +4298,7 @@ public class QuadCLT {
 				  int alpha_index = 3;
 				  texture_overlap = image_dtt.combineRBGATiles(
 						  texture_tiles,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						  clt_parameters.transform_size,
+///						  image_dtt.transform_size,
 						  true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						  clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						  threadsMax,                    // maximal number of threads to launch
@@ -4302,8 +4317,8 @@ public class QuadCLT {
 				  if (!batch_mode && clt_parameters.show_overlap) {// not used in lwir
 					  sdfa_instance.showArrays( // all but r-rms, b-rms
 							  texture_overlap,
-							  tilesX * clt_parameters.transform_size,
-							  tilesY * clt_parameters.transform_size,
+							  tilesX * image_dtt.transform_size,
+							  tilesY * image_dtt.transform_size,
 							  true,
 							  name+sAux() + "-TXTOL-D"+clt_parameters.disparity,
 							  (clt_parameters.keep_weights?rbga_weights_titles:rbga_titles));
@@ -4324,8 +4339,8 @@ public class QuadCLT {
 							  true, // boolean saveShowIntermediate, // save/show if set globally
 							  true, // boolean saveShowFinal,        // save/show result (color image?)
 							  ((clt_parameters.alpha1 > 0)? texture_rgba: texture_rgb),
-							  tilesX *  clt_parameters.transform_size,
-							  tilesY *  clt_parameters.transform_size,
+							  tilesX *  image_dtt.transform_size,
+							  tilesY *  image_dtt.transform_size,
 							  1.0,         // double scaleExposure, // is it needed?
 							  debugLevel );
 				  }
@@ -4482,7 +4497,7 @@ public class QuadCLT {
 				  for (int i = 0; i<corr_rslt.length; i++) {
 					  corr_rslt[i] = image_dtt.corr_dbg(
 							  clt_corr_combo[i],
-							  2*clt_parameters.transform_size - 1,
+							  2*image_dtt.transform_size - 1,
 							  clt_parameters.corr_border_contrast,
 							  threadsMax,
 							  debugLevel);
@@ -4490,8 +4505,8 @@ public class QuadCLT {
 // all zeros
 				  sdfa_instance.showArrays(
 						  corr_rslt,
-						  tilesX*(2*clt_parameters.transform_size),
-						  tilesY*(2*clt_parameters.transform_size),
+						  tilesX*(2*image_dtt.transform_size),
+						  tilesY*(2*image_dtt.transform_size),
 						  true,
 						  name+sAux()+"-CORR-D"+clt_parameters.disparity,
 						  titles );
@@ -4506,7 +4521,7 @@ public class QuadCLT {
 					  }
 					  double [][] corr_rslt_partial = image_dtt.corr_partial_dbg(
 							  clt_corr_partial,
-							  2*clt_parameters.transform_size - 1,	//final int corr_size,
+							  2*image_dtt.transform_size - 1,	//final int corr_size,
 							  4,	// final int pairs,
 							  4,    // final int colors,
 							  clt_parameters.corr_border_contrast,
@@ -4516,8 +4531,8 @@ public class QuadCLT {
 					  System.out.println("corr_rslt_partial.length = "+corr_rslt_partial.length+", titles.length = "+titles.length);
 					  sdfa_instance.showArrays( // out of boundary 15
 							  corr_rslt_partial,
-							  tilesX*(2*clt_parameters.transform_size),
-							  tilesY*(2*clt_parameters.transform_size),
+							  tilesX*(2*image_dtt.transform_size),
+							  tilesY*(2*image_dtt.transform_size),
 							  true,
 							  name+sAux()+"-PART_CORR-D"+clt_parameters.disparity);
 //							  titles);
@@ -4537,7 +4552,7 @@ public class QuadCLT {
 						  image_dtt.clt_lpf(
 								  clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 								  clt_data[iQuad][chn],
-								  clt_parameters.transform_size,
+///								  image_dtt.transform_size,
 								  threadsMax,
 								  debugLevel);
 					  }
@@ -4559,8 +4574,8 @@ public class QuadCLT {

 					  if (debugLevel > 0){
 						  sdfa_instance.showArrays(clt,
-								  tilesX*clt_parameters.transform_size,
-								  tilesY*clt_parameters.transform_size,
+								  tilesX*image_dtt.transform_size,
+								  tilesY*image_dtt.transform_size,
 								  true,
 								  results[iQuad].getTitle()+"-CLT-D"+clt_parameters.disparity);
 					  }
@@ -4570,7 +4585,7 @@ public class QuadCLT {
 				  for (int ncol=0; ncol<iclt_data[iQuad].length;ncol++) if (clt_data[iQuad][ncol] != null) {
 					  iclt_data[iQuad][ncol] = image_dtt.iclt_2d(
 							  clt_data[iQuad][ncol],           // scanline representation of dcd data, organized as dct_size x dct_size tiles
-							  clt_parameters.transform_size,  // final int
+///							  image_dtt.transform_size,  // final int
 							  clt_parameters.clt_window,      // window_type
 							  15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 							  0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -4581,8 +4596,8 @@ public class QuadCLT {
 				  if (clt_parameters.gen_chn_stacks) sdfa_instance.showArrays(
 //				  if (clt_parameters.gen_chn_stacks || true) sdfa_instance.showArrays(
 						  iclt_data[iQuad],
-						  (tilesX + 0) * clt_parameters.transform_size,
-						  (tilesY + 0) * clt_parameters.transform_size,
+						  (tilesX + 0) * image_dtt.transform_size,
+						  (tilesY + 0) * image_dtt.transform_size,
 						  true,
 						  results[iQuad].getTitle()+"-ICLT-RGB-D"+clt_parameters.disparity);
 			  } // end of generating shifted channel images
@@ -4626,8 +4641,8 @@ public class QuadCLT {
 							  !batch_mode, // true, // boolean saveShowIntermediate, // save/show if set globally
 							  false, // boolean saveShowFinal,        // save/show result (color image?)
 							  iclt_data[iQuad],
-							  tilesX *  clt_parameters.transform_size,
-							  tilesY *  clt_parameters.transform_size,
+							  tilesX *  image_dtt.transform_size,
+							  tilesY *  image_dtt.transform_size,
 							  scaleExposures[iQuad], // double scaleExposure, // is it needed?
 							  debugLevel );
 				  }
@@ -4862,16 +4877,6 @@ public class QuadCLT {
 			  final boolean    updateStatus,
 			  final int        debugLevel){
 		  final boolean      batch_mode = clt_parameters.batch_run; //disable any debug images
-//		  boolean advanced=  this.correctionsParameters.zcorrect || this.correctionsParameters.equirectangular;
-//		  boolean toRGB=     advanced? true: this.correctionsParameters.toRGB;
-
-
-//		  if (!batch_mode) return null;
-
-
-
-		  // may use this.StartTime to report intermediate steps execution times
-//		  String aux = isAux()?"-AUX":"";
 		  String name=this.correctionsParameters.getModelName((String) imp_quad[0].getProperty("name"));
 		  //		int channel= Integer.parseInt((String) imp_src.getProperty("channel"));
 		  String path= (String) imp_quad[0].getProperty("path");
@@ -4890,7 +4895,7 @@ public class QuadCLT {
 					  this.is_mono);
 		  }

-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(clt_parameters.transform_size,isMonochrome(),clt_parameters.getScaleStrength(isAux()));
 		  for (int i = 0; i < double_stacks.length; i++){
 			  if ( double_stacks[i].length > 2) {
 				  for (int j =0 ; j < double_stacks[i][0].length; j++){
@@ -4992,62 +4997,32 @@ public class QuadCLT {
 			  z_correction +=clt_parameters.z_corr_map.get(name);// not used in lwir
 		  }
 		  final double disparity_corr = (z_correction == 0) ? 0.0 : geometryCorrection.getDisparityFromZ(1.0/z_correction);
-//		  double [][][][][][] clt_data = image_dtt.clt_aberrations_quad_corr_min(
 		  double [][] lazy_eye_data = image_dtt.cltMeasureLazyEye(
 				  clt_parameters.img_dtt,       // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
-//				  1,                            // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 				  tile_op,                      // per-tile operation bit codes
 				  disparity_array,              // final double            disparity,
 				  double_stacks,                // final double [][][]      imade_data, // first index - number of image in a quad
 				  saturation_imp,               // boolean [][] saturation_imp, // (near) saturated pixels or null
-				  // correlation results - final and partial
-//				  clt_corr_combo,               // [tp.tilesY][tp.tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
-//				  clt_corr_partial,             // [tp.tilesY][tp.tilesX][pair][color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 				  clt_mismatch,                 // [12][tp.tilesY * tp.tilesX] // transpose unapplied. null - do not calculate
 				  disparity_map,                // [2][tp.tilesY * tp.tilesX]
-//				  texture_tiles,                // [tp.tilesY][tp.tilesX]["RGBA".length()][];
 				  imp_quad[0].getWidth(),       // final int width,
 				  clt_parameters.getFatZero(isMonochrome()),      // add to denominator to modify phase correlation (same units as data1, data2). <0 - pure sum
-//				  clt_parameters.corr_sym,
-//				  clt_parameters.corr_offset,
 				  clt_parameters.corr_red,
 				  clt_parameters.corr_blue,
 				  clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
-//				  clt_parameters.corr_normalize, // normalize correlation results by rms
 				  min_corr_selected, // 0.0001; // minimal correlation value to consider valid
-//				  clt_parameters.max_corr_sigma,// 1.5;  // weights of points around global max to find fractional
-//				  clt_parameters.max_corr_radius,
-//				  clt_parameters.max_corr_double, // Double pass when masking center of mass to reduce preference for integer values
-//				  clt_parameters.corr_mode,     // Correlation mode: 0 - integer max, 1 - center of mass, 2 - polynomial
-//				  clt_parameters.min_shot,       // 10.0;  // Do not adjust for shot noise if lower than
-//				  clt_parameters.scale_shot,     // 3.0;   // scale when dividing by sqrt ( <0 - disable correction)
-//				  clt_parameters.diff_sigma,     // 5.0;//RMS difference from average to reduce weights (~ 1.0 - 1/255 full scale image)
-//				  clt_parameters.diff_threshold, // 5.0;   // RMS difference from average to discard channel (~ 1.0 - 1/255 full scale image)
-//				  clt_parameters.diff_gauss,     // true;  // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
-//				  clt_parameters.min_agree,      // 3.0;   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-//				  clt_parameters.dust_remove,    // Do not reduce average weight when only one image differes much from the average
-//				  clt_parameters.keep_weights,   // Add port weights to RGBA stack (debug feature)
 				  geometryCorrection,            // final GeometryCorrection  geometryCorrection,
 				  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity
-
-//				  (clt_parameters.fcorr_ignore? null: this.fine_corr),
-//				  clt_parameters.corr_magic_scale, // still not understood coefficient that reduces reported disparity value.  Seems to be around 0.85
-
 				  clt_parameters.shift_x,          // final int               shiftX, // shift image horizontally (positive - right) - just for testing
 				  clt_parameters.shift_y,          // final int               shiftY, // shift image vertically (positive - down)
 				  clt_parameters.tileStep,         // 	final int                 tileStep, // process tileStep x tileStep cluster of tiles when adjusting lazy eye parameters
-
 				  clt_parameters.tileX, // -1234, // clt_parameters.tileX,         // final int               debug_tileX,
 				  clt_parameters.tileY,         // final int               debug_tileY, -1234 will cause port coordinates debug images
-//				  (clt_parameters.dbg_mode & 64) != 0, // no fract shift
-//				  (clt_parameters.dbg_mode & 128) != 0, // no convolve
-				  //				  (clt_parameters.dbg_mode & 256) != 0, // transpose convolve
 				  threadsMax,
 				  debugLevel);

@@ -6087,7 +6062,10 @@ public class QuadCLT {
 					  this.is_mono);
 		  }
 		  // =================
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  for (int i = 0; i < double_stacks.length; i++){
 			  for (int j =0 ; j < double_stacks[i][0].length; j++){
 				  double_stacks[i][2][j]*=0.5; // Scale green 0.5 to compensate more pixels than R,B
@@ -6175,7 +6153,7 @@ public class QuadCLT {
 					  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 					  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 					  clt_parameters.kernel_step,
-					  clt_parameters.transform_size,
+///					  image_dtt.transform_size,
 					  clt_parameters.clt_window,
 					  shiftXY, //
 					  disparity_corr, // final double              disparity_corr, // disparity at infinity
@@ -9135,10 +9113,6 @@ public class QuadCLT {
 		  }


-
-
-
-
 		  for (int scanIndex = next_pass; scanIndex < tp.clt_3d_passes.size(); scanIndex++){
 			  if (debugLevel > 0){
 				  System.out.println("FPGA processing scan #"+scanIndex);
@@ -9464,10 +9438,13 @@ public class QuadCLT {
 		  if (num_bgnd < clt_parameters.min_bgnd_tiles){
 			  return null; // no background to generate // not used in lwir
 		  }
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  double [][] texture_overlap = image_dtt.combineRBGATiles(
 				  texture_tiles_bgnd, // texture_tiles,               // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 				  clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 				  threadsMax,                    // maximal number of threads to launch
@@ -9498,8 +9475,8 @@ public class QuadCLT {
 				  true, // boolean saveShowIntermediate, // save/show if set globally
 				  false, //true, // boolean saveShowFinal,        // save/show result (color image?)
 				  ((clt_parameters.alpha1 > 0)? texture_rgba: texture_rgb),
-				  tilesX *  clt_parameters.transform_size,
-				  tilesY *  clt_parameters.transform_size,
+				  tilesX *  image_dtt.transform_size,
+				  tilesY *  image_dtt.transform_size,
 				  1.0,         // double scaleExposure, // is it needed?
 				  debugLevel);
 		  // resize for backdrop here!
@@ -9562,14 +9539,18 @@ public class QuadCLT {
 			  System.out.println("getPassImage(): Empty image!");
 			  return null;
 		  }
-		  double [][]alphaFade = tp.getAlphaFade(clt_parameters.transform_size);
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
+		  double [][]alphaFade = tp.getAlphaFade(image_dtt.transform_size);
 		  if ((debugLevel > 0) && (scanIndex == 1)) { // not used in lwir
 			  String [] titles = new String[16];
 			  for (int i = 0; i<titles.length;i++)  titles[i]=""+i;
-			  sdfa_instance.showArrays(alphaFade, 2*clt_parameters.transform_size,2*clt_parameters.transform_size,true,"alphaFade",titles);
+			  sdfa_instance.showArrays(alphaFade, 2*image_dtt.transform_size,2*image_dtt.transform_size,true,"alphaFade",titles);
 		  }
 		  double [][][][] texture_tiles_cluster = new double[tilesY][tilesX][][];
-		  double [] alpha_zero = new double [4*clt_parameters.transform_size*clt_parameters.transform_size];
+		  double [] alpha_zero = new double [4*image_dtt.transform_size*image_dtt.transform_size];
 		  int alpha_index = 3;
 		  for (int i = 0; i < alpha_zero.length; i++) alpha_zero[i]=0.0;
 		  for (int tileY = 0; tileY < tilesY; tileY++){
@@ -9598,10 +9579,9 @@ public class QuadCLT {
 			  }
 		  }

-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
 		  double [][] texture_overlap = image_dtt.combineRBGATiles(
 				  texture_tiles_cluster, // texture_tiles,               // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 				  clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 				  threadsMax,                    // maximal number of threads to launch
@@ -9625,14 +9605,14 @@ public class QuadCLT {
 		  if (resize) {
 		  texture_rgbx = resizeGridTexture(
 				  texture_rgbx,
-				  clt_parameters.transform_size,
+				  image_dtt.transform_size,
 				  tilesX,
 				  tilesY,
 				  scan.getTextureBounds());
 		  }

-		  int width = resize ? (clt_parameters.transform_size * scan.getTextureBounds().width): (clt_parameters.transform_size * tilesX);
-		  int height = resize ? (clt_parameters.transform_size * scan.getTextureBounds().height): (clt_parameters.transform_size * tilesY);
+		  int width = resize ? (image_dtt.transform_size * scan.getTextureBounds().width): (image_dtt.transform_size * tilesX);
+		  int height = resize ? (image_dtt.transform_size * scan.getTextureBounds().height): (image_dtt.transform_size * tilesY);
 		  if ((width <= 0) || (height <= 0)) {
 			  System.out.println("***** BUG in getPassImage(): width="+width+", height="+height+", resize="+resize+" ****"); // not used in lwir
 		  }
@@ -9648,8 +9628,8 @@ public class QuadCLT {
 				  true, // boolean saveShowIntermediate, // save/show if set globally
 				  false, //true, // boolean saveShowFinal,        // save/show result (color image?)
 				  texture_rgbx,
-				  width, //tp.tilesX *  clt_parameters.transform_size,
-				  height, //tp.tilesY *  clt_parameters.transform_size,
+				  width, //tp.tilesX *  image_dtt.transform_size,
+				  height, //tp.tilesY *  image_dtt.transform_size,
 				  1.0,         // double scaleExposure, // is it needed?
 				  debugLevel);

@@ -9773,7 +9753,10 @@ public class QuadCLT {
 		  }

 		  double [][][][] texture_tiles =     new double [tilesY][tilesX][][]; // ["RGBA".length()][];
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  double z_correction =  clt_parameters.z_correction;
 		  if (clt_parameters.z_corr_map.containsKey(image_name)){ // not used in lwir
 			  z_correction +=clt_parameters.z_corr_map.get(image_name);
@@ -9794,7 +9777,7 @@ public class QuadCLT {
 				  //	Use it with disparity_maps[scan_step]?		  clt_mismatch,    // [tp.tilesY][tp.tilesX][pair]{dx,dy,weight}[(2*transform_size-1)*(2*transform_size-1)] // transpose unapplied. null - do not calculate
 				  disparity_map,    // [12][tp.tilesY * tp.tilesX]
 				  texture_tiles,        // [tp.tilesY][tp.tilesX]["RGBA".length()][];
-				  tilesX * clt_parameters.transform_size, // imp_quad[0].getWidth(),       // final int width,
+				  tilesX * image_dtt.transform_size, // imp_quad[0].getWidth(),       // final int width,
 				  clt_parameters.getFatZero(isMonochrome()),      // add to denominator to modify phase correlation (same units as data1, data2). <0 - pure sum
 				  clt_parameters.corr_sym,
 				  clt_parameters.corr_offset,
@@ -9819,7 +9802,7 @@ public class QuadCLT {
 				  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity
@@ -10008,8 +9991,10 @@ public class QuadCLT {
 		  }

 		  double [][][][] texture_tiles =   save_textures ? new double [tilesY][tilesX][][] : null; // ["RGBA".length()][];
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
-//		  final double disparity_corr = (clt_parameters.z_correction == 0) ? 0.0 : geometryCorrection.getDisparityFromZ(1.0/clt_parameters.z_correction);
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  double z_correction =  clt_parameters.z_correction;
 		  if (clt_parameters.z_corr_map.containsKey(image_name)){ // not used in lwir
 			  z_correction +=clt_parameters.z_corr_map.get(image_name);
@@ -10029,7 +10014,7 @@ public class QuadCLT {
 				  //	Use it with disparity_maps[scan_step]?		  clt_mismatch,    // [tp.tilesY][tp.tilesX][pair]{dx,dy,weight}[(2*transform_size-1)*(2*transform_size-1)] // transpose unapplied. null - do not calculate
 				  disparity_map,    // [12][tp.tilesY * tp.tilesX]
 				  texture_tiles,        // [tp.tilesY][tp.tilesX]["RGBA".length()][];
-				  tilesX * clt_parameters.transform_size, // imp_quad[0].getWidth(),       // final int width,
+				  tilesX * image_dtt.transform_size, // imp_quad[0].getWidth(),       // final int width,
 				  clt_parameters.getFatZero(isMonochrome()),      // add to denominator to modify phase correlation (same units as data1, data2). <0 - pure sum
 				  clt_parameters.corr_sym,
 				  clt_parameters.corr_offset,
@@ -10056,7 +10041,7 @@ public class QuadCLT {
 				  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity
@@ -10141,7 +10126,10 @@ public class QuadCLT {
 		  }

 		  double [][][][] texture_tiles =   save_textures ? new double [tilesY][tilesX][][] : null; // ["RGBA".length()][];
-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  double z_correction =  clt_parameters.z_correction;
 		  if (clt_parameters.z_corr_map.containsKey(image_name)){ // not used in lwir
 			  z_correction +=clt_parameters.z_corr_map.get(image_name);
@@ -10162,7 +10150,7 @@ public class QuadCLT {
 				  //	Use it with disparity_maps[scan_step]?		  clt_mismatch,    // [tp.tilesY][tp.tilesX][pair]{dx,dy,weight}[(2*transform_size-1)*(2*transform_size-1)] // transpose unapplied. null - do not calculate
 				  disparity_map,    // [12][tp.tilesY * tp.tilesX]
 				  texture_tiles,        // [tp.tilesY][tp.tilesX]["RGBA".length()][];
-				  tilesX * clt_parameters.transform_size, // imp_quad[0].getWidth(),       // final int width,
+				  tilesX * image_dtt.transform_size, // imp_quad[0].getWidth(),       // final int width,
 				  clt_parameters.getFatZero(isMonochrome()),      // add to denominator to modify phase correlation (same units as data1, data2). <0 - pure sum
 				  clt_parameters.corr_sym,
 				  clt_parameters.corr_offset,
@@ -10187,7 +10175,7 @@ public class QuadCLT {
 				  geometryCorrection_main, // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                  // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+///				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity
@@ -10313,7 +10301,10 @@ public class QuadCLT {
 			  shiftXY = shiftXY0;
 		  }

-		  ImageDtt image_dtt = new ImageDtt(isMonochrome(),clt_parameters.getScaleStrength(isAux()));
+		  ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  isMonochrome(),
+				  clt_parameters.getScaleStrength(isAux()));
 		  double z_correction =  clt_parameters.z_correction;
 		  if (clt_parameters.z_corr_map.containsKey(image_name)){ // not used in lwir
 			  z_correction +=clt_parameters.z_corr_map.get(image_name);
@@ -10335,7 +10326,7 @@ public class QuadCLT {
 				  null, // final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
 				  // values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
 				  null, // disparity_map,    // [12][tp.tilesY * tp.tilesX]
-				  tilesX * clt_parameters.transform_size, // imp_quad[0].getWidth(),       // final int width,
+				  tilesX * image_dtt.transform_size, // imp_quad[0].getWidth(),       // final int width,
 				  clt_parameters.getFatZero(isMonochrome()),      // add to denominator to modify phase correlation (same units as data1, data2). <0 - pure sum
 				  clt_parameters.corr_red,
 				  clt_parameters.corr_blue,
@@ -10345,7 +10336,7 @@ public class QuadCLT {
 				  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
 				  clt_kernels,                   // final double [][][][][][] clt_kernels, // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 				  clt_parameters.kernel_step,
-				  clt_parameters.transform_size,
+//				  image_dtt.transform_size,
 				  clt_parameters.clt_window,
 				  shiftXY, //
 				  disparity_corr, // final double              disparity_corr, // disparity at infinity

--- a/src/main/java/com/elphel/imagej/tileprocessor/TileProcessor.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TileProcessor.java
@@ -5806,7 +5806,10 @@ public class TileProcessor {
 		// show testure_tiles

 		double [][][][] texture_tiles = scan_prev.getTextureTiles();
-		ImageDtt image_dtt = new ImageDtt(isMonochrome(), clt_parameters.getScaleStrength(is_aux));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				isMonochrome(),
+				clt_parameters.getScaleStrength(is_aux));

 		double [][][]  dispStrength = st.getDisparityStrengths(
 				clt_parameters.stMeasSel); // int        stMeasSel) //            = 1;      // Select measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert)
@@ -5830,15 +5833,15 @@ public class TileProcessor {
 			if (!batch_mode && show_nonoverlap){
 				texture_nonoverlap = image_dtt.combineRBGATiles(
 						texture_tiles,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+///						image_dtt.transform_size,
 						false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
 						debugLevel);
 				sdfa_instance.showArrays(
 						texture_nonoverlap,
-						tilesX * (2 * clt_parameters.transform_size),
-						tilesY * (2 * clt_parameters.transform_size),
+						tilesX * (2 * image_dtt.transform_size),
+						tilesY * (2 * image_dtt.transform_size),
 						true,
 						name + "-TXTNOL-D",
 						(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -5849,7 +5852,7 @@ public class TileProcessor {
 				int alpha_index = 3;
 				texture_overlap = image_dtt.combineRBGATiles(
 						texture_tiles,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+///						image_dtt.transform_size,
 						true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
@@ -5868,8 +5871,8 @@ public class TileProcessor {
 				if (show_overlap) {
 					sdfa_instance.showArrays(
 							texture_overlap,
-							tilesX * clt_parameters.transform_size,
-							tilesY * clt_parameters.transform_size,
+							tilesX * image_dtt.transform_size,
+							tilesY * image_dtt.transform_size,
 							true,
 							name + "-TXTOL-D",
 							(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -7182,7 +7185,9 @@ public class TileProcessor {

 			CLTPass3d scan_prev = clt_3d_passes.get(clt_3d_passes.size() -1); // get last one
 			boolean [] these_tiles = scan_prev.getSelected();
-			DisparityProcessor dp = new DisparityProcessor(this, clt_parameters.transform_size * geometryCorrection.getScaleDzDx());
+			DisparityProcessor dp = new DisparityProcessor(
+					this,
+					clt_parameters.transform_size * geometryCorrection.getScaleDzDx());
 			boolean [] grown = these_tiles.clone();
 			growTiles(
 					2,          // grow tile selection by 1 over non-background tiles 1: 4 directions, 2 - 8 directions, 3 - 8 by 1, 4 by 1 more

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -347,6 +347,7 @@ public class TwoQuadCLT {
 	}

 	public void prepareFilesForGPUDebug(
+			String                                         save_prefix, // absolute path to the cuda project root
 			QuadCLT                                        quadCLT_main,
 			QuadCLT                                        quadCLT_aux,
 			CLTParameters       clt_parameters,
@@ -407,8 +408,9 @@ public class TwoQuadCLT {
 					saturation_imp_aux,            //output  // boolean [][]                              saturation_imp,
 					debugLevel); // int                                       debugLevel);

-			// Tempporarily processing individaully with the old code
+			// Tempporarily processing individually with the old code
 			processCLTQuadCorrPairForGPU(
+					save_prefix,                // String save_prefix,
 					quadCLT_main,               // QuadCLT                                        quadCLT_main,
 					quadCLT_aux,                // QuadCLT                                        quadCLT_aux,
 					imp_srcs_main,              // ImagePlus []                                   imp_quad_main,
@@ -638,7 +640,10 @@ public class TwoQuadCLT {

 		double [][] disparity_bimap  = new double [ImageDtt.BIDISPARITY_TITLES.length][]; //[0] -residual disparity, [1] - orthogonal (just for debugging) last 4 - max pixel differences

-		ImageDtt image_dtt = new ImageDtt(quadCLT_main.isMonochrome(),clt_parameters.getScaleStrength(false));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_main.isMonochrome(),
+				clt_parameters.getScaleStrength(false));

 		double [][] ml_data = null;
 //		int [][] woi_tops = {quadCLT_main.woi_tops,quadCLT_aux.woi_tops};
@@ -694,30 +699,30 @@ public class TwoQuadCLT {
 			if (clt_parameters.show_nonoverlap){
 				texture_nonoverlap_main = image_dtt.combineRBGATiles(
 						texture_tiles_main,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
 						debugLevel);
 				sdfa_instance.showArrays(
 						texture_nonoverlap_main,
-						tilesX * (2 * clt_parameters.transform_size),
-						tilesY * (2 * clt_parameters.transform_size),
+						tilesX * (2 * image_dtt.transform_size),
+						tilesY * (2 * image_dtt.transform_size),
 						true,
 						name + "-TXTNOL-D"+clt_parameters.disparity+"-MAIN",
 						(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));

 				texture_nonoverlap_aux = image_dtt.combineRBGATiles(
 						texture_tiles_aux,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
 						debugLevel);
 				sdfa_instance.showArrays(
 						texture_nonoverlap_aux,
-						tilesX * (2 * clt_parameters.transform_size),
-						tilesY * (2 * clt_parameters.transform_size),
+						tilesX * (2 * image_dtt.transform_size),
+						tilesY * (2 * image_dtt.transform_size),
 						true,
 						name + "-TXTNOL-D"+clt_parameters.disparity+"-AUX",
 						(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -726,7 +731,7 @@ public class TwoQuadCLT {
 				int alpha_index = 3;
 				texture_overlap_main = image_dtt.combineRBGATiles(
 						texture_tiles_main,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
@@ -744,7 +749,7 @@ public class TwoQuadCLT {

 				texture_overlap_aux = image_dtt.combineRBGATiles(
 						texture_tiles_aux,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
@@ -763,8 +768,8 @@ public class TwoQuadCLT {
 				if (!batch_mode && clt_parameters.show_overlap) {
 					sdfa_instance.showArrays(
 							texture_overlap_main,
-							tilesX * clt_parameters.transform_size,
-							tilesY * clt_parameters.transform_size,
+							tilesX * image_dtt.transform_size,
+							tilesY * image_dtt.transform_size,
 							true,
 							name + "-TXTOL-D"+clt_parameters.disparity+"-MAIN",
 							(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -772,8 +777,8 @@ public class TwoQuadCLT {
 				if (!batch_mode && clt_parameters.show_overlap) {
 					sdfa_instance.showArrays(
 							texture_overlap_aux,
-							tilesX * clt_parameters.transform_size,
-							tilesY * clt_parameters.transform_size,
+							tilesX * image_dtt.transform_size,
+							tilesY * image_dtt.transform_size,
 							true,
 							name + "-TXTOL-D"+clt_parameters.disparity+"-AUX",
 							(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -797,8 +802,8 @@ public class TwoQuadCLT {
 							false, // true, // boolean saveShowIntermediate, // save/show if set globally
 							false, // true, // boolean saveShowFinal,        // save/show result (color image?)
 							((clt_parameters.alpha1 > 0)? texture_rgba_main: texture_rgb_main),
-							tilesX *  clt_parameters.transform_size,
-							tilesY *  clt_parameters.transform_size,
+							tilesX *  image_dtt.transform_size,
+							tilesY *  image_dtt.transform_size,
 							1.0,         // double scaleExposure, // is it needed?
 							debugLevel );
 					ImagePlus imp_texture_aux = quadCLT_aux.linearStackToColor(
@@ -812,8 +817,8 @@ public class TwoQuadCLT {
 							false, // true, // boolean saveShowIntermediate, // save/show if set globally
 							false, // true, // boolean saveShowFinal,        // save/show result (color image?)
 							((clt_parameters.alpha1 > 0)? texture_rgba_aux: texture_rgb_aux),
-							tilesX *  clt_parameters.transform_size,
-							tilesY *  clt_parameters.transform_size,
+							tilesX *  image_dtt.transform_size,
+							tilesY *  image_dtt.transform_size,
 							1.0,         // double scaleExposure, // is it needed?
 							debugLevel );
 					int width = imp_texture_main.getWidth();
@@ -870,7 +875,7 @@ public class TwoQuadCLT {
 				for (int i = 0; i<corr_rslt.length; i++) {
 					corr_rslt[i] = image_dtt.corr_dbg(
 							clt_corr_combo[i],
-							2*clt_parameters.transform_size - 1,
+							2*image_dtt.transform_size - 1,
 							clt_parameters.corr_border_contrast,
 							threadsMax,
 							debugLevel);
@@ -878,8 +883,8 @@ public class TwoQuadCLT {

 				sdfa_instance.showArrays(
 						corr_rslt,
-						tilesX*(2*clt_parameters.transform_size),
-						tilesY*(2*clt_parameters.transform_size),
+						tilesX*(2*image_dtt.transform_size),
+						tilesY*(2*image_dtt.transform_size),
 						true,
 						name + "-CORR-D"+clt_parameters.disparity,
 						titles );
@@ -904,7 +909,7 @@ public class TwoQuadCLT {
 						image_dtt.clt_lpf(
 								clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 								clt_bidata[iAux][iSubCam][chn],
-								clt_parameters.transform_size,
+//								image_dtt.transform_size,
 								threadsMax,
 								debugLevel);
 					}
@@ -926,8 +931,8 @@ public class TwoQuadCLT {

 					if (debugLevel > 0){
 						sdfa_instance.showArrays(clt,
-								tilesX*clt_parameters.transform_size,
-								tilesY*clt_parameters.transform_size,
+								tilesX*image_dtt.transform_size,
+								tilesY*image_dtt.transform_size,
 								true,
 								results[iQuadComb].getTitle()+"-CLT-D"+clt_parameters.disparity);
 					}
@@ -936,7 +941,7 @@ public class TwoQuadCLT {
 				for (int chn=0; chn<iclt_data.length;chn++){
 					iclt_data[chn] = image_dtt.iclt_2d(
 							clt_bidata[iAux][iSubCam][chn], // scanline representation of dcd data, organized as dct_size x dct_size tiles
-							clt_parameters.transform_size,  // final int
+//							image_dtt.transform_size,  // final int
 							clt_parameters.clt_window,      // window_type
 							15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 							0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -946,8 +951,8 @@ public class TwoQuadCLT {
 				}

 				if (clt_parameters.gen_chn_stacks) sdfa_instance.showArrays(iclt_data,
-						(tilesX + 0) * clt_parameters.transform_size,
-						(tilesY + 0) * clt_parameters.transform_size,
+						(tilesX + 0) * image_dtt.transform_size,
+						(tilesY + 0) * image_dtt.transform_size,
 						true,
 						results[iQuadComb].getTitle()+"-ICLT-RGB-D"+clt_parameters.disparity);
 				if (!clt_parameters.gen_chn_img) continue;
@@ -963,8 +968,8 @@ public class TwoQuadCLT {
 						!batch_mode, // true, // boolean saveShowIntermediate, // save/show if set globally
 						false, // boolean saveShowFinal,        // save/show result (color image?)
 						iclt_data,
-						tilesX *  clt_parameters.transform_size,
-						tilesY *  clt_parameters.transform_size,
+						tilesX *  image_dtt.transform_size,
+						tilesY *  image_dtt.transform_size,
 						scaleExposures[iAux][iSubCam], // double scaleExposure, // is it needed?
 						debugLevel );
 			} // end of generating shifted channel images
@@ -1162,7 +1167,12 @@ public class TwoQuadCLT {
 				bb.clear();
 				for (int i = 0; i <  image_data[chn][0].length; i++) {
 //					dos.writeFloat((float) (image_data[chn][0][i] + image_data[chn][1][i] + image_data[chn][2][i]));
-					bb.putFloat((float) (image_data[chn][0][i] + image_data[chn][1][i] + image_data[chn][2][i]));
+					double d = 0;
+					for (int c = 0; c < image_data[chn].length; c++) {
+						d += image_data[chn][c][i];
+					}
+//					bb.putFloat((float) (image_data[chn][0][i] + image_data[chn][1][i] + image_data[chn][2][i]));
+					bb.putFloat((float) d);
 				}
 				bb.flip();
 				channel.write(bb);
@@ -1270,6 +1280,7 @@ public class TwoQuadCLT {


 	public ImagePlus [] processCLTQuadCorrPairForGPU(
+			String                                         save_prefix,
 			QuadCLT                                        quadCLT_main,
 			QuadCLT                                        quadCLT_aux,
 			ImagePlus []                                   imp_quad_main,
@@ -1355,21 +1366,14 @@ public class TwoQuadCLT {
 			}

 		}
-
 		double [][] disparity_bimap  = new double [ImageDtt.BIDISPARITY_TITLES.length][]; //[0] -residual disparity, [1] - orthogonal (just for debugging) last 4 - max pixel differences
-
-		ImageDtt image_dtt = new ImageDtt(quadCLT_main.isMonochrome(),clt_parameters.getScaleStrength(false));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_main.isMonochrome(),
+				clt_parameters.getScaleStrength(false));
 		double [][] ml_data = null;
-//		int [][] woi_tops = {quadCLT_main.woi_tops,quadCLT_aux.woi_tops};
 		double [][][]       ers_delay = get_ers?(new double [2][][]):null;
-///		double [][][][][][] clt_kernels_main = quadCLT_main.getCLTKernels(); // [4][3][123][164]{[64],[64],[64],[64],[8]}
-///		double [][][][][][] clt_kernels_aux =  quadCLT_aux.getCLTKernels();
-
-
-		//[4][3][123][164][5][]
-///		double [][] dbg_kern = clt_kernels_main[0][0][0][0];
 		// here all data is ready (images, kernels) to try GPU code
-
 		float [][] main_bayer = new float [quadCLT_main.image_data.length][quadCLT_main.image_data[0][0].length];
 		float [][] dst_bayer =  new float [quadCLT_main.image_data.length][quadCLT_main.image_data[0][0].length];
 		for (int nc = 0; nc < main_bayer.length; nc++) {
@@ -1382,6 +1386,7 @@ public class TwoQuadCLT {

 		double [][][]       port_xy_main_dbg = new double [tilesX*tilesY][][];
 		double [][][]       port_xy_aux_dbg =  new double [tilesX*tilesY][][];
+//		double [][][]       corr2ddata =       new double [1][][];

 		final double [][][][][][][] clt_bidata = // new double[2][quad][nChn][tilesY][tilesX][][]; // first index - main/aux
 				image_dtt.clt_bi_quad_dbg (
@@ -1412,24 +1417,69 @@ public class TwoQuadCLT {
 						quadCLT_aux.getCLTKernels(),          // final double [][][][][][] clt_kernels_aux,  // [channel_in_quad][color][tileY][tileX][band][pixel] , size should match image (have 1 tile around)
 						clt_parameters.corr_magic_scale,      // final double              corr_magic_scale, // still not understood coefficient that reduces reported disparity value.  Seems to be around 0.85
 						true,                                 // 	final boolean             keep_clt_data,
-//						woi_tops,                             // final int [][]            woi_tops,
 						ers_delay,                            // final double [][][]       ers_delay,        // if not null - fill with tile center acquisition delay
 						threadsMax,                           // final int                 threadsMax,  // maximal number of threads to launch
 						debugLevel,                           // final int                 globalDebugLevel);
 						port_xy_main_dbg,                     // final double [][][]       port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development)
 						port_xy_aux_dbg);                      // final double [][][]       port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development)
-/*
-		if (debugLevel < 1000) {
+
+
+		// Create list of all correlation pairs
+		double [][][][][][] clt_data = clt_bidata[0];
+		int numTiles = tilesX * tilesY;
+		int numPairs = GPUTileProcessor.NUM_PAIRS;
+		int [] corr_indices = new int [numTiles * numPairs];
+		int indx=0;
+		for (int i = 0; i < numTiles; i++) {
+			for (int j = 0; j < numPairs; j++) {
+				corr_indices[indx++] = (i << GPUTileProcessor.CORR_PAIR_SHIFT) + j;
+			}
+		}
+		double [][] corrs2d = image_dtt.get2DCorrs(
+				clt_parameters,  // final CLTParameters       clt_parameters,
+				clt_data,        // final double [][][][][][] clt_data, // [channel_in_quad][color][tileY][tileX][band][pixel];
+				corr_indices,      // final int    []           pairs_list,
+				threadsMax,      // final int                 threadsMax,  // maximal number of threads to launch
+				debugLevel);     // final int                 debugLevel
+		float [][] fcorrs2d = new float [corrs2d.length][corrs2d[0].length];
+		// for compatibility with the actual GPUI output
+		for (int n = 0; n < corrs2d.length; n++) {
+			for (int i = 0; i < corrs2d[0].length; i++) {
+				fcorrs2d[n][i] = (float) corrs2d[n][i];
+			}
+		}
+		int [] wh = new int[2];
+		double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+	    		tilesX,
+	    		tilesY,
+	    		corr_indices,
+	    		fcorrs2d,
+	    		wh);
+		(new ShowDoubleFloatArrays()).showArrays(
+				dbg_corr,
+				wh[0],
+				wh[1],
+				true,
+				"CORR2D_CPU",
+				GPUTileProcessor.getCorrTitles());
+
+
+		if ((save_prefix != null) && (save_prefix != "")) {
+
+			if (debugLevel < -1000) {
 				return null;
 			}

-		String kernel_dir = "/home/eyesis/workspace-python3/nvidia_dct8x8/clt/";
-//		boolean [][] what_to_save = {{false,false,true}, {false,false,true}};
+			String kernel_dir = save_prefix+"clt/";
+			File kdir = new File(kernel_dir);
+			kdir.mkdir();
+			//		boolean [][] what_to_save = {{false,false,true}, {false,false,true}};
 			boolean [][] what_to_save = {{true,true,true}, {true,true,true}};
 			try {
 				saveFloatKernels(
 						kernel_dir +"main", // String file_prefix,
-					(what_to_save[0][0]?clt_kernels_main:null), // double [][][][][][] clt_kernels, // null
+//						(what_to_save[0][0]?clt_kernels_main:null), // double [][][][][][] clt_kernels, // null
+						(what_to_save[0][0]?quadCLT_main.getCLTKernels():null), // double [][][][][][] clt_kernels, // null
 						(what_to_save[0][1]?quadCLT_main.image_data:null),
 						(what_to_save[0][2]?port_xy_main_dbg:null), // double [][][]       port_xy,
 						true);
@@ -1442,7 +1492,8 @@ public class TwoQuadCLT {
 			try {
 				saveFloatKernels(
 						kernel_dir +"aux", // String file_prefix,
-					(what_to_save[1][0]?clt_kernels_aux:null), // double [][][][][][] clt_kernels, // null
+//						(what_to_save[1][0]?clt_kernels_aux:null), // double [][][][][][] clt_kernels, // null
+						(what_to_save[1][0]?quadCLT_aux.getCLTKernels():null), // double [][][][][][] clt_kernels, // null
 						(what_to_save[1][1]?quadCLT_aux.image_data:null),
 						(what_to_save[1][2]?port_xy_aux_dbg:null), // double [][][]       port_xy,
 						true);
@@ -1452,15 +1503,14 @@ public class TwoQuadCLT {
 				e.printStackTrace();
 			} // boolean transpose);

-
-
-		if (debugLevel < 1000) {
+			if (debugLevel < -1000) {
 				return null;
 			}
 			if (ers_delay !=null) {
 				showERSDelay(ers_delay);
 			}
-*/
+
+		}
 		double [][] texture_nonoverlap_main = null;
 		double [][] texture_nonoverlap_aux = null;
 		double [][] texture_overlap_main = null;
@@ -1471,30 +1521,30 @@ public class TwoQuadCLT {
 			if (clt_parameters.show_nonoverlap){
 				texture_nonoverlap_main = image_dtt.combineRBGATiles(
 						texture_tiles_main,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
 						debugLevel);
 				sdfa_instance.showArrays(
 						texture_nonoverlap_main,
-						tilesX * (2 * clt_parameters.transform_size),
-						tilesY * (2 * clt_parameters.transform_size),
+						tilesX * (2 * image_dtt.transform_size),
+						tilesY * (2 * image_dtt.transform_size),
 						true,
 						name + "-TXTNOL-D"+clt_parameters.disparity+"-MAIN",
 						(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));

 				texture_nonoverlap_aux = image_dtt.combineRBGATiles(
 						texture_tiles_aux,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						false,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
 						debugLevel);
 				sdfa_instance.showArrays(
 						texture_nonoverlap_aux,
-						tilesX * (2 * clt_parameters.transform_size),
-						tilesY * (2 * clt_parameters.transform_size),
+						tilesX * (2 * image_dtt.transform_size),
+						tilesY * (2 * image_dtt.transform_size),
 						true,
 						name + "-TXTNOL-D"+clt_parameters.disparity+"-AUX",
 						(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -1503,7 +1553,7 @@ public class TwoQuadCLT {
 				int alpha_index = 3;
 				texture_overlap_main = image_dtt.combineRBGATiles(
 						texture_tiles_main,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
@@ -1521,7 +1571,7 @@ public class TwoQuadCLT {

 				texture_overlap_aux = image_dtt.combineRBGATiles(
 						texture_tiles_aux,                 // array [tp.tilesY][tp.tilesX][4][4*transform_size] or [tp.tilesY][tp.tilesX]{null}
-						clt_parameters.transform_size,
+//						image_dtt.transform_size,
 						true,                         // when false - output each tile as 16x16, true - overlap to make 8x8
 						clt_parameters.sharp_alpha,    // combining mode for alpha channel: false - treat as RGB, true - apply center 8x8 only
 						threadsMax,                    // maximal number of threads to launch
@@ -1540,8 +1590,8 @@ public class TwoQuadCLT {
 				if (!batch_mode && clt_parameters.show_overlap) {
 					sdfa_instance.showArrays(
 							texture_overlap_main,
-							tilesX * clt_parameters.transform_size,
-							tilesY * clt_parameters.transform_size,
+							tilesX * image_dtt.transform_size,
+							tilesY * image_dtt.transform_size,
 							true,
 							name + "-TXTOL-D"+clt_parameters.disparity+"-MAIN",
 							(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -1549,8 +1599,8 @@ public class TwoQuadCLT {
 				if (!batch_mode && clt_parameters.show_overlap) {
 					sdfa_instance.showArrays(
 							texture_overlap_aux,
-							tilesX * clt_parameters.transform_size,
-							tilesY * clt_parameters.transform_size,
+							tilesX * image_dtt.transform_size,
+							tilesY * image_dtt.transform_size,
 							true,
 							name + "-TXTOL-D"+clt_parameters.disparity+"-AUX",
 							(clt_parameters.keep_weights?rgba_weights_titles:rgba_titles));
@@ -1574,8 +1624,8 @@ public class TwoQuadCLT {
 							false, // true, // boolean saveShowIntermediate, // save/show if set globally
 							false, // true, // boolean saveShowFinal,        // save/show result (color image?)
 							((clt_parameters.alpha1 > 0)? texture_rgba_main: texture_rgb_main),
-							tilesX *  clt_parameters.transform_size,
-							tilesY *  clt_parameters.transform_size,
+							tilesX *  image_dtt.transform_size,
+							tilesY *  image_dtt.transform_size,
 							1.0,         // double scaleExposure, // is it needed?
 							debugLevel );
 					ImagePlus imp_texture_aux = quadCLT_aux.linearStackToColor(
@@ -1589,8 +1639,8 @@ public class TwoQuadCLT {
 							false, // true, // boolean saveShowIntermediate, // save/show if set globally
 							false, // true, // boolean saveShowFinal,        // save/show result (color image?)
 							((clt_parameters.alpha1 > 0)? texture_rgba_aux: texture_rgb_aux),
-							tilesX *  clt_parameters.transform_size,
-							tilesY *  clt_parameters.transform_size,
+							tilesX *  image_dtt.transform_size,
+							tilesY *  image_dtt.transform_size,
 							1.0,         // double scaleExposure, // is it needed?
 							debugLevel );
 					int width = imp_texture_main.getWidth();
@@ -1647,7 +1697,7 @@ public class TwoQuadCLT {
 				for (int i = 0; i<corr_rslt.length; i++) {
 					corr_rslt[i] = image_dtt.corr_dbg(
 							clt_corr_combo[i],
-							2*clt_parameters.transform_size - 1,
+							2*image_dtt.transform_size - 1,
 							clt_parameters.corr_border_contrast,
 							threadsMax,
 							debugLevel);
@@ -1655,8 +1705,8 @@ public class TwoQuadCLT {

 				sdfa_instance.showArrays(
 						corr_rslt,
-						tilesX*(2*clt_parameters.transform_size),
-						tilesY*(2*clt_parameters.transform_size),
+						tilesX*(2*image_dtt.transform_size),
+						tilesY*(2*image_dtt.transform_size),
 						true,
 						name + "-CORR-D"+clt_parameters.disparity,
 						titles );
@@ -1681,7 +1731,7 @@ public class TwoQuadCLT {
 						image_dtt.clt_lpf(
 								clt_parameters.getCorrSigma(image_dtt.isMonochrome()),
 								clt_bidata[iAux][iSubCam][chn],
-								clt_parameters.transform_size,
+//								image_dtt.transform_size,
 								threadsMax,
 								debug_lpf);
 					}
@@ -1703,8 +1753,8 @@ public class TwoQuadCLT {

 					if (debugLevel > 0){
 						sdfa_instance.showArrays(clt,
-								tilesX*clt_parameters.transform_size,
-								tilesY*clt_parameters.transform_size,
+								tilesX*image_dtt.transform_size,
+								tilesY*image_dtt.transform_size,
 								true,
 								results[iQuadComb].getTitle()+"-CLT-D"+clt_parameters.disparity);
 					}
@@ -1713,7 +1763,7 @@ public class TwoQuadCLT {
 				for (int chn=0; chn<iclt_data.length;chn++){
 					iclt_data[chn] = image_dtt.iclt_2d_debug_gpu(
 							clt_bidata[iAux][iSubCam][chn], // scanline representation of dcd data, organized as dct_size x dct_size tiles
-							clt_parameters.transform_size,  // final int
+//							image_dtt.transform_size,  // final int
 							clt_parameters.clt_window,      // window_type
 							15,                             // clt_parameters.iclt_mask,       //which of 4 to transform back
 							0,                              // clt_parameters.dbg_mode,        //which of 4 to transform back
@@ -1725,8 +1775,8 @@ public class TwoQuadCLT {
 				}

 				if (clt_parameters.gen_chn_stacks) sdfa_instance.showArrays(iclt_data,
-						(tilesX + 0) * clt_parameters.transform_size,
-						(tilesY + 0) * clt_parameters.transform_size,
+						(tilesX + 0) * image_dtt.transform_size,
+						(tilesY + 0) * image_dtt.transform_size,
 						true,
 						results[iQuadComb].getTitle()+"-ICLT-RGB-D"+clt_parameters.disparity);
 				if (!clt_parameters.gen_chn_img) continue;
@@ -1742,8 +1792,8 @@ public class TwoQuadCLT {
 						!batch_mode, // true, // boolean saveShowIntermediate, // save/show if set globally
 						false, // boolean saveShowFinal,        // save/show result (color image?)
 						iclt_data,
-						tilesX *  clt_parameters.transform_size,
-						tilesY *  clt_parameters.transform_size,
+						tilesX *  image_dtt.transform_size,
+						tilesY *  image_dtt.transform_size,
 						scaleExposures[iAux][iSubCam], // double scaleExposure, // is it needed?
 						debugLevel );
 			} // end of generating shifted channel images
@@ -1836,21 +1886,42 @@ public class TwoQuadCLT {
 			final int        threadsMax,  // maximal number of threads to launch
 			final boolean    updateStatus,
 			final int        debugLevel){
+// get fat_zero (absolute) and color scales
+		boolean is_mono = quadCLT_main.isMonochrome();
+		double    fat_zero = clt_parameters.getGpuFatZero(is_mono); //   30.0;
+		double [] scales = (is_mono) ? (new double [] {1.0}) :(new double [] {
+				clt_parameters.gpu_weight_r, // 0.25
+				clt_parameters.gpu_weight_b, // 0.25
+				1.0 - clt_parameters.gpu_weight_r - clt_parameters.gpu_weight_b}); // 0.5
+
+		ImageDtt image_dtt = new ImageDtt(
+				  clt_parameters.transform_size,
+				  is_mono,
+				  1.0);
+		float [][] lpf_rgb;
+		if (is_mono) {
+			lpf_rgb = new float[1][];
+			lpf_rgb[0] = image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_m);
+		} else {
+			lpf_rgb = new float[3][];
+			lpf_rgb[0] = image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_r);
+			lpf_rgb[1] = image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_b);
+			lpf_rgb[2] = image_dtt.floatGetCltLpfFd(clt_parameters.gpu_sigma_g);
+		}

 		gPUTileProcessor.setLpfRbg(
-				1.1f,  // float sigma_r,
-				1.1f,  // float sigma_b,
-				0.7f); // float sigma_g)
+				lpf_rgb);
+		float [] lpf_flat = image_dtt.floatGetCltLpfFd(clt_parameters.getGpuCorrSigma(is_mono));
+
+		gPUTileProcessor.setLpfCorr(
+				lpf_flat);


 		final boolean use_aux = false; // currently GPU is configured for a single quad camera

 		final boolean      batch_mode = clt_parameters.batch_run; //disable any debug images
-//		final boolean get_ers = !batch_mode;
-//		boolean infinity_corr = false;
-//		double [][] scaleExposures= {scaleExposures_main, scaleExposures_aux};
 		boolean toRGB=     quadCLT_main.correctionsParameters.toRGB;
-//		showDoubleFloatArrays sdfa_instance = new showDoubleFloatArrays(); // just for debugging? - TODO - move where it belongs
+
 		// may use this.StartTime to report intermediate steps execution times
 		String name=quadCLT_main.correctionsParameters.getModelName((String) imp_quad_main[0].getProperty("name"));
 		String path= (String) imp_quad_main[0].getProperty("path"); // Only for debug output
@@ -1888,6 +1959,8 @@ public class TwoQuadCLT {
 		// Set task clt_parameters.disparity
 		GPUTileProcessor.TpTask [] tp_tasks  = gPUTileProcessor.setFullFrameImages(
 	    		(float) clt_parameters.disparity,     // float                     target_disparity, // apply same disparity to all tiles
+	    		0xf, // int                       out_image, // from which tiles to generate image (currently 0/1)
+	    		0x3f, // int                       corr_mask,  // which correlation pairs to generate (maybe later - reduce size from 15x15)
 	    		!use_aux,                             // boolean                   use_master,
 	    		use_aux,                              // boolean                   use_aux,
 	    		quadCLT_main.getGeometryCorrection(), // final GeometryCorrection  geometryCorrection_main,
@@ -1895,44 +1968,55 @@ public class TwoQuadCLT {
 	    		null,                                 // final double [][][]       ers_delay,        // if not null - fill with tile center acquisition delay
 	    		threadsMax,                           // final int                 threadsMax,  // maximal number of threads to launch
 	    		debugLevel);                          // final int                 debugLevel)
+
 		gPUTileProcessor.setTasks(
 				tp_tasks, // TpTask [] tile_tasks,
 				use_aux); // boolean use_aux)

+
+		int [] corr_indices = gPUTileProcessor.getCorrTasks(
+				tp_tasks);
+		// corr_indices array of integers to be passed to GPU
+		gPUTileProcessor.setCorrIndices(corr_indices);
+
 		// All set, run kernel (correct and convert)
 		int NREPEAT = 1; // 00;
 		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
 		long startGPU=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execConverCorrectTiles();
+
 		// run imclt;
-		long firstGPUTime= (System.nanoTime() - startGPU)/NREPEAT;
+		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execImcltRbg();
-		long runGPUTime = (System.nanoTime() - startGPU)/NREPEAT;
+		long endImcltTime = System.nanoTime();
+
+		long startCorr2d=System.nanoTime();   // System.nanoTime();
+
+		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execCorr2D(
+	    		scales,// double [] scales,
+	    		fat_zero); // double fat_zero);
+
+		long endCorr2d = System.nanoTime();
+
+		long endGPUTime = System.nanoTime();
+		long firstGPUTime= (startIMCLT- startGPU)/NREPEAT;
+		long runImcltTime = (endImcltTime - startIMCLT)/NREPEAT;
+		long runCorr2DTime = (endCorr2d - startCorr2d)/NREPEAT;
+		long runGPUTime = (endGPUTime - startGPU)/NREPEAT;
+		// run corr2d
+
 		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
 		System.out.println("GPU run time ="+(runGPUTime * 1.0e-6)+"ms, (direct conversion: "+(firstGPUTime*1.0e-6)+"ms, imclt: "+
-				((runGPUTime - firstGPUTime)*1.0e-6)+"ms)");
-
+				(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms");
+		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
 			iclt_fimg[ncam] = gPUTileProcessor.getRBG(ncam);
 		}
-		// get data back from GPU
-		String [] rgb_titles = {"red","blue","green"};
+
 		int out_width =  GPUTileProcessor.IMG_WIDTH +  GPUTileProcessor.DTT_SIZE;
 		int out_height = GPUTileProcessor.IMG_HEIGHT + GPUTileProcessor.DTT_SIZE;
-		/*
-		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
-			String title=name+"-RBG"+String.format("%02d", ncam);

-			(new ShowDoubleFloatArrays()).showArrays(
-					iclt_fimg[ncam],
-					out_width,
-					out_height,
-					true,
-					title,
-					rgb_titles);
-		}
-		 */
 		ImagePlus [] imps_RGB = new ImagePlus[iclt_fimg.length];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
 			String title=name+"-"+String.format("%02d", ncam);
@@ -1953,6 +2037,25 @@ public class TwoQuadCLT {
 					debugLevel );

 		}
+		float [][] corr2D = gPUTileProcessor.getCorr2D();
+// convert to 6-layer image		 using tasks
+		int tilesX =  GPUTileProcessor.IMG_WIDTH / GPUTileProcessor.DTT_SIZE;
+		int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
+		int [] wh = new int[2];
+		double [][] dbg_corr = gPUTileProcessor.getCorr2DView(
+	    		tilesX,
+	    		tilesY,
+	    		corr_indices,
+	    		corr2D,
+	    		wh);
+		(new ShowDoubleFloatArrays()).showArrays(
+				dbg_corr,
+				wh[0],
+				wh[1],
+				true,
+				"CORR2D",
+				gPUTileProcessor.getCorrTitles());
+
 		if (clt_parameters.gen_chn_img) {
 			// combine to a sliced color image
 			// assuming total number of images to be multiple of 4
@@ -2255,7 +2358,12 @@ public class TwoQuadCLT {
 					"MACRO-INPUT");
 		}

-		int macro_scale = clt_parameters.transform_size;
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_main.isMonochrome(),
+				clt_parameters.getScaleStrength(false));
+
+		int macro_scale = image_dtt.transform_size;
 		int mTilesX = tilesX/macro_scale;
 		int mTilesY = tilesY/macro_scale;
 		int [][] mtile_op = new int [mTilesY][mTilesX];
@@ -2266,7 +2374,6 @@ public class TwoQuadCLT {
 		}
 		double [][] mdisparity_array = new double [mTilesY][mTilesX]; // keep all zeros
 		double [][] mdisparity_bimap = new double [ImageDtt.BIDISPARITY_TITLES.length][];
-		ImageDtt image_dtt = new ImageDtt(quadCLT_main.isMonochrome(),clt_parameters.getScaleStrength(false));
 		image_dtt.clt_bi_macro(
 				clt_parameters,                       // final EyesisCorrectionParameters.CLTParameters       clt_parameters,
 				clt_parameters.getFatZero(image_dtt.isMonochrome()),              // final double              fatzero,         // May use correlation fat zero from 2 different parameters - fat_zero and rig.ml_fatzero
@@ -3155,7 +3262,10 @@ if (debugLevel > -100) return true; // temporarily !
 				}
 			}
 		}
-		ImageDtt image_dtt = new ImageDtt(quadCLT_main.isMonochrome(),clt_parameters.getScaleStrength(false));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_main.isMonochrome(),
+				clt_parameters.getScaleStrength(false));
 		image_dtt.clt_bi_quad (
 				clt_parameters,                       // final EyesisCorrectionParameters.CLTParameters       clt_parameters,
 				clt_parameters.getFatZero(image_dtt.isMonochrome()),              // final double              fatzero,         // May use correlation fat zero from 2 different parameters - fat_zero and rig.ml_fatzero
@@ -3175,7 +3285,7 @@ if (debugLevel > -100) return true; // temporarily !
 				null, // ml_data,                     // 	final double [][]         ml_data,         // data for ML - 10 layers - 4 center areas (3x3, 5x5,..) per camera-per direction, 1 - composite, and 1 with just 1 data (target disparity)
 				texture_tiles[0],                     // final double [][][][]     texture_tiles_main, // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
 				texture_tiles[1],                     // final double [][][][]     texture_tiles_aux,  // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
-				quadCLT_main.tp.getTilesX()*clt_parameters.transform_size, // final int                 width,
+				quadCLT_main.tp.getTilesX()*image_dtt.transform_size, // final int                 width,

 				quadCLT_main.getGeometryCorrection(), // final GeometryCorrection  geometryCorrection_main,
 				quadCLT_aux.getGeometryCorrection(),  // final GeometryCorrection  geometryCorrection_aux,
@@ -7232,7 +7342,10 @@ if (debugLevel > -100) return true; // temporarily !
 			final int                                threadsMax,  // maximal number of threads to launch
 			final boolean                            updateStatus,
 			final int                                debugLevel){
-		ImageDtt image_dtt = new ImageDtt(quadCLT_main.isMonochrome(),clt_parameters.getScaleStrength(false));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_main.isMonochrome(),
+				clt_parameters.getScaleStrength(false));

 		double [][] disparity_bimap  = new double [ImageDtt.BIDISPARITY_TITLES.length][]; //[0] -residual disparity, [1] - orthogonal (just for debugging) last 4 - max pixel differences

@@ -7255,7 +7368,7 @@ if (debugLevel > -100) return true; // temporarily !
 				ml_data,                              // 	final double [][]         ml_data,         // data for ML - 10 layers - 4 center areas (3x3, 5x5,..) per camera-per direction, 1 - composite, and 1 with just 1 data (target disparity)
 				null,                                 // final double [][][][]     texture_tiles_main, // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
 				null,                                 // final double [][][][]     texture_tiles_aux,  // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
-				quadCLT_main.tp.getTilesX()*clt_parameters.transform_size, // final int                 width,
+				quadCLT_main.tp.getTilesX()*image_dtt.transform_size, // final int                 width,

 				quadCLT_main.getGeometryCorrection(), // final GeometryCorrection  geometryCorrection_main,
 				quadCLT_aux.getGeometryCorrection(),  // final GeometryCorrection  geometryCorrection_aux,
@@ -7397,7 +7510,10 @@ if (debugLevel > -100) return true; // temporarily !
 			final int                                threadsMax,  // maximal number of threads to launch
 			final boolean                            updateStatus,
 			final int                                debugLevel){
-		ImageDtt image_dtt = new ImageDtt(quadCLT_aux.isMonochrome(),clt_parameters.getScaleStrength(true));
+		ImageDtt image_dtt = new ImageDtt(
+				clt_parameters.transform_size,
+				quadCLT_aux.isMonochrome(),
+				clt_parameters.getScaleStrength(true));
 		double [][] disparity_bimap  = new double [ImageDtt.BIDISPARITY_TITLES.length][]; //[0] -residual disparity, [1] - orthogonal (just for debugging) last 4 - max pixel differences
 		image_dtt.clt_bi_quad (
 				clt_parameters,                       // final EyesisCorrectionParameters.CLTParameters       clt_parameters,
@@ -7417,8 +7533,8 @@ if (debugLevel > -100) return true; // temporarily !
 				ml_data,                              // 	final double [][]         ml_data,         // data for ML - 10 layers - 4 center areas (3x3, 5x5,..) per camera-per direction, 1 - composite, and 1 with just 1 data (target disparity)
 				null,                                 // final double [][][][]     texture_tiles_main, // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
 				null,                                 // final double [][][][]     texture_tiles_aux,  // [tilesY][tilesX]["RGBA".length()][];  null - will skip images combining
-//				quadCLT_main.tp.getTilesX()*clt_parameters.transform_size, // final int                 width,
-				quadCLT_aux.tp.getTilesX()*clt_parameters.transform_size, // final int                 width,
+//				quadCLT_main.tp.getTilesX()*image_dtt.transform_size, // final int                 width,
+				quadCLT_aux.tp.getTilesX()*image_dtt.transform_size, // final int                 width,

 				null, // quadCLT_main.getGeometryCorrection(), // final GeometryCorrection  geometryCorrection_main,
 				quadCLT_aux.getGeometryCorrection(),  // final GeometryCorrection  geometryCorrection_aux,

--- a/src/main/resources/TileProcessor.cuh
+++ b/src/main/resources/TileProcessor.cuh
@@ -46,12 +46,19 @@
 #define KERNELS_HOR           164
 #define KERNELS_VERT          123
 #define NUM_CAMS                4
+#define NUM_PAIRS               6
 #define NUM_COLORS              3
 #define KERNELS_LSTEP           4
 #define THREADS_PER_TILE        8
 #define TILES_PER_BLOCK         4
+#define CORR_THREADS_PER_TILE   8
+#define CORR_TILES_PER_BLOCK    4
 #define IMCLT_THREADS_PER_TILE 16
 #define IMCLT_TILES_PER_BLOCK   4
+#define CORR_PAIR_SHIFT         8 // 8 lower bits - number of a pair, other bits tile number
+#define TASK_CORR_BITS          4
+#define CORR_OUT_RAD            7
+

 #endif
 //#define IMCLT14
@@ -106,6 +113,11 @@
 #define DTT_SIZE1        (DTT_SIZE + 1)
 #define DTT_SIZE2        (2 * DTT_SIZE)
 #define DTT_SIZE21       (DTT_SIZE2 + 1)
+#define DTT_SIZE4        (4 * DTT_SIZE)
+#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
+
+// Use CORR_OUT_RAD for the correlation output
+

 #define BAYER_RED   0
 #define BAYER_BLUE  1
@@ -117,15 +129,16 @@
 //#define BAYER_BLUE_COL (1 - BAYER_RED_COL)


-#define DBG_TILE_X     174
-#define DBG_TILE_Y     118
+#define DBG_TILE_X     40
+#define DBG_TILE_Y     80

-//#define DBG_TILE     (DBG_TILE_Y * 324 + DBG_TILE_X)
+#define DBG_TILE     (DBG_TILE_Y * 324 + DBG_TILE_X)
 //#define DEBUG1 1
 //#define DEBUG2 1
 //#define DEBUG3 1
 //#define DEBUG4 1
 //#define DEBUG5 1
+#define DEBUG6 1
 //56494
 // struct tp_task
 //#define TASK_SIZE      12
@@ -311,6 +324,24 @@ __constant__ float lpf_data[3][64]={
 				0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
 				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
 		}};
+__constant__ float lpf_corr[64]={ // modify if needed
+				1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
+				0.87041007f, 0.75761368f, 0.57398049f, 0.37851747f, 0.21734206f, 0.10895863f, 0.04888546f, 0.02374977f,
+				0.65943687f, 0.57398049f, 0.43485698f, 0.28677101f, 0.16466189f, 0.08254883f, 0.03703642f, 0.01799322f,
+				0.43487258f, 0.37851747f, 0.28677101f, 0.18911416f, 0.10858801f, 0.05443770f, 0.02442406f, 0.01186582f,
+				0.24970076f, 0.21734206f, 0.16466189f, 0.10858801f, 0.06235047f, 0.03125774f, 0.01402412f, 0.00681327f,
+				0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
+				0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
+				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
+		};
+
+__constant__ int pairs[6][2]={
+		{0, 1},
+		{2, 3},
+		{0, 2},
+		{1, 3},
+		{0, 3},
+		{2, 1}};
 //#endif
 __device__ void convertCorrectTile(
 		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
@@ -333,32 +364,297 @@ __device__ void convertCorrectTile(
 	    float window_hor_sin  [2*DTT_SIZE],
 	    float window_vert_cos [2*DTT_SIZE]);

+__device__ void debug_print_lpf(
+		float * lpf_tile);
+
+__device__ void debug_print_clt1(
+		float * clt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+		const int color,
+		int mask);
+
+__device__ void debug_print_mclt(
+		float * mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+		const int color);
+__device__ void debug_print_corr_15x15(
+		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
+		const int color);
 // Fractional pixel shift (phase rotation), horizontal. In-place.
-__device__ void shiftTileHor(
+__device__ void shiftTileHor( // implemented, used
 		float * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float residual_shift                         );
 // Fractional pixel shift (phase rotation), vertical. In-place.
-__device__ void shiftTileVert(
+__device__ void shiftTileVert( // implemented, used
 		float *clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float residual_shift                         );
-__device__ void convolveTiles(
+__device__ void convolveTiles( // implemented, used
 		float* clt_tile, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 		float* kernel); //      [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the CLT kernel (DTT3 converted)
-__device__ void imclt(
+__device__ void correlateAccumulateTiles(
+		float  scale,      //    scale correlation
+		float* clt_tile1,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+		float* clt_tile2,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+		float* corr_tile); //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+__device__ void resetCorrelation(
+		float* corr_tile); //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+__device__ void normalizeTileAmplitude(
+		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+		float fat_zero);  // fat zero is absolute, scale it outside
+__device__ void corrUnfoldTile(
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+__device__ void imclt(  // implemented, used // why is it twice?
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-__device__ void imclt(
+__device__ void imclt(  // implemented, used // why is it twice?
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-__device__ void imclt_plane(
+__device__ void imclt_plane( // not implemented, not used
 		int               color,
 		float           * gpu_clt,   // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)

+extern "C"
+__global__ void correlate2D(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+//		int               tilesX,             // make it variable
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		size_t            num_corr_tiles,     // number of correlation tiles to process
+		int             * gpu_corr_indices,   // packed tile+pair
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs)          // correlation output data
+{
+///	int thr3 =        threadIdx.x >> 3; // now zero?
+///	int column =      threadIdx.x; // modify to use 2 * 8 threads, if needed.
+	float scales[3] = {scale0, scale1, scale2};
+	int corr_in_block = threadIdx.y;
+	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
+	if (corr_num >= num_corr_tiles){
+		return; // nothing to do
+	}
+	// get number of pair and number of tile
+#define ALLTILES 1
+#ifdef ALLTILES
+	int corr_pair = corr_num % NUM_PAIRS;
+	int tile_num =  corr_num / NUM_PAIRS;
+#else
+	int corr_pair = gpu_corr_indices[corr_num];
+	int tile_num = corr_pair >> CORR_PAIR_SHIFT;
+#endif
+
+	corr_pair &= (corr_pair & ((1 << CORR_PAIR_SHIFT) - 1));
+	if (corr_pair > NUM_PAIRS){
+		return; // BUG - should not happen
+	}
+	int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
+	int cam2 = pairs[corr_pair][1]; // number of the first camera in a pair
+    __syncthreads();// __syncwarp();
+    __shared__ float clt_tiles1  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_tiles2  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float mlt_corrs   [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
+    // set clt_corr to all zeros
+    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+    float * mclt_corr = ((float *) mlt_corrs) +  corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
+    resetCorrelation(clt_corr);
+    for (int color = 0; color < colors; color++){
+        // copy clt (frequency domain data)
+        float * clt_tile1 = ((float *) clt_tiles1) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        float * clt_tile2 = ((float *) clt_tiles2) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
+        float * gpu_tile1 = ((float *) gpu_clt[cam1]) + offs;
+        float * gpu_tile2 = ((float *) gpu_clt[cam2]) + offs;
+		float * clt_tile1i = clt_tile1 + threadIdx.x;
+		float * clt_tile2i = clt_tile2 + threadIdx.x;
+#pragma unroll
+		for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
+			*clt_tile1i= *gpu_tile1;
+			*clt_tile2i= *gpu_tile2;
+			clt_tile1i += DTT_SIZE1;
+			clt_tile2i += DTT_SIZE1;
+			gpu_tile1 += DTT_SIZE;
+			gpu_tile2 += DTT_SIZE;
+		}
+		__syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA1\n",tile_num, corr_pair,color);
+    	debug_print_clt1(clt_tile1, color,  0xf); //
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA22\n",tile_num, corr_pair,color);
+    	debug_print_clt1(clt_tile2, color,  0xf); //
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+		// each thread should get the same pointers here, offsets are inside
+        correlateAccumulateTiles(
+        		scales[color], // float  scale,     // scale correlation
+				clt_tile1, // float* clt_tile1, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+				clt_tile2, // float* clt_tile2, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+				clt_corr); // float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+    	__syncthreads();
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D, color = %d CORRELATION\n", color);
+    	debug_print_clt1(clt_corr, color,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    }
+    normalizeTileAmplitude(
+    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+			fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
+// Low Pass Filter from constant area (is it possible to replace?)
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D LPF\n");
+        debug_print_lpf(lpf_corr);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+
+
+    float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+    for (int q = 0; q < 4; q++){
+		float *lpf = lpf_corr + threadIdx.x;
+#pragma unroll
+    	for (int i = 0; i < DTT_SIZE; i++){
+    		(*clt) *= (*lpf);
+    		clt   += DTT_SIZE1;
+    		lpf   += DTT_SIZE;
+    	}
+    }
+    __syncthreads();// __syncwarp();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D CORRELATION LPF-ed\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+
+// now new part - need to transform with DCT-II and make 15x15
+/*
+    //    quadrant 0 dct_ii hor, dct_ii vert,
+    //    quadrant 1 dct_ii hor, dst_ii vert,
+    //    quadrant 2 dst_ii hor, dct_ii vert,
+    //    quadrant 3 dst_ii hor, dst_ii vert,
+Java code:
+     	for (int quadrant = 0; quadrant < 4; quadrant++){
+    		int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
+    		tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
+    	}
+
+ */
+    // change to 16-32 threads?? in next iteration
+    // hor pass
+    for (int q = 0; q < 4; q++){
+    	int is_sin = (q >> 1) & 1;
+//    	int is_sin = q & 1;
+//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D AFTER VERTICAL (HORIZONTAL) PASS\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+    // vert pass
+    for (int q = 0; q < 4; q++){
+    	int is_sin = q & 1;
+//    	int is_sin = (q >> 1) & 1;
+//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+    corrUnfoldTile(
+    		(float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+			(float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+
+    __syncthreads();
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D after UNFOLD\n");
+    	debug_print_corr_15x15(mclt_corr, -1);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+    // copy 15x15 tile to main memory
+     int corr_tile_offset =  + corr_stride * corr_num;
+    float *mem_corr = gpu_corrs + corr_tile_offset;
+
+    //CORR_THREADS_PER_TILE
+//    int offs = threadIdx.x;
+#pragma unroll
+    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+    	mem_corr[offs] = mclt_corr[offs];
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D after copy to main memory\n");
+//    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+
+}
+
+
 extern "C"
 __global__ void convert_correct_tiles(
-//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct paraeters
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
 			float           ** gpu_images,         // [NUM_CAMS],
@@ -367,9 +663,7 @@ __global__ void convert_correct_tiles(
 			size_t             dstride,            // in floats (pixels)
 			int                num_tiles,          // number of tiles in task
 			int                lpf_mask)           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-
 {
-//	struct CltExtra* gpu_kernel_offsets = (struct CltExtra*) vgpu_kernel_offsets;
 	dim3 t = threadIdx;
 	int tile_in_block = threadIdx.y;
 	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
@@ -379,8 +673,6 @@ __global__ void convert_correct_tiles(
 	__shared__ struct tp_task tt [TILES_PER_BLOCK];
 	// Copy task data to shared memory
 	tt[tile_in_block].task =          gpu_task -> task;
-//	tt[tile_in_block].tx =            gpu_task -> tx;
-//	tt[tile_in_block].ty =            gpu_task -> ty;
 	tt[tile_in_block].txy =           gpu_task -> txy;
 	int thread0 =  threadIdx.x & 1;
 	int thread12 = threadIdx.x >>1;
@@ -426,7 +718,6 @@ __global__ void convert_correct_tiles(
 					lpf_mask,                        // const int         lpf_mask,
 					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
 					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
-//					tt[tile_in_block].tx | (tt[tile_in_block].ty <<16), //  const int txy,
 					tt[tile_in_block].txy,           //  const int txy,
 					dstride,                         // size_t            dstride, // in floats (pixels)
 					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
@@ -556,6 +847,191 @@ __device__ void convolveTiles(
 	}
 }

+__device__ void correlateAccumulateTiles(
+		float  scale,     // scale correlation
+		float* clt_tile1, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+		float* clt_tile2, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+		float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+	float * clt_tile2_j; //  =   clt_tile2 +      joffs;                // ==&clt_tile2[0][j][0]
+	float * clt_tile1_j0 = clt_tile1 +    joffs;                // ==&clt_tile[0][j][0]
+	float * clt_tile1_j1 = clt_tile1_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * clt_tile1_j2 = clt_tile1_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * clt_tile1_j3 = clt_tile1_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+
+	float * corr_tile_j0 = corr_tile +    joffs;                // ==&clt_tile[0][j][0]
+	float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+//#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++){
+		// k=0
+		clt_tile2_j =   clt_tile2 + joffs + i;
+		float clt2 = *(clt_tile2_j);
+		float r0 =  *(clt_tile1_j0) * clt2;
+		float r1 = -*(clt_tile1_j1) * clt2;
+		float r2 = -*(clt_tile1_j2) * clt2;
+		float r3 =  *(clt_tile1_j3) * clt2;
+		// k = 1
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j1) * clt2;
+		r1 +=  *(clt_tile1_j0) * clt2;
+		r2 -=  *(clt_tile1_j3) * clt2;
+		r3 -=  *(clt_tile1_j2) * clt2;
+		// k=2
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j2) * clt2;
+		r1 -=  *(clt_tile1_j3) * clt2;
+		r2 +=  *(clt_tile1_j0) * clt2;
+		r3 -=  *(clt_tile1_j1) * clt2;
+		// k=3
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j3) * clt2;
+		r1 +=  *(clt_tile1_j2) * clt2;
+		r2 +=  *(clt_tile1_j1) * clt2;
+		r3 +=  *(clt_tile1_j0) * clt2;
+
+		*(corr_tile_j0) += scale * r0;
+		*(corr_tile_j1) += scale * r1;
+		*(corr_tile_j2) += scale * r2;
+		*(corr_tile_j3) += scale * r3;
+		clt_tile1_j0 ++;
+		clt_tile1_j1 ++;
+		clt_tile1_j2 ++;
+		clt_tile1_j3 ++;
+		corr_tile_j0 ++;
+		corr_tile_j1 ++;
+		corr_tile_j2 ++;
+		corr_tile_j3 ++;
+	}
+}
+
+__device__ void resetCorrelation(
+		float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+
+	float * corr_tile_j0 = corr_tile +    joffs;                // k = 0
+	float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // k = 1
+	float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // k = 2
+	float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // k = 3
+//#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++){
+
+		*(corr_tile_j0) = 0;
+		*(corr_tile_j1) = 0;
+		*(corr_tile_j2) = 0;
+		*(corr_tile_j3) = 0;
+		corr_tile_j0 ++;
+		corr_tile_j1 ++;
+		corr_tile_j2 ++;
+		corr_tile_j3 ++;
+	}
+}
+
+__device__ void normalizeTileAmplitude(
+		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+		float fat_zero ) // fat zero is absolute, scale it outside
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+	float * clt_tile_j0 = clt_tile +    joffs;                // ==&clt_tile[0][j][0]
+	float * clt_tile_j1 = clt_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * clt_tile_j2 = clt_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * clt_tile_j3 = clt_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++) {
+		float s2 = fat_zero * fat_zero +
+				*(clt_tile_j0) * *(clt_tile_j0) +
+				*(clt_tile_j1) * *(clt_tile_j1) +
+				*(clt_tile_j2) * *(clt_tile_j2) +
+				*(clt_tile_j3) * *(clt_tile_j3);
+		float scale = rsqrtf(s2); // 1.0/sqrt(s2)
+		*(clt_tile_j0) *= scale;
+		*(clt_tile_j1) *= scale;
+		*(clt_tile_j2) *= scale;
+		*(clt_tile_j3) *= scale;
+
+		clt_tile_j0 ++; // =DTT_SIZE1;
+		clt_tile_j1 ++; // =DTT_SIZE1;
+		clt_tile_j2 ++; // =DTT_SIZE1;
+		clt_tile_j3 ++; // =DTT_SIZE1;
+	}
+}
+/*
+Converted from DttRad2.java:443
+	public  double [] corr_unfold_tile(
+		double [][]  qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
+		int          transform_size
+	)
+ */
+__device__ void corrUnfoldTile(
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+{
+	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE;
+	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
+	int i = threadIdx.x;
+	float corr_pixscale = 0.25f;
+	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
+	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
+	int rslt_row_offs = i * DTT_SIZE2M1;
+	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
+	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
+	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
+	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
+	for (int j = 1; j < DTT_SIZE; j++) {
+		int rslt_base_index_pp = rslt_base_index_p + j;
+		int rslt_base_index_pm = rslt_base_index_p - j;
+///		int rslt_base_index_mp = rslt_base_index_m + j;
+///		int rslt_base_index_mm = rslt_base_index_m - j;
+		rslt[rslt_base_index_pp] = corr_pixscale * (
+				 qdata0[i_transform_size + j] +
+				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
+		rslt[rslt_base_index_pm] = corr_pixscale * (
+				 qdata0[i_transform_size + j] +
+				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
+	}
+	if (i == 0) {
+		return;
+	}
+///	int im1 = i-1;
+	im1_transform_size = i_transform_size - DTT_SIZE1;
+	float d = corr_pixscale * qdata2[im1_transform_size];
+	rslt[rslt_base_index_p] += d;
+	rslt[rslt_base_index_m] -= d;
+	for (int j = 1; j < DTT_SIZE; j++) {
+		int rslt_base_index_pp = rslt_base_index_p + j;
+		int rslt_base_index_pm = rslt_base_index_p - j;
+		int rslt_base_index_mp = rslt_base_index_m + j;
+		int rslt_base_index_mm = rslt_base_index_m - j;
+		float d2 = corr_pixscale * qdata2[im1_transform_size + j];
+		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
+		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
+		rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
+		rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
+		rslt[rslt_base_index_pp] += d2 + d3;
+		rslt[rslt_base_index_pm] += d2 - d3;
+	}
+}
+
+__device__ void debug_print_lpf(
+		float * lpf_tile)
+{
+	for (int dbg_row = 0; dbg_row < DTT_SIZE; dbg_row++){
+		for (int dbg_col = 0; dbg_col < DTT_SIZE; dbg_col++){
+			printf ("%10.5f ", lpf_tile[dbg_row * DTT_SIZE + dbg_col]);
+		}
+		printf("\n");
+	}
+}
+
+
 __device__ void debug_print_clt1(
 		float * clt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 		const int color,
@@ -591,6 +1067,23 @@ __device__ void debug_print_mclt(
 	printf("\n");
 }

+__device__ void debug_print_corr_15x15(
+		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
+		const int color)
+{
+
+	if (color >= 0) printf("----------- Color = %d -----------\n",color);
+	for (int dbg_row = 0; dbg_row < DTT_SIZE2M1; dbg_row++){
+		for (int dbg_col = 0; dbg_col < DTT_SIZE2M1; dbg_col++){
+			printf ("%10.5f ", mclt_tile[dbg_row * DTT_SIZE2M1 + dbg_col]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
+
+
+
 __device__ void convertCorrectTile(
 		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
 		float               * gpu_kernels,        // [tileY][tileX][color]
@@ -1361,7 +1854,7 @@ __device__ void imclt_plane(

 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
-// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Shuld be zeroed before the
+// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
 __device__ void imclt(

--- a/src/main/resources/dtt8x8.cuh
+++ b/src/main/resources/dtt8x8.cuh
@@ -36,10 +36,10 @@
 * \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
 *        in GPU
 * This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
-* imlementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
+* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
 * results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
 * here it is used to convert convolution kernels and LPF to the frequency domain - done in
-* softwaer.
+* software.
 *
 * This file is cpompatible with both runtime and driver API, runtime is used for development
 * with Nvidia Nsight, driver API when calling these kernels from Java
@@ -84,23 +84,24 @@ __constant__ float SINN1[] = {0.195090f,0.555570f};
 __constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};


-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);
-inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct);
-inline __device__ void dctiv_nodiverg(float * x0,  int inc);
-inline __device__ void dstiv_nodiverg(float * x0,  int inc);
+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
+inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
+inline __device__ void dctiv_nodiverg  (float * x0,  int inc);                    // used in TP
+inline __device__ void dstiv_nodiverg  (float * x0,  int inc);                    // used in TP

-inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each
-inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each
-inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each
-inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
-inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each
+inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
+inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used


 /**
 **************************************************************************
 *  Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
-*  Mostly for testing and profiling individual converions
+*  Mostly for testing and profiling individual conversions
 *
 * \param dst                        [OUT] - Coefficients as 8x8 tiles
 * \param src                         [IN] - Source image of floats
@@ -376,6 +377,88 @@ inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
 	}
 }

+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
+{
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	if (dst_not_dct) { // DSTII
+		// invert odd input samples
+		u00= ( (*x0) - (*x7));
+		u10= ( (*x0) + (*x7));
+
+		u01= (-(*x1) + (*x6));
+		u11= (-(*x1) - (*x6));
+
+		u02= ( (*x2) - (*x5));
+		u12= ( (*x2) + (*x5));
+
+		u03= (-(*x3) + (*x4));
+		u13= (-(*x3) - (*x4));
+	} else { // DCTII
+		u00= ( (*x0) + (*x7));
+		u10= ( (*x0) - (*x7));
+
+		u01= ( (*x1) + (*x6));
+		u11= ( (*x1) - (*x6));
+
+		u02= ( (*x2) + (*x5));
+		u12= ( (*x2) - (*x5));
+
+		u03= ( (*x3) + (*x4));
+		u13= ( (*x3) - (*x4));
+	}
+	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
+
+		float w00= u00 + u03;
+		float w10= u00 - u03;
+
+		float w01= (u01 + u02);
+		float w11= (u01 - u02);
+
+		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+
+		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+		float v11 = w20 - w21 - w30 + w31;
+		float v12 = w20 - w21 + w30 - w31;
+
+	if (dst_not_dct) { // DSTII
+		// Invert output sequence
+		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+		*x1 =   v03 *         SQRT1_8;
+
+		*x2 =   v12 *         SQRT1_8;
+		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+
+		*x4 =   v11 *         SQRT1_8;
+		*x5 =   v01 *         SQRT1_8;
+
+		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x7 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+	} else {
+		*x0 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+
+		*x2 =   v01 *         SQRT1_8;
+		*x3 =   v11 *         SQRT1_8;
+
+		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x5 =   v12 *         SQRT1_8;
+
+		*x6 =   v03 *         SQRT1_8;
+		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+	}
+}

 inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
 {