implementing moving objects masking for ERS fitting, this is debug

version for GPU geometry correction

implementing moving objects masking for ERS fitting, this is debug
version for GPU geometry correction
08bccd0d · Andrey Filippov · fabadb74 · 08bccd0d · 08bccd0d · 08bccd0d
Commit 08bccd0d authored Sep 18, 2020 by Andrey Filippov
7 changed files
--- a/src/main/java/com/elphel/imagej/common/DoubleGaussianBlur.java
+++ b/src/main/java/com/elphel/imagej/common/DoubleGaussianBlur.java
@@ -29,23 +29,70 @@ package com.elphel.imagej.common;

 public class DoubleGaussianBlur {
 	/* the standard deviation of the Gaussian*/
-//    private static double sigma = 2.0;
+	//    private static double sigma = 2.0;
 	/* whether sigma is given in units corresponding to the pixel scale (not pixels)*/
-//    private static boolean sigmaScaled = false;
+	//    private static boolean sigmaScaled = false;
 	/* The flags specifying the capabilities and needs */
-//    private int flags = DOES_ALL|SUPPORTS_MASKING|PARALLELIZE_STACKS|KEEP_PREVIEW;
-//    private ImagePlus imp;              // The ImagePlus of the setup call, needed to get the spatial calibration
-//    private boolean hasScale = false;   // whether the image has an x&y scale
+	//    private int flags = DOES_ALL|SUPPORTS_MASKING|PARALLELIZE_STACKS|KEEP_PREVIEW;
+	//    private ImagePlus imp;              // The ImagePlus of the setup call, needed to get the spatial calibration
+	//    private boolean hasScale = false;   // whether the image has an x&y scale
 	private int nPasses = 1;            // The number of passes (filter directions * color channels * stack slices)
-//	private int nChannels = 1;        // The number of color channels
+	//	private int nChannels = 1;        // The number of color channels
 	private int pass;                   // Current pass

 	/* Default constructor */
 	public DoubleGaussianBlur() {
+		
 	}

+	public double [] blurWithNaN(
+			double[] pixels,
+			double [] in_weight, // or null
+			int width,
+			int height,
+			double sigmaX,
+			double sigmaY,
+			double accuracy
+			) {
+		double [] weight;
+		double [] blured = new double[pixels.length];
+		if (in_weight == null) {
+			weight = new double [pixels.length];
+			for (int i = 0; i < weight.length; i++) {
+				weight[i] = 1.0;
+			}
+		} else {
+			weight = in_weight.clone();
+		}
+		for (int i = 0; i < pixels.length; i++) {
+			if (Double.isNaN(pixels[i])) {
+				weight[i] = 0.0;
+			} else {
+				blured[i] = pixels[i] * weight[i];
+			}
+		}
+		blurDouble(
+				blured,
+				width,
+				height,
+				sigmaX,
+				sigmaY,
+				accuracy);
+		blurDouble(
+				weight,
+				width,
+				height,
+				sigmaX,
+				sigmaY,
+				accuracy);
+		for (int i = 0; i < pixels.length; i++) {
+			blured[i] /= weight[i];
+		}
+		return blured;
+	}

-	  public void blurDouble(double[] pixels,
+	public void blurDouble(
+			double[] pixels,
 			int width,
 			int height,
 			double sigmaX,
@@ -73,31 +120,31 @@ public class DoubleGaussianBlur {
 			double     sigma,
 			double   accuracy,
 			boolean xDirection
-//	                               int extraLines
+			//	                               int extraLines
 			) {
 		final int UPSCALE_K_RADIUS = 2;             //number of pixels to add for upscaling
 		final double MIN_DOWNSCALED_SIGMA = 4.;     //minimum standard deviation in the downscaled image
-//	        float[] pixels = (float[])ip.getPixels();
-//	        int width = ip.getWidth();
-//	        int height = ip.getHeight();
-//	        Rectangle roi = ip.getRoi();
+		//	        float[] pixels = (float[])ip.getPixels();
+		//	        int width = ip.getWidth();
+		//	        int height = ip.getHeight();
+		//	        Rectangle roi = ip.getRoi();
 		int length = xDirection ? width : height;   //number of points per line (line can be a row or column)
 		int pointInc = xDirection ? 1 : width;      //increment of the pixels array index to the next point in a line
 		int lineInc = xDirection ? width : 1;       //increment of the pixels array index to the next line
-//	        int lineFrom = (xDirection ? roi.y : roi.x) - extraLines;  //the first line to process
-//	        if (lineFrom < 0) lineFrom = 0;
+		//	        int lineFrom = (xDirection ? roi.y : roi.x) - extraLines;  //the first line to process
+		//	        if (lineFrom < 0) lineFrom = 0;
 		int lineFrom = 0;  //the first line to process
-//	        int lineTo = (xDirection ? roi.y+roi.height : roi.x+roi.width) + extraLines; //the last line+1 to process
-//	        if (lineTo > (xDirection ? height:width)) lineTo = (xDirection ? height:width);
+		//	        int lineTo = (xDirection ? roi.y+roi.height : roi.x+roi.width) + extraLines; //the last line+1 to process
+		//	        if (lineTo > (xDirection ? height:width)) lineTo = (xDirection ? height:width);
 		int lineTo = (xDirection ? height:width);
-//	        int writeFrom = xDirection? roi.x : roi.y;  //first point of a line that needs to be written
-//	        int writeTo = xDirection ? roi.x+roi.width : roi.y+roi.height;
+		//	        int writeFrom = xDirection? roi.x : roi.y;  //first point of a line that needs to be written
+		//	        int writeTo = xDirection ? roi.x+roi.width : roi.y+roi.height;
 		int writeFrom = 0;  //first point of a line that needs to be written
 		int writeTo = xDirection ? width : height;
-//	        int inc = Math.max((lineTo-lineFrom)/(100/(nPasses>0?nPasses:1)+1),20);
+		//	        int inc = Math.max((lineTo-lineFrom)/(100/(nPasses>0?nPasses:1)+1),20);
 		pass++;
 		if (pass>nPasses) pass =1;
-//	        Thread thread = Thread.currentThread();     // needed to check for interrupted state
+		//	        Thread thread = Thread.currentThread();     // needed to check for interrupted state
 		if (sigma > 2*MIN_DOWNSCALED_SIGMA + 0.5) {
 			/* large radius (sigma): scale down, then convolve, then scale up */
 			int reduceBy = (int)Math.floor(sigma/MIN_DOWNSCALED_SIGMA); //downscale by this factor

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -106,6 +106,7 @@ public class GPUTileProcessor {
 	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv";      // calculate rotation matrices and derivatives
 	static String GPU_SET_TILES_OFFSETS =          "get_tiles_offsets";   // calculate pixel offsets and disparity distortions
+	static String GPU_CALCULATE_TILES_OFFSETS =    "calculate_tiles_offsets";   // calculate pixel offsets and disparity distortions
 	static String GPU_CALC_REVERSE_DISTORTION =    "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction

 //  pass some defines to gpu source code with #ifdef JCUDA
@@ -168,7 +169,8 @@ public class GPUTileProcessor {
    private CUfunction GPU_TEXTURES_kernel =                null;
    private CUfunction GPU_RBGA_kernel =                    null;
    private CUfunction GPU_ROT_DERIV_kernel =               null;
-    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null;
+//    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null;
+    private CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null;
    private CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;

    CUmodule    module; // to access constants memory
@@ -393,7 +395,8 @@ public class GPUTileProcessor {
        		GPU_TEXTURES_NAME,
        		GPU_RBGA_NAME,
        		GPU_ROT_DERIV,
-        		GPU_SET_TILES_OFFSETS,
+//        		GPU_SET_TILES_OFFSETS,
+        		GPU_CALCULATE_TILES_OFFSETS,
        		GPU_CALC_REVERSE_DISTORTION
        };
        CUfunction[] functions = createFunctions(kernelSources,
@@ -408,7 +411,8 @@ public class GPUTileProcessor {
        GPU_TEXTURES_kernel=                 functions[5];
        GPU_RBGA_kernel=                     functions[6];
        GPU_ROT_DERIV_kernel =               functions[7];
-        GPU_SET_TILES_OFFSETS_kernel =       functions[8];
+//        GPU_SET_TILES_OFFSETS_kernel =       functions[8];
+        GPU_CALCULATE_TILES_OFFSETS_kernel = functions[8];
        GPU_CALC_REVERSE_DISTORTION_kernel = functions[9];

        System.out.println("GPU kernel functions initialized");
@@ -420,7 +424,8 @@ public class GPUTileProcessor {
        System.out.println(GPU_TEXTURES_kernel.toString());
        System.out.println(GPU_RBGA_kernel.toString());
        System.out.println(GPU_ROT_DERIV_kernel.toString());
-        System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
+//        System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
+        System.out.println(GPU_CALCULATE_TILES_OFFSETS_kernel.toString());
        System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());
        
        // GPU data structures are now initialized through GpuQuad instances
@@ -1603,7 +1608,8 @@ public class GPUTileProcessor {
 /**
 * Calculate tiles offsets (before each direct conversion run)
 */
-        public void execSetTilesOffsets() {
+        /*
+        public void execSetTilesOffsetsOld() {
        	execCalcReverseDistortions(); // will check if it is needed first
        	execRotDerivs();              // will check if it is needed first
            if (GPU_SET_TILES_OFFSETS_kernel == null)
@@ -1612,7 +1618,7 @@ public class GPUTileProcessor {
                return;
            }
            // kernel parameters: pointer to pointers
-            int [] GridFullWarps =    {(num_task_tiles + TILES_PER_BLOCK_GEOM - 1)/TILES_PER_BLOCK_GEOM, 1, 1}; // round up
+            int [] GridFullWarps =    {(num_task_tiles + 2 * TILES_PER_BLOCK_GEOM - 1)/TILES_PER_BLOCK_GEOM, 1, 1}; // round up
            int [] ThreadsFullWarps = {num_cams, TILES_PER_BLOCK_GEOM, 1}; // 4,8,1
            Pointer kernelParameters = Pointer.to(
            		Pointer.to(gpu_tasks),                   // struct tp_task     * gpu_tasks,
@@ -1632,6 +1638,42 @@ public class GPUTileProcessor {
    			System.out.println("======execSetTilesOffsets()");
    		}
        }
+*/        
+        public void execSetTilesOffsets() {
+        	execCalcReverseDistortions(); // will check if it is needed first
+        	execRotDerivs();              // will check if it is needed first
+            if (GPU_CALCULATE_TILES_OFFSETS_kernel == null)
+            {
+                IJ.showMessage("Error", "No GPU kernel: GPU_CALCULATE_TILES_OFFSETS_kernel");
+                return;
+            }
+    		if (gpu_debug_level > -1) {
+    			System.out.println("num_task_tiles="+num_task_tiles);
+    		}
+
+            // kernel parameters: pointer to pointers
+//            int [] GridFullWarps =    {(num_task_tiles + 2 * TILES_PER_BLOCK_GEOM - 1)/TILES_PER_BLOCK_GEOM, 1, 1}; // round up
+//            int [] ThreadsFullWarps = {num_cams, TILES_PER_BLOCK_GEOM, 1}; // 4,8,1
+            int [] GridFullWarps =    {1, 1, 1}; // round up
+            int [] ThreadsFullWarps = {1, 1, 1}; // 4,8,1
+            Pointer kernelParameters = Pointer.to(
+            		Pointer.to(gpu_tasks),                   // struct tp_task     * gpu_tasks,
+            		Pointer.to(new int[] { num_task_tiles }),// int                  num_tiles,          // number of tiles in task list
+            		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
+            		Pointer.to(gpu_correction_vector),       //	struct corr_vector * gpu_correction_vector,
+            		Pointer.to(gpu_rByRDist),                //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+            		Pointer.to(gpu_rot_deriv));              // trot_deriv         * gpu_rot_deriv);
+            cuCtxSynchronize();
+        	cuLaunchKernel(GPU_CALCULATE_TILES_OFFSETS_kernel,
+        			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+        			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+        			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+        			kernelParameters, null);   // Kernel- and extra parameters
+        	cuCtxSynchronize(); // remove later
+    		if (gpu_debug_level > -1) {
+    			System.out.println("======execSetTilesOffsets()");
+    		}
+        }

 /**
 * Direct CLT conversion and aberration correction 

--- a/src/main/java/com/elphel/imagej/tileprocessor/ExtrinsicAdjustment.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ExtrinsicAdjustment.java
 package com.elphel.imagej.tileprocessor;

+import com.elphel.imagej.common.DoubleGaussianBlur;
 import com.elphel.imagej.common.ShowDoubleFloatArrays;

 import Jama.Matrix;
@@ -81,7 +82,7 @@ public class ExtrinsicAdjustment {
 	public int clustersX;
 	public int clustersY;
 	
-	public double dbg_delta = 1.0E-5; // if not null - use delta instead of the derivatives in getJacobianTransposed
+	public double dbg_delta = 0; // 1.0E-5; // if not 0 - use delta instead of the derivatives in getJacobianTransposed

 	public double [] getOldNewRMS() {
 		double [] on_rms = new double[2];
@@ -272,7 +273,7 @@ public class ExtrinsicAdjustment {
 		 double min_fg_disp = 0.0; //  25.0; // minimal disparity to boost foreground objects
 		 double min_rel_over = 0.25; // minimal relative disparity over average for a row to boost foreground objects
 		 int min_num_fg =      min_num_forced;
-		 double fb_boost_fraction = 0.5; 
+		 double fg_boost_fraction = 0.5; 
 		 if (min_fg_disp > 0.0) {
 			 boolean [] select_ers = selectERS(
 					 measured_dsxy, // double  [][] measured_dsxy,
@@ -283,10 +284,62 @@ public class ExtrinsicAdjustment {
 					 this.weights, // double []    weights, // will be updated
 					 select_ers, // boolean []   fg,
 					 min_num_fg, // int          min_num_fg,
-					 fb_boost_fraction); // double       fg_boost_fraction)
+					 fg_boost_fraction); // double       fg_boost_fraction)
+		 }
+		 double max_ers_disparity = 175.0;
+		 limitDisparity(
+				 measured_dsxy, // double  [][] measured_dsxy,
+				 this.weights, // double []    weights, // will be updated 
+				 max_ers_disparity); // double max_disparity
+		 // remove moving objects here		 
+		 if (debugLevel > -10) { // temporary
+			 dbgYminusFxWeight(
+					 this.last_ymfx,
+					 this.weights, // will use current weights
+					 "Initial y-fX");
+			 double [][] ers_tilt_az = getMovingObjects(
+					 corr_vector, // GeometryCorrection.CorrVector corr_vector,
+					 null ); // double [] fx
+			 showMovingObjects(
+					 "Moving_objects", // String      title,
+					 ers_tilt_az);
+			 double moving_sigma = 1.0;
+
+			 double [] ers_blured = showMovingObjects(
+					 "Moving_objects_sigma"+moving_sigma, // String      title,
+					 ers_tilt_az,
+					 moving_sigma, // double sigmaX,
+					 0.01); // double accuracy
+			 System.out.println("Moving objects");
+
+			 double mov_avg = 0.0;
+			 int clusters = clustersX * clustersY;
+			 for (int cluster = 0; cluster < clusters; cluster++) {
+				 if (!Double.isNaN(ers_blured[cluster])) mov_avg += ers_blured[cluster];
+			 }
+			 mov_avg /= clusters;
+			 System.out.println("Average moving object detection value = "+mov_avg+
+					 " (use to abandon detection if too high?");
+//			 double min_l2 = 1.0 * mov_avg; 
+			 double min_l2 = 1.0 * mov_avg; 
+			 boolean [] moving_maybe =  extractMovingObjects(
+					 "Moving_objects_filtering", //  String    title,
+					 ers_blured, //double [] ers_blured, // has NaNs
+					 min_l2, // double min_l2); // minimal value (can use average or fraction of it?
+					 4, // int       shrink, // prevents growing outer border (==4)
+					 2); // int       grow) { // located moving areas
+
+			 int num_moving = 0;
+			 for (int cluster = 0; cluster < clusters; cluster++) {
+				 if (moving_maybe[cluster]) num_moving++;
+			 }
+			 System.out.println("Number of clusters to remove from LMA fitting = "+num_moving);
+			 System.out.println();
+			 blockSelectedClusters(
+					 measured_dsxy, // double  [][] measured_dsxy,
+					 this.weights, // double []    weights, // will be updated
+					 moving_maybe); // 	boolean []   prohibit
 		 }
-		 
-		 

 		 double lambda = 0.1;
 		 double lambda_scale_good = 0.5;
@@ -606,6 +659,78 @@ public class ExtrinsicAdjustment {
 		return;
 	}
 	
+	private void limitDisparity(
+			double  [][] measured_dsxy,
+			double []    weights,
+			double max_disparity
+			) {
+		double sw =  0.0;
+		double sw_near = 0.0;
+		int clusters = clustersX * clustersY;
+		for (int cluster = 0; cluster < clusters; cluster++) if (measured_dsxy[cluster] != null){
+			double disparity = measured_dsxy[cluster][ExtrinsicAdjustment.INDX_DISP];
+			if (disparity <= max_disparity) {
+				for (int i = 0; i < POINTS_SAMPLE; i++) {
+					sw_near += weights[cluster * POINTS_SAMPLE + i];
+				}
+			}
+			for (int i = 0; i < POINTS_SAMPLE; i++) {
+				sw += weights[cluster * POINTS_SAMPLE + i];
+			}
+		}
+		
+		if (sw_near > 0.0) {
+			double wboost = sw/sw_near;
+			for (int cluster = 0; cluster < clusters; cluster++) if (measured_dsxy[cluster] != null){
+				double disparity = measured_dsxy[cluster][ExtrinsicAdjustment.INDX_DISP];
+				if (disparity <= max_disparity) {
+					for (int i = 0; i < POINTS_SAMPLE; i++) {
+						weights[cluster * POINTS_SAMPLE + i] *= wboost;
+					}
+				} else {
+					for (int i = 0; i < POINTS_SAMPLE; i++) {
+						weights[cluster * POINTS_SAMPLE + i] = 0;
+					}
+				}
+			}
+		}
+	}
+	
+	private void blockSelectedClusters(
+			double  [][] measured_dsxy,
+			double  []   weights,
+			boolean []   prohibit
+			) {
+		double sw =  0.0;
+		double sw_en = 0.0;
+		int clusters = clustersX * clustersY;
+		for (int cluster = 0; cluster < clusters; cluster++) if (measured_dsxy[cluster] != null){
+			if (!prohibit[cluster]) {
+				for (int i = 0; i < POINTS_SAMPLE; i++) {
+					sw_en += weights[cluster * POINTS_SAMPLE + i];
+				}
+			}
+			for (int i = 0; i < POINTS_SAMPLE; i++) {
+				sw += weights[cluster * POINTS_SAMPLE + i];
+			}
+		}
+		
+		if (sw_en > 0.0) {
+			double wboost = sw/sw_en;
+			for (int cluster = 0; cluster < clusters; cluster++) if (measured_dsxy[cluster] != null){
+				if (!prohibit[cluster]) {
+					for (int i = 0; i < POINTS_SAMPLE; i++) {
+						weights[cluster * POINTS_SAMPLE + i] *= wboost;
+					}
+				} else {
+					for (int i = 0; i < POINTS_SAMPLE; i++) {
+						weights[cluster * POINTS_SAMPLE + i] = 0;
+					}
+				}
+			}
+		}
+
+	}
 	
 	
 	private double [] getWeights(
@@ -683,10 +808,14 @@ public class ExtrinsicAdjustment {
 		Matrix [][] deriv_rots = corr_vector.getRotDeriveMatrices();
 		double [] imu = corr_vector.getIMU(); // i)
 		double [] y_minus_fx = new double  [clusters * POINTS_SAMPLE];
-		for (int cluster = 0; cluster < clusters;  cluster++) if (measured_dsxy[cluster] != null){
-			if ((cluster == 1735 ) || (cluster==1736)){
-				System.out.print("");
-			}
+		for (int cluster = 0; cluster < clusters;  cluster++) {
+//			if ((cluster == 1892) || (cluster == 1894) ||(cluster == 3205)) {
+//				System.out.println("getFx() cluster="+cluster);
+//			}
+			if (measured_dsxy[cluster] != null){
+//				if ((cluster == 1735 ) || (cluster==1736)){
+//					System.out.print("");
+//				}
 				double [] ddnd = geometryCorrection.getPortsDDNDAndDerivativesNew( // USED in lwir
 						geometryCorrection,     // GeometryCorrection gc_main,
 						use_rig_offsets,        // boolean     use_rig_offsets,
@@ -709,6 +838,7 @@ public class ExtrinsicAdjustment {
 				}
 				System.arraycopy(ddnd, 0, y_minus_fx, cluster*POINTS_SAMPLE, POINTS_SAMPLE);
 			}
+		}
 		return y_minus_fx;
 	}

@@ -751,7 +881,7 @@ public class ExtrinsicAdjustment {
 			double delta,
 			boolean graphic) {
 //		delta *= 0.1;
-		int gap = 10; //pix
+		int gap = 10+0; //pix
 		int clusters = clustersX * clustersY;
 		int num_pars = getNumPars();
 		String [] titles = getSymNames();
@@ -902,10 +1032,168 @@ public class ExtrinsicAdjustment {
 				titles);
 	}

+	private double [][] getMovingObjects(
+			GeometryCorrection.CorrVector corr_vector,
+			double [] fx // or null;
+			) {
+		int ers_tilt_index =    GeometryCorrection.CorrVector.IMU_INDEX;
+		int ers_azimuth_index = GeometryCorrection.CorrVector.IMU_INDEX + 1;
+		int clusters = clustersX * clustersY;
+		double [][] ers_tilt_az = new double [clusters][];
+		double [][] jt = getJacobianTransposed(corr_vector);
+		for (int cluster = 0; cluster < clusters; cluster++) if ( measured_dsxy[cluster] != null) {
+			ers_tilt_az[cluster] = new double [2];
+			for (int i = 0; i < (POINTS_SAMPLE-1); i++) {
+				int indx = cluster * POINTS_SAMPLE + i + 1; // skipping disparity
+				double d = measured_dsxy[cluster][ExtrinsicAdjustment.INDX_DD0 + i];
+				if (fx != null) {
+					d += fx[indx];
+				}
+				ers_tilt_az[cluster][0] += d * jt[ers_tilt_index][indx];
+				ers_tilt_az[cluster][1] += d * jt[ers_azimuth_index][indx];
+			}
+		}
+		return ers_tilt_az;
+	}
+
+	private void showMovingObjects(
+			String      title,
+			double [][] ers_tilt_az) {
+		showMovingObjects(
+				title,
+				ers_tilt_az,
+				0, // double sigma,
+				0);
+	}
+
+	private double [] showMovingObjects(
+			String      title,
+			double [][] ers_tilt_az,
+			double sigma,
+			double accuracy
+			)
+	{
+		int clusters = clustersX * clustersY;
+		
+		String [] titles = {"vert", "hor", "amplitude"};
+		double [][] dbg_img = new double [titles.length][clusters];
+		double max_reasonable = 100.0;
+		for (int cluster = 0; cluster < clusters; cluster++) {
+			if (ers_tilt_az[cluster] != null) {
+				dbg_img[0][cluster] = ers_tilt_az[cluster][0];
+				dbg_img[1][cluster] = ers_tilt_az[cluster][1];
+				dbg_img[2][cluster] = Math.sqrt(0.5*(ers_tilt_az[cluster][0]*ers_tilt_az[cluster][0] + ers_tilt_az[cluster][1]*ers_tilt_az[cluster][1]));
+				if (dbg_img[2][cluster] > max_reasonable) {
+					dbg_img[0][cluster] = Double.NaN;
+					dbg_img[1][cluster] = Double.NaN;
+					dbg_img[2][cluster] = Double.NaN;
+				}
+			} else {
+				dbg_img[0][cluster] = Double.NaN;
+				dbg_img[1][cluster] = Double.NaN;
+				dbg_img[2][cluster] = Double.NaN;
+			}
+		}
+		
+		if (sigma > 0.0){
+			double [] cluster_weights = new double [clusters];
+			for (int cluster = 0; cluster < clusters; cluster++) {
+				for (int i = 0; i < POINTS_SAMPLE; i++) {
+					cluster_weights[cluster] += this.weights[cluster * POINTS_SAMPLE + i];
+				}
+			}
+			for (int i = 0; i < dbg_img.length; i++) {
+				dbg_img[i] = (new DoubleGaussianBlur()).blurWithNaN(
+						dbg_img[i], // double[] pixels,
+						cluster_weights, // null, // double [] in_weight, // or null
+						clustersX,// int width,
+						clustersY,// int height,
+						sigma, // double sigmaX,
+						sigma, // double sigmaY,
+						0.01); // double accuracy
+			}
+		}
+		
+		(new ShowDoubleFloatArrays()).showArrays(
+				dbg_img,
+				clustersX,
+				clustersY,
+				true,
+				title,
+				titles);
+		return dbg_img[2]; 
+
+	}
+	
+	public boolean [] extractMovingObjects(
+			String    title,
+			double [] ers_blured, // has NaNs
+			double    min_l2, // minimal value (can use average or fraction of it?
+			int       shrink, // prevents growing outer border (==4)
+			int       grow) { // located moving areas
+		int clusters = clustersX * clustersY;
+		TileNeibs tn = new TileNeibs(clustersX, clustersY);
+		boolean [] terrain = new boolean[clusters];
+		for (int i = 0; i < clusters; i++) {
+			terrain[i] = ers_blured[i] <= min_l2; // NaNs not included
+		}
+		// find largest terrain area
+		int [] iterrains = tn. enumerateClusters(
+				terrain, // boolean [] tiles,
+				true); //boolean ordered)
+		boolean [] terrain_main = new boolean[clusters];
+		for (int i = 0; i < clusters; i++) {
+			terrain_main[i] = (iterrains[i] == 1); // largest cluster
+		}
+		
+		boolean [] filled = tn.fillConcave(terrain_main);
+		
+		tn.shrinkSelection(
+				 shrink, // int        shrink,
+					filled, // boolean [] tiles,
+					null); // boolean [] prohibit)
+		boolean [] moving = filled.clone();
+		for (int i = 0; i < clusters; i++) {
+			moving[i] &= !terrain_main[i]; 
+		}
+		boolean [] grown = moving.clone();
+		tn.growSelection(
+				grow, // int        grow,           // grow tile selection by 1 over non-background tiles 1: 4 directions, 2 - 8 directions, 3 - 8 by 1, 4 by 1 more
+				grown, // boolean [] tiles,
+				null); // boolean [] prohibit)
+		if (title != null) {
+			String [] titles = {"terrain","clusters","main_terrain","filled","moving","grown"};
+			double [][] dbg_img = new double [titles.length][clusters];
+			for (int cluster = 0; cluster < clusters; cluster++) {
+				dbg_img[0][cluster] = terrain[cluster]? 1.0 : 0.0;
+				dbg_img[1][cluster] = iterrains[cluster];
+				dbg_img[2][cluster] = terrain_main[cluster]? 1.0 : 0.0;
+				dbg_img[3][cluster] = filled[cluster]? 1.0 : 0.0;
+				dbg_img[4][cluster] = moving[cluster]? 1.0 : 0.0;
+				dbg_img[5][cluster] = grown[cluster]? 1.0 : 0.0;
+			}
+			(new ShowDoubleFloatArrays()).showArrays(
+					dbg_img,
+					clustersX,
+					clustersY,
+					true,
+					title,
+					titles);
+		}
+		return grown;
+	}
+	
+	
+	
+	
+	
 	private void dbgYminusFxWeight(
 			double []   fx,
 			double []   weights,
 			String title) {
+		if (fx == null) {
+			fx = new double [weights.length];
+		}
 		int gap = 10; //pix
 		int clusters = clustersX * clustersY;
 		String [] titles = {"Y", "-fX", "Y+fx", "Weight", "W*(Y+fx)", "Masked Y+fx"};
@@ -979,23 +1267,33 @@ public class ExtrinsicAdjustment {
 		double [][] xy = getXYNondistorted(corr_vector, false);

 		double [][] dbg_img = new double [3][width*height];
-		for (int mode = 0; mode < 2 * NUM_SENSORS; mode++) {
+		double [][] moving_objects = new double [3][clusters];
+		for (int mode = 0; mode < 2 * NUM_SENSORS + 1; mode++) {
 			int x0 = (mode % 3) * (clustersX + gap);
 			int y0 = (mode / 3) * (clustersY + gap);
 			for (int cluster = 0; cluster < clusters;  cluster++) {
 				int x = x0 + (cluster % clustersX);
 				int y = y0 + (cluster / clustersX);
 				int pix = x + y * width;
-//				int indx = cluster * POINTS_SAMPLE + mode;
 				if (measured_dsxy[cluster] != null){
+					if (mode < (2 * NUM_SENSORS)) {
 						dbg_img[0][pix] =  measured_dsxy[cluster][ExtrinsicAdjustment.INDX_X0+mode];
 						dbg_img[1][pix] =  xy[cluster][mode] - x0y0[cluster][mode];
-						dbg_img[2][pix] =  xy[cluster][mode] - x0y0[cluster][mode] - measured_dsxy[cluster][ExtrinsicAdjustment.INDX_X0+mode];
+						dbg_img[2][pix] =  measured_dsxy[cluster][ExtrinsicAdjustment.INDX_X0+mode] - (xy[cluster][mode] - x0y0[cluster][mode]);
+						moving_objects[0][cluster] += dbg_img[0][pix]*dbg_img[0][pix];
+						moving_objects[1][cluster] += dbg_img[1][pix]*dbg_img[0][pix];
+						moving_objects[2][cluster] += dbg_img[2][pix]*dbg_img[0][pix];
+					} else {
+						dbg_img[0][pix] = Math.sqrt(moving_objects[0][cluster]/(2 * NUM_SENSORS));
+						dbg_img[1][pix] = Math.sqrt(moving_objects[0][cluster]/(2 * NUM_SENSORS));
+						dbg_img[2][pix] = Math.sqrt(moving_objects[0][cluster]/(2 * NUM_SENSORS));
+					}
 				} else {
 					dbg_img[ 0][pix] =  Double.NaN;
 					dbg_img[ 1][pix] =  Double.NaN;
 					dbg_img[ 2][pix] =  Double.NaN;
 				}
+
 			}
 		}
 		(new ShowDoubleFloatArrays()).showArrays(
@@ -1005,6 +1303,7 @@ public class ExtrinsicAdjustment {
 				true,
 				title,
 				titles);
+		System.out.println("dbgXY() DONE");
 	}


@@ -1176,15 +1475,53 @@ public class ExtrinsicAdjustment {
 //					this.last_jt); // double [][] jt) { // should be either [vector.length][samples.size()] or null - then only fx is calculated
 			this.last_jt =  getJacobianTransposed(corr_vector); // new double [num_pars][num_points];
 			this.last_ymfx = getFx(corr_vector);
-
-			if (debug_level > 2) {
+			if (debug_level > -10) { // temporary
 				dbgYminusFxWeight(
 						this.last_ymfx,
 						this.weights,
-						"Initial y-fX");
-//				dbgYminusFx(this.last_ymfx, "Initial y-fX");
+						"Initial_y-fX_after_moving_objects");
+				
+				/*
+				double [][] ers_tilt_az = getMovingObjects(
+						corr_vector, // GeometryCorrection.CorrVector corr_vector,
+						null ); // double [] fx
+				showMovingObjects(
+						"Moving_objects", // String      title,
+						ers_tilt_az);
+				double moving_sigma = 1.0;
+				
+				double [] ers_blured = showMovingObjects(
+						"Moving_objects_sigma"+moving_sigma, // String      title,
+						ers_tilt_az,
+						moving_sigma, // double sigmaX,
+						0.01); // double accuracy
+				System.out.println("Moving objects");
+				
+				double mov_avg = 0.0;
+				int clusters = clustersX * clustersY;
+				for (int cluster = 0; cluster < clusters; cluster++) {
+					if (!Double.isNaN(ers_blured[cluster])) mov_avg += ers_blured[cluster];
+				}
+				mov_avg /= clusters;
+				System.out.println("Average moving object detection value = "+mov_avg+
+						" (use to abandon detection if too high?");
+				double min_l2 = 1.0 * mov_avg; 
+				boolean [] moving_maybe =  extractMovingObjects(
+						"Moving_objects_filtering", //  String    title,
+						ers_blured, //double [] ers_blured, // has NaNs
+						min_l2, // double min_l2); // minimal value (can use average or fraction of it?
+						4, // int       shrink, // prevents growing outer border (==4)
+						2); // int       grow) { // located moving areas
+
+				int num_moving = 0;
+				for (int cluster = 0; cluster < clusters; cluster++) {
+					if (moving_maybe[cluster]) num_moving++;
 				}
+				System.out.println("Number of clusters to remove from LMA fitting = "+num_moving);
+				System.out.println();
+				*/
 				
+			}

 			if (last_ymfx == null) {
 				return null; // need to re-init/restart LMA

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -655,6 +655,639 @@ public class ImageDtt extends ImageDttCPU {
 //		return clt_data;
 	}

+	public GPUTileProcessor.TpTask[][] clt_aberrations_quad_corr_GPU_test(
+			final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
+			final int                 macro_scale,     // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
+			final int [][]            tile_op,         // [tilesY][tilesX] - what to do - 0 - nothing for this tile
+			final double [][]         disparity_array, // [tilesY][tilesX] - individual per-tile expected disparity
+			final float  [][][][]     fcorr_td,        // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+			final float  [][][][]     fcorr_combo_td,  // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+			                                           // each of the top elements may be null to skip particular combo type
+			final float  [][][][]     fdisp_dist,      // [tilesY][tilesX][cams][4], // disparity derivatives vectors or null
+			final float  [][][][]     fpxpy,           // [tilesY][tilesX][cams][2], tile {pX,pY}
+			final float  [][][][]     fpxpy_test,      // [tilesY][tilesX][cams][2], tile {pX,pY}
+			final double [][][][]     clt_corr_combo,  // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+			                                           // [type][tilesY][tilesX] should be set by caller
+													   // types: 0 - selected correlation (product+offset), 1 - sum
+			final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+                                                       // [tilesY][tilesX] should be set by caller
+			// When clt_mismatch is non-zero, no far objects extraction will be attempted
+			final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
+			                                           // values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
+			final double [][]         disparity_map,   // [8][tilesY][tilesX], only [6][] is needed on input or null - do not calculate
+			                                           // last 2 - contrast, avg/ "geometric average)
+			final int                 disparity_modes, // bit mask of disparity_map slices to calculate/return
+			final double [][][][]     texture_tiles,   // compatible with the CPU ones      
+			final float [][]          texture_img,     // null or [3][] (RGB) or [4][] RGBA
+			final Rectangle           texture_woi_pix, // null or generated texture location/size in pixels
+			final float [][][]        iclt_fimg,       // will return quad images or null to skip, use quadCLT.linearStackToColor 
+			// new parameters, will replace some other?
+			final double              gpu_corr_scale,  //  0.75; // reduce GPU-generated correlation values
+			final double              gpu_fat_zero,    // clt_parameters.getGpuFatZero(is_mono);absolute == 30.0
+			final double              gpu_sigma_r,     // 0.9, 1.1
+			final double              gpu_sigma_b,     // 0.9, 1.1
+			final double              gpu_sigma_g,     // 0.6, 0.7
+			final double              gpu_sigma_m,     //  =       0.4; // 0.7;
+			final double              gpu_sigma_rb_corr, //  = 0.5; // apply LPF after accumulating R and B correlation before G, monochrome ? 1.0 : gpu_sigma_rb_corr;
+			final double              gpu_sigma_corr,   //  =    0.9;gpu_sigma_corr_m
+			final int                 gpu_corr_rad,     // = transform_size - 1 ?
+			final double              corr_red, // +used
+			final double              corr_blue,// +used
+			final double              max_corr_radius, // 3.9;
+			final int                 corr_mode, // Correlation mode: 0 - integer max, 1 - center of mass, 2 - polynomial
+			
+			final double              min_shot,        // +10.0;  // Do not adjust for shot noise if lower than
+			final double              scale_shot,      // +3.0;   // scale when dividing by sqrt ( <0 - disable correction)
+			final double              diff_sigma,      // +5.0;//RMS difference from average to reduce weights (~ 1.0 - 1/255 full scale image)
+			final double              diff_threshold,  // +5.0;   // RMS difference from average to discard channel (~ 1.0 - 1/255 full scale image)
+			final boolean             diff_gauss,      // true;  // when averaging images, use Gaussian around average as weight (false - sharp all/nothing)
+			final double              min_agree,       // +3.0;   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			final boolean             dust_remove,     // +Do not reduce average weight when only one image differs much from the average
+			final GeometryCorrection  geometryCorrection, // for GPU TODO: combine geometry corrections if geometryCorrection_main is not null
+			final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
+			final int                 window_type,    // GPU: will not be used
+			final double              disparity_corr, // disparity at infinity
+			final int                 debug_tileX,
+			final int                 debug_tileY,
+			final int                 threadsMax,       // maximal number of threads to launch
+			final int                 globalDebugLevel)
+	{
+		if (this.gpuQuad == null) {
+			System.out.println("clt_aberrations_quad_corr_GPU(): this.gpuQuad is null, bailing out");
+			return null;
+		}
+		final boolean [][] saturation_imp = gpuQuad.quadCLT.saturation_imp;               // boolean [][] saturation_imp, // (near) saturated pixels or null
+//gpuQuad
+		final boolean debug_distort= globalDebugLevel > 0; ///false; // true;
+
+		final double [][] debug_offsets = new double[imgdtt_params.lma_dbg_offset.length][2];
+		for (int i = 0; i < debug_offsets.length; i++) for (int j = 0; j < debug_offsets[i].length; j++) {
+			debug_offsets[i][j] = imgdtt_params.lma_dbg_offset[i][j]*imgdtt_params.lma_dbg_scale;
+		}
+
+
+		final boolean macro_mode = macro_scale != 1;      // correlate tile data instead of the pixel data
+		final int quad = 4;   // number of subcameras
+//		final int numcol = 3; // number of colors // keep the same, just do not use [0] and [1], [2] - green
+		final int numcol = isMonochrome()?1:3;
+
+		final int width =  gpuQuad.getImageWidth();
+		final int height = gpuQuad.getImageHeight();
+		final int tilesX=gpuQuad.getTilesX(); // width/transform_size;
+		final int tilesY=gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+////		final int nTilesInChn=tilesX*tilesY;
+////		final double [][][][][][] clt_data = new double[quad][numcol][tilesY][tilesX][][];
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		final double [] col_weights= new double [numcol]; // colors are RBG
+		final double [][] dbg_distort = debug_distort? (new double [4*quad][tilesX*tilesY]) : null;
+		// not yet used with GPU
+/**		
+		final double [][] corr_wnd = Corr2dLMA.getCorrWnd(
+				transform_size,
+				imgdtt_params.lma_wnd);
+		final double [] corr_wnd_inv_limited = (imgdtt_params.lma_min_wnd <= 1.0)?  new double [corr_wnd.length * corr_wnd[0].length]: null;
+		if (corr_wnd_inv_limited != null) {
+			double inv_pwr = imgdtt_params.lma_wnd_pwr - (imgdtt_params.lma_wnd - 1.0); // compensate for lma_wnd
+			for (int i = imgdtt_params.lma_hard_marg; i < (corr_wnd.length - imgdtt_params.lma_hard_marg); i++) {
+				for (int j = imgdtt_params.lma_hard_marg; j < (corr_wnd.length - imgdtt_params.lma_hard_marg); j++) {
+					corr_wnd_inv_limited[i * (corr_wnd.length) + j] = 1.0/Math.max(Math.pow(corr_wnd[i][j],inv_pwr), imgdtt_params.lma_min_wnd);
+				}
+			}
+		}
+*/
+		// keep for now for mono, find out  what do they mean for macro mode
+		
+		if (macro_mode) { // all the same as they now mean different
+			//compensating Bayer correction
+			col_weights[0] = 0.25; //  1.0/3;
+			col_weights[1] = 0.25; //  1.0/3;
+			col_weights[2] = 0.5; // 1.0/3;
+		} else {
+			if (isMonochrome()) {
+				col_weights[2] = 1.0;// green color/mono
+				col_weights[0] = 0;
+				col_weights[1] = 0;
+			} else {
+				col_weights[2] = 1.0/(1.0 + corr_red + corr_blue);    // green color
+				col_weights[0] = corr_red *  col_weights[2];
+				col_weights[1] = corr_blue * col_weights[2];
+			}
+		}
+		final int corr_size = transform_size * 2 - 1;
+		
+		if ((globalDebugLevel > -10) && (disparity_corr != 0.0)){
+			System.out.println(String.format("Using manual infinity disparity correction of %8.5f pixels",disparity_corr));
+		}
+
+		// reducing weight of on-axis correlation values to enhance detection of vertical/horizontal lines
+		// multiply correlation results inside the horizontal center strip  2*enhortho_width - 1 wide by enhortho_scale
+
+		final double [] enh_ortho_scale = new double [corr_size];
+		for (int i = 0; i < corr_size; i++){
+			if ((i < (transform_size - imgdtt_params.getEnhOrthoWidth(isAux()))) || (i > (transform_size - 2 + imgdtt_params.getEnhOrthoWidth(isAux())))) {
+				enh_ortho_scale[i] = 1.0;
+			} else {
+				enh_ortho_scale[i] = imgdtt_params.getEnhOrthoScale(isAux());
+			}
+			if (i == (transform_size-1)) enh_ortho_scale[i] = 0.0 ; // hardwired 0 in the center
+			enh_ortho_scale[i] *= Math.sin(Math.PI*(i+1.0)/(2*transform_size));
+		}
+		if (globalDebugLevel > 1){
+			System.out.println("getEnhOrthoWidth(isAux())="+ imgdtt_params.getEnhOrthoWidth(isAux())+" getEnhOrthoScale(isAux())="+ imgdtt_params.getEnhOrthoScale(isAux()));
+			for (int i = 0; i < corr_size; i++){
+				System.out.println(" enh_ortho_scale["+i+"]="+ enh_ortho_scale[i]);
+
+			}
+		}
+
+		// Create window  to select center correlation strip using
+		// ortho_height - full width of non-zero elements
+		// ortho_eff_height - effective height (ration of the weighted column sum to the center value)
+		int wcenter = transform_size - 1;
+		final double [] ortho_weights = new double [corr_size]; // [15]
+		for (int i = 0; i < corr_size; i++){
+			if ((i >= wcenter - imgdtt_params.ortho_height/2) && (i <= wcenter + imgdtt_params.ortho_height/2)) {
+				double dx = 1.0*(i-wcenter)/(imgdtt_params.ortho_height/2 + 1);
+				ortho_weights[i] = 0.5*(1.0+Math.cos(Math.PI*dx))/imgdtt_params.ortho_eff_height;
+			}
+		}
+		if (globalDebugLevel > 0){
+			System.out.println("ortho_height="+ imgdtt_params.ortho_height+" ortho_eff_height="+ imgdtt_params.ortho_eff_height);
+			for (int i = 0; i < corr_size; i++){
+				System.out.println(" ortho_weights["+i+"]="+ ortho_weights[i]);
+			}
+		}
+
+
+		if (globalDebugLevel > 0) {
+			System.out.println("clt_aberrations_quad_corr(): width="+width+" height="+height+" transform_size="+transform_size+
+					" debug_tileX="+debug_tileX+" debug_tileY="+debug_tileY+" globalDebugLevel="+globalDebugLevel);
+		}
+		
+	
+		// add optional initialization of debug layers here
+		boolean need_macro = false;
+		boolean need_corr = (clt_mismatch != null) || (fcorr_combo_td !=null) || (fcorr_td !=null) ; // (not the only reason)
+		// skipping DISPARITY_VARIATIONS_INDEX - it was not used
+		if (disparity_map != null){
+			for (int i = 0; i<disparity_map.length;i++) if ((disparity_modes & (1 << i)) != 0){
+				if ((i == OVEREXPOSED) && (saturation_imp == null)) {
+					continue;
+				}
+				disparity_map[i] = new double [tilesY*tilesX];
+				if ((i >= IMG_TONE_RGB) || ((i >= IMG_DIFF0_INDEX) && (i < (IMG_DIFF0_INDEX + 4)))) {
+					need_macro = true;
+				}
+				if (i <=DISPARITY_STRENGTH_INDEX) {
+					need_corr = true;
+				}
+			}
+		}
+		
+		if (clt_mismatch != null){
+			for (int i = 0; i<clt_mismatch.length;i++){
+				clt_mismatch[i] = new double [tilesY*tilesX]; // will use only "center of mass" centers
+			}
+		}
+
+		DttRad2 dtt = new DttRad2(transform_size);
+		dtt.set_window(window_type);
+		final double [] lt_window = dtt.getWin2d();	// [256] - never used
+		final double [] lt_window2 = new double [lt_window.length]; // squared - never used
+
+		for (int i = 0; i < lt_window.length; i++) lt_window2[i] = lt_window[i] * lt_window[i];
+
+		if (globalDebugLevel > 1) {
+			ShowDoubleFloatArrays sdfa_instance = new ShowDoubleFloatArrays(); // just for debugging?
+			sdfa_instance.showArrays(lt_window,  2*transform_size, 2*transform_size, "lt_window");
+		}
+		if (globalDebugLevel > 0) {
+			System.out.println("macro_mode="+macro_mode);
+		}
+
+		final boolean use_main = geometryCorrection_main != null;
+		boolean [] used_corrs = new boolean[1];
+	    final int all_pairs = imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
+		final GPUTileProcessor.TpTask[] tp_tasks =  gpuQuad.setTpTask(
+				disparity_array, // final double [][]  disparity_array,  // [tilesY][tilesX] - individual per-tile expected disparity
+				disparity_corr,  // final double       disparity_corr,
+				used_corrs,      // final boolean []   need_corrs,       // should be initialized to boolean[1] or null
+				tile_op,         // final int [][]     tile_op,          // [tilesY][tilesX] - what to do - 0 - nothing for this tile
+				all_pairs,       // final int                      corr_mask,        // <0 - use corr mask from the tile tile_op, >=0 - overwrite all with non-zero corr_mask_tp
+				threadsMax);     // final int          threadsMax,       // maximal number of threads to launch
+		
+		if (tp_tasks.length == 0) {
+			System.out.println("Empty tasks - nothing to do");
+			return null;
+		}
+		//texture_tiles
+		final boolean fneed_macro = need_macro;
+		final boolean fneed_corr =  need_corr && used_corrs[0]; // *** tasks should include correlation
+
+		final float [][] lpf_rgb = new float[][] {
+			floatGetCltLpfFd(gpu_sigma_r),
+			floatGetCltLpfFd(gpu_sigma_b),
+			floatGetCltLpfFd(gpu_sigma_g),
+			floatGetCltLpfFd(gpu_sigma_m)
+		};
+		gpuQuad.setLpfRbg( // constants memory - same for all cameras
+				lpf_rgb,
+				globalDebugLevel > -1);
+
+		final float [] lpf_flat = floatGetCltLpfFd(gpu_sigma_corr);
+
+		gpuQuad.setLpfCorr(// constants memory - same for all cameras
+				"lpf_corr", // String const_name, // "lpf_corr"
+				lpf_flat,
+				globalDebugLevel > -1);
+
+		final float [] lpf_rb_flat = floatGetCltLpfFd(gpu_sigma_rb_corr);
+		gpuQuad.setLpfCorr(// constants memory - same for all cameras
+				"lpf_rb_corr", // String const_name, // "lpf_corr"
+				lpf_rb_flat,
+				globalDebugLevel > -1);
+
+		gpuQuad.setTasks(                  // copy tp_tasks to the GPU memory
+				tp_tasks,                  // TpTask [] tile_tasks,
+				use_main,                  // use_aux); // boolean use_aux)
+				imgdtt_params.gpu_verify); // boolean verify
+
+		gpuQuad.execSetTilesOffsets(); // prepare tiles offsets in GPU memory
+
+		GPUTileProcessor.TpTask[][] test_tasks = new GPUTileProcessor.TpTask[3][];
+		test_tasks[2] = tp_tasks;
+		if ((fdisp_dist != null) || (fpxpy != null)) {
+			final GPUTileProcessor.TpTask[] tp_tasks_full = gpuQuad.getTasks(use_main);
+			test_tasks[0] = tp_tasks_full;
+			for (int ithread = 0; ithread < threads.length; ithread++) {
+				threads[ithread] = new Thread() {
+					@Override
+					public void run() {
+						for (int indx_tile = ai.getAndIncrement(); indx_tile < tp_tasks_full.length; indx_tile = ai.getAndIncrement()) {
+							GPUTileProcessor.TpTask task = tp_tasks_full[indx_tile];
+							if (fdisp_dist != null) {
+								fdisp_dist[task.getTileY()][task.getTileX()] = task.getDispDist();
+							}
+							if (fpxpy != null) {
+								fpxpy[task.getTileY()][task.getTileX()] = task.getXY(use_main); // boolean use_aux);
+							}
+						} // end of tile
+					}
+				};
+			}
+			startAndJoin(threads);
+			ai.set(0);
+		}
+		if (fpxpy_test != null) {
+
+			gpuQuad.execSetTilesOffsets(); // prepare tiles offsets in GPU memory
+			
+			final GPUTileProcessor.TpTask[] tp_tasks_full = gpuQuad.getTasks(use_main); // reads the same
+			test_tasks[1] = tp_tasks_full;
+			for (int ithread = 0; ithread < threads.length; ithread++) {
+				threads[ithread] = new Thread() {
+					@Override
+					public void run() {
+						for (int indx_tile = ai.getAndIncrement(); indx_tile < tp_tasks_full.length; indx_tile = ai.getAndIncrement()) {
+							GPUTileProcessor.TpTask task = tp_tasks_full[indx_tile];
+							if (fpxpy != null) {
+								fpxpy_test[task.getTileY()][task.getTileX()] = task.getXY(use_main); // boolean use_aux);
+							}
+						} // end of tile
+					}
+				};
+			}
+			startAndJoin(threads);
+			ai.set(0);
+		}
+
+//			GPUTileProcessor.TpTask[] tp_tasks_full1,
+//		GPUTileProcessor.TpTask[] tp_tasks_full2)
+		
+		
+		gpuQuad.execConvertDirect();
+		if (iclt_fimg != null) {
+			gpuQuad.execImcltRbgAll(isMonochrome());  // execute GPU kernel
+			for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
+				iclt_fimg[ncam] = gpuQuad.getRBG(ncam); // retrieve data from GPU
+			}
+		} else {gpuQuad.execImcltRbgAll(isMonochrome());} // just for testing
+		// does it need texture tiles to be output?
+		if (texture_img != null) {
+			Rectangle woi = new Rectangle(); // will be filled out to match actual available image
+			gpuQuad.execRBGA(
+					col_weights,                   // double [] color_weights,
+					isLwir(),         // boolean   is_lwir,
+					min_shot,       // double    min_shot,           // 10.0
+					scale_shot,     // double    scale_shot,         // 3.0
+					diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
+					diff_threshold, // double    diff_threshold,     // pixel value/pixel change
+					min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					dust_remove);   // boolean   dust_remove,
+			float [][] rbga = gpuQuad.getRBGA(
+					(isMonochrome() ? 1 : 3), // int     num_colors,
+					(texture_woi_pix != null)? texture_woi_pix : woi);
+			for (int ncol = 0; ncol < texture_img.length; ncol++) if (ncol < rbga.length) {
+				texture_img[ncol] = rbga[ncol];
+			}
+		}
+		// does it need macro data?
+		if (fneed_macro) {
+			//Generate non-overlapping (16x16) texture tiles, prepare 
+			gpuQuad.execTextures(
+				col_weights,                   // double [] color_weights,
+				isLwir(),         // boolean   is_lwir,
+				min_shot,       // double    min_shot,           // 10.0
+				scale_shot,     // double    scale_shot,         // 3.0
+				diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
+				diff_threshold, // double    diff_threshold,     // pixel value/pixel change - never used in GPU ?
+				min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				dust_remove,    // boolean   dust_remove,        // Do not reduce average weight when only one image differs much from the average
+				false,                 // boolean   calc_textures,
+				true);                   // boolean   calc_extra)
+			float [][] extra = gpuQuad.getExtra();
+			int num_cams = gpuQuad.getNumCams();
+			for (int ncam = 0; ncam < num_cams; ncam++) {
+				int indx = ncam + IMG_DIFF0_INDEX;
+				if ((disparity_modes & (1 << indx)) != 0){
+					disparity_map[indx] = new double [extra[ncam].length];
+					for (int i = 0; i < extra[ncam].length; i++) {
+						disparity_map[indx][i] = extra[ncam][i];
+					}
+				}
+			}
+			for (int nc = 0; nc < (extra.length - num_cams); nc++) {
+				int sindx = nc + num_cams;
+				int indx = nc + IMG_TONE_RGB;
+				if ((disparity_modes & (1 << indx)) != 0){
+					disparity_map[indx] = new double [extra[sindx].length];
+					for (int i = 0; i < extra[sindx].length; i++) {
+						disparity_map[indx][i] = extra[sindx][i];
+					}
+				}
+			}			
+		}
+		// does it need non-overlapping texture tiles
+		if (texture_tiles != null) {   // compatible with the CPU ones
+			//Generate non-overlapping (16x16) texture tiles, prepare 
+			gpuQuad.execTextures(
+				col_weights,                   // double [] color_weights,
+				isLwir(),         // boolean   is_lwir,
+				min_shot,       // double    min_shot,           // 10.0
+				scale_shot,     // double    scale_shot,         // 3.0
+				diff_sigma,     // double    diff_sigma,         // pixel value/pixel change
+				diff_threshold, // double    diff_threshold,     // pixel value/pixel change - never used in GPU ?
+				min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				dust_remove,    // boolean   dust_remove,        // Do not reduce average weight when only one image differs much from the average
+				true,           // boolean   calc_textures,
+				false);         // boolean   calc_extra)
+
+			int [] texture_indices = gpuQuad.getTextureIndices();
+			int          num_src_slices = numcol + 1; //  + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
+			float [] flat_textures =  gpuQuad.getFlatTextures( // fatal error has been detected by the Java Runtime Environment:
+					texture_indices.length,
+					numcol, // int     num_colors,
+		    		false); // clt_parameters.keep_weights); // boolean keep_weights);
+
+			gpuQuad.doubleTextures(
+		    		new Rectangle(0, 0, tilesX, tilesY), // Rectangle    woi,
+		    		texture_tiles,                       // double [][][][] texture_tiles, // null or [tilesY][tilesX]
+		    		texture_indices,                     // int []       indices,
+		    		flat_textures,                       // float [][][] ftextures,
+		    		tilesX,                              // int          full_width,
+		    		4,                                   // rbga only /int          num_slices
+		    		num_src_slices                       // int          num_src_slices
+		    		);
+		}
+		
+		
+		// does it need correlations?
+		if (fneed_corr) {
+			//Generate 2D phase correlations from the CLT representation
+			gpuQuad.execCorr2D_TD(col_weights); // Get TD version of correlations (may be read out and saved) 
+			final int [] corr_indices = gpuQuad.getCorrIndices();
+			if (fcorr_td != null) {
+				gpuQuad.getCorrTilesTd(fcorr_td); // generate time domain correlation pairs
+			}
+			
+			gpuQuad.execCorr2D_normalize(
+	        		false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+		    		gpu_corr_rad); // int corr_radius
+			final float [][] fcorr2D = gpuQuad.getCorr2D(gpu_corr_rad); //  int corr_rad);
+			
+			// Combine 4 ortho pairs
+			gpuQuad.execCorr2D_combine( // calculate cross pairs
+			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+			        0x0f); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 0) && (fcorr_combo_td[0] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_combo_td[0]); // generate time domain correlation pairs for quad ortho combination
+			}
+			// normalize and convert to pixel domain
+			gpuQuad.execCorr2D_normalize(
+	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+		    		gpu_corr_rad); // int corr_radius
+			final int [] corr_quad_indices = gpuQuad.getCorrComboIndices(); // get quad
+			final float [][] fcorr2D_quad =   gpuQuad.getCorr2DCombo(gpu_corr_rad);
+
+			// Combine 2 diagonal pairs			
+			gpuQuad.execCorr2D_combine( // calculate cross pairs
+			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+			        0x30); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 1) && (fcorr_combo_td[1] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_combo_td[1]); // generate time domain correlation pairs for cross diagonal combination
+			}
+			gpuQuad.execCorr2D_normalize(
+	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+		    		gpu_corr_rad); // int corr_radius
+			final float [][] fcorr2D_cross =   gpuQuad.getCorr2DCombo(gpu_corr_rad);
+			
+			// Combine 2 horizontal pairs
+			gpuQuad.execCorr2D_combine( // calculate cross pairs
+			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+			        0x03); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 2) && (fcorr_combo_td[2] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_combo_td[2]); // generate time domain correlation pairs for horizontal combination
+			}
+			gpuQuad.execCorr2D_normalize(
+	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+		    		gpu_corr_rad); // int corr_radius
+			final float [][] fcorr2D_hor =   gpuQuad.getCorr2DCombo(gpu_corr_rad);
+			// Combine 2 vertical pairs
+			gpuQuad.execCorr2D_combine( // calculate cross pairs
+			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+			        0x0c); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 3) && (fcorr_combo_td[3] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_combo_td[3]); // generate time domain correlation pairs for vertical combination
+			}
+			gpuQuad.execCorr2D_normalize(
+	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+		    		gpu_corr_rad); // int corr_radius
+			final float [][] fcorr2D_vert =   gpuQuad.getCorr2DCombo(gpu_corr_rad);
+			
+			
+			
+			if (corr_indices.length > 0) {
+				/*				
+				if (true) { // debugging only
+					int [] wh = new int[2];
+					double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+							tilesX,
+							tilesY,
+							corr_indices,
+							fcorr2D,
+							wh);
+					(new ShowDoubleFloatArrays()).showArrays(
+							dbg_corr,
+							wh[0],
+							wh[1],
+							true,
+							"dbg-corr2D-initial", // name+"-CORR2D-D"+clt_parameters.disparity,
+							GPUTileProcessor.getCorrTitles());
+				}
+				*/
+				final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size
+				// assuming that the correlation pairs sets are the same for each tile that has correlations
+				// find this number
+				int nt0 = (corr_indices[0] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+				int nc0 = 1;
+				for (int i = 1; (i < corr_indices.length) && ((corr_indices[i] >> GPUTileProcessor.CORR_NTILE_SHIFT) == nt0) ; i++) {
+					nc0++;
+				}
+				final int num_tile_corr = nc0; // normally 6
+				final int num_tiles = corr_indices.length / num_tile_corr; 
+				
+
+				for (int ithread = 0; ithread < threads.length; ithread++) {
+					threads[ithread] = new Thread() {
+						@Override
+						public void run() {
+							//							int tileY,tileX,tIndex;
+//							double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15)
+							
+							Correlation2d corr2d = new Correlation2d(
+									imgdtt_params,              // ImageDttParameters  imgdtt_params,
+									transform_size,             // int transform_size,
+									2.0,                        //  double wndx_scale, // (wndy scale is always 1.0)
+									isMonochrome(), // boolean monochrome,
+									(globalDebugLevel > -1));   //   boolean debug)
+							corr2d.createOrtoNotch(
+									imgdtt_params.getEnhOrthoWidth(isAux()), // double getEnhOrthoWidth(isAux()),
+									imgdtt_params.getEnhOrthoScale(isAux()), //double getEnhOrthoScale(isAux()),
+									(imgdtt_params.lma_debug_level > 1)); // boolean debug);
+
+							for (int indx_tile = ai.getAndIncrement(); indx_tile < num_tiles; indx_tile = ai.getAndIncrement()) {
+								// double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15)
+								// added quad and cross combos
+								double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS + 4][corr_length]; // 225-long (15x15)
+								int indx_corr = indx_tile * num_tile_corr;
+								int nt = (corr_indices[indx_corr] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+								int tileX = nt % tilesX;
+								int tileY = nt / tilesX;
+								int tIndex = tileY * tilesX + tileX;
+								
+								// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
+								int pair_mask = 0;
+								for (int indx_pair = 0; indx_pair < num_tile_corr; indx_pair++) {
+									int pair = corr_indices[indx_corr] & GPUTileProcessor.CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
+									assert pair < GPUTileProcessor.NUM_PAIRS : "invalid correllation pair";
+									pair_mask |= (1 << pair);
+									for (int i = 0; i < corr_length; i++) {
+										corrs[pair][i] = gpu_corr_scale * fcorr2D[indx_corr][i]; // from float to double
+									}
+									indx_corr++; 
+								}
+								// add 4 combo layers : quad, cross, hor, vert
+								int pair = GPUTileProcessor.NUM_PAIRS; // 6
+								nt = (corr_quad_indices[indx_tile] >> GPUTileProcessor.CORR_NTILE_SHIFT); // corr_quad_indices - different sequence
+								for (int i = 0; i < corr_length; i++) {
+									corrs[pair][i] = gpu_corr_scale * fcorr2D_quad[indx_tile][i]; // from float to double
+								}
+								// indices for cross are the same as for quad
+								pair++;
+								for (int i = 0; i < corr_length; i++) {
+									corrs[pair][i] = gpu_corr_scale * fcorr2D_cross[indx_tile][i]; // from float to double
+								}
+								
+								// indices for hor are the same as for quad
+								pair++;
+								for (int i = 0; i < corr_length; i++) {
+									corrs[pair][i] = gpu_corr_scale * fcorr2D_hor[indx_tile][i]; // from float to double
+								}
+
+								// indices for vert are the same as for quad
+								pair++;
+								for (int i = 0; i < corr_length; i++) {
+									corrs[pair][i] = gpu_corr_scale * fcorr2D_vert[indx_tile][i]; // from float to double
+								}
+								
+								int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
+								int tile_lma_debug_level =  ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
+								boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
+								
+								corr_common_GPU(
+										imgdtt_params,        // final ImageDttParameters  imgdtt_params,
+										clt_corr_partial,     // final double [][][][][]   clt_corr_partial,			
+										used_pairs,           // final int           used_pairs,
+										disparity_map,        // final double [][]   disparity_map,
+										clt_mismatch,         // final double [][]   clt_mismatch,
+										saturation_imp,       // final boolean [][]  saturation_imp,
+										fneed_macro,          // final boolean       fneed_macro,
+										corr2d,               // final Correlation2d corr2d,
+										corrs,                // final double [][]   corrs,
+										tileX,                // final int           tileX,
+										tileY,                // final int           tileY,
+										max_corr_radius,      // final double        max_corr_radius, // 3.9;
+										tile_lma_debug_level, // int                 tile_lma_debug_level,
+										debugTile,            // boolean             debugTile,
+										globalDebugLevel);    // final int           globalDebugLevel)							
+								
+								// double extra_disparity = 0.0; // used for textures:  if allowed, shift images extra before trying to combine
+/*
+// Disabled for GPU
+								if      (corr_mode == 0) extra_disparity = disparity_map[DISPARITY_INDEX_INT][tIndex];
+								else if (corr_mode == 1) extra_disparity = disparity_map[DISPARITY_INDEX_CM][tIndex];
+								else if (corr_mode == 2) extra_disparity = disparity_map[DISPARITY_INDEX_POLY][tIndex];
+								else if (corr_mode == 3) extra_disparity = disparity_map[DISPARITY_INDEX_HOR][tIndex];  // not used in lwir
+								else if (corr_mode == 4) extra_disparity = disparity_map[DISPARITY_INDEX_VERT][tIndex];  // not used in lwir
+								if (Double.isNaN(extra_disparity)) extra_disparity = 0;  // used in lwir
+*/
+								if ((disparity_map != null) && Double.isNaN(disparity_map[DISPARITY_STRENGTH_INDEX][tIndex])) {
+									System.out.println("BUG: 3. disparity_map[DISPARITY_STRENGTH_INDEX][tIndex] should not be NaN");
+								}
+
+								// only debug is left
+								// old (per-color correlation)
+								// removed
+								
+							} // end of tile
+						}
+					};
+				}
+				startAndJoin(threads);
+			} else {
+				// no correlation tiles to process
+			}
+		}
+		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
+			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
+		}
+		return test_tasks;
+		
+//			GPUTileProcessor.TpTask[] tp_tasks_full1,
+//		GPUTileProcessor.TpTask[] tp_tasks_full2)
+		
+	}
+	
 	
 	public void clt_process_tl_correlations_GPU( // convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
 			final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -48,6 +48,7 @@ import com.elphel.imagej.gpu.GPUTileProcessor;
 import ij.ImagePlus;
 import ij.ImageStack;
 public class QuadCLT extends QuadCLTCPU {
+	int dbg_lev = 1;
 	private GPUTileProcessor.GpuQuad gpuQuad =              null;	
 	public QuadCLT(
 			String                                          prefix,
@@ -2314,12 +2315,75 @@ public class QuadCLT extends QuadCLTCPU {
 					  threadsMax,
 					  debugLevel - 2);
 		  } else {// GPU
-// First - measure and get 	fcorr_td, fdisp_dist, fpxpy
+			  // First - measure and get 	fcorr_td, fdisp_dist, fpxpy
 			  float  [][][][]     fcorr_td =   new float[tilesY][tilesX][][];
 			  float  [][][][]     fdisp_dist = new float[tilesY][tilesX][][];
 			  float  [][][][]     fpxpy =      new float[tilesY][tilesX][][];
+			  float  [][][][]     fpxpy_test = new float[tilesY][tilesX][][];
+			  float  [][][][]     fpxpy_test1 = new float[tilesY][tilesX][][];
+			  float  [][][][]     fpxpy_test2 = new float[tilesY][tilesX][][];
 			  final double        gpu_fat_zero = clt_parameters.getGpuFatZero(isMonochrome()); 
-			  image_dtt.clt_aberrations_quad_corr_GPU( // USED in LWIR
+			  
+			  GPUTileProcessor.TpTask[][] test_tasks0 = image_dtt.clt_aberrations_quad_corr_GPU_test( // USED in LWIR
+					  clt_parameters.img_dtt,        // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
+					  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
+					  tile_op,                       // per-tile operation bit codes
+					  disparity_array,               // clt_parameters.disparity,     // final double            disparity,
+					  fcorr_td,				 		 // final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+					  null,			  				 //	final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+					  // each of the top elements may be null to skip particular combo type
+					  fdisp_dist,                    // final float  [][][][]     fdisp_dist,      // [tilesY][tilesX][cams][4], // disparity derivatives vectors or null
+					  fpxpy_test,	                     //	final float  [][][][]     fpxpy,           // [tilesY][tilesX][cams][2], tile {pX,pY}
+					  fpxpy_test1,	                     //	final float  [][][][]     fpxpy,           // [tilesY][tilesX][cams][2], tile {pX,pY}
+					  //// Uses quadCLT from gpuQuad			
+					  // correlation results - final and partial
+					  null,                          // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+					  // [type][tilesY][tilesX] should be set by caller
+					  // types: 0 - selected correlation (product+offset), 1 - sum
+					  null,     			         // clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+					  // [tilesY][tilesX] should be set by caller
+					  // When clt_mismatch is non-zero, no far objects extraction will be attempted
+					  null,                          // clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
+					  // values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
+					  null,              			 // disparity_map,   // [8][tilesY][tilesX], only [6][] is needed on input or null - do not calculate
+					  // last 2 - contrast, avg/ "geometric average)
+					  0, // disparity_modes,         // disparity_modes, // bit mask of disparity_map slices to calculate/return
+					  null,                          // 	final double [][][][]     texture_tiles,   // compatible with the CPU ones      
+					  null, // texture_img,          // texture_img,     // null or [3][] (RGB) or [4][] RGBA
+					  null, // texture_woi,          // texture_woi,     // null or generated texture location/size
+					  null,                          //  iclt_fimg,       // will return quad images or null to skip, use quadCLT.linearStackToColor 
+					  clt_parameters.gpu_corr_scale, //.gpu_corr_scale, //  gpu_corr_scale,  //  0.75; // reduce GPU-generated correlation values
+					  gpu_fat_zero, // final double              gpu_fat_zero,    // clt_parameters.getGpuFatZero(is_mono);absolute == 30.0\
+					  clt_parameters.gpu_sigma_r,    // 0.9, 1.1
+					  clt_parameters.gpu_sigma_b,    // 0.9, 1.1
+					  clt_parameters.gpu_sigma_g,    // 0.6, 0.7
+					  clt_parameters.gpu_sigma_m,    //  =       0.4; // 0.7;
+					  (isMonochrome()? 1.0 : clt_parameters.gpu_sigma_rb_corr), // final double              gpu_sigma_rb_corr, //  = 0.5; // apply LPF after accumulating R and B correlation before G, monochrome ? 1.0 : gpu_sigma_rb_corr;
+					  clt_parameters.gpu_sigma_corr, //  =    0.9;gpu_sigma_corr_m
+					  image_dtt.transform_size - 1, // clt_parameters.gpu_corr_rad,   // = transform_size - 1 ?
+					  clt_parameters.corr_red,       // +used
+					  clt_parameters.corr_blue,      // +used
+					  clt_parameters.max_corr_radius,// 3.9;
+					  clt_parameters.corr_mode,      // Correlation mode: 0 - integer max, 1 - center of mass, 2 - polynomial
+					  clt_parameters.min_shot,       // 10.0;  // Do not adjust for shot noise if lower than
+					  clt_parameters.scale_shot,     // 3.0;   // scale when dividing by sqrt ( <0 - disable correction)
+					  clt_parameters.diff_sigma,     // 5.0;//RMS difference from average to reduce weights (~ 1.0 - 1/255 full scale image)
+					  clt_parameters.diff_threshold, // 5.0;   // RMS difference from average to discard channel (~ 1.0 - 1/255 full scale image)
+					  clt_parameters.diff_gauss,     // true;  // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
+					  clt_parameters.min_agree,      // 3.0;   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+					  clt_parameters.dust_remove,    // Do not reduce average weight when only one image differes much from the average
+					  geometryCorrection,            // final GeometryCorrection  geometryCorrection,
+					  null,                          // final GeometryCorrection  geometryCorrection_main, // if not null correct this camera (aux) to the coordinates of the main
+					  clt_parameters.clt_window,
+					  disparity_corr,                // final double              disparity_corr, // disparity at infinity
+					  clt_parameters.tileX,          // final int               debug_tileX,
+					  clt_parameters.tileY,          // final int               debug_tileY,
+					  threadsMax,
+					  debugLevel);
+			  
+			  
+			  
+			  GPUTileProcessor.TpTask[][] test_tasks1 = image_dtt.clt_aberrations_quad_corr_GPU_test( // USED in LWIR
 					  clt_parameters.img_dtt,        // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 					  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 					  tile_op,                       // per-tile operation bit codes
@@ -2329,6 +2393,7 @@ public class QuadCLT extends QuadCLTCPU {
 					  // each of the top elements may be null to skip particular combo type
 					  fdisp_dist,                    // final float  [][][][]     fdisp_dist,      // [tilesY][tilesX][cams][4], // disparity derivatives vectors or null
 					  fpxpy,	                     //	final float  [][][][]     fpxpy,           // [tilesY][tilesX][cams][2], tile {pX,pY}
+					  fpxpy_test2,                    //	final float  [][][][]     fpxpy_test,           // [tilesY][tilesX][cams][2], tile {pX,pY}
 					  //// Uses quadCLT from gpuQuad			
 					  // correlation results - final and partial
 					  null,                          // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
@@ -2374,6 +2439,153 @@ public class QuadCLT extends QuadCLTCPU {
 					  clt_parameters.tileY,          // final int               debug_tileY,
 					  threadsMax,
 					  debugLevel);
+			  if (dbg_lev > 0) {
+				  double [][] dbg_disp = new double [16][tilesX*tilesY]; 
+				  double [][] dbg_pxy =  new double [8] [tilesX*tilesY];
+				  double [][] dbg_pxy_test =  new double [8] [tilesX*tilesY];
+				  double [][] dbg_pxy_diff =  new double [8] [tilesX*tilesY];
+				  for (int tY = 0; tY<tilesY; tY++) {
+					  for (int tX = 0; tX<tilesX; tX++) {
+						  int indx = tY * tilesX + tX;
+						  if (fdisp_dist[tY][tX] != null) {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 4; i++) {
+									  dbg_disp[ncam * 4 + i][indx] = fdisp_dist[tY][tX][ncam][i];
+								  }
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy[ncam * 2 + i][indx] = fpxpy[tY][tX][ncam][i];
+								  }
+							  }
+						  } else {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 4; i++) {
+									  dbg_disp[ncam * 4 + i][indx] = Double.NaN;
+								  }
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy[ncam * 2 + i][indx] = Double.NaN;
+								  }
+							  }
+						  }
+						  
+						  if (fpxpy_test[tY][tX] != null) {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy_test[ncam * 2 + i][indx] = fpxpy_test[tY][tX][ncam][i];
+								  }
+							  }
+						  } else {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy_test[ncam * 2 + i][indx] = Double.NaN;
+								  }
+							  }
+						  }
+						  
+					  }
+				  }
+				  double d2 = 0.0;
+				  for (int n = 0; n < dbg_pxy_diff.length; n++) {
+					  for (int i = 0; i < dbg_pxy_diff[n].length; i++) {
+						  dbg_pxy_diff[n][i] = dbg_pxy[n][i] - dbg_pxy_test[n][i];
+						  if (!Double.isNaN(dbg_pxy_diff[n][i])) {
+							  d2+=dbg_pxy_diff[n][i]*dbg_pxy_diff[n][i];
+							  if (dbg_pxy_diff[n][i] != 0.0) {
+								  System.out.println("dbg_pxy - dbg_pxy_test mismatch @ n ="+n+" tile="+i);
+								  // find tasks
+								  int ty = i / tilesX;
+								  int tx = i % tilesX;
+								  System.out.println("tx= "+tx+", ty = "+ty);
+								  // find index in test_tasks0[0]
+								  int t0_indx = -1;
+								  for (int ti = 0; ti < test_tasks0[0].length; ti++) {
+									  GPUTileProcessor.TpTask t0 = test_tasks0[0][ti];
+									  if ((t0.tx == tx) && (t0.ty == ty)) {
+										  t0_indx = ti;
+										  break;
+									  }
+								  }
+								  int t1_indx = -1;
+								  for (int ti = 0; ti < test_tasks1[0].length; ti++) {
+									  GPUTileProcessor.TpTask t1 = test_tasks1[0][ti];
+									  if ((t1.tx == tx) && (t1.ty == ty)) {
+										  t1_indx = ti;
+										  break;
+									  }
+								  }
+								  System.out.println("tx= "+tx+", ty = "+ty+ " t0_indx="+t0_indx+ " t1_indx="+t1_indx);
+								  System.out.print("");
+							  }
+						  }
+						  
+					  }
+				  }
+				  System.out.println("(dbg_pxy - dbg_pxy_test) squared = "+d2);
+				  
+				  d2 = 0.0;
+				  for (int tY = 0; tY<tilesY; tY++) {
+					  for (int tX = 0; tX<tilesX; tX++) {
+						  int indx = tY * tilesX + tX;
+						  if (fpxpy[tY][tX] != null) {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 2; i++) {
+									  double diff = fpxpy[tY][tX][ncam][i] - fpxpy_test2[tY][tX][ncam][i];
+									  if (diff != 0.0) {
+										  System.out.println("fpxpy - fpxpy_test2 mismatch @ tile="+indx+" n ="+ncam+" i="+i);
+										  System.out.print("");
+									  }
+									  d2 += diff*diff;
+								  }
+							  }
+						  }
+					  }
+				  }
+				  System.out.println("(fpxpy - fpxpy_test2) squared = "+d2);
+
+				  d2 = 0.0;
+				  for (int tY = 0; tY<tilesY; tY++) {
+					  for (int tX = 0; tX<tilesX; tX++) {
+						  int indx = tY * tilesX + tX;
+						  if (fpxpy[tY][tX] != null) {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 2; i++) {
+									  double diff = fpxpy_test[tY][tX][ncam][i] - fpxpy_test1[tY][tX][ncam][i];
+									  if (diff != 0.0) {
+										  System.out.println("fpxpy_test - fpxpy_test1 mismatch @ tile="+indx+" n ="+ncam+" i="+i);
+										  System.out.print("");
+									  }
+									  d2 += diff*diff;
+								  }
+							  }
+						  }
+					  }
+				  }
+				  System.out.println("(fpxpy_test - fpxpy_test1) squared = "+d2);
+				  
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_pxy,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fpxpy");
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_pxy_test,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fpxpy_test");
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_pxy_diff,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fpxpy_diff");
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_disp,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fdisp_dist");
+			  }

 			  lazy_eye_data = image_dtt.cltMeasureLazyEyeGPU ( // returns d,s lazy eye parameters
 					  clt_parameters.img_dtt,        // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
@@ -2391,6 +2603,51 @@ public class QuadCLT extends QuadCLTCPU {
 					  clt_parameters.tileY,          // final int               debug_tileY,
 					  threadsMax,
 					  debugLevel);
+
+			  if (dbg_lev > 0) {
+				  double [][] dbg_disp = new double [16][tilesX*tilesY]; 
+				  double [][] dbg_pxy =  new double [8] [tilesX*tilesY];
+				  for (int tY = 0; tY<tilesY; tY++) {
+					  for (int tX = 0; tX<tilesX; tX++) {
+						  int indx = tY * tilesX + tX;
+						  if (fdisp_dist[tY][tX] != null) {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 4; i++) {
+									  dbg_disp[ncam * 4 + i][indx] = fdisp_dist[tY][tX][ncam][i];
+								  }
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy[ncam * 2 + i][indx] = fpxpy[tY][tX][ncam][i];
+								  }
+							  }
+						  } else {
+							  for (int ncam = 0; ncam < 4; ncam++) {
+								  for (int i = 0; i < 4; i++) {
+									  dbg_disp[ncam * 4 + i][indx] = Double.NaN;
+								  }
+								  for (int i = 0; i < 2; i++) {
+									  dbg_pxy[ncam * 2 + i][indx] = Double.NaN;
+								  }
+							  }
+						  }
+					  }
+				  }
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_disp,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fdisp_dist-after");
+
+				  (new ShowDoubleFloatArrays()).showArrays(
+						  dbg_pxy,
+						  tilesX,
+						  tilesY,
+						  true,
+						  "fpxpy-after");
+			  }
+
+		  
+		  
 		  }

 		  scan.setLazyEyeData(lazy_eye_data);
@@ -2399,6 +2656,7 @@ public class QuadCLT extends QuadCLTCPU {
 		  scan.resetProcessed();
 		  return scan;
 	  }
+	  
 	  public void gpuResetCorrVector() {
 		  if (getGPU() != null) {
 			  getGPU().resetGeometryCorrectionVector();

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLTCPU.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLTCPU.java
@@ -7904,7 +7904,7 @@ public class QuadCLTCPU {
 				  CLTPass3d   scan = tp.clt_3d_passes.get(combo_scan);
 				  // for the second half of runs (always for single run) - limit infinity min/max
 				  
-				  boolean debug_actual_LY_derivs = true; // debugLevel > 9;
+				  boolean debug_actual_LY_derivs =  debugLevel > 9; // true
 				  
 				  if (debug_actual_LY_derivs) {
 					  debugLYDerivatives(

--- a/src/main/java/com/elphel/imagej/tileprocessor/TileNeibs.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TileNeibs.java
@@ -513,4 +513,41 @@ public class TileNeibs{

 		return num_total_removed;
 	}
+	
+	public boolean [] fillConcave(
+			boolean [] tiles // should be the same size
+			) {
+		boolean [] filled = tiles.clone();
+		int min_neibs = 4;
+		ArrayList<Integer> front = new ArrayList<Integer>();
+		for (int start_indx = 0; start_indx < tiles.length; start_indx++) if (!filled[start_indx]) {
+			int nneib = 0;
+			for (int dir = 0; dir < dirs; dir++) {
+				int id = getNeibIndex(start_indx, dir);
+				if ((id >= 0) && filled[id]) nneib++;
+			}
+			if (nneib >= min_neibs) {
+				front.add(start_indx); // do not mark yet
+				filled[start_indx] = true;
+				while (!front.isEmpty()) {
+					int seed = front.remove(0);// get oldest element
+					for (int d = 0; d < dirs; d++){
+						int indx = 	getNeibIndex(seed, d);
+						if ((indx >=0) && !filled[indx]) {
+							nneib = 0;
+							for (int dir = 0; dir < dirs; dir++) {
+								int id = getNeibIndex(indx, dir);
+								if ((id >= 0) && filled[id]) nneib++;
+							}
+							if (nneib >= min_neibs) {
+								front.add(indx); // do not mark yet
+								filled[indx] = true;
+							}
+						}
+					}
+				}				
+			}
+		}
+		return filled;
+	}
 }