Motion blur correction for rendering only

93480b46 · Andrey Filippov · 7d6fb681 · 93480b46 · 93480b46 · 93480b46
Commit 93480b46 authored Aug 10, 2022 by Andrey Filippov
7 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -3842,19 +3842,21 @@ public class GpuQuad{ // quad camera description
 		final int tilesX =  img_width / GPUTileProcessor.DTT_SIZE;
 		final int tiles = pXpYD.length;
 		final Matrix [] corr_rots = geometryCorrection.getCorrVector().getRotMatrices(); // get array of per-sensor rotation matrices
-		final int quad_main = (geometryCorrection != null)? num_cams:0;
+		final int quad_main = num_cams; // (geometryCorrection != null)? num_cams:0;
 		final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(00);
 		final AtomicInteger aTiles = new AtomicInteger(0);
 		final TpTask[][] tp_tasks = new TpTask[2][tiles]; // aTiles.get()]; // [0] - main, [1] - shifted
 		final double mb_len_scale = -Math.log(1.0 - 1.0/mb_max_gain);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				@Override
 				public void run() {
 					for (int nTile = ai.getAndIncrement(); nTile < tiles; nTile = ai.getAndIncrement())
-						if ((pXpYD[nTile] != null) && (mb_vectors[nTile] != null) && ((selection == null) || selection[nTile])) {
+						if ((pXpYD[nTile] != null) &&
+								!Double.isNaN(mb_vectors[0][nTile]) &&
+								!Double.isNaN(mb_vectors[1][nTile]) &&
+								((selection == null) || selection[nTile])) {
 						int tileY = nTile / tilesX;
 						int tileX = nTile % tilesX;
 						TpTask tp_task =    new TpTask(num_cams, tileX, tileY);
@@ -3867,8 +3869,8 @@ public class GpuQuad{ // quad camera description
 						double [] centerXY = pXpYD[nTile];
 						tp_task.setCenterXY(centerXY); // this pair of coordinates will be used by GPU to set tp_task.xy and task.disp_dist!
 						// calculate offset for the secondary tile and weigh
-						double dx = mb_vectors[nTile][0];
+						double dx = mb_vectors[0][nTile];
-						double dy = mb_vectors[nTile][1];
+						double dy = mb_vectors[1][nTile];
 						double mb_len = Math.sqrt(dx*dx+dy*dy); // in pixels/s
 						dx /= mb_len; // unit vector
 						dy /= mb_len;
@@ -3887,7 +3889,6 @@ public class GpuQuad{ // quad camera description
 						double gain_sub = -gain * exp_offs;
 						tp_task.setScale(gain);
 						tp_task_sub.setScale(gain_sub);
 						boolean bad_margins = false;
 						if (calcPortsCoordinatesAndDerivatives) { // for non-GPU?
 							double [][] disp_dist = new double[quad_main][]; // used to correct 3D correlations (not yet used here)

--- a/src/main/java/com/elphel/imagej/tileprocessor/IntersceneLma.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/IntersceneLma.java
@@ -15,7 +15,7 @@ import javax.xml.bind.DatatypeConverter;
 import Jama.Matrix;
 public class IntersceneLma {
-	OpticalFlow opticalFlow = null;
+//	OpticalFlow opticalFlow = null;
 	QuadCLT [] scenesCLT =    null; // now will use just 2 - 0 -reference scene, 1 - scene.  
 	private double []         last_rms =        null; // {rms, rms_pure}, matching this.vector
 	private double []         good_or_bad_rms = null; // just for diagnostics, to read last (failed) rms
@@ -37,11 +37,11 @@ public class IntersceneLma {
 	private int               num_samples = 0;
 	private boolean           thread_invariant = true; // Do not use DoubleAdder, provide results not dependent on threads
 	public IntersceneLma(
-			OpticalFlow opticalFlow,
+//			OpticalFlow opticalFlow,
 			boolean thread_invariant
 			) {
 		this.thread_invariant = thread_invariant;
-		this.opticalFlow = opticalFlow;
+//		this.opticalFlow = opticalFlow;
 	}
 	public double [][]       getLastJT(){
@@ -549,7 +549,7 @@ public class IntersceneLma {
 	{
 		this.weights = new double [num_samples + parameters_vector.length];
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		double sum_weights;
 		if (thread_invariant) {
@@ -652,7 +652,7 @@ public class IntersceneLma {
 	private void normalizeWeights()
 	{
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		double full_weight, sum_weight_pure;
 		if (thread_invariant) {
@@ -763,7 +763,7 @@ public class IntersceneLma {
 				scene_atr, // double [] atr);
 				false)[0]; // boolean invert));
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
@@ -840,7 +840,7 @@ public class IntersceneLma {
 		final int num_pars2 = num_pars * num_pars;
 		final int nup_points = jt[0].length;
 		final double [][] wjtjl = new double [num_pars][num_pars];
-		final Thread[] threads = ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
@@ -876,7 +876,7 @@ public class IntersceneLma {
 			final double []   fx,
 			final double []   rms_fp // null or [2]
 			) {
-		final Thread[]      threads =     ImageDtt.newThreadArray(opticalFlow.threadsMax);
+		final Thread[]      threads =     ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai =          new AtomicInteger(0);
 		final double []     wymfw =       new double [fx.length];
 		double s_rms; 

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
@@ -4888,6 +4888,7 @@ public class OpticalFlow {
 	        							scenes_suffix,      // String        suffix,
 	        							ds_vantage[0],      // selected_disparity, // double []     ref_disparity,			
 	        							quadCLTs,           // QuadCLT []    quadCLTs,
+	        							threadsMax,         // int           threadsMax,
 	        							debugLevel);        // int           debugLevel);
 	        					if (save_mapped_mono_color[col_mode]) {	        	
 	        						quadCLTs[ref_index].saveImagePlusInModelDirectory(
@@ -5811,11 +5812,16 @@ public class OpticalFlow {
    		String         suffix_in,
    		double []      ref_disparity,			
    		QuadCLT []     quadCLTs,
+    		int            threadsMax,
    		int            debugLevel) {
    	double [] stereo_atr = ZERO3; // maybe later play with rotated camera
 		boolean um_mono =            clt_parameters.imp.um_mono;
 		double  um_sigma =           clt_parameters.imp.um_sigma;
 		double  um_weight =          clt_parameters.imp.um_weight;
+		boolean mb_en =       clt_parameters.imp.mb_en && (fov_tiles==null) && (mode3d > 0);
+		double  mb_tau =      clt_parameters.imp.mb_tau;      // 0.008; // time constant, sec
+		double  mb_max_gain = clt_parameters.imp.mb_max_gain; // 5.0;   // motion blur maximal gain (if more - move second point more than a pixel
    	final float fum_weight = (float)  um_weight; 
    	boolean merge_all = clt_parameters.imp.merge_all;
@@ -5836,6 +5842,15 @@ public class OpticalFlow {
        for (int i = 0; i < num_sens; i++) if (((sensor_mask >> i) & 1) != 0) channels[nch++] = i;
        ImageStack stack_scenes = null;
        int dbg_scene = -95;
+        double [][] ref_pXpYD = transformToScenePxPyD( // now should work with offset ref_scene
+        		fov_tiles,            // final Rectangle [] extra_woi,    // show larger than sensor WOI (or null)
+        		ref_disparity,        // final double []   disparity_ref, // invalid tiles - NaN in disparity
+				ZERO3,                // final double []   scene_xyz, // camera center in world coordinates
+				ZERO3,                // final double []   scene_atr, // camera orientation relative to world frame
+				quadCLTs[ref_index],  // final QuadCLT     scene_QuadClt,
+				quadCLTs[ref_index],  // final QuadCLT     reference_QuadClt, // now - may be null - for testing if scene is rotated ref
+				threadsMax);          // int               threadsMax)
 		for (int nscene =  0; nscene < quadCLTs.length ; nscene++) if (quadCLTs[nscene] != null){
 			if (nscene== dbg_scene) {
 				System.out.println("renderSceneSequence(): nscene = "+nscene);
@@ -5843,7 +5858,6 @@ public class OpticalFlow {
 			String ts = quadCLTs[nscene].getImageName();
 			double []   scene_xyz = ZERO3;
 			double []   scene_atr = ZERO3;
-//			if ((nscene != ref_index) && (mode3d >= 0)) {
 			if (nscene != ref_index) { // Check even for raw, so video frames will match in all modes 
 				scene_xyz = ers_reference.getSceneXYZ(ts);
 				scene_atr = ers_reference.getSceneATR(ts);
@@ -5871,7 +5885,80 @@ public class OpticalFlow {
 				scene_atr = combo_xyzatr[1];
 			}
 			int sm = merge_all? -1: sensor_mask;
-			ImagePlus imp_scene = QuadCLT.renderGPUFromDSI(
+			ImagePlus imp_scene = null;
+			double [][] dxyzatr_dt = null;
+			if (mb_en) {
+				get_velocities:
+				{
+					int nscene0 = nscene - 1;
+					if ((nscene0 < 0) ||
+							(quadCLTs[nscene0]== null)||
+							(ers_reference.getSceneXYZ(quadCLTs[nscene0].getImageName())== null) ||
+							(ers_reference.getSceneATR(quadCLTs[nscene0].getImageName())== null)) {
+						nscene0 = nscene;
+					}
+					int nscene1 = nscene + 1;
+					if ((nscene1 > ref_index) || (quadCLTs[nscene1]== null)) {
+						nscene1 = nscene;
+					}
+					if (nscene1 == nscene0) {
+						System.out.println("**** Isoloated scene!!! skipping... now may only happen for a ref_scene****");
+						break get_velocities;
+					}
+					double dt = quadCLTs[nscene1].getTimeStamp() - quadCLTs[nscene0].getTimeStamp();
+					String ts0 = quadCLTs[nscene0].getImageName();
+					String ts1 = quadCLTs[nscene1].getImageName();
+					double [] scene_xyz0 = ers_reference.getSceneXYZ(ts0);
+					double [] scene_atr0 = ers_reference.getSceneATR(ts0);
+					if (scene_xyz0 == null) {
+						System.out.println ("BUG: No egomotion data for timestamp "+ts0);
+						break get_velocities;
+					}
+					double [] scene_xyz1 = (nscene1== ref_index)? ZERO3:ers_reference.getSceneXYZ(ts1);
+					double [] scene_atr1 = (nscene1== ref_index)? ZERO3:ers_reference.getSceneATR(ts1);
+					dxyzatr_dt = new double[2][3];
+					for (int i = 0; i < 3; i++) {
+						dxyzatr_dt[0][i] = (scene_xyz1[i]-scene_xyz0[i])/dt;
+						dxyzatr_dt[1][i] = (scene_atr1[i]-scene_atr0[i])/dt;
+					}
+				}
+			}
+			if (mb_en && (dxyzatr_dt != null)) {
+				double [][] motion_blur = getMotionBlur(
+						quadCLTs[ref_index],   // QuadCLT        ref_scene,
+						quadCLTs[nscene],      // QuadCLT        scene,         // can be the same as ref_scene
+						ref_pXpYD,             // double [][]    ref_pXpYD,     // here it is scene, not reference!
+						scene_xyz,             // double []      camera_xyz,
+						scene_atr,             // double []      camera_atr,
+						dxyzatr_dt[0],         // double []      camera_xyz_dt,
+						dxyzatr_dt[1],         // double []      camera_atr_dt,
+						0,                     // int            shrink_gaps,  // will gaps, but not more that grow by this
+						debugLevel);           // int            debug_level)
+				imp_scene = QuadCLT.renderGPUFromDSI(
+						sm,                  // final int         sensor_mask,
+						merge_all,           // final boolean     merge_channels,
+						null,                // final Rectangle   full_woi_in,      // show larger than sensor WOI (or null)
+						clt_parameters,      // CLTParameters     clt_parameters,
+						ref_disparity,       // double []         disparity_ref,
+						// motion blur compensation 
+						mb_tau,              // double            mb_tau,      // 0.008; // time constant, sec
+						mb_max_gain,         // double            mb_max_gain, // 5.0;   // motion blur maximal gain (if more - move second point more than a pixel
+						motion_blur,         // double [][]       mb_vectors,  //
+						scene_xyz,           // final double []   scene_xyz, // camera center in world coordinates
+						scene_atr,           // final double []   scene_atr, // camera orientation relative to world frame
+						quadCLTs[nscene],    // final QuadCLT     scene,
+						quadCLTs[ref_index], // final QuadCLT     ref_scene, // now - may be null - for testing if scene is rotated ref
+						toRGB,               // final boolean     toRGB,
+						(toRGB? clt_parameters.imp.show_color_nan : clt_parameters.imp.show_mono_nan),
+						"", // String            suffix, no suffix here
+						QuadCLT.THREADS_MAX,          // int               threadsMax,
+						debugLevel);         // int         debugLevel)
+			} else {
+				imp_scene = QuadCLT.renderGPUFromDSI(
 						sm,                  // final int         sensor_mask,
 						merge_all,           // final boolean     merge_channels,
 						fov_tiles,           // testr, // null,                // final Rectangle   full_woi_in,      // show larger than sensor WOI (or null)
@@ -5887,6 +5974,7 @@ public class OpticalFlow {
 						"", // String            suffix, no suffix here
 						QuadCLT.THREADS_MAX,          // int               threadsMax,
 						debugLevel);         // int         debugLevel)
+			}
 			if (stack_scenes == null) {
 				stack_scenes = new ImageStack(imp_scene.getWidth(),imp_scene.getHeight());
 			}
@@ -13164,7 +13252,6 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 					iscale);           // int iscale) // 8
 		}
 		IntersceneLma intersceneLma = new IntersceneLma(
-				this, // OpticalFlow opticalFlow
 				clt_parameters.ilp.ilma_thread_invariant);
 		int nlma = 0;
 		int lmaResult = -1;
@@ -13507,11 +13594,15 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 					ZERO3, //, // dxyzatr_dt[nscene][0], // double []    ers_xyz_dt,
 					dxyzatr_dt[nscene][1]); // double []    ers_atr_dt)(ers_scene_original_xyz_dt);
 			if (dbg_mb_img != null) {
+				boolean show_corrected = false;
+				if (nscene == debug_scene) {
+					System.out.println("nscene = "+nscene);
+					System.out.println("nscene = "+nscene);
+				}
 				dbg_mb_img[nscene] = new double [tilesX*tilesY*2];
 				Arrays.fill(dbg_mb_img[nscene],Double.NaN);
 				double [] mb_scene_xyz = (nscene != ref_index)? ers_reference.getSceneXYZ(ts):ZERO3;
 				double [] mb_scene_atr = (nscene != ref_index)? ers_reference.getSceneATR(ts):ZERO3;
 				double [][] motion_blur = getMotionBlur(
 						quadCLTs[ref_index],   // QuadCLT        ref_scene,
 						quadCLTs[nscene],      // QuadCLT        scene,         // can be the same as ref_scene
@@ -13520,17 +13611,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 						mb_scene_atr,          // double []      camera_atr,
 						dxyzatr_dt[nscene][0], // double []      camera_xyz_dt,
 						dxyzatr_dt[nscene][1], // double []      camera_atr_dt,
+						-1,                    // int            shrink_gaps,  // will gaps, but not more that grow by this
 						debugLevel); // int            debug_level)
-				for (int nTile = 0; nTile < motion_blur.length; nTile++) if (motion_blur[nTile] != null) {
+				for (int nTile = 0; nTile < motion_blur[0].length; nTile++) {
 					int tx = nTile % tilesX;
 					int ty = nTile / tilesX;
-					dbg_mb_img[nscene][tx + tilesX * (ty*2 +0)] = motion_blur[nTile][0];
+					dbg_mb_img[nscene][tx + tilesX * (ty*2 +0)] = mb_tau * motion_blur[0][nTile];
-					dbg_mb_img[nscene][tx + tilesX * (ty*2 +1)] = motion_blur[nTile][1];
+					dbg_mb_img[nscene][tx + tilesX * (ty*2 +1)] = mb_tau * motion_blur[1][nTile];
-				}
-				boolean show_corrected = false;
-				if (nscene == debug_scene) {
-					System.out.println("nscene = "+nscene);
-					System.out.println("nscene = "+nscene);
 				}
 				while (show_corrected) {
 					ImagePlus imp_mbc = QuadCLT.renderGPUFromDSI(
@@ -13667,21 +13754,24 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 	 * @param camera_atr camera azimuth, tilt, roll relative to the reference
 	 * @param camera_xyz_dt camera linear velocities: x', y', z'
 	 * @param camera_atr_dt camera angular velocities: azimuth', tilt', roll'
+	 * @param shrink_gaps < 0 fill all gaps, 0 - do not fill gaps, >0 expand using growTiles, do not fill farther. 
 	 * @param debug_level debug level
-	 * @return per-tile array of {dx/dt, dy/dt} vectors, some may be null
+	 * @return per-tile array of [2][tiles] of dx/dt, dy/dt, some may be NaN
 	 */
-	public double [][] getMotionBlur(
+	public static double [][] getMotionBlur(
 			QuadCLT        ref_scene,
 			QuadCLT        scene,         // can be the same as ref_scene
-			double [][]    ref_pXpYD,
+			double [][]    ref_pXpYD,    // tilesX * tilesY
 			double []      camera_xyz,
 			double []      camera_atr,
 			double []      camera_xyz_dt,
 			double []      camera_atr_dt,
-//			boolean        fill_gaps,
+			int            shrink_gaps,  // will gaps, but not more that grow by this
 			int            debug_level)
 	{
+		int       num_passes = 100;
+		double    max_diff = 1E-4;
 		boolean[] param_select = new boolean[ErsCorrection.DP_NUM_PARS];
 		final int [] par_indices = new int[] {
 				ErsCorrection.DP_DSAZ,
@@ -13696,10 +13786,12 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 		final double [] camera_dt = new double[] {
 				camera_atr_dt[0], camera_atr_dt[1], camera_atr_dt[2],
 				camera_xyz_dt[0], camera_xyz_dt[1], camera_xyz_dt[2]};
-		final double [][] mb_vectors = new double [ref_pXpYD.length][];
+		final double [][] mb_vectors = new double [2][ref_pXpYD.length];
+		Arrays.fill(mb_vectors[0], Double.NaN);
+		Arrays.fill(mb_vectors[1], Double.NaN);
+		final int tilesX = ref_scene.tp.getTilesX();
+//		final int tilesY = ref_scene.tp.getTilesY();
 		IntersceneLma intersceneLma = new IntersceneLma(
-				this,          // OpticalFlow opticalFlow
 				false);        // clt_parameters.ilp.ilma_thread_invariant);
 		intersceneLma.prepareLMA(
 				camera_xyz,    // final double []   scene_xyz0,     // camera center in world coordinates (or null to use instance)
@@ -13713,28 +13805,170 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 				false,         // boolean           first_run,
 				debug_level);  // final int         debug_level)
 		final double [][] last_jt = intersceneLma. getLastJT(); // alternating x,y for each selected parameters
-		final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
+		int [] sensor_wh = ref_scene.getGeometryCorrection().getSensorWH();
+		final double width = sensor_wh[0];
+		final double height = sensor_wh[1];
+		final double min_disparity = -0.5;
+		final double max_disparity = 100.0;
+		final Thread[] threads = ImageDtt.newThreadArray(QuadCLT.THREADS_MAX);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				public void run() {
 					for (int nTile = ai.getAndIncrement(); nTile < ref_pXpYD.length; nTile = ai.getAndIncrement()) if (ref_pXpYD[nTile] != null){
-						mb_vectors[nTile]= new double[2];
+						if (    (ref_pXpYD[nTile][0] < 0) || (ref_pXpYD[nTile][0] >= width) ||
-						for (int i = 0; i < par_indices.length; i++) {
+								(ref_pXpYD[nTile][1] < 0) || (ref_pXpYD[nTile][1] >= height) ||
-							mb_vectors[nTile][0] += camera_dt[i] * last_jt[i][2*nTile + 0];
+								(ref_pXpYD[nTile][2] < min_disparity) || (ref_pXpYD[nTile][2] >= max_disparity)) {
-							mb_vectors[nTile][1] += camera_dt[i] * last_jt[i][2*nTile + 1];
+							continue;
 						}
-						if (Double.isNaN(mb_vectors[nTile][0]) || Double.isNaN(mb_vectors[nTile][1])) {
+						mb_vectors[0][nTile] = 0.0;
-							mb_vectors[nTile] = null;
+						mb_vectors[1][nTile] = 0.0;
+						for (int i = 0; i < par_indices.length; i++) {
+							mb_vectors[0][nTile] += camera_dt[i] * last_jt[i][2*nTile + 0];
+							mb_vectors[1][nTile] += camera_dt[i] * last_jt[i][2*nTile + 1];
 						}
 					}
 				}
 			};
 		}		      
 		ImageDtt.startAndJoin(threads);
+		if (shrink_gaps != 0) {
+			for (int dim = 0; dim < mb_vectors.length; dim++) {
+				mb_vectors[dim] =  fillGapsDouble(
+						mb_vectors[dim], // double []  data,
+						null, // boolean [] mask_in, // do not process if false (may be null)
+						tilesX, // int       width,
+						(shrink_gaps > 0) ? shrink_gaps: 0, // int       max_grow,
+						num_passes, // int       num_passes,
+						max_diff, // double    max_diff,
+						QuadCLT.THREADS_MAX,          // int               threadsMax,
+						debug_level); // int       debug_level)
+			}
+		}
 		return mb_vectors;
 	}
+	public static double[] fillGapsDouble(
+			double []  data,
+			boolean [] mask_in, // do not process if false (may be null)
+			int       width,
+			int       max_grow,
+			int       num_passes,
+			double    max_diff,
+			int       threadsMax,
+			int       debug_level)
+	{
+		final double    max_diff2 = max_diff * max_diff;
+        final double   diagonal_weight = 0.5 * Math.sqrt(2.0); // relative to ortho
+		double wdiag = 0.25 *diagonal_weight / (diagonal_weight + 1.0);
+		double wortho = 0.25 / (diagonal_weight + 1.0);
+		final double [] neibw = {wortho, wdiag, wortho, wdiag, wortho, wdiag, wortho, wdiag}; 
+		final int tiles = data.length;
+		final int height = tiles/width;
+		final double [] data_in = data.clone();
+		final double [] data_out = data.clone();
+		final boolean [] mask = (mask_in==null) ? new boolean[tiles]: mask_in.clone();
+		if (mask_in == null) {
+			if (max_grow == 0) {
+				Arrays.fill(mask,  true);
+			} else {
+				for (int i = 0; i < tiles; i++) {
+					mask[i] = !Double.isNaN(data[i]);
+				}
+				TileProcessor.growTiles(
+						max_grow, // grow,           // grow tile selection by 1 over non-background tiles 1: 4 directions, 2 - 8 directions, 3 - 8 by 1, 4 by 1 more
+						mask,     // tiles,
+						null,     // prohibit,
+						width,
+						height); 
+			}
+		}
+		final TileNeibs tn =  new TileNeibs(width, height);
+		final int [] tile_indices = new int [tiles];
+		final boolean [] floating =      new boolean[tiles]; // which tiles will change
+		final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		final AtomicInteger anum_gaps = new AtomicInteger(0);
+		final int dbg_tile = -3379;
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < tiles; nTile = ai.getAndIncrement()) {
+						if (mask[nTile] && Double.isNaN(data[nTile])){
+							int indx = anum_gaps.getAndIncrement();
+							tile_indices[indx] = nTile;
+							floating[nTile] = true;
+						}
+					}
+				}
+			};
+		}		      
+		ImageDtt.startAndJoin(threads);
+		ai.set(0);
+		final int num_gaps = anum_gaps.get(); 
+		if (num_gaps == 0) {
+			return data_in; // no gaps already
+		}
+		final boolean [] fill_all = {false};
+		DoubleAccumulator amax_diff =  new DoubleAccumulator (Double::max, Double.NEGATIVE_INFINITY);
+		for (int npass = 0; npass < num_passes; npass+= fill_all[0]? 1:0 ) { // do not limit initial passes
+			anum_gaps.set(0);
+			amax_diff.reset();
+			for (int ithread = 0; ithread < threads.length; ithread++) {
+				threads[ithread] = new Thread() {
+					public void run() {
+						for (int indx = ai.getAndIncrement(); indx < num_gaps; indx = ai.getAndIncrement()) {
+							int nTile = tile_indices[indx];
+							if ((debug_level >0) && (nTile == dbg_tile)) {
+								System.out.println("fillDisparityStrength() nTile="+nTile);
+							}
+							if (!fill_all[0] && !Double.isNaN(data_in[nTile])) {
+								continue; // fill only new
+							}
+							double swd = 0.0, sw = 0.0;
+							for (int dir = 0; dir < 8; dir++) {
+								int nt_neib = tn.getNeibIndex(nTile, dir);
+								if ((nt_neib >= 0) && !Double.isNaN(data_in[nt_neib])) {
+									sw += neibw[dir];
+									swd +=  neibw[dir] * data_in[nt_neib];
+								}
+							}
+							if (sw > 0) {
+								double new_val = swd/sw;
+								double d = new_val -  data_in[nTile];
+								double d2 = d * d;
+								amax_diff.accumulate(d2);
+								data_out[nTile] = new_val;
+							} else {
+								anum_gaps.getAndIncrement();	
+							}
+						}
+					}
+				};
+			}		      
+			ImageDtt.startAndJoin(threads);
+			ai.set(0);
+			System.arraycopy(data_out, 0, data_in, 0, tiles);
+			if ((debug_level > 0) && fill_all[0]) {
+				System.out.println("fillGapsDouble() num_gaps="+num_gaps+", npass="+npass+", change="+Math.sqrt(amax_diff.get())+" ("+max_diff+")");
+			}
+			if (fill_all[0] && (amax_diff.get() < max_diff2)) {
+				break; // all done
+			}
+			if (anum_gaps.get() == 0) { // no new tiles filled
+				fill_all[0] = true; 
+			}
+			if ((debug_level>0) && (npass == (num_passes-1))){
+				System.out.println("fillGapsDouble() LAST PASS ! npass="+npass+", change="+Math.sqrt(amax_diff.get())+" ("+max_diff+")");
+				System.out.println("fillGapsDouble() LAST PASS ! npass="+npass+", change="+Math.sqrt(amax_diff.get())+" ("+max_diff+")");
+				System.out.println("fillGapsDouble() LAST PASS ! npass="+npass+", change="+Math.sqrt(amax_diff.get())+" ("+max_diff+")");
+			}
+		} // for (int npass = 0; npass < num_passes; npass+= fill_all[0]? 1:0 )
+		return data_out;
+	}
 	public double[][]  adjustPairsLMAInterscene(
 			CLTParameters  clt_parameters,			
@@ -13754,7 +13988,6 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 		int        sensor_mask_inter = clt_parameters.imp.sensor_mask_inter ; //-1;
 		float [][][] facc_2d_img = new float [1][][];
 		IntersceneLma intersceneLma = new IntersceneLma(
-				this, // OpticalFlow opticalFlow
 				clt_parameters.ilp.ilma_thread_invariant);
 		int lmaResult = -1;
 		boolean last_run = false;
@@ -14029,7 +14262,6 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 					iscale);           // int iscale) // 8
 		}
 		IntersceneLma intersceneLma = new IntersceneLma(
-				this, // OpticalFlow opticalFlow
 				clt_parameters.ilp.ilma_thread_invariant);
 		for (int nlma = 0; nlma < clt_parameters.ilp.ilma_num_corr; nlma++) {
 			boolean last_run = nlma == ( clt_parameters.ilp.ilma_num_corr - 1);
@@ -14102,9 +14334,6 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 						macroTilesX); // int         width)	
 			}
-//			IntersceneLma intersceneLma = new IntersceneLma(
-//					this); // OpticalFlow opticalFlow
 			intersceneLma.prepareLMA(
 					camera_xyz0,                                    // final double []   scene_xyz0,     // camera center in world coordinates (or null to use instance)
 					camera_atr0,                                    // final double []   scene_atr0,     // camera orientation relative to world frame (or null to use instance)

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -2706,7 +2706,7 @@ public class QuadCLT extends QuadCLTCPU {
 			// motion blur compensation 
 			double            mb_tau,      // 0.008; // time constant, sec
 			double            mb_max_gain, // 5.0;   // motion blur maximal gain (if more - move second point more than a pixel
-			double [][]       mb_vectors,  //
+			double [][]       mb_vectors,  // now [2][ntiles];
 			final double []   scene_xyz, // camera center in world coordinates
 			final double []   scene_atr, // camera orientation relative to world frame
@@ -2740,14 +2740,14 @@ public class QuadCLT extends QuadCLTCPU {
 			for (int i = 0; i < dbg_img.length; i++) {
 				Arrays.fill(dbg_img[i], Double.NaN);
 			}
-			for (int nTile = 0; nTile < pXpYD.length; nTile++) if (pXpYD[nTile] != null){
+			for (int nTile = 0; nTile < pXpYD.length; nTile++){
+				if (pXpYD[nTile] != null) {
 					for (int i = 0; i < pXpYD[nTile].length; i++) {
 						dbg_img[i][nTile] = pXpYD[nTile][i];
 					}
-				if (mb_vectors[nTile]!=null) {
-					for (int i = 0; i <2; i++) {
-						dbg_img[3 + i][nTile] =  mb_tau * mb_vectors[nTile][i];
 				}
+				for (int i = 0; i <2; i++) {
+					dbg_img[3 + i][nTile] =  mb_tau * mb_vectors[i][nTile];
 				}
 			}
 			(new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
@@ -2804,8 +2804,8 @@ public class QuadCLT extends QuadCLTCPU {
 	    		full_woi_in.width * GPUTileProcessor.DTT_SIZE,
 	    		full_woi_in.height * GPUTileProcessor.DTT_SIZE};
 	    int                 erase_clt = show_nan ? 1:0;
-	    boolean test1 = true;
+//	    boolean test1 = true;
-	    if ((mb_vectors!=null) && test1) {
+	    if (mb_vectors!=null) {// && test1) {
 	    	image_dtt.setReferenceTDMotionBlur( // change to main?
 	    			erase_clt, //final int                 erase_clt,
 	    			wh, // null,                       // final int []              wh,               // null (use sensor dimensions) or pair {width, height} in pixels

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -862,6 +862,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -3118,7 +3119,7 @@ __global__ void convert_correct_tiles(
 	int thread0 =  threadIdx.x & 1; // 0,1
 	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change
-	float * tp = tp0 + tp_task_xy_offset + threadIdx.x;
+	float * tp = tp0 + TP_TASK_XY_OFFSET + threadIdx.x;
 	if (thread12 < num_cams) {
 		tt[tile_in_block].xy[thread12][thread0] = *(tp);        // gpu_task -> xy[thread12][thread0];
 	}
@@ -3135,7 +3136,9 @@ __global__ void convert_correct_tiles(
 	if (threadIdx.x == 0){ // only one thread calculates, others - wait
 		tt[tile_in_block].task = *(int *)     (tp0++);    // get first integer value
 		tt[tile_in_block].txy =  *(int *)     (tp0++);    // get second integer value
-		tt[tile_in_block].target_disparity = *(tp0++);    //
+		tt[tile_in_block].target_disparity = *(tp0);      //
+		tp0 +=3; // skip centerXY and previous increment (was tt[tile_in_block].target_disparity = *(tp0++);
+		tt[tile_in_block].scale =            *(tp0++);    // get scale to multiply before accumulating/saving
 	}
 	// float centerXY[2] is not used/copied here
@@ -3168,6 +3171,7 @@ __global__ void convert_correct_tiles(
 					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
 					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
 					tt[tile_in_block].txy,           // const int txy,
+					tt[tile_in_block].scale,         // const float           tscale,
 					dstride,                         // size_t            dstride, // in floats (pixels)
 					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
 					(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
@@ -4457,6 +4461,7 @@ __device__ void normalizeTileAmplitude(
 * @param centerX              full X-offset of the tile center, calculated from the geometry, distortions and disparity
 * @param centerY              full Y-offset of the tile center
 * @param txy                  integer value combining tile X (low 16 bits) and tile Y (high 16 bits)
+ * @param tscale               float value to scale result. 0 - set. >0 scale and set, <0 subtract
 * @param dstride              stride (in floats) for the input Bayer images
 * @param clt_tile             image tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
 * @param clt_kernels          kernel tile in shared memory [4][DTT_SIZE][DTT_SIZE1] (just allocated)
@@ -4482,6 +4487,7 @@ __device__ void convertCorrectTile(
 		const float           centerX,
 		const float           centerY,
 		const int             txy,
+		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -5078,7 +5084,7 @@ __device__ void convertCorrectTile(
 #endif
+    if (tscale == 0) { // just set w/o scaling
 #pragma unroll
    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
    		// shared memory tiles use DTT_SIZE1
@@ -5086,10 +5092,24 @@ __device__ void convertCorrectTile(
    		clt_src   += DTT_SIZE1;
    		clt_dst   += DTT_SIZE;
    	}
+    } else if (tscale > 0) { // positive - scale and set. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst =  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
+    } else { // negative - scale and subtract from existing. For motion blur positive should be first
+#pragma unroll
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory tiles use DTT_SIZE1
+    		*clt_dst +=  *clt_src * tscale;
+    		clt_src   += DTT_SIZE1;
+    		clt_dst   += DTT_SIZE;
+    	}
+    }
    __syncthreads();// __syncwarp();
-    // just for testing perform imclt, save result to clt_kernels
-//#endif
 }

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -460,11 +460,11 @@ extern "C" __global__ void get_tiles_offsets(
 	// common code, calculated in parallel
 ///	int cxy = gpu_tasks[task_num].txy;
 ///	float disparity = gpu_tasks[task_num].target_disparity;
-	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
+	float disparity = * (gpu_ftasks +  task_size * task_num + TP_TASK_DISPARITY_OFFSET);
-	float *centerXY =    gpu_ftasks +  task_size * task_num + tp_task_centerXY_offset;
+	float *centerXY =    gpu_ftasks +  task_size * task_num + TP_TASK_CENTERXY_OFFSET;
 	float px =  *(centerXY);
 	float py =  *(centerXY + 1);
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
+	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + TP_TASK_TXY_OFFSET);
 	int tileX = (cxy & 0xffff);
 	int tileY = (cxy >> 16);
@@ -705,7 +705,7 @@ extern "C" __global__ void get_tiles_offsets(
 ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
 ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
 ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+	float * disp_dist_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
 	*(disp_dist_p++) = disp_dist[0]; // global memory
 	*(disp_dist_p++) = disp_dist[1];
 	*(disp_dist_p++) = disp_dist[2];
@@ -768,7 +768,7 @@ extern "C" __global__ void get_tiles_offsets(
 //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
 //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	// .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+	float * tile_xy_p = gpu_ftasks +  task_size * task_num + TP_TASK_XY_OFFSET + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
 	*(tile_xy_p++) = pXY[0]; // global memory
 	*(tile_xy_p++) = pXY[1]; // global memory
 }

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -64,13 +64,19 @@ struct tp_task {
 	float target_disparity;
 	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
 	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
-	float xy[NUM_CAMS][2];
+	// scale == 0 - old way, just set. Scale !=0 - accumulate. Or make > 0 - set too? only negative - subtract?
+	float scale;                // multiply during direct conversion before accumulating in TD - used for motion blur correction
+	float xy       [NUM_CAMS][2];
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };
 #define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
-#define tp_task_xy_offset 5
+#define TP_TASK_TASK_OFFSET      0
-#define tp_task_centerXY_offset 3
+#define TP_TASK_TXY_OFFSET       1
+#define TP_TASK_DISPARITY_OFFSET 2
+#define TP_TASK_CENTERXY_OFFSET  3
+#define TP_TASK_SCALE_OFFSET     5
+#define TP_TASK_XY_OFFSET        6
 struct corr_vector{
 	float tilt    [NUM_CAMS-1]; // 0..2