/**
 ** ERSTiltLMA - Extract ERS-related errors (affine distortions that do not match
 ** known ground plane tilts) by comparing multiple overlapping pairs. If there were
 ** no such errors, then combined affine transforms calculated from the local ground
 ** plane tilts should match affine transforms for image pairs. Additional condition - 
 ** minimization of the per-scene ERS correction transforms (weighted by the number
 ** of participating pairs and their overlap, and ???)
 **
 ** The ERS errors possibly come for faulty absolute ERS correction that was not
 ** tested - only relative to a single reference scene. Maybe it can be improved in
 ** the future.  
 **
 ** Copyright (C) 2024 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
 **  PairwiseOrthoMatch.java is free software: you can redistribute it and/or modify
 **  it under the terms of the GNU General Public License as published by
 **  the Free Software Foundation, either version 3 of the License, or
 **  (at your option) any later version.
 **
 **  This program is distributed in the hope that it will be useful,
 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 **  GNU General Public License for more details.
 **
 **  You should have received a copy of the GNU General Public License
 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ** -----------------------------------------------------------------------------**
 **
 */
package com.elphel.imagej.orthomosaic;

import java.util.concurrent.atomic.AtomicInteger;

import com.elphel.imagej.tileprocessor.ImageDtt;

import Jama.Matrix;

public class ERSTiltLMA {
	public int []             indices;
	public int [][]           cpairs =          null;
	public int                num_scenes = 0;
	public int                num_pairs = 0;
	private double []         last_rms =        null; // {rms, rms_pure}, matching this.vector
	private double []         good_or_bad_rms = null; // just for diagnostics, to read last (failed) rms
	private double []         initial_rms =     null; // {rms, rms_pure}, first-calcualted rms
	private double []         parameters_vector = null;
//	private double []         x_vector =        null; // not used. Save total weight
//	private double []         y_vector =        null;
//	private double            weight =          0;      // total weight
	private double            weight_pure =     0;
	private double []         weights; // normalized so sum is 1.0 for all - samples and extra regularization
	private double []         last_ymfx =       null;
	private double [][]       last_jt =         null;
	
	public  double [][][]     aff_pairs_inosr = null; // inverted pairs affines w/o scale and rotation
	public  double [][][][]   aff_tilts =       null;
	public  boolean           invert_q2a =      false;
	// debug
	public  double [][][]     aff_pairs =       null; // pairwise image-based affines
	public  double [][]       diff_tilts =      null; // pairwise differential tilts
	
	public boolean            last3only =       false;  // true; // debug feature
	public double             scale_pairs =     1e3; // 1e4; // increase fX and derivatives for pairs
	public double             delta=            1e-7;
	public double fx_scale = 1;
	
	public double [][] getPseudoXY(){
		double [][] pseudo_xy = new double [num_scenes][];
		for (int nscene = 0; nscene < num_scenes; nscene++) {
			pseudo_xy[nscene] = new double [] {parameters_vector[2*nscene], parameters_vector[2*nscene+1]};
		}
		return pseudo_xy;
	}
	
	public double [][] getPseudoXY(int npair){
		return getPseudoXY(parameters_vector, npair);
	}
	public double [][] getPseudoXY(double [] vector, int npair){
		double [][]   pseudo_xy= new double[2][];
		for (int i = 0; i < 2; i++) {
			pseudo_xy[i] = new double [] {vector[2*cpairs[npair][i]],vector[2*cpairs[npair][i]+1]}; 
		}
		return pseudo_xy;
	}
	
	public double [][][] getERSAffines(){
		double [][][] ers_affines = new double [num_scenes][][];
		for (int nscene = 0; nscene < num_scenes; nscene++) {
			ers_affines[nscene] = QuatUtils.pseudoTiltToAffine(new double [] {parameters_vector[2*nscene], parameters_vector[2*nscene+1]});
		}
		return ers_affines;
	}
	
	public SingularValueDecomposition [] getSVD() {
		SingularValueDecomposition [] svd = new  SingularValueDecomposition[num_scenes];
		double [][][] ers_affines = getERSAffines();
		for (int nscene = 0; nscene < num_scenes; nscene++) {
			svd[nscene] = SingularValueDecomposition.singularValueDecompose(ers_affines[nscene]);
		}
		return svd;
	}
	
	/**
	 * Get differential tilts between the second and the first scenes in each pair
	 * @return [pair]{tiltX,tiltY}
	 */
	public double [][] getDiffTilts(){
		return diff_tilts;
	}
	
	
	public void printSceneResults(boolean degrees, boolean percents) {
		double [][] pseudo_xy = getPseudoXY();
		double [][][] ers_affines = getERSAffines();
		SingularValueDecomposition [] svd = getSVD();
		String svd_title = SingularValueDecomposition.titleString(degrees);
		System.out.println(String.format("%4s\t%4s\t%11s\t%11s\t%11s\t%s\t%11s\t%11s\t%11s\t%11s",
				"#", "scn", "px","py","r",svd_title,"aff[0][0]","aff[0][1]","aff[1][0]","aff[1][1]"));
		for (int nscene=0; nscene < num_scenes; nscene++) {
			double px = pseudo_xy[nscene][0], py = pseudo_xy[nscene][1];
			double r = Math.sqrt(px*px+py*py);
			double [][] aff = ers_affines[nscene];
			String ssvd = svd[nscene].toString(degrees, 1); 
			System.out.println(String.format("%4d\t%4d\t%11.8f\t%11.8f\t%11.8f\t%s\t%11.8f\t%11.8f\t%11.8f\t%11.8f",
					nscene, indices[nscene], px,py,r,ssvd,aff[0][0],aff[0][1],aff[1][0],aff[1][1]));
		}
		return;
	}
	public void printPairsResults(boolean degrees, boolean percents) {
		double sd = degrees? 180/Math.PI : 1.0;
		double sp = percents? 100 : 1;
		String svd_title = SingularValueDecomposition.titleString(degrees);
		double [] fx = getFxDerivs(
				parameters_vector, // double []         vector,
				null,              // final double [][] jt, // should be null or initialized with [vector.length][]
				0);                // final int         debug_level)
		
		String tab_title_rad =      String.format(
				"%4s\t%4s\t%4s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%s",
				"#","scn0","scn1","tiltX","tiltY","tilt","tilt-dir","elevation","pair_\u0394w","residual",svd_title);
		String tab_title_deg =      String.format(
				"%4s\t%4s\t%4s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%s",
				"#","scn0","scn1","tiltX","tiltY","tilt","tilt-dir\u00B0","elevation\u00B0","pair_\u0394w","residual",svd_title);
		String tab_title_rad_perc = String.format(
				"%4s\t%4s\t%4s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%s",
				"#","scn0","scn1","tiltX%","tiltY%","tilt%","tilt-dir","elevation","pair_\u0394w%","residual%",svd_title);
		String tab_title_deg_perc = String.format(
				"%4s\t%4s\t%4s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%11s\t%s",
				"#","scn0","scn1","tiltX%","tiltY%","tilt%","tilt-dir\u00B0","elevation\u00B0","pair_\u0394w%","residual%",svd_title);
		String title =degrees ? (percents?tab_title_deg_perc:tab_title_deg) : (percents?tab_title_rad_perc:tab_title_rad);
		
		String fmt_rad=     "%4d\t%4d\t%4d\t%11.8f\t%11.8f\t%11.8f\t%11.8f\t%11.8f\t%11.8f\t%11.8f\t%s";
		String fmt_deg=     "%4d\t%4d\t%4d\t%11.8f\t%11.8f\t%11.8f\t%11.6f\t%11.6f\t%11.8f\t%11.8f\t%s";
		String fmt_rad_perc="%4d\t%4d\t%4d\t%11.6f%%\t%11.6f%%\t%11.6f%%\t%11.8f\t%11.8f\t%11.6f%%\t%11.6f%%\t%s";
		String fmt_deg_perc="%4d\t%4d\t%4d\t%11.6f%%\t%11.6f%%\t%11.6f%%\t%11.6f\t%11.6f\t%11.6f%%\t%11.6f%%\t%s";
		String fmt =degrees ? (percents?fmt_deg_perc:fmt_deg) : (percents?fmt_rad_perc:fmt_rad);
		
		System.out.println(title);

		double [] fit_err = new double [num_pairs]; 
		for (int npair=0; npair < num_pairs; npair++) {
			int [] pscenes = {indices[cpairs[npair][0]], indices[cpairs[npair][1]]};
			double [] dtilt_xy = diff_tilts[npair];
			double dtilt = Math.sqrt(dtilt_xy[0]*dtilt_xy[0]+dtilt_xy[1]*dtilt_xy[1]);
			double tilt_dir = Math.atan2(dtilt_xy[1],dtilt_xy[0]);
			double tilt_ang = Math.atan(dtilt);
			double [][] aff_pair = aff_pairs[npair]; // image-based affine for a pair
			SingularValueDecomposition svd_aff_pair = SingularValueDecomposition.singularValueDecompose(aff_pair);
			// just to see - which is zeroed - beta or gamma? gamma changes, beta stays
			String ssvd = svd_aff_pair.toString(degrees, 1);
			double pair_dw = svd_aff_pair.getDW();
			fit_err[npair] = fx[npair] / scale_pairs; // dW -> %
			System.out.println(String.format(fmt,
					npair, pscenes[0], pscenes[1], sp*dtilt_xy[0], sp*dtilt_xy[1], sp*dtilt, sd*tilt_dir, sd*tilt_ang,
					sp*pair_dw, sp*fit_err[npair], ssvd));
			

		}
		return;
	
	}	
	
	/*
	 * Starting with tilts0 and tilt1 = tilt0+tilt_diff (differential
	 * tilt between the two scenes has better accuracy than individual
	 * scenes ground planes tilts as it less depends on the scene
	 * flatness - maybe compare difference tilt to the difference of the
	 * individual ones to determine applicability of this method.
	 * Then affine transforms for each of the scenes relative to the local
	 * ground surface plane is:
	 * AFF0= AFF_ERS0 * AFF_TILT0 // (AFF_TILT0 calculated from the TILT0 only)
	 * and
	 * AFF1= AFF_ERS1 * AFF_TILT1
	 * AFF_DIFF = AFF_ERS1 * AFF_ERS0.inverse() - affine transform from scene0
	 * to scene 1 that should match independently measured (from image comparison)
	 * AFF_PAIR_NOROT (with rotation and scale removed)
	 * The AFF_ERR = AFF_DIFF * AFF_PAIR_NOROT.inverse() should have tilt minimized,
	 * and scale ignored.
	 * 
	 * AFF_ERS<0,1> are defined by direction angle (beta=-gamma as no rotation) and
	 * k=w2/w1 = w2 assuming w1==1 >= w2
	 * 
	 * So there will be 4 parameters per pair and (1-AFF_ERR.w2/AFF_ERR.w1) as output        
	 * 
	 */
	public int prepareLMA(
			int    []     indices, // should all be used
			int    [][]   cpairs,
			double []     weights_scenes, // sfm, number used?
			double []     weights_pairs, // from matching tilts(flatness) (and worst sfm, per-pair rmse)?
			double        weight_pairs_k,
			double [][][] tilts,      // [pair][scene(2)][tilt(2)]
			double [][][] affine_pairs,
			int           debug_level) {
		this.indices =         indices;  // maybe not needed
		this.cpairs = cpairs;
		num_scenes = indices.length;
		num_pairs = cpairs.length;
		aff_pairs = new double [num_pairs][][]; // preserving for stats, not used in processing
		diff_tilts= new double [num_pairs][];
		aff_pairs_inosr = new double [num_pairs][][];
		aff_tilts =      new double  [num_pairs][2][][];
		for (int npair = 0; npair < num_pairs; npair++) {
			aff_pairs[npair] =  affine_pairs[npair]; // preserving for stats
			diff_tilts[npair] = new double [] {tilts[npair][1][0]-tilts[npair][0][0],tilts[npair][1][1]-tilts[npair][0][1]};
			double [][] affine_pair_nors = SingularValueDecomposition.removeTiltRotScale(
					affine_pairs[npair], // double [][] A,
	    			false,               // boolean removeTilt,
	    			true,                // boolean removeRot,
	    			true,                // boolean removeScale,
	    			false,               // boolean removeOffset,
	    			false);              // boolean max_is_scale);
			aff_pairs_inosr[npair] = QuatUtils.matInverse2x2(affine_pair_nors); // maybe reverse order and do not use inversion?
			for (int i = 0; i < 2; i++) {
				aff_tilts[npair][i] = QuatUtils.tiltToAffine( // convert tilts to affines as they will not be modified later
						tilts[npair][i]);        // double [] tilt,
			}
		}
		parameters_vector = new double [2 * num_scenes];
		weights = new double [num_pairs + 2 * num_scenes];
		double sum_weights = 0;
		for (int i = 0; i < num_pairs; i++) {
			weights[i] = weights_pairs[i] * weight_pairs_k;
			sum_weights += weights[i]; 
		}
		weight_pure= sum_weights;
		for (int i= 0; i < num_scenes; i++) {
			weights[num_pairs + 2*i + 0] =  weights_scenes[i];
			weights[num_pairs + 2*i + 1] =  weights_scenes[i];
			sum_weights += 2 * weights_scenes[i];
		}
		double k = 1.0/sum_weights;
		for (int i = 0; i < weights.length; i++) {
			weights[i] *= k;
		}
		weight_pure *= k;
		last_jt = new double [parameters_vector.length][];
		return weights.length;
	}
	
	private double [] getFxDerivs(
			final double []         vector,
			final double [][] jt, // should be null or initialized with [vector.length][]
			final int         debug_level)
	{
		double [] fX = new double [weights.length]; // num_pairs + vector.length];
		if (jt != null) {
			for (int i = 0; i < jt.length; i++) {
				jt[i] = new double [weights.length]; // weights.length];
			}
		}

		final Thread[] threads = ImageDtt.newThreadArray();
		final AtomicInteger ai = new AtomicInteger(0);
		// first cycle - minimize per-pair errors (differences between singular values)
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					for (int npair = ai.getAndIncrement(); npair < num_pairs; npair = ai.getAndIncrement()) {
						/*
						double [][]   pseudo_xy= new double[2][];
						for (int i = 0; i < 2; i++) {
							pseudo_xy[i] = new double [] {vector[2*cpairs[npair][i]],vector[2*cpairs[npair][i]+1]}; 
						}
						*/
						double [][] pseudo_xy = getPseudoXY(vector,npair);
						double [][][] aff_err = QuatUtils.pseudoAffineDiffAndDerivatives( //affineDiffAndDerivatives(
								pseudo_xy,                   // double [][]   txy,   // [scene][direction]
								aff_tilts[npair],      // double [][][] a_tilts,
								aff_pairs_inosr[npair], //  double [][]   iaff_pair, // affine pair inversed
								invert_q2a); // boolean       invert_q2a)  // invert result affines (to match "usual")
						if (jt == null) {
							aff_err = new double [][][] {aff_err[0]}; //
						}
						double [][] WdW = SingularValueDecomposition.getMinMaxEigenValues(
								aff_err); // double [][][] AdA)
						fX[npair] = scale_pairs * (WdW[0][1] - WdW[0][0]);
						if (jt != null) {
							for (int nscene = 0; nscene < 2; nscene++) {
								for (int j = 0; j < 2; j++) { // 2 parameters per scene
									jt[2*cpairs[npair][nscene]+j][npair] = scale_pairs* (WdW[2*nscene+j+1][1] - WdW[2*nscene+j+1][0]);
								}
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		ai.set(0);
		// second cycle - minimize ERS corrections (tilts)
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					for (int nscene = ai.getAndIncrement(); nscene < num_scenes; nscene = ai.getAndIncrement()) {
						int pindx = num_pairs + 2*nscene;
						for (int j = 0; j < 2; j++) {
							fX[pindx+j] =  vector[2*nscene + j];
							if (jt != null) {
//								jt[2*nscene + 1][pindx] = 1.0; // only for w, does not depend on beta
								jt[2*nscene + j][pindx + j] = 1.0; // for both pseudo_x and pseudo-Y								
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		return fX;
	}
	
	
	private double [][] getFxDerivsDelta(
			double []         vector,
			final double      delta,
			final int         debug_level) {
		double [][] jt =  new double [vector.length][weights.length];
		for (int nv = 0; nv < vector.length; nv++) {
			double [] vpm = vector.clone();
			vpm[nv]+= 0.5*delta;
			double [] fx_p =  getFxDerivs(
					vpm,
					null, // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);
			vpm[nv]-= delta;
			double [] fx_m =  getFxDerivs(
					vpm,
					null, // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);
			for (int i = 0; i < weights.length; i++) if (weights[i] > 0) {
				jt[nv][i] = (fx_p[i]-fx_m[i])/delta;
			}
		}
		return jt;
	}

	private double compareJT(
			double [] vector,
			double    delta,
			boolean   last3only) { // do not process samples - they are tested before
		double []  errors=new double [vector.length];
		double [][] jt =  new double [vector.length][];
		System.out.print("Parameters vector = [");
		for (int i = 0; i < vector.length; i++) {
			System.out.print(vector[i]);
			if (i < (vector.length -1)) System.out.print(", ");
		}
		System.out.println("]");
		getFxDerivs(
				vector,
				jt, // final double [][] jt, // should be null or initialized with [vector.length][]
				1); // debug_level);
		double [][] jt_delta =  getFxDerivsDelta(
				vector, // double []         vector,
				delta, // final double      delta,
				-1); // final int         debug_level)
		int start_index = last3only? (weights.length-3) : 0;
		for (int n = start_index; n < weights.length; n++) if (weights[n] > 0) {
			System.out.print(String.format("%3d",n));
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt[i][n]));
			}			
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt_delta[i][n]));
			}			
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt[i][n]-jt_delta[i][n]));
			}			
			System.out.println();
			/*
			System.out.println(String.format(
					"%3d\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f\t%12.9f",
					n, jt[0][n], jt[1][n], jt[2][n], jt[3][n],
					jt_delta[0][n], jt_delta[1][n], jt_delta[2][n], jt_delta[3][n],
					jt[0][n]-jt_delta[0][n],jt[1][n]-jt_delta[1][n],jt[2][n]-jt_delta[2][n],jt[3][n]-jt_delta[3][n]));
					*/
			for (int i = 0; i < vector.length; i++) {
				errors[i] = Math.max(errors[i], jt[i][n]-jt_delta[i][n]);
			}
		}
		for (int i = 0; i < vector.length; i++) {
			System.out.print("\t\t");
		}			
		for (int i = 0; i < vector.length; i++) {
			System.out.print(String.format("\t%12.9f",errors[i]));
		}			
        /*
		System.out.println(String.format(
				"-\t-\t-\t-\t-\t-\t-\t-\t-\t%12.9f\t%12.9f\t%12.9f\t%12.9f",
				errors[0], errors[1], errors[2], errors[3]));
				*/
		double err=0;
		for (int i = 0; i < vector.length; i++) {
			err = Math.max(errors[i], err);
		}
		return err;
	}
	
	
	
	private double [] getYminusFxWeighted(
			final double []   fx,
			final double []   rms_fp // null or [2]
			) {
		final Thread[]      threads =     ImageDtt.newThreadArray();
		final AtomicInteger ai =          new AtomicInteger(0);
		final AtomicInteger ati =         new AtomicInteger(0);
		final double []     wymfw =       new double [fx.length];
		double [] swd2 = new double[threads.length];
		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					for (int n = ai.getAndIncrement(); n < num_pairs; n = ai.getAndIncrement()) {
						double d = fx_scale *(-fx[n]); // - fx[n]; // +y_vector[i]
						double wd = d * weights[n];
						wymfw[n] = wd;
						swd2[nthread] += d * wd;
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		double s_rms_pure = 0;
		for (int n = 0; n < swd2.length; n++) {
			s_rms_pure += swd2[n];
		}
//		System.out.println("ai.get()="+ai.get());
		// important to set - after first cycle ai is left 16(number of threads) larger than number of cycles!
		// It is so, because it first increments, then tests if (n < num_pairs)
		ai.set(num_pairs);
		ati.set(0);
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					for (int n = ai.getAndIncrement(); n < fx.length; n = ai.getAndIncrement()) {
						double d = fx_scale *(-fx[n]); // - fx[n]; // +y_vector[i]
						double wd = d * weights[n];
						wymfw[n] = wd;
						swd2[nthread] += d * wd;
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		
		double s_rms = 0; // start from scratch
		for (int n = 0; n < swd2.length; n++) {
			s_rms += swd2[n];
		}
		
		if (rms_fp != null) {
			rms_fp[0] = Math.sqrt(s_rms);
			rms_fp[1] = Math.sqrt(s_rms_pure/weight_pure);
		}
		return wymfw;
	}
	
	public int runLma( // <0 - failed, >=0 iteration number (1 - immediately)
			double lambda,           // 0.1
			double lambda_scale_good,// 0.5
			double lambda_scale_bad, // 8.0
			double lambda_max,       // 100
			double rms_diff,         // 0.001
			int    num_iter,         // 20
			boolean last_run,
			String dbg_prefix,
			int    debug_level)
	{
		boolean [] rslt = {false,false};
		this.last_rms = null; // remove?
		int iter = 0;
		if (dbg_prefix != null) {
//			 debugStateImage(dbg_prefix+"-initial");
		}
		for (iter = 0; iter < num_iter; iter++) {
			rslt =  lmaStep(
					lambda,
					rms_diff,
					debug_level);
			if (dbg_prefix != null) {
//				 debugStateImage(dbg_prefix+"-step_"+iter);
			}
			
			if (rslt == null) {
				return -1; // false; // need to check
			}
			if (debug_level > 1) {
				System.out.println("LMA step"+String.format("%3d",iter)+": {"+rslt[0]+","+rslt[1]+"} full RMS= "+good_or_bad_rms[0]+
						" ("+initial_rms[0]+"), pure RMS="+good_or_bad_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
			}
			if (rslt[1]) {
				break;
			}
			if (rslt[0]) { // good
				lambda *= lambda_scale_good;
			} else {
				lambda *= lambda_scale_bad;
				if (lambda > lambda_max) {
					break; // not used in lwir
				}
			}
		}
		if (rslt[0]) { // better
			if (iter >= num_iter) { // better, but num tries exceeded
				if (debug_level > 1) System.out.println("Step "+iter+": Improved, but number of steps exceeded maximal");
			} else {
				if (debug_level > 1) System.out.println("Step "+iter+": LMA: Success");
			}

		} else { // improved over initial ?
			if (last_rms[0] < initial_rms[0]) { // NaN
				rslt[0] = true;
				if (debug_level > 1) System.out.println("Step "+iter+": Failed to converge, but result improved over initial");
			} else {
				if (debug_level > 1) System.out.println("Step "+iter+": Failed to converge");
			}
		}
		boolean show_intermediate = true;
		if (show_intermediate && (debug_level > 0)) {
			System.out.println("LMA: full RMS="+last_rms[0]+" ("+initial_rms[0]+"), pure RMS="+last_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
		}
		if (debug_level > 2){ 
			System.out.println("iteration="+iter);
		}
		if (debug_level > 0) {
			if ((debug_level > 1) ||  last_run) { // (iter == 1) || last_run) {
				if (!show_intermediate) {
					System.out.println("LMA: iter="+iter+",   full RMS="+last_rms[0]+" ("+initial_rms[0]+"), pure RMS="+last_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
				}
			}
		}
		if ((debug_level > -2) && !rslt[0]) { // failed
			if ((debug_level > 1) || (iter == 1) || last_run) {
				System.out.println("LMA failed on iteration = "+iter);
			}
			System.out.println();
		}

		return rslt[0]? iter : -1;
	}
	
	
	
	private boolean [] lmaStep(
			double lambda,
			double rms_diff,
			int debug_level) {
		boolean [] rslt = {false,false};
		// maybe the following if() branch is not needed - already done in prepareLMA !
		if (this.last_rms == null) { //first time, need to calculate all (vector is valid)
			last_rms = new double[2];
			if (debug_level > 1) {
				System.out.println("lmaStep(): first step");
			}
			double [] fx = getFxDerivs(
					parameters_vector, // double []         vector,
					last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);      // final int         debug_level)
			last_ymfx = getYminusFxWeighted(
					fx, // final double []   fx,
					last_rms); // final double []   rms_fp // null or [2]
			this.initial_rms = this.last_rms.clone();
			this.good_or_bad_rms = this.last_rms.clone();

			if (debug_level > -1) { // temporary
				/*
				dbgYminusFxWeight(
						this.last_ymfx,
						this.weights,
						"Initial_y-fX_after_moving_objects");
                */
			}
			if (last_ymfx == null) {
				return null; // need to re-init/restart LMA
			}
			// TODO: Restore/implement
			if (debug_level > 3) {
				double    delta = this.delta;
			 	double delta_err=compareJT(
			 			parameters_vector, // double [] vector,
						delta,             // double    delta,
						last3only);        // boolean   last3only); // do not process samples - they are tested before
				System.out.println("\nMaximal error = "+delta_err);
			}
		}
		if (debug_level > 3) { // 0) {
			double    delta =  this.delta; // 1E-3;
		 	double delta_err=compareJT(
		 			parameters_vector, // double [] vector,
					delta,             // double    delta,
					last3only);        // boolean   last3only); // do not process samples - they are tested before
			System.out.println("\nMaximal error = "+delta_err);
		}
		
		
		Matrix y_minus_fx_weighted = new Matrix(this.last_ymfx, this.last_ymfx.length);

		Matrix wjtjlambda = new Matrix(getWJtJlambda(
				lambda, // *10, // temporary
				this.last_jt)); // double [][] jt) // null
		
		if (debug_level>2) {
			System.out.println("JtJ + lambda*diag(JtJ");
			wjtjlambda.print(18, 6);
		}
		Matrix jtjl_inv = null;
		try {
			jtjl_inv = wjtjlambda.inverse(); // check for errors
		} catch (RuntimeException e) {
			rslt[1] = true;
			if (debug_level > 0) {
				System.out.println("Singular Matrix!");
			}

			return rslt;
		}
		if (debug_level>2) {
			System.out.println("(JtJ + lambda*diag(JtJ).inv()");
			jtjl_inv.print(18, 6);
		}
//last_jt has NaNs
		Matrix jty = (new Matrix(this.last_jt)).times(y_minus_fx_weighted);
		if (debug_level>2) {
			System.out.println("Jt * (y-fx)");
			jty.print(18, 6);
		}
		
		
		Matrix mdelta = jtjl_inv.times(jty);
		if (debug_level>2) {
			System.out.println("mdelta");
			mdelta.print(18, 6);
		}

		double scale = 1.0;
		double []  delta =      mdelta.getColumnPackedCopy();
		double []  new_vector = parameters_vector.clone();
		for (int i = 0; i < parameters_vector.length; i++) {
			new_vector[i] += scale * delta[i];
		}
		
		double [] fx = getFxDerivs(
				new_vector, // double []         vector,
				last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
				debug_level);      // final int         debug_level)
		double [] rms = new double[2];
		last_ymfx = getYminusFxWeighted(
//				vector_XYS, // final double [][] vector_XYS,
				fx, // final double []   fx,
				rms); // final double []   rms_fp // null or [2]
		if (debug_level > 2) {
			/*
			dbgYminusFx(this.last_ymfx, "next y-fX");
			dbgXY(new_vector, "XY-correction");
			*/
		}

		if (last_ymfx == null) {
			return null; // need to re-init/restart LMA
		}

		this.good_or_bad_rms = rms.clone();
		if (rms[0] < this.last_rms[0]) { // improved
			rslt[0] = true;
			rslt[1] = rms[0] >=(this.last_rms[0] * (1.0 - rms_diff));
			this.last_rms = rms.clone();

			this.parameters_vector = new_vector.clone();
			if (debug_level > 2) {
				// print vectors in some format
				/*
				System.out.print("delta: "+corr_delta.toString()+"\n");
				System.out.print("New vector: "+new_vector.toString()+"\n");
				System.out.println();
				*/
			}
		} else { // worsened
			rslt[0] = false;
			rslt[1] = false; // do not know, caller will decide
			// restore state
			fx = getFxDerivs(
					parameters_vector, // double []         vector,
					last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);      // final int         debug_level)
			last_ymfx = getYminusFxWeighted(
					fx, // final double []   fx,
					this.last_rms); // final double []   rms_fp // null or [2]
			if (last_ymfx == null) {
				return null; // need to re-init/restart LMA
			}
			if (debug_level > 2) {
				/*
				 dbgJacobians(
							corr_vector, // GeometryCorrection.CorrVector corr_vector,
							1E-5, // double delta,
							true); //boolean graphic)
							*/
			}
		}
		return rslt;
	}
	
	
	
	private double [][] getWJtJlambda(
			final double      lambda,
			final double [][] jt)
	{
		final int num_pars = jt.length;
		final int num_pars2 = num_pars * num_pars;
		final int nup_points = jt[0].length;
		final double [][] wjtjl = new double [num_pars][num_pars];
		final Thread[] threads = ImageDtt.newThreadArray();
		final AtomicInteger ai = new AtomicInteger(0);
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					for (int indx = ai.getAndIncrement(); indx < num_pars2; indx = ai.getAndIncrement()) {
						int i = indx / num_pars;
						int j = indx % num_pars;
						if (j >= i) {
							double d = 0.0;
							for (int k = 0; k < nup_points; k++) {
								if (jt[i][k] != 0) {
									d+=0;
								}
								d += weights[k]*jt[i][k]*jt[j][k];
							}
							wjtjl[i][j] = d;
							if (i == j) {
								wjtjl[i][j] += d * lambda;
							} else {
								wjtjl[j][i] = d;
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		return wjtjl;
	}
	
	
	public double [] getRms() {
		return last_rms;
		
	}

	public double [] getInitialRms() {
		return initial_rms;
	}	
	
	
	
	
}
