 /**
 ** OrientationSceneLMA - Fit multiple scenes orientations/scales when some
 ** pairwise  mutual orientations/scales are known as quaternions (not unit ones)
 **                into a set of output 3D vectors
 **
 ** Copyright (C) 2024 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
 **  OrientationSceneLMA.java is free software: you can redistribute it and/or modify
 **  it under the terms of the GNU General Public License as published by
 **  the Free Software Foundation, either version 3 of the License, or
 **  (at your option) any later version.
 **
 **  This program is distributed in the hope that it will be useful,
 **  but WITHOUT ANY WARRANTY; without even the implied warranty of
 **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 **  GNU General Public License for more details.
 **
 **  You should have received a copy of the GNU General Public License
 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ** -----------------------------------------------------------------------------**
 **
 */
package com.elphel.imagej.orthomosaic;

import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;

import com.elphel.imagej.tileprocessor.ImageDtt;

import Jama.Matrix;

public class OrientationSceneLMA {
	public int []             indices;
	public int [][]           cpairs =          null;
	public int                num_scenes = 0;
	public int                num_pairs = 0;
	private double []         last_rms =        null; // {rms, rms_pure}, matching this.vector
	private double []         good_or_bad_rms = null; // just for diagnostics, to read last (failed) rms
	private double []         initial_rms =     null; // {rms, rms_pure}, first-calcualted rms
	private double []         parameters_vector = null;
	private double [][]       qpairs = null; 
	private double            weight_pure =     0;
	private double []         weights; // normalized so sum is 1.0 for all - samples and extra regularization
	private double []         last_ymfx =       null;
	private double [][]       last_jt =         null;
	
	public boolean            last3only =       false;  // true; // debug feature
	public double             delta=            1e-5; // 7;
	public boolean            debug_deriv = true;
	public int                debug_width =     12;
	public int                debug_decimals =  9;

	public double [][] getOrientationQuaternions(){
		double [][] qorient = new double [num_scenes][4];
		for (int nscene = 0; nscene < num_scenes; nscene++) {
			for (int i = 0; i < 4; i++) {
				qorient[nscene][i] = parameters_vector[4*nscene + i];
			}
		}
		return qorient;
	}
	
	public int prepareLMA(
			int    []     indices, // should all be used
			int    [][]   cpairs,
			double []     weights_pairs, // from matching tilts(flatness) (and worst sfm, per-pair rmse)?
			double        weight_rot,    // >0 weight of pairs errors in qn3
			double        weight_tilt,   // >0 weight of pairs errors in qn1, qn2
			double        weight_scale,  // >0 weight in pairs scale-1.0 errors
			double        pull,          // 0 <= pull <1 - fraction of all RMS contributors
			double        pull_rots,     // >=0 weight of sum of rotations, may be 0, normalized by pull value 
			double        pull_tilts,    // >=0 weights of sum of qn1 and qn2 of scenes, normalized by pull value
			double        pull_scales,   // >=0 weights of scales of scenes, normalized by pull value
			double [][]   qpairs,        // [pair][4] quaternions for pairs - orientation and scale (non-unity quaternions) 
			int           debug_level) {
		this.indices =         indices;  // maybe not needed
		this.cpairs = cpairs;
		num_scenes = indices.length;
		num_pairs = cpairs.length;
//		qpairs = new double [num_pairs][];
		if (weights_pairs == null) {
			weights_pairs = new double[num_pairs];
			Arrays.fill(weights_pairs, 1.0);
		}
		weights = new double [num_pairs*4+4];
		double sum_weights = 0;
		double [] weights4 = {weight_scale, weight_tilt/2, weight_tilt/2, weight_rot};
		double [] pull4 =    {pull_scales,  pull_tilts/2,  pull_tilts/2,  pull_rots};
		
		double sum_pull4 = 0;
		for (double w: pull4) sum_pull4+= w;
		if (sum_pull4 <= 0) {
			pull = 0;
		}
		for (int npair = 0; npair < num_pairs; npair++) {
			for (int i = 0; i < 4; i++) {
				int indx = 4* npair+ i;
				double w = weights_pairs[npair] * weights4[i];
				weights[ indx] = w; 
				sum_weights += w;
			}
		}
		weight_pure = 1 - pull;
		double k1 = (1.0 - pull)/sum_weights;
		double k2 = (pull>0) ? (pull/sum_pull4): 0;
		for (int i = 0; i < 4*num_pairs; i++) {
			weights[i] *= k1;
		}
		for (int i = 0; i < 4; i++) {
			weights[4 * num_pairs + i] = k2*weights4[i];
		}
		parameters_vector = new double [4 * num_scenes];
		for (int i = 0; i < num_scenes; i++) {
			parameters_vector[4*i] = 1.0; // 
		}
		last_jt = new double [parameters_vector.length][];
		this.qpairs = qpairs;
		return weights.length;
	}
	
	private double [] getFxDerivs(
			final double []         vector,
			final double [][] jt, // should be null or initialized with [vector.length][]
			final int         debug_level)
	{
		double [] fX = new double [weights.length]; // num_pairs + vector.length];
		if (jt != null) {
			for (int i = 0; i < jt.length; i++) {
				jt[i] = new double [weights.length]; // weights.length];
			}
		}

		final Thread[] threads = ImageDtt.newThreadArray();
		final AtomicInteger ai = new AtomicInteger(0);
		final AtomicInteger ati = new AtomicInteger(0);
		// first cycle - minimize per-pair errors (differences between singular values)
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					double [] qscene1 = new double[4];
					double [] qscene2 = new double[4];
					double [][][] derivs = (jt!=null) ? (new double [2][][]): null;
					double [][][] qderivs = (jt != null) ? (new double [2][][]) : null;
					
					for (int npair = ai.getAndIncrement(); npair < num_pairs; npair = ai.getAndIncrement()) {
						int [] scene_ind = {4 * cpairs[npair][0], 4 * cpairs[npair][1]};
						System.arraycopy(vector, scene_ind[0], qscene1, 0, 4);
						System.arraycopy(vector, scene_ind[1], qscene2, 0, 4);
						double [] q2iq1q12;
						q2iq1q12 = getPairErrQuaternion( // qderivs here tested
								qscene1, // double [] qscene1,
								qscene2, // double [] qscene2,
								qpairs[npair], // double [] qpair12,
								qderivs); // double [][][] qderivs)

						
						double [] q2iq1q12_norm=QuatUtils.normalize(q2iq1q12); // use q1,q2,q3, maybe q1 and q2 (from DEM) lower weight than q3 (from images)
						double scale_diff = QuatUtils.norm(q2iq1q12)-1.0;
						double [] fx_frag = {scale_diff, q2iq1q12_norm[1], q2iq1q12_norm[2], q2iq1q12_norm[3]};

						System.arraycopy(
								fx_frag,
								0,
								fX,
								fx_frag.length*npair,
								fx_frag.length);
						if (jt!=null) {
							double [][] dq2iq1q12_norm = QuatUtils.dnormalize_dq(q2iq1q12);
							double [] dscale = QuatUtils.dscale_dq(q2iq1q12);
							dq2iq1q12_norm[0] = dscale; // replace first row (q_norm/dq) with dsca/e/dq
							derivs[0] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[0]); // /dq1 (scene 1)
							derivs[1] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[1]); // /dq2 (scene 2)
							
							for (int iscene = 0; iscene <2; iscene++) {
								for (int npar = 0; npar <4; npar++) {
									for (int i = 0; i < 4; i++) {
										jt[scene_ind[iscene]+npar][fx_frag.length*npair+i] = derivs[iscene][i][npar];
									}
								}
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		ai.set(0);
		// second cycle - regularization (average scale diffs, q1,q2,q3}
		double [][]   pull_threads = new double [threads.length][4];
		final int pull_index = num_pairs*4; // start of pull sums in fx
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					double []   qscene = new double[4];
					for (int nscene = ai.getAndIncrement(); nscene < num_scenes; nscene = ai.getAndIncrement()) {
						System.arraycopy(vector, nscene*4, qscene, 0, 4);
						double [] qscene_norm=QuatUtils.normalize(qscene); // use q1,q2,q3, maybe q1 and q2 (from DEM) lower weight than q3 (from images)
						double scale_diff = QuatUtils.norm(qscene)-1.0;
						qscene_norm[0] = scale_diff; // {scale_diff, qn1, qn2, qn3};
						for (int i = 0; i < 4; i++) {
							pull_threads[nthread][i] += qscene_norm[i];
						}
						if (jt != null) {
							double [][] dnq_dq = QuatUtils.dnormalize_dq(qscene);
							double []   dscale = QuatUtils.dscale_dq(qscene);
							dnq_dq[0] = dscale;
							for (int npar = 0; npar <4; npar++) {
								System.arraycopy(
										dnq_dq[npar],
										0 ,
										jt[nscene*4+npar],
										pull_index,
										4);
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		// sum partial sums from threads
		for (int nthread = 0; nthread < pull_threads.length; nthread++ ) {
			for (int i = 0; i < 4; i++) {
				fX[pull_index+i] += pull_threads[nthread][i];
			}
		}
		return fX;
	}

	
	public double [] getFxDerivs_debug(
			final double []         vector,
			final double [][] jt, // should be null or initialized with [vector.length][]
			final int         debug_level)
	{
		double [] fX = new double [weights.length]; // num_pairs + vector.length];
		if (jt != null) {
			for (int i = 0; i < jt.length; i++) {
				jt[i] = new double [weights.length]; // weights.length];
			}
		}

		final Thread[] threads = ImageDtt.newThreadArray();
		final AtomicInteger ai = new AtomicInteger(0);
		final AtomicInteger ati = new AtomicInteger(0);
		// first cycle - minimize per-pair errors (differences between singular values)
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					double [] qscene1 = new double[4];
					double [] qscene2 = new double[4];
					double [][][] derivs = (jt!=null) ? (new double [2][][]): null;
					double [][][] qderivs = (jt != null) ? (new double [2][][]) : null;
					
					for (int npair = ai.getAndIncrement(); npair < num_pairs; npair = ai.getAndIncrement()) {
						int [] scene_ind = {4 * cpairs[npair][0], 4 * cpairs[npair][1]};
						System.arraycopy(vector, scene_ind[0], qscene1, 0, 4);
						System.arraycopy(vector, scene_ind[1], qscene2, 0, 4);
						double [] q2iq1q12;
						if (debug_deriv && (debug_level>10)) {
							q2iq1q12 = getPairErrQuaternion(
									qscene1, // double [] qscene1,
									qscene2, // double [] qscene2,
									qpairs[npair],  // double [] qpair12,
									qderivs,        // double [][][] qderivs)
									delta,          // double    delta,
									debug_width,    // int       fmt_width,
									debug_decimals, // int       fmt_decimals,
									npair);         // int       npair); 


						} else {
							q2iq1q12 = getPairErrQuaternion( // qderivs here tested
									qscene1, // double [] qscene1,
									qscene2, // double [] qscene2,
									qpairs[npair], // double [] qpair12,
									qderivs); // double [][][] qderivs)
						}
						
						
						double [] q2iq1q12_norm=QuatUtils.normalize(q2iq1q12); // use q1,q2,q3, maybe q1 and q2 (from DEM) lower weight than q3 (from images)
						double scale_diff = QuatUtils.norm(q2iq1q12)-1.0;
						double [] fx_frag = {scale_diff, q2iq1q12_norm[1], q2iq1q12_norm[2], q2iq1q12_norm[3]};

						System.arraycopy(
								fx_frag,
								0,
								fX,
								fx_frag.length*npair,
								fx_frag.length);
						if (jt!=null) {
							double [][][] qderivs_delta = getPairErrQuaternionDelta(  // test only
									qscene1,       // double [] q1,
									qscene2,       // double [] q2,
									qpairs[npair],
									delta);        // double delta)
//							printDerivDelta("npair="+npair+" qderivs[0], delta="+delta, qderivs[0], qderivs_delta[0],debug_width,debug_decimals);
//							printDerivDelta("npair="+npair+" qderivs[1], delta="+delta, qderivs[1], qderivs_delta[1],debug_width,debug_decimals);
 
							
							double [][] dq2iq1q12_norm = QuatUtils.dnormalize_dq(q2iq1q12);
							double [] dscale = QuatUtils.dscale_dq(q2iq1q12);
							if (debug_deriv) {
								double [][] dq2iq1q12_norm_delta=QuatUtils.dnormalize_dq(q2iq1q12_norm,delta);
								printDerivDelta("npair="+npair+" dq2iq1q12_norm, delta="+delta, dq2iq1q12_norm, dq2iq1q12_norm_delta,debug_width,debug_decimals);
								double [] dscale_delta = QuatUtils.dscale_dq(q2iq1q12_norm, delta);
								printDerivDelta("npair="+npair+" dscale_dq, delta="+delta, dscale, dscale_delta,debug_width,debug_decimals);
							}
							dq2iq1q12_norm[0] = dscale; // replace first row (q_norm/dq) with dsca/e/dq
							if (debug_deriv) {
								double [][] dq2iq1q12_norm_delta=QuatUtils.dnormalize_dq(q2iq1q12_norm,delta);
								double [] dscale_delta = QuatUtils.dscale_dq(q2iq1q12_norm, delta);
								dq2iq1q12_norm_delta[0] = dscale_delta; // replace first row (q_norm/dq) with dsca/e/dq
								printDerivDelta("npair="+npair+" dq2iq1q12_normD,delta="+delta, dq2iq1q12_norm, dq2iq1q12_norm_delta,debug_width,debug_decimals);
							}							
							derivs[0] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[0]); // /dq1 (scene 1)
							derivs[1] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[1]); // /dq2 (scene 2)
							
							for (int iscene = 0; iscene <2; iscene++) {
								for (int npar = 0; npar <4; npar++) {
									for (int i = 0; i < 4; i++) {
										jt[scene_ind[iscene]+npar][fx_frag.length*npair+i] = derivs[iscene][i][npar];
									}
									/*
									System.arraycopy(
											derivs[iscene][npar], // null
											0,
											jt[scene_ind[iscene]+npar],
											fx_frag.length*npair,
											fx_frag.length);
									*/
								}
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		ai.set(0);
		// second cycle - regularization (average scale diffs, q1,q2,q3}
		double [][]   pull_threads = new double [threads.length][4];
		final int pull_index = num_pairs*4; // start of pull sums in fx
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					double []   qscene = new double[4];
					for (int nscene = ai.getAndIncrement(); nscene < num_scenes; nscene = ai.getAndIncrement()) {
						System.arraycopy(vector, nscene*4, qscene, 0, 4);
						double [] qscene_norm=QuatUtils.normalize(qscene); // use q1,q2,q3, maybe q1 and q2 (from DEM) lower weight than q3 (from images)
						double scale_diff = QuatUtils.norm(qscene)-1.0;
						qscene_norm[0] = scale_diff; // {scale_diff, qn1, qn2, qn3};
						for (int i = 0; i < 4; i++) {
							pull_threads[nthread][i] += qscene_norm[i];
						}
						if (jt != null) {
							double [][] dnq_dq = QuatUtils.dnormalize_dq(qscene);
							double []   dscale = QuatUtils.dscale_dq(qscene);
							dnq_dq[0] = dscale;
							for (int npar = 0; npar <4; npar++) {
								System.arraycopy(
										dnq_dq[npar],
										0 ,
										jt[nscene*4+npar],
										pull_index,
										4);
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		// sum partial sums from threads
		for (int nthread = 0; nthread < pull_threads.length; nthread++ ) {
			for (int i = 0; i < 4; i++) {
				fX[pull_index+i] += pull_threads[nthread][i];
			}
		}
		return fX;
	}

	
	
	
	private double [][] getFxDerivsDelta(
			double []         vector,
			final double      delta,
			final int         debug_level) {
		double [][] jt =  new double [vector.length][weights.length];
		for (int nv = 0; nv < vector.length; nv++) {
			double [] vpm = vector.clone();
			vpm[nv]+= 0.5*delta;
			double [] fx_p =  getFxDerivs(
					vpm,
					null, // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);
			vpm[nv]-= delta;
			double [] fx_m =  getFxDerivs(
					vpm,
					null, // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);
			for (int i = 0; i < weights.length; i++) if (weights[i] > 0) {
				jt[nv][i] = (fx_p[i]-fx_m[i])/delta;
			}
		}
		return jt;
	}

	private double compareJT(
			double [] vector,
			double    delta,
			boolean   last3only) { // do not process samples - they are tested before
		double []  errors=new double [vector.length];
		double [][] jt =  new double [vector.length][];
		System.out.print("Parameters vector = [");
		for (int i = 0; i < vector.length; i++) {
			System.out.print(vector[i]);
			if (i < (vector.length -1)) System.out.print(", ");
		}
		System.out.println("]");
		getFxDerivs(
				vector,
				jt, // final double [][] jt, // should be null or initialized with [vector.length][]
				1); // debug_level);
		double [][] jt_delta =  getFxDerivsDelta(
				vector, // double []         vector,
				delta, // final double      delta,
				-1); // final int         debug_level)
		int start_index = last3only? (weights.length-3) : 0;
		for (int n = start_index; n < weights.length; n++) if (weights[n] > 0) {
			System.out.print(String.format("%3d",n));
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt[i][n]));
			}			
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt_delta[i][n]));
			}			
			for (int i = 0; i < vector.length; i++) {
				System.out.print(String.format("\t%12.9f",jt[i][n]-jt_delta[i][n]));
			}			
			System.out.println();
			for (int i = 0; i < vector.length; i++) {
				errors[i] = Math.max(errors[i], jt[i][n]-jt_delta[i][n]);
			}
		}
		System.out.print(String.format("%3s","err"));
		for (int i = 0; i < 2* vector.length; i++) {
			System.out.print(String.format("\t%12s",""));
		}			
		for (int i = 0; i < vector.length; i++) {
			System.out.print(String.format("\t%12.9f",errors[i]));
		}			
		double err=0;
		for (int i = 0; i < vector.length; i++) {
			err = Math.max(errors[i], err);
		}
		return err;
	}

	
	private double [] getYminusFxWeighted(
			final double []   fx,
			final double []   rms_fp // null or [2]
			) {
		final Thread[]      threads =     ImageDtt.newThreadArray();
		final AtomicInteger ai =          new AtomicInteger(0);
		final AtomicInteger ati =         new AtomicInteger(0);
		final double []     wymfw =       new double [fx.length];
		double [] swd2 = new double[threads.length];
		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					for (int n = ai.getAndIncrement(); n < 4*num_pairs; n = ai.getAndIncrement()) {
						double d = -fx[n]; // - fx[n]; // +y_vector[i]
						double wd = d * weights[n];
						wymfw[n] = wd;
						swd2[nthread] += d * wd;
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		double s_rms_pure = 0;
		for (int n = 0; n < swd2.length; n++) {
			s_rms_pure += swd2[n];
		}
//		System.out.println("ai.get()="+ai.get());
		// important to set - after first cycle ai is left 16(number of threads) larger than number of cycles!
		// It is so, because it first increments, then tests if (n < num_pairs)
		ai.set(4*num_pairs);
		ati.set(0);
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					int nthread =  ati.getAndIncrement();
					for (int n = ai.getAndIncrement(); n < fx.length; n = ai.getAndIncrement()) {
						double d = -fx[n]; // - fx[n]; // +y_vector[i]
						double wd = d * weights[n];
						wymfw[n] = wd;
						swd2[nthread] += d * wd;
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		
		double s_rms = 0; // start from scratch
		for (int n = 0; n < swd2.length; n++) {
			s_rms += swd2[n];
		}
		
		if (rms_fp != null) {
			rms_fp[0] = Math.sqrt(s_rms);
			rms_fp[1] = Math.sqrt(s_rms_pure/weight_pure);
		}
		return wymfw;
	}
	
	
	public int runLma( // <0 - failed, >=0 iteration number (1 - immediately)
			double lambda,           // 0.1
			double lambda_scale_good,// 0.5
			double lambda_scale_bad, // 8.0
			double lambda_max,       // 100
			double rms_diff,         // 0.001
			int    num_iter,         // 20
			boolean last_run,
			String dbg_prefix,
			int    debug_level)
	{
		boolean [] rslt = {false,false};
		this.last_rms = null; // remove?
		int iter = 0;
		if (dbg_prefix != null) {
//			 debugStateImage(dbg_prefix+"-initial");
		}
		for (iter = 0; iter < num_iter; iter++) {
			rslt =  lmaStep(
					lambda,
					rms_diff,
					debug_level);
			if (dbg_prefix != null) {
//				 debugStateImage(dbg_prefix+"-step_"+iter);
			}
			
			if (rslt == null) {
				return -1; // false; // need to check
			}
			if (debug_level > 1) {
				System.out.println("LMA step"+String.format("%3d",iter)+": {"+rslt[0]+","+rslt[1]+"} full RMS= "+good_or_bad_rms[0]+
						" ("+initial_rms[0]+"), pure RMS="+good_or_bad_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
			}
			if (rslt[1]) {
				break;
			}
			if (rslt[0]) { // good
				lambda *= lambda_scale_good;
			} else {
				lambda *= lambda_scale_bad;
				if (lambda > lambda_max) {
					break; // not used in lwir
				}
			}
		}
		if (rslt[0]) { // better
			if (iter >= num_iter) { // better, but num tries exceeded
				if (debug_level > 1) System.out.println("Step "+iter+": Improved, but number of steps exceeded maximal");
			} else {
				if (debug_level > 1) System.out.println("Step "+iter+": LMA: Success");
			}

		} else { // improved over initial ?
			if (last_rms[0] < initial_rms[0]) { // NaN
				rslt[0] = true;
				if (debug_level > 1) System.out.println("Step "+iter+": Failed to converge, but result improved over initial");
			} else {
				if (debug_level > 1) System.out.println("Step "+iter+": Failed to converge");
			}
		}
		boolean show_intermediate = true;
		if (show_intermediate && (debug_level > 0)) {
			System.out.println("LMA: full RMS="+last_rms[0]+" ("+initial_rms[0]+"), pure RMS="+last_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
		}
		if (debug_level > 2){ 
			System.out.println("iteration="+iter);
		}
		if (debug_level > 0) {
			if ((debug_level > 1) ||  last_run) { // (iter == 1) || last_run) {
				if (!show_intermediate) {
					System.out.println("LMA: iter="+iter+",   full RMS="+last_rms[0]+" ("+initial_rms[0]+"), pure RMS="+last_rms[1]+" ("+initial_rms[1]+") + lambda="+lambda);
				}
			}
		}
		if ((debug_level > -2) && !rslt[0]) { // failed
			if ((debug_level > 1) || (iter == 1) || last_run) {
				System.out.println("LMA failed on iteration = "+iter);
			}
			System.out.println();
		}

		return rslt[0]? iter : -1;
	}
	
	private boolean [] lmaStep(
			double lambda,
			double rms_diff,
			int debug_level) {
		boolean [] rslt = {false,false};
		// maybe the following if() branch is not needed - already done in prepareLMA !
		if (this.last_rms == null) { //first time, need to calculate all (vector is valid)
			last_rms = new double[2];
			if (debug_level > 1) {
				System.out.println("lmaStep(): first step");
			}
			double [] fx = getFxDerivs(
					parameters_vector, // double []         vector,
					last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);      // final int         debug_level)
			last_ymfx = getYminusFxWeighted(
					fx, // final double []   fx,
					last_rms); // final double []   rms_fp // null or [2]
			this.initial_rms = this.last_rms.clone();
			this.good_or_bad_rms = this.last_rms.clone();

			if (debug_level > -1) { // temporary
				/*
				dbgYminusFxWeight(
						this.last_ymfx,
						this.weights,
						"Initial_y-fX_after_moving_objects");
                */
			}
			if (last_ymfx == null) {
				return null; // need to re-init/restart LMA
			}
			// TODO: Restore/implement
			if (debug_level > 3) {
				double    delta = this.delta;
			 	double delta_err=compareJT(
			 			parameters_vector, // double [] vector,
						delta,             // double    delta,
						last3only);        // boolean   last3only); // do not process samples - they are tested before
				System.out.println("\nMaximal error = "+delta_err);
			}
		}
		if (debug_level > 3) { // 0) {
			double    delta =  this.delta; // 1E-3;
		 	double delta_err=compareJT(
		 			parameters_vector, // double [] vector,
					delta,             // double    delta,
					last3only);        // boolean   last3only); // do not process samples - they are tested before
			System.out.println("\nMaximal error = "+delta_err);
		}
		
		
		Matrix y_minus_fx_weighted = new Matrix(this.last_ymfx, this.last_ymfx.length);

		Matrix wjtjlambda = new Matrix(getWJtJlambda(
				lambda, // *10, // temporary
				this.last_jt)); // double [][] jt) // null
		
		if (debug_level>2) {
			System.out.println("JtJ + lambda*diag(JtJ");
			wjtjlambda.print(18, 6);
		}
		Matrix jtjl_inv = null;
		try {
			jtjl_inv = wjtjlambda.inverse(); // check for errors
		} catch (RuntimeException e) {
			rslt[1] = true;
			if (debug_level > 0) {
				System.out.println("Singular Matrix!");
			}

			return rslt;
		}
		if (debug_level>2) {
			System.out.println("(JtJ + lambda*diag(JtJ).inv()");
			jtjl_inv.print(18, 6);
		}
//last_jt has NaNs
		Matrix jty = (new Matrix(this.last_jt)).times(y_minus_fx_weighted);
		if (debug_level>2) {
			System.out.println("Jt * (y-fx)");
			jty.print(18, 6);
		}
		
		
		Matrix mdelta = jtjl_inv.times(jty);
		if (debug_level>2) {
			System.out.println("mdelta");
			mdelta.print(18, 6);
		}

		double scale = 1.0;
		double []  delta =      mdelta.getColumnPackedCopy();
		double []  new_vector = parameters_vector.clone();
		for (int i = 0; i < parameters_vector.length; i++) {
			new_vector[i] += scale * delta[i];
		}
		
		double [] fx = getFxDerivs(
				new_vector, // double []         vector,
				last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
				debug_level);      // final int         debug_level)
		double [] rms = new double[2];
		last_ymfx = getYminusFxWeighted(
//				vector_XYS, // final double [][] vector_XYS,
				fx, // final double []   fx,
				rms); // final double []   rms_fp // null or [2]
		if (debug_level > 2) {
			/*
			dbgYminusFx(this.last_ymfx, "next y-fX");
			dbgXY(new_vector, "XY-correction");
			*/
		}

		if (last_ymfx == null) {
			return null; // need to re-init/restart LMA
		}

		this.good_or_bad_rms = rms.clone();
		if (rms[0] < this.last_rms[0]) { // improved
			rslt[0] = true;
			rslt[1] = rms[0] >=(this.last_rms[0] * (1.0 - rms_diff));
			this.last_rms = rms.clone();

			this.parameters_vector = new_vector.clone();
			if (debug_level > 2) {
				// print vectors in some format
				/*
				System.out.print("delta: "+corr_delta.toString()+"\n");
				System.out.print("New vector: "+new_vector.toString()+"\n");
				System.out.println();
				*/
			}
		} else { // worsened
			rslt[0] = false;
			rslt[1] = false; // do not know, caller will decide
			// restore state
			fx = getFxDerivs(
					parameters_vector, // double []         vector,
					last_jt,           // final double [][] jt, // should be null or initialized with [vector.length][]
					debug_level);      // final int         debug_level)
			last_ymfx = getYminusFxWeighted(
					fx, // final double []   fx,
					this.last_rms); // final double []   rms_fp // null or [2]
			if (last_ymfx == null) {
				return null; // need to re-init/restart LMA
			}
			if (debug_level > 2) {
				/*
				 dbgJacobians(
							corr_vector, // GeometryCorrection.CorrVector corr_vector,
							1E-5, // double delta,
							true); //boolean graphic)
							*/
			}
		}
		return rslt;
	}
	
	
	private double [][] getWJtJlambda(
			final double      lambda,
			final double [][] jt)
	{
		final int num_pars = jt.length;
		final int num_pars2 = num_pars * num_pars;
		final int nup_points = jt[0].length;
		final double [][] wjtjl = new double [num_pars][num_pars];
		final Thread[] threads = ImageDtt.newThreadArray();
		final AtomicInteger ai = new AtomicInteger(0);
		for (int ithread = 0; ithread < threads.length; ithread++) {
			threads[ithread] = new Thread() {
				public void run() {
					for (int indx = ai.getAndIncrement(); indx < num_pars2; indx = ai.getAndIncrement()) {
						int i = indx / num_pars;
						int j = indx % num_pars;
						if (j >= i) {
							double d = 0.0;
							for (int k = 0; k < nup_points; k++) {
								if (jt[i][k] != 0) {
									d+=0;
								}
								d += weights[k]*jt[i][k]*jt[j][k];
							}
							wjtjl[i][j] = d;
							if (i == j) {
								wjtjl[i][j] += d * lambda;
							} else {
								wjtjl[j][i] = d;
							}
						}
					}
				}
			};
		}		      
		ImageDtt.startAndJoin(threads);
		return wjtjl;
	}
	
	
	public double [] getRms() {
		return last_rms;
		
	}

	public double [] getInitialRms() {
		return initial_rms;
	}	
	
	
	/**
	 * Calculate mismatch quaternion (between the calculated difference of two scenes and
	 * previously known one. So if qscene1 - quaternion for scene1 (rot+scale), qscene2 -
	 * quaternion for the scene2 and qpair_inv - inverted known quaternion 
	 * @param qscene1
	 * @param qscene2
	 * @param qpair12
	 * @param qderivs
	 * @return
	 */
	
	public static double [] getPairErrQuaternion0(
			double [] qscene1,
			double [] qscene2,
			double [] qpair12,
			double [][][] qderivs) {
		double [] iqscene2 = QuatUtils.invert(qscene2);
		double [] iq2q1 =    QuatUtils.multiply(iqscene2, qscene1);
		double [] q2iq1q12 = QuatUtils.multiply(iq2q1, qpair12);
		if (qderivs != null) {
			double [][] diqscene2_dq2 = QuatUtils.d_invert_dq(qscene2);
			double [][] diq2q1_dq2 =    QuatUtils.matMult(QuatUtils.d_pq_dp(iqscene2,qscene1), diqscene2_dq2);
			double [][] dq2iq1q12_dq2 = QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12),diq2q1_dq2);
			
			double [][] diq2q1_dq1 =    QuatUtils.d_pq_dq(iqscene2,qscene1);
			double [][] dq2iq1q12_dq1 = QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12),diq2q1_dq1);
			qderivs[0] = dq2iq1q12_dq1;
			qderivs[1] = dq2iq1q12_dq2;
		}
		return q2iq1q12;
	}
	
	public static double [] getPairErrQuaternion(
			double [] q1,
			double [] q2,
			double [] q12,
			double [][][] qderivs) {
		double [] iq2 =      QuatUtils.invert(q2);
		double [] iq2q1 =    QuatUtils.multiply(iq2, q1);
		double [] iq2q1q12 = QuatUtils.multiply(iq2q1, q12);
		if (qderivs != null) {
			double [][] diq2_dq2 =     QuatUtils.d_invert_dq(q2);
			
			double [][] diq2q1_dq2 =    QuatUtils.matMult(QuatUtils.d_pq_dp(q1), diq2_dq2);
			double [][] diq2q1q12_dq2 = QuatUtils.matMult(QuatUtils.d_pq_dp(q12),diq2q1_dq2);
			
			double [][] diq2q1_dq1 =    QuatUtils.d_pq_dq(iq2);
			double [][] diq2q1q12_dq1 = QuatUtils.matMult(QuatUtils.d_pq_dp(q12),diq2q1_dq1);
			qderivs[0] = diq2q1q12_dq1;
			qderivs[1] = diq2q1q12_dq2;
		}
		return iq2q1q12;
	}
	
	public static double [] getPairErrQuaternion( // test only
			double [] q1,
			double [] q2,
			double [][][] qderivs) {
		double [] iq2 =      QuatUtils.invert(q2);
		double [] iq2q1 =    QuatUtils.multiply(iq2, q1);
		if (qderivs != null) {
			double [][] diq2_dq2 =     QuatUtils.d_invert_dq(q2);
			double [][] diq2q1_dq2 =    QuatUtils.matMult(QuatUtils.d_pq_dp(q1), diq2_dq2);
			double [][] diq2q1_dq1 =    QuatUtils.d_pq_dq(iq2);
			qderivs[0] = diq2q1_dq1;
			qderivs[1] = diq2q1_dq2;
		}
		return iq2q1;
	}
	
	public static double [][][] getPairErrQuaternionDelta(  // test only
			double [] q1,
			double [] q2,
			double delta) {
		double [][][] qderivs =  new double [2][4][4];
		for (int nscene = 0; nscene <2; nscene++) {
			for (int npar = 0; npar < 4; npar++) {
				double [][] vpm = new double [][] {q1.clone(),q2.clone()};
				vpm[nscene][npar] += 0.5*delta;
				double [] qd_p = getPairErrQuaternion(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						null);  // double [][][] qderivs)
				vpm[nscene][npar]-= delta;
				double [] qd_m = getPairErrQuaternion(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						null);  // double [][][] qderivs)
				for (int i = 0; i < 4; i++) {
					qderivs[nscene][i][npar] = (qd_p[i] - qd_m[i])/delta; 
				}
//				System.out.println("nscene="+nscene+", npar="+npar);
			}
		}
		return qderivs;
	}
	
	

	public static void testGetPairErrQuaternion () {
		double delta = 1e-5;
		int debug_width = 12,debug_decimals=9;
		double [] q1 = {0.9995, 0.01, 0.015, 0.02}; 
		double [] q2 = {0.999,  0.02, 0.01, 0.015};
		double [] q12 = {1.1,   0.1, 0.15, -0.2};
		double [][][] qderivs =  new double [2][4][4];
		for (int n = 0; n < 10; n++) {
			 getPairErrQuaternion( // test only
						q1, // double [] q1,
						q2, // double [] q2,
						q12,
						qderivs, // double [][][] qderivs)
						delta, // double    delta,
						debug_width, // int       fmt_width,
						debug_decimals, // int       fmt_decimals,
						0); // int       npair) { // 
			double [][][] qderivs_delta = getPairErrQuaternionDelta(  // test only
					q1, // double [] q1,
					q2, // double [] q2,
					q12,
					delta); // double delta)
			
			printDerivDelta(" scene1, delta="+delta, qderivs[0], qderivs_delta[0], debug_width,debug_decimals);
			printDerivDelta(" scene2, delta="+delta, qderivs[1], qderivs_delta[1], debug_width,debug_decimals);
			System.out.println();
		}
	}
	
	
	public static void testGetPairPairScaleDirError () {
		double delta = 1e-5;
		int debug_width = 12,debug_decimals=9;
		double [] q1 = {0.9995, 0.01, 0.015, 0.02}; 
		double [] q2 = {0.999,  0.02, 0.01, 0.015};
		double [] q12 = {1.1,   0.1, 0.15, -0.2};
		double [][][] qderivs =  new double [2][4][4];
		for (int n = 0; n < 10; n++) {
			/*
			 getPairErrQuaternion( // test only
						q1, // double [] q1,
						q2, // double [] q2,
						q12,
						qderivs); // double [][][] qderivs)
						*/
			/*
			 getPairErrQuaternion( // test only
						q1, // double [] q1,
						q2, // double [] q2,
						q12,
						qderivs, // double [][][] qderivs)
						delta, // double    delta,
						debug_width, // int       fmt_width,
						debug_decimals, // int       fmt_decimals,
						0); // int       npair) { // 
			 */
			getPairScaleDirError(
					q1, // double [] q1,
					q2, // double [] q2,
					q12,
					qderivs); // double [][][] qderivs)
/*			 
			double [][][] qderivs_delta = getPairErrQuaternionDelta(  // test only
					q1, // double [] q1,
					q2, // double [] q2,
					q12,
					delta); // double delta)
*/					
			double [][][] qderivs_delta = getPairScaleDirError(  // test only
					q1, // double [] q1,
					q2, // double [] q2,
					q12,
					delta); // double delta)
			
			printDerivDelta(" scene1, delta="+delta, qderivs[0], qderivs_delta[0], debug_width,debug_decimals);
			printDerivDelta(" scene2, delta="+delta, qderivs[1], qderivs_delta[1], debug_width,debug_decimals);
			System.out.println();
		}
	}
	
	
	
	
	
	
	public static double [] getPairErrQuaternion(
			double [] qscene1,
			double [] qscene2,
			double [] qpair12,
			double [][][] qderivs,
			double    delta,
			int       fmt_width,
			int       fmt_decimals,
			int       npair) { // 
		double [] iqscene2 = QuatUtils.invert(qscene2);
		double [] iq2q1 =    QuatUtils.multiply(iqscene2, qscene1);
		double [] q2iq1q12 = QuatUtils.multiply(iq2q1, qpair12);
		if (qderivs != null) {
			double [][] diqscene2_dq2 = QuatUtils.d_invert_dq(qscene2);
			{
				double [][] diqscene2_dq2_delta=QuatUtils.d_invert_dq(qscene2,delta);
				printDerivDelta("npair="+npair+" diqscene2_dq2, delta="+delta, diqscene2_dq2, diqscene2_dq2_delta,fmt_width,fmt_decimals);
			}			
			double [][] diq2q1_dq2 =    QuatUtils.matMult(QuatUtils.d_pq_dp(iqscene2,qscene1), diqscene2_dq2);
			{
				double [][] diq2q1_dq2_delta= QuatUtils.matMult(QuatUtils.d_pq_dp(iqscene2,qscene1, delta), diqscene2_dq2);
				printDerivDelta("npair="+npair+" diq2q1_dq2, delta="+delta, diq2q1_dq2, diq2q1_dq2_delta,fmt_width,fmt_decimals);
			}			
			double [][] dq2iq1q12_dq2 = QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12),diq2q1_dq2);
			{
				double [][] dq2iq1q12_dq2_delta= QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12,delta),diq2q1_dq2);
				printDerivDelta("npair="+npair+" dq2iq1q12_dq2, delta="+delta, dq2iq1q12_dq2, dq2iq1q12_dq2_delta,fmt_width,fmt_decimals);
			}			
			double [][] diq2q1_dq1 =    QuatUtils.d_pq_dq(iqscene2,qscene1);
			{
				double [][] diq2q1_dq1_delta= QuatUtils.d_pq_dq(iqscene2,qscene1,delta);
				printDerivDelta("npair="+npair+" diq2q1_dq1, delta="+delta, diq2q1_dq1, diq2q1_dq1_delta,fmt_width,fmt_decimals);
			}			
			double [][] dq2iq1q12_dq1 = QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12),diq2q1_dq1);
			{
				double [][] dq2iq1q12_dq1_dq1_delta= QuatUtils.matMult(QuatUtils.d_pq_dp(iq2q1,qpair12,delta),diq2q1_dq1);
				printDerivDelta("npair="+npair+" dq2iq1q12_dq1, delta="+delta, dq2iq1q12_dq1, dq2iq1q12_dq1_dq1_delta,fmt_width,fmt_decimals);
			}			
			qderivs[0] = dq2iq1q12_dq1;
			qderivs[1] = dq2iq1q12_dq2;
		}
		return q2iq1q12;
	}

	
	public static double [][][] getPairErrQuaternionDelta(
			double [] qscene1,
			double [] qscene2,
			double [] qpair12,
			double delta) {
		double [][][] qderivs =  new double [2][4][4];
		for (int nscene = 0; nscene <2; nscene++) {
			for (int npar = 0; npar < 4; npar++) {
				double [][] vpm = new double [][] {qscene1.clone(),qscene2.clone()};
				vpm[nscene][npar] += 0.5*delta;
				double [] qd_p = getPairErrQuaternion(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						qpair12,// double [][][] qderivs),
						null);  // double [][][] qderivs)
				vpm[nscene][npar]-= delta;
				double [] qd_m = getPairErrQuaternion(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						qpair12,// double [][][] qderivs),
						null);  // double [][][] qderivs)
				for (int i = 0; i < 4; i++) {
					qderivs[nscene][i][npar] = (qd_p[i] - qd_m[i])/delta; 
				}
			}
		}
		return qderivs;
	}
	
	
	
	public static double [] getPairScaleDirError(
			double [] qscene1,
			double [] qscene2,
			double [] qpair12,
			double [][][] derivs) {
		boolean truncate = false; // true;
		double [][][] qderivs = (derivs != null) ? (new double [2][][]) : null;
		double [] q2iq1q12 = getPairErrQuaternion(
				qscene1, // double [] qscene1,
				qscene2, // double [] qscene2,
				qpair12, // double [] qpair12,
				qderivs); // double [][][] qderivs)
		double [] q2iq1q12_norm=QuatUtils.normalize(q2iq1q12); // use q1,q2,q3, maybe q1 and q2 (from DEM) lower weight than q3 (from images)
		double scale_diff = QuatUtils.norm(q2iq1q12)-1.0;
		if (derivs != null) {
			if (truncate) {
				derivs[0] = qderivs[0];
				derivs[1] = qderivs[1];
				
			} else {
				double [][] dq2iq1q12_norm = QuatUtils.dnormalize_dq(q2iq1q12);
				double [] dscale = QuatUtils.dscale_dq(q2iq1q12);
				dq2iq1q12_norm[0] = dscale; // replace first row (q_norm/dq) with dsca/e/dq
				derivs[0] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[0]); // /dq1 (scene 1)
				derivs[1] = QuatUtils.matMult(dq2iq1q12_norm,qderivs[1]); // /dq1 (scene 2)
			}
		}		
		if (truncate) return q2iq1q12;
		return new double [] {scale_diff, q2iq1q12_norm[1], q2iq1q12_norm[2], q2iq1q12_norm[3]};
	}

	
	
	
	public static double [][][] getPairScaleDirError(
			double [] qscene1,
			double [] qscene2,
			double [] qpair12,
			double delta) {
		double [][][] sd_derivs =  new double [2][4][4];
		for (int nscene = 0; nscene <2; nscene++) {
			for (int npar = 0; npar < 4; npar++) {
				double [][] vpm = new double [][] {qscene1.clone(),qscene2.clone()};
				vpm[nscene][npar] += 0.5*delta;
				double [] qd_p = getPairScaleDirError(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						qpair12,// double [][][] qderivs),
						null);  // double [][][] qderivs)
				vpm[nscene][npar]-= delta;
				double [] qd_m = getPairScaleDirError(
						vpm[0], // double [] qscene1,
						vpm[1], // double [] qscene2,
						qpair12,// double [][][] qderivs),
						null);  // double [][][] qderivs)
				for (int i = 0; i < 4; i++) {
					sd_derivs[nscene][i][npar] = (qd_p[i] - qd_m[i])/delta; 
				}
			}
		}
		return sd_derivs;
	}
	
	
	
	
	public static void printDerivDelta(String name, double [] deriv, double [] delta, int width, int decimals) {
		double max_err = 0;
		double [] diff= new double [deriv.length];
		String fmt ="\t%"+width+"."+decimals+"f"; 
		for (int i = 0; i < diff.length; i++) {
			diff[i] = deriv[i]-delta[i];
			max_err = Math.max(max_err, Math.abs(diff[i]));
		}
		System.out.println (name+", max_err = "+max_err);
		System.out.print ("deriv ");
		for (int i = 0; i < diff.length; i++) 	System.out.print(String.format(fmt, deriv[i]));
		System.out.println();
		System.out.print ("delta ");
		for (int i = 0; i < diff.length; i++) 	System.out.print(String.format(fmt, delta[i]));
		System.out.println();
		System.out.print (" diff ");
		for (int i = 0; i < diff.length; i++) 	System.out.print(String.format(fmt, diff[i]));
		System.out.println();
	}

	public static void printDerivDelta(String name, double [][] deriv2, double [][] delta2, int width, int decimals) {
		int l = deriv2[0].length;
		double [] deriv = new double [deriv2.length*l];
		double [] delta = new double [deriv.length];
		for (int i = 0; i < deriv2.length; i++) {
			for (int j = 0; j <l; j++) {
				deriv[i*l+j] = deriv2[i][j];
				delta[i*l+j] = delta2[i][j];
			}
		}
		printDerivDelta(name, deriv, delta, width, decimals);
	}
	
	
}
