Found multithreaded Cholesky almost 10 times faster than single-threaded

for n=2300, 16 threads

Found multithreaded Cholesky almost 10 times faster than single-threaded
for n=2300, 16 threads
3cedfc7a · Andrey Filippov · 663a964a · 3cedfc7a · 3cedfc7a · 3cedfc7a
Commit 3cedfc7a authored Mar 17, 2025 by Andrey Filippov
5 changed files
--- a/src/main/java/com/elphel/imagej/common/CholeskyBlock.java
+++ b/src/main/java/com/elphel/imagej/common/CholeskyBlock.java
+package com.elphel.imagej.common;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.elphel.imagej.tileprocessor.ImageDtt;
+
+import Jama.Matrix;
+
+public class CholeskyBlock {
+	public final int m;  // tile size
+	public final int np; // number of elements in row/col
+	public final int n;  // number of tiles in  row/col
+	public final int nf; // number of full rows/cols
+	public final double [] A;
+	public final double [] L;
+	
+	public CholeskyBlock (
+			double [][] A_in,
+			int size) {
+		m = size;
+		np = A_in.length;
+		nf = np / m;
+		n = ((nf * m) < np)? (nf + 1) : nf; 
+		A = new double [n*n*m*m];
+		L = new double [A.length];
+		setup_ATriangle(A_in);
+	}
+	
+	private void setup_ATriangle(double [][] A_in) {
+		for (int tile_row = 0; tile_row < nf; tile_row++) {
+			for (int tile_col = 0; tile_col < tile_row; tile_col++) {
+				int indx = indx_IJ(tile_row, tile_col);
+				for (int k = 0; k < m; k++) {
+					System.arraycopy(
+							A_in[tile_row*m +k],
+							tile_col * m,
+							A,
+							indx + m * k,
+							m);
+				}
+			}
+			// copy diagonal
+			int indx = indx_IJ(tile_row, tile_row);
+			for (int k = 0; k < m; k++) {
+				System.arraycopy(
+						A_in[tile_row*m +k],
+						tile_row * m,
+						A,
+						indx + m * k,
+						k+1);
+			}
+		}
+		if (n > nf) { // if there are small tiles below and to the right
+			int tile_row = nf;
+			int m1 = np - m * nf;
+			for (int tile_col = 0; tile_col < nf; tile_col++) {
+				int indx = indx_IJ(tile_row, tile_col);
+				for (int k = 0; k < m1; k++) {
+					System.arraycopy(
+							A_in[tile_row*m +k],
+							tile_col * m,
+							A,
+							indx + m * k,
+							m);
+				}
+				
+			}
+			// copy diagonal
+			int indx = indx_IJ(tile_row, tile_row);
+			for (int k = 0; k < m1; k++) {
+				System.arraycopy(
+						A_in[tile_row*m +k],
+						tile_row * m,
+						A,
+						indx + m1 * k,
+						k+1);
+			}
+		}
+		
+	}
+	
+	public Matrix getL() {
+		return new Matrix(get_LTriangle(),np,np);
+	}
+	
+	
+	public double [][] get_LTriangle() {
+		double [][] L_out = new double[np][np];
+		for (int tile_row = 0; tile_row < nf; tile_row++) {
+			for (int tile_col = 0; tile_col < tile_row; tile_col++) {
+				int indx = indx_IJ(tile_row, tile_col);
+				for (int k = 0; k < m; k++) {
+					System.arraycopy(
+							L,
+							indx + m * k,
+							L_out[tile_row*m +k],
+							tile_col * m,
+							m);
+				}
+			}
+			// copy diagonal
+			int indx = indx_IJ(tile_row, tile_row);
+			for (int k = 0; k < m; k++) {
+				System.arraycopy(
+						L,
+						indx + m * k,
+						L_out[tile_row*m +k],
+						tile_row * m,
+						k+1);
+			}
+		}
+		if (n > nf) { // if there are small tiles below and to the right
+			int tile_row = nf;
+			int m1 = np - m * nf;
+			for (int tile_col = 0; tile_col < nf; tile_col++) {
+				int indx = indx_IJ(tile_row, tile_col);
+				for (int k = 0; k < m1; k++) {
+					System.arraycopy(
+							L,
+							indx + m * k,
+							L_out[tile_row*m +k],
+							tile_col * m,
+							m);
+				}
+				
+			}
+			// copy diagonal
+			int indx = indx_IJ(tile_row, tile_row);
+			for (int k = 0; k < m1; k++) {
+				System.arraycopy(
+						L,
+						indx + m1 * k,
+						L_out[tile_row*m +k],
+						tile_row * m,
+						k+1);
+			}
+		}
+		return L_out;
+		
+	}
+	
+	
+	
+	
+	/**
+	 * Get index of the top-left tile corner
+	 * @param i tile row
+	 * @param j tile column
+	 * @return index in A and L arrays
+	 */
+	public int indx_IJ(int i, int j) {
+		return j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i;
+	}
+
+	/**
+	 * Get a new diagonal square submatrix 
+	 * @param arr A or L packed array (line-scan order for each tile column)
+	 * @param i tile index on the diagonal
+	 * @return new array with tile data (m x m, or smaller for the bottom right) 
+	 */
+	public double [][] getDiagSquare(
+			double [] arr,
+			int i){
+		int l =  (i >= nf) ? (np-nf*m): m;
+		double [][] a_diag = new double[l][l];
+		return getDiagSquare (arr, a_diag, i);
+	}
+
+	/**
+	 * Get a new diagonal square submatrix 
+	 * @param arr A or L packed array (line-scan order for each tile column)
+	 * @param a_diag - preallocated array (should be smaller if needed) 
+	 * @param i tile index on the diagonal
+	 * @return new array with tile data (m x m, or smaller for the bottom right) 
+	 */
+	public double [][] getDiagSquare(
+			double []   arr,
+			double [][] a_diag,
+			int         i){
+		int indx = indx_IJ(i,i);
+		int l = a_diag.length;
+		for (int k = 0; k < l; k++) {
+			int kl = k*l;
+			System.arraycopy( arr, indx+kl, a_diag[k], 0, l);
+		}
+		return a_diag;
+	}
+
+
+	/**
+	 * Get a new diagonal square submatrix, copy only lower triangle including diagonal  
+	 * @param arr A or L packed array (line-scan order for each tile column)
+	 * @param i tile index on the diagonal
+	 * @return new array with tile data (m x m, or smaller for the bottom right) 
+	 */
+	public double [][] getDiagTriangle(
+			double [] arr,
+			int i){
+		int l =  (i >= nf) ? (np-nf*m): m;
+		double [][] a_diag = new double[l][l];
+		return getDiagLTriangle (arr, a_diag, i);
+	}
+
+	/**
+	 * Get a new diagonal square submatrix, copy only lower triangle including diagonal 
+	 * @param arr A or L packed array (line-scan order for each tile column)
+	 * @param a_diag - preallocated array (should be smaller if needed) 
+	 * @param i tile index on the diagonal
+	 * @return new array with tile data (m x m, or smaller for the bottom right) 
+	 */
+	public double [][] getDiagLTriangle(
+			double [] arr,
+			double [][] a_diag,
+			int i){
+		int indx = indx_IJ(i,i);
+		int l = a_diag.length;
+		for (int k = 0; k < l; k++) {
+			int kl = k*l;
+			System.arraycopy( arr, indx+kl, a_diag[k], 0, k+1);
+		}
+		return a_diag;
+	}
+	
+	/**
+	 * Save calculated tile L lower diagonal matrix to a packed array
+	 * @param arr    A or L packed array (line-scan order for each tile column)
+	 * @param l_diag lower triangular array with Cholesky decomposition
+	 * @param i tile index on a diagonal
+	 */
+	public void putDiagLTriangle(
+			double []   arr,
+			double [][] l_diag,
+			int         i) {
+		int indx = indx_IJ(i,i);
+		int l = l_diag.length;
+		for (int k = 0; k < l; k++) {
+			int kl = k*l;
+			System.arraycopy(l_diag[k], 0, arr, indx+kl,  k+1);
+		}
+	}
+	
+	
+	public void setL21(
+			int       i,   // i > j,
+			int       j) { // j <nf
+		int indx_diag = indx_IJ(j,j);
+		int indx_ij =   indx_IJ(i,j);
+		int h = (i < nf) ? m : (np-nf*m);
+		// prepare solving Lx = b, copy tile A -> L
+		System.arraycopy(A, indx_ij, L, indx_ij, m * h);
+		for (int l_row = 0; l_row < m; l_row++) {
+			for (int x_col= 0; x_col < h; x_col++) { // b-vector
+				int lindx = indx_ij + m * x_col + l_row;
+				double ls = L[lindx];
+				for (int l_col = 0; l_col < l_row; l_col++) {
+					ls -= L[indx_ij + m * x_col + l_col] * L[indx_diag + m* l_row+l_col];
+				}
+				L[lindx] = ls/L[indx_diag + (m + 1)* l_row];
+			}
+		}
+		return;
+	}
+	
+	public void setA22(
+			int diag,// < col,  < row
+			int row, // >= col
+			int col) {
+		int h = (row < nf) ? m : (np-nf*m);
+		int indx_a =    indx_IJ(row,col);
+		int indx_lrow = indx_IJ(row,diag);
+		if (row == col) {
+			for (int i = 0; i < h; i++) {
+				for (int j = 0; j <= i; j++) {
+					for (int k = 0; k < h; k++) {
+						A[indx_a + i * m + j] -= L[indx_lrow + i * h + k] * L[indx_lrow + j * h + k];
+					}
+				}
+			}
+		} else {
+			int indx_lcol = indx_IJ(col,diag);
+			for (int i = 0; i < h; i++) {
+				for (int j = 0; j < m; j++) {
+					for (int k = 0; k < m; k++) {
+						A[indx_a + i * m + j] -= L[indx_lrow + i * m + k] * L[indx_lcol + j * m + k];
+					}
+				}
+			}
+		}
+		return;
+	}
+	
+	public void choleskyBlockMulti() {
+		final Thread[] threads =       ImageDtt.newThreadArray();
+		final AtomicInteger ai =       new AtomicInteger(0);
+		final double [][] Am = new double[m][m];
+		final double [][] Ah = (n > nf) ? new double[(np-nf*m)][(np-nf*m)] : null;
+		final double [][] A1 = (np < m) ? Ah : Am; // smaller than a tile
+		final double [][] Lm = new double[m][m];
+		final double [][] Lh = (n > nf) ? new double[(np-nf*m)][(np-nf*m)] : null;
+		final double [][] L1 = (np < m) ? Lh : Lm;  // smaller than a tile
+		
+		// Extract top-left tile (only lower triangle)
+		getDiagLTriangle(
+				A,   // double [] arr,
+				A1,  // double [][] a_diag,
+				0);  // int i)
+		// Decompose, get L (lower triangle)
+		cholesky_single (
+				A1,  // double [][] a,
+				L1); // double [][] l)
+		// Save it to L-array:
+		putDiagLTriangle(
+				L,   // double []   arr,
+				L1,  // double [][] l_diag,
+				0);  // int         i)
+		// Calculate first column under diagonal (L21) - maybe use m
+		ai.set(1); // start from the second tile row
+		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+			threads[ithread] = new Thread() {
+				public void run() {
+					for (int tile_row = ai.getAndIncrement(); tile_row < n; tile_row = ai.getAndIncrement()) {
+						setL21(tile_row, 0);
+					}
+				}
+			};
+		}		      
+		ImageDtt.startAndJoin(threads);
+
+		for (int tile_diag = 1; tile_diag < n; tile_diag++) {
+			final int ftile_diag = tile_diag;
+			// Calculate A in one tile column of the remaining A2'
+			// start with diagonal (top) tile, and calculate its Cholesky
+			// In parallel, calculate A for all tiles in that column below diagonal
+			ai.set(ftile_diag);
+			for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+				threads[ithread] = new Thread() {
+					public void run() {
+						for (int nRow= ai.getAndIncrement(); nRow < n; nRow = ai.getAndIncrement()) {
+							setA22(
+									ftile_diag-1, // int diag,// < col,  < row
+									nRow,         // int row, // >= col
+									ftile_diag);  // int col) 							
+							if (nRow == ftile_diag) {
+								double [][] At = (ftile_diag < nf) ? Am: Ah; // full or reduced array
+								double [][] Lt = (ftile_diag < nf) ? Lm: Lh; // full or reduced array
+								// Extract top-left tile (only lower triangle)
+								getDiagLTriangle(
+										A,            // double [] arr,
+										At,           // double [][] a_diag,
+										ftile_diag);  // int i)
+								// Decompose, get L (lower triangle)
+								cholesky_single (
+										At,           // double [][] a,
+										Lt);          // double [][] l)
+								// Save it to L-array:
+								putDiagLTriangle(
+										L,            // double []   arr,
+										Lt,           // double [][] l_diag,
+										ftile_diag);  // int         i)
+								/*
+								// Calculate first column under diagonal (L21) - maybe use m
+								for (int tr = ftile_diag; tr < n; tr++) {
+									setL21(tr, ftile_diag);
+								}
+								*/
+							}
+						}
+					}
+				};
+			}		      
+			ImageDtt.startAndJoin(threads);
+			if (ftile_diag < (n-1)) {
+				// Now in parallel calculate L in column ftile_diag under diagonal and
+				// finish A2' to the right of ftile_diag column
+				final int left_rows = n - ftile_diag - 1;
+				final int num_tiles = left_rows * (left_rows + 1) / 2 + 1;
+
+				ai.set(0);
+				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+					threads[ithread] = new Thread() {
+						public void run() {
+							for (int ntile = ai.getAndIncrement(); ntile < num_tiles; ntile = ai.getAndIncrement()) {
+								if (ntile == 0) {
+									// Calculate first column of L under diagonal (L21) - maybe use m
+
+									for (int tr = ftile_diag + 1; tr < n; tr++) {
+										setL21(
+												tr,          // row > column 
+												ftile_diag); // column
+									}
+								} else  {
+									int nrow = (int) Math.floor(-0.5 + 0.5* Math.sqrt(1 + 8 * (ntile-1)));
+									int ncol = (ntile-1) - (nrow * (nrow + 1) /2);
+									int row = ftile_diag + nrow + 1;
+									int col = ftile_diag + ncol + 1;
+									setA22(
+											ftile_diag-1, // int diag,// < col,  < row
+											row, // int row, // >= col
+											col); // int col) 							
+								}
+							}
+						}
+					};
+				}		      
+				ImageDtt.startAndJoin(threads);
+			}
+		}
+	}
+	
+	
+	// Cholesky-Banachiewicz Algorithm ?
+	// Single-threaded
+	public static double [][] cholesky_single (
+			double [][] a,
+			double [][] l) {
+		  int n = a.length;
+//		  double [][] l = new double[n][n];
+		  for (int j = 0; j < n; j++) {
+			  Arrays.fill(l[j], 0.0);
+		  }
+	      // Main loop.
+	      for (int j = 0; j < n; j++) {
+	         double[] Lrowj = l[j];
+	         double d = 0.0;
+	         for (int k = 0; k < j; k++) {
+	            double[] Lrowk = l[k];
+	            double s = 0.0;
+	            for (int i = 0; i < k; i++) {
+	               s += Lrowk[i]*Lrowj[i];
+	            }
+	            Lrowj[k] = s = (a[j][k] - s)/l[k][k];
+	            d = d + s*s;
+	         }
+	         d = a[j][j] - d;
+	         l[j][j] = Math.sqrt(Math.max(d,0.0));
+	      }
+	      return l;
+	}
+	
+	
+	
+	
+
+}
--- a/src/main/java/com/elphel/imagej/common/CholeskyLDLTMulti.java
+++ b/src/main/java/com/elphel/imagej/common/CholeskyLDLTMulti.java
@@ -7,6 +7,7 @@ import com.elphel.imagej.tileprocessor.ImageDtt;

 import Jama.CholeskyDecomposition;
 import Jama.Matrix;
+import ij.ImagePlus;

 public class CholeskyLDLTMulti {
 	public final int n;
@@ -14,178 +15,149 @@ public class CholeskyLDLTMulti {
 	public int solve_step = 10;
 	public double thread_scale = 2.5; // split jobs in thread_scale * num_treads chunks
 	
-	//	public double [][] L;
-	public CholeskyLDLTMulti(Matrix matA, boolean b) {
-		boolean debug = true;
-		int dbg_j = 100; // 1520;
+	
+
+	public CholeskyLDLTMulti(Matrix matA) {
 		n = matA.getRowDimension();
-		L = matA.getArray(); // will be modified, copy externally if needed
-		final Thread[] threads = ImageDtt.newThreadArray();
-		final AtomicInteger ai = new AtomicInteger(0);
-		for (int j = 0; j < n; j++) {
-			if (j == dbg_j) {
-				System.out.println("CholeskyLDLTMulti(): j=dbg_j="+j);
-			}
-			final int fj=j;
-		    for (int k = 0; k < j; k++) {
-		    	final int fk=k;
-		        L[j][j] -= L[j][k]*L[j][k]; // diagonal
-		        final double Ljk = L[j][k];
-//		        for (int i = j+1; i < n; i++) {
-//	            	l[i][j] -= l[i][k]*l[j][k]; // not including diagonal
-//	        	}        
-		        // parallel
-		        ai.set(j+1);
-				for (int ithread = 0; ithread < threads.length; ithread++) {
-					threads[ithread] = new Thread() {
-						public void run() {
-							for (int i = ai.getAndIncrement(); i < n; i = ai.getAndIncrement()) {
-								L[i][fj] -= L[i][fk]*Ljk;
-							}
-						}
-					};
-				}		      
-				ImageDtt.startAndJoin(threads);
-		    }
-		    // parallel
-//		    for (int i = j+1; i < n; i++) {
-//	        	l[i][j] /= l[j][j];
-//	    	}
-		    final double Ljj = L[j][j];
-	        ai.set(j+1);
-			for (int ithread = 0; ithread < threads.length; ithread++) {
-				threads[ithread] = new Thread() {
-					public void run() {
-						for (int i = ai.getAndIncrement(); i < n; i = ai.getAndIncrement()) {
-							L[i][fj] /= Ljj;
-						}
-					}
-				};
-			}		      
-			ImageDtt.startAndJoin(threads);
-			if (debug) {
-				System.out.println("CholeskyLDLTMulti(): j="+j);
-			}
-		}
-		return;
+		L = CholeskyLDLTMulti_multi(matA);
 	}
 	
-	// L overwites strict lower A, d overwrites diagonal A
-	public CholeskyLDLTMulti(Matrix matA, int[] ii) {
-//		boolean debug = true;
-		int dbg_j = 100; // 1520;
+	public CholeskyLDLTMulti(Matrix matA, int mode) {
 		n = matA.getRowDimension();
-		// not needed
-		L = matA.getArray(); // will be modified, copy externally if needed
-		final double [][] a = matA.getArray(); 
-		final Thread[] threads = ImageDtt.newThreadArray();
-		final AtomicInteger ai = new AtomicInteger(0);
+		switch (mode) {
+		case 0: L = CholeskyLDLTMulti_single(matA); break;
+		case 1: L = CholeskyLDLTMulti_multi(matA); break;
+		case 2: L = CholeskyLDLTMulti_fast(matA); break;
+		default:L = CholeskyLDLTMulti_single(matA);
+		}
+	}	
+
+	public double[][] CholeskyLDLTMulti_multi(Matrix matA) {
+		final double [][] a =          matA.getArray();
+		final Thread[] threads =       ImageDtt.newThreadArray();
+		final AtomicInteger ai =       new AtomicInteger(0);
+		final AtomicInteger ati =      new AtomicInteger(0);
+		final double [] ajj_threaded = new double[threads.length];
+
+		final int threads_chunk = (int) (n / (thread_scale * threads.length));
 		for (int j = 0; j < n; j++) {
 			final int fj = j;
-			if (j == dbg_j) {
-				System.out.println("CholeskyLDLTMulti(): j=dbg_j="+j);
-			}
-			/*
-			if (j > 0) {
+			final double [] aj = a[j];
+			double ajj = aj[j];
+			if (j > threads_chunk) { // use multithreaded
+				Arrays.fill(ajj_threaded, 0.0);
 				ai.set(0);
-				for (int ithread = 0; ithread < threads.length; ithread++) {
+				ati.set(0);
+				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
 					threads[ithread] = new Thread() {
 						public void run() {
-							for (int i = ai.getAndIncrement(); i < fj; i = ai.getAndIncrement()) {
-								a[fj][fj] -= a[i][i]*a[fj][i]*a[fj][i]; // squared
+							int nthread =  ati.getAndIncrement();
+							for (int i0 = ai.getAndAdd(threads_chunk); i0 < fj; i0 = ai.getAndAdd(threads_chunk)) {
+								int i1 = Math.min(fj, i0+threads_chunk);
+								for (int i = i0; i < i1; i++) {
+									double aji = aj[i];
+									ajj_threaded[nthread] += a[i][i]*aji*aji; // squared
+								}
 							}
 						}
 					};
 				}		      
 				ImageDtt.startAndJoin(threads);
+				for (int nthread = 0; nthread < ajj_threaded.length; nthread++) {
+					ajj -= ajj_threaded[nthread];
+				}
+			} else {
+				for (int i = 0; i < j; i++) {  // use aj, a, i
+					double aji = aj[i];
+					ajj -= a[i][i]*aji*aji; // squared
+				}
 			}
-			*/
-			for (int i = 0; i < j; i++) {
-				a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
+			aj[j] = ajj;
+			// TODO: above run single-threaded for small, then multithreaded with post-accumulation of ajj
+			if ((n - j+1) > threads_chunk) { // use multithreaded
+				ai.set(j+1);
+	    		for (int ithread = 0; ithread < threads.length; ithread++) {
+	    			threads[ithread] = new Thread() {
+	    				public void run() {
+	    					for (int k0 = ai.getAndAdd(threads_chunk); k0 < n; k0 = ai.getAndAdd(threads_chunk)) {
+	    						int k1 = Math.min(n, k0+threads_chunk);
+	    						for (int k = k0; k < k1; k++) {
+	    							double [] ak = a[k];
+	    							for (int i = 0; i < fj; i++) { // ak, aj, i, 
+	    								ak[fj] -= a[i][i] * ak[i] * aj[i];
+	    							}
+	    							ak[fj] /= aj[fj];
+	    						}
+	    					}
+	    				}
+	    			};
+	    		}		      
+	    		ImageDtt.startAndJoin(threads);
+			} else { // use single-threaded
+			    for (int k = j+1; k < n; k++) {
+					final double [] ak = a[k];
+			    	for (int i = 0; i < j; i++) { // ak, aj, i, 
+			    		ak[j] -= a[i][i]* ak[i] * aj[i];
+			    	}
+					ak[j] /= ajj;
+			    }
 			}
-	
-/*  for (int i = 0; i < j; i++) {
-        a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
-    } */
-		    for (int k = j+1; k < n; k++) {
-		    	final int fk = k;
-		    	/*
-		    	if (j > 0) {
-		    		ai.set(0);
-		    		for (int ithread = 0; ithread < threads.length; ithread++) {
-		    			threads[ithread] = new Thread() {
-		    				public void run() {
-		    					for (int i = ai.getAndIncrement(); i < fj; i = ai.getAndIncrement()) {
-		    						a[fk][fj] -= a[i][i]* a[fk][i] * a[fj][i];
-		    					}
-		    				}
-		    			};
-		    		}		      
-		    		ImageDtt.startAndJoin(threads);
-		    	}
-		    	*/
-		    	for (int i = 0; i < j; i++) {
-		    		a[k][j] -= a[i][i]* a[k][i] * a[j][i];
-		    	}
-				a[k][j] /= a[j][j];
-		    }
-/*    for (int k = j+1; k < n; k++) {
-        for (int i = 0; i < j; i++) {
-            a[k][j] -= a[i][i]* a[k][i] * a[j][i];
-        }
-        a[k][j] /= a[j][j];
-    } */
 		}
-		return;
+//		L = a;
+		return a;
 	}
+
 	
 	
-	public CholeskyLDLTMulti(Matrix matA, int ii) { // single-threaded
-		n = matA.getRowDimension();
-		// not needed
-//		L = matA.getArray(); // will be modified, copy externally if needed
-		final double [][] a = matA.getArray();
-		for (int j = 0; j < n; j++) {
-			for (int i = 0; i < j; i++) {
-				a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
-			}
-		    for (int k = j+1; k < n; k++) {
-		    	for (int i = 0; i < j; i++) {
-		    		a[k][j] -= a[i][i]* a[k][i] * a[j][i];
-		    	}
-				a[k][j] /= a[j][j];
-		    }
-		}
-		L = a;
-		return;
-	}
-
-	public CholeskyLDLTMulti(Matrix matA) {
-		n = matA.getRowDimension();
+	public double[][] CholeskyLDLTMulti_fast(Matrix matA) {
 		final double [][] a =          matA.getArray();
 		final Thread[] threads =       ImageDtt.newThreadArray();
 		final AtomicInteger ai =       new AtomicInteger(0);
 		final AtomicInteger ati =      new AtomicInteger(0);
 		final double [] ajj_threaded = new double[threads.length];
-
 		final int threads_chunk = (int) (n / (thread_scale * threads.length));
+		
+		final Integer [] slots_order = new Integer [threads.length]; // per-thread slot index
+		final int [] slot_j  = new int [threads.length];     // slot synchronized to this j (including)
+		final double [][][] slots_a = new double [threads.length][n][n]; // copies of L in progress
+		Arrays.fill(slot_j, -1);
+//		Arrays.setAll(slots_order, i->i);
+		
+//		for (int i = 0; i < slots_order.length; i++) {
+//			slots_order[i] = i;
+//		}
 		for (int j = 0; j < n; j++) {
 			final int fj = j;
+			Arrays.setAll(slots_order, i->i); //https://stackoverflow.com/questions/68331291/how-do-i-reorder-another-array-based-on-a-sorted-array-java
+			Arrays.sort(slots_order, (la, lb) -> slot_j[lb] - slot_j[la]); // decreasing order
 			final double [] aj = a[j];
 			double ajj = aj[j];
-			if (j > threads_chunk) { // use multithreaded
+			if (j > threads_chunk) { // use multithreaded TEMPORARY MAKE IT ALWAYS SINGLE
 				Arrays.fill(ajj_threaded, 0.0);
 				ai.set(0);
 				ati.set(0);
 				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
 					threads[ithread] = new Thread() {
 						public void run() {
-							int nthread =  ati.getAndIncrement();
+							int ithread =  ati.getAndIncrement();
+							double [][] local_a;
 							for (int i0 = ai.getAndAdd(threads_chunk); i0 < fj; i0 = ai.getAndAdd(threads_chunk)) {
+								int slot = slots_order[ithread];
+								local_a = slots_a[slot];
+								int local_j = slot_j[slot];
+								if (local_j < fj-1) { // catch up if needed
+									for (int col = local_j+1; col < fj; col++){
+										for (int row = local_j; row < n; row++) {
+											local_a[row][col] = a[row][col]; // copying from global to local memory
+										}
+									}
+									slot_j[slot] = fj-1;
+								}
 								int i1 = Math.min(fj, i0+threads_chunk);
+								double [] local_aj = local_a[fj];
 								for (int i = i0; i < i1; i++) {
-									double aji = aj[i];
-									ajj_threaded[nthread] += a[i][i]*aji*aji; // squared
+									double aji = local_aj[i];
+									ajj_threaded[slot] += local_a[i][i]*aji*aji; // squared
 								}
 							}
 						}
@@ -202,20 +174,45 @@ public class CholeskyLDLTMulti {
 				}
 			}
 			aj[j] = ajj;
+			final double fajj = ajj; // use in all threads, a[j][j] is also valid
 			// TODO: above run single-threaded for small, then multithreaded with post-accumulation of ajj
 			if ((n - j+1) > threads_chunk) { // use multithreaded
+				// re-evaluate slots, they could change after diagonal calculations
+				Arrays.setAll(slots_order, i->i); //https://stackoverflow.com/questions/68331291/how-do-i-reorder-another-array-based-on-a-sorted-array-java
+				Arrays.sort(slots_order, (la, lb) -> slot_j[lb] - slot_j[la]); // decreasing order
 				ai.set(j+1);
+				ati.set(0);
 	    		for (int ithread = 0; ithread < threads.length; ithread++) {
 	    			threads[ithread] = new Thread() {
 	    				public void run() {
+							int ithread =  ati.getAndIncrement();
+							double [][] local_a;
 	    					for (int k0 = ai.getAndAdd(threads_chunk); k0 < n; k0 = ai.getAndAdd(threads_chunk)) {
+								int slot = slots_order[ithread];
+								local_a = slots_a[slot];
+								int local_j = slot_j[slot];
+								if (local_j < fj-1) { // catch up if needed
+									for (int col = local_j+1; col < fj; col++){
+										for (int row =col; row < n; row++) {
+											local_a[row][col] = a[row][col]; // copying from global to local memory
+										}
+									}
+									slot_j[slot] = fj-1;
+								}
+								double [] local_aj = local_a[fj];
 	    						int k1 = Math.min(n, k0+threads_chunk);
 	    						for (int k = k0; k < k1; k++) {
-	    							double [] ak = a[k];
+	    							double [] local_ak = local_a[k];
+	    							double akj = local_ak[fj];
 	    							for (int i = 0; i < fj; i++) { // ak, aj, i, 
-	    								ak[fj] -= a[i][i] * ak[i] * aj[i];
+//	    								ak[fj] -= a[i][i] * ak[i] * aj[i];
+//	    								local_ak[fj] -= local_a[i][i] * local_ak[i] * local_aj[i];
+	    								akj -= local_a[i][i] * local_ak[i] * local_aj[i];
+	    								
 	    							}
-	    							ak[fj] /= aj[fj];
+//	    							ak[fj] /= aj[fj];
+	    							a[k][fj] = local_ak[fj] / fajj; // write to global memory
+	    							a[k][fj] = akj / fajj; // write to global memory
 	    						}
 	    					}
 	    				}
@@ -232,44 +229,34 @@ public class CholeskyLDLTMulti {
 			    }
 			}
 		}
-		L = a;
-		return;
+//		L = a;
+		return a;
 	}
 	
-	
-	public CholeskyLDLTMulti(Matrix matA, long ll) { // single-threaded
-		n = matA.getRowDimension();
+	public double [][] CholeskyLDLTMulti_single(Matrix matA) { // single-threaded
+//		n = matA.getRowDimension();
 		// not needed
+//		L = matA.getArray(); // will be modified, copy externally if needed
 		final double [][] a = matA.getArray();
 		for (int j = 0; j < n; j++) {
-			final double [] aj = a[j];
-			double ajj = aj[j];
-			for (int i = 0; i < j; i++) {  // use aj, a, i
-				double aji = aj[i];
-//				a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
-				ajj -= a[i][i]*aji*aji; // squared
-				aj[j] -= a[i][i]*aji*aji; // squared
+			for (int i = 0; i < j; i++) {
+				a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
 			}
-			aj[j] = ajj;
 		    for (int k = j+1; k < n; k++) {
-				final double [] ak = a[k];
-		    	for (int i = 0; i < j; i++) { // ak, aj, i, 
-//		    		a[k][j] -= a[i][i]* a[k][i] * a[j][i]; // use digonal
-		    		ak[j] -= a[i][i]* ak[i] * aj[i];
+		    	for (int i = 0; i < j; i++) {
+		    		a[k][j] -= a[i][i]* a[k][i] * a[j][i];
 		    	}
-//				a[k][j] /= a[j][j];
-//				ak[j] /= aj[j];
-				ak[j] /= ajj;
+				a[k][j] /= a[j][j];
 		    }
 		}
-		L = a;
-		return;
+//		L = a;
+		return a;
 	}
+
+
 	
 	
-	
-	
-	public Matrix solve0 (Matrix B) { // multithreased
+	public Matrix solve0 (Matrix B) { // multithreaded
 		if (B.getRowDimension() != n) {
 			throw new IllegalArgumentException("Matrix row dimensions must agree.");
 		}
@@ -407,18 +394,56 @@ public class CholeskyLDLTMulti {
 		return new Matrix(x,n);
 	}
 		
+	public static void testCholesky(ImagePlus imp_src) {
+		float [] fpixels = (float[]) imp_src.getProcessor().getPixels();
+		int      width = imp_src.getWidth();
+		int      height = imp_src.getHeight();
+		int n = height;
+		double [][] a = new double [n][n];
+		double [][] b = new double [n][1];
+		for (int i = 0; i < n; i++) {
+			for (int j= 0; j < n; j++) {
+				a[i][j] = fpixels[i*width+j];
+			}
+			b[i][0] = fpixels[i*width+n];
+		}
+		testCholesky(
+				new Matrix(a), // Matrix wjtjlambda,
+				new Matrix(b)); // Matrix jty)
+	}
 	public static Matrix[] testCholesky(
 			Matrix wjtjlambda,
 			Matrix jty) {
 		Matrix wjtjlambda_copy0 = new Matrix(wjtjlambda.getArrayCopy());
-		CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda_copy0);
-		
+		Matrix wjtjlambda_copy1 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy2 = new Matrix(wjtjlambda.getArrayCopy());
 		Matrix wjtjlambda_copy = new Matrix(wjtjlambda.getArrayCopy());
-		CholeskyLDLTMulti choleskyLDLTMulti = new CholeskyLDLTMulti(wjtjlambda_copy);
+		double [] starts = new double[7];
+		double start_time = (((double) System.nanoTime()) * 1E-9);
 		
+		CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda_copy);
+		starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLDLTMulti choleskyLDLTMulti_single = new CholeskyLDLTMulti(wjtjlambda_copy0,0);
+		starts[1] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLDLTMulti choleskyLDLTMulti_multi = new CholeskyLDLTMulti(wjtjlambda_copy1,1);
+		starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLDLTMulti choleskyLDLTMulti_fast = new CholeskyLDLTMulti(wjtjlambda_copy2,2);
+		starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLDLTMulti choleskyLDLTMulti = choleskyLDLTMulti_fast;
 		Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
+		starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
 		Matrix mdelta_cholesky_multi0 = choleskyLDLTMulti.solve0(jty);
+		starts[5] = (((double) System.nanoTime()) * 1E-9) - start_time;
 		Matrix mdelta_cholesky_multi =  choleskyLDLTMulti.solve(jty);
+		starts[6] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		System.out.println("testCholesky(): choleskyDecomposition:    "+(starts[0])+" sec");
+		System.out.println("testCholesky(): choleskyLDLTMulti_single: "+(starts[1]-starts[0])+" sec");
+		System.out.println("testCholesky(): choleskyLDLTMulti_multi:  "+(starts[2]-starts[1])+" sec");
+		System.out.println("testCholesky(): choleskyLDLTMulti_fast:   "+(starts[3]-starts[2])+" sec");
+		
+		System.out.println("testCholesky(): mdelta_cholesky:          "+(starts[4]-starts[3])+" sec");
+		System.out.println("testCholesky(): mdelta_cholesky_multi0:   "+(starts[5]-starts[4])+" sec");
+		System.out.println("testCholesky(): mdelta_cholesky_multi:    "+(starts[6]-starts[5])+" sec");
 		return new Matrix[] {mdelta_cholesky, mdelta_cholesky_multi0, mdelta_cholesky_multi};
 	}
 	

--- a/src/main/java/com/elphel/imagej/common/CholeskyLLTMulti.java
+++ b/src/main/java/com/elphel/imagej/common/CholeskyLLTMulti.java
+package com.elphel.imagej.common;
+
+import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import com.elphel.imagej.tileprocessor.ImageDtt;
+
+import Jama.CholeskyDecomposition;
+import Jama.Matrix;
+import ij.ImagePlus;
+
+public class CholeskyLLTMulti {
+	public final int n;
+	public final double [][] L;
+	public int solve_step = 1; //0;
+	public double thread_scale = 10; // 2.5; // split jobs in thread_scale * num_treads chunks
+
+	public Matrix getL () {
+		return new Matrix(L,n,n);
+	}
+
+	public CholeskyLLTMulti(Matrix matA, int mode) {
+		n = matA.getRowDimension();
+		L = new double[n][n];
+		switch (mode) {
+		case 0: CholeskyLLTMulti_single(matA); break;
+		case 1: CholeskyLLTMulti_multi(matA);  break;
+		case 2: CholeskyLLTMulti_fast(matA);   break;
+		default:CholeskyLLTMulti_single(matA);
+		}
+	}	
+	
+	// Cholesky-Banachiewicz Algorithm ?
+	public double [][] CholeskyLLTMulti_single (Matrix mA) {
+	      double[][] a = mA.getArray();
+	      // Main loop.
+	      for (int j = 0; j < n; j++) {
+	         double[] Lrowj = L[j];
+	         double d = 0.0;
+	         for (int k = 0; k < j; k++) {
+	            double[] Lrowk = L[k];
+	            double s = 0.0;
+	            for (int i = 0; i < k; i++) {
+	               s += Lrowk[i]*Lrowj[i];
+	            }
+	            Lrowj[k] = s = (a[j][k] - s)/L[k][k];
+	            d = d + s*s;
+	         }
+	         d = a[j][j] - d;
+	         L[j][j] = Math.sqrt(Math.max(d,0.0));
+//	         for (int k = j+1; k < n; k++) {
+//	            L[j][k] = 0.0;
+//	         }
+	      }
+	      return L;
+	}
+
+	public double [][] CholeskyLLTMulti_multi (Matrix mA) {
+		double[][] a = mA.getArray();
+		final Thread[] threads =       ImageDtt.newThreadArray();
+		final AtomicInteger ai =       new AtomicInteger(0);
+		final AtomicInteger ati =      new AtomicInteger(0);
+		final int threads_chunk = (int) (n / (thread_scale * threads.length));
+		final double [] threads_d = new double [threads.length];
+
+		// Main loop.
+		for (int j = 0; j < n; j++) {
+			final int fj = j;
+			final double[] Lrowj = L[j];
+			double d = 0.0;
+			if (j > threads_chunk) {
+				ai.set(0);
+				ati.set(0);
+				Arrays.fill(threads_d, 0.0);
+				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+					threads[ithread] = new Thread() {
+						public void run() {
+							int nthread =  ati.getAndIncrement();
+							for (int k0 = ai.getAndAdd(threads_chunk); k0 < fj; k0 = ai.getAndAdd(threads_chunk)) {
+								int k1 = Math.min(fj, k0+threads_chunk);
+								for (int k = k0; k < k1; k++) {
+									double[] Lrowk = L[k];
+									double s = 0.0;
+									for (int i = 0; i < k; i++) {
+										s += Lrowk[i]*Lrowj[i];
+									}
+									s = (a[fj][k] - s)/L[k][k];
+									Lrowj[k] = s;
+									threads_d[nthread] += s * s;
+								}
+							}
+						}
+					};
+				}		      
+				ImageDtt.startAndJoin(threads);
+				for (double dt:threads_d) d+=dt;
+			} else {
+				for (int k = 0; k < j; k++) {
+					double[] Lrowk = L[k];
+					double s = 0.0;
+					for (int i = 0; i < k; i++) {
+						s += Lrowk[i]*Lrowj[i];
+					}
+					Lrowj[k] = s = (a[j][k] - s)/L[k][k];
+					d = d + s*s;
+				}
+			}
+			d = a[j][j] - d;
+			L[j][j] = Math.sqrt(Math.max(d,0.0));
+//			for (int k = j+1; k < n; k++) {
+//				L[j][k] = 0.0;
+//			}
+		}
+		return a;
+	}
+	public double [][] CholeskyLLTMulti_fast (Matrix mA) {
+		double[][] a = mA.getArray();
+		final Thread[] threads =       ImageDtt.newThreadArray();
+		final AtomicInteger ai =       new AtomicInteger(0);
+		final AtomicInteger ati =      new AtomicInteger(0);
+		final int threads_chunk = (int) (n / (thread_scale * threads.length));
+		final double [] threads_d = new double [threads.length];
+
+		// Main loop.
+		for (int j = 0; j < n; j++) {
+			final int fj = j;
+			final double[] Lrowj = L[j];
+			double d = 0.0;
+			// WRONG! parallel only for the inner-most (i), because k should progress consequently
+			if (j > threads_chunk) {
+				ai.set(0);
+				ati.set(0);
+				Arrays.fill(threads_d, 0.0);
+				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+					threads[ithread] = new Thread() {
+						public void run() {
+							int nthread =  ati.getAndIncrement();
+//							double[] localLrowj = Lrowj.clone(); 
+							for (int k0 = ai.getAndAdd(threads_chunk); k0 < fj; k0 = ai.getAndAdd(threads_chunk)) {
+								int k1 = Math.min(fj, k0+threads_chunk);
+								for (int k = k0; k < k1; k++) {
+									double[] Lrowk = L[k];
+									double s = 0.0;
+									for (int i = 0; i < k; i++) {
+										s += Lrowk[i]*Lrowj[i];
+									}
+									s = (a[fj][k] - s)/L[k][k];
+									Lrowj[k] = s;     // write ***
+									threads_d[nthread] += s * s;
+								}
+							}
+						}
+					};
+				}		      
+				ImageDtt.startAndJoin(threads);
+				for (double dt:threads_d) d+=dt;
+			} else {
+				for (int k = 0; k < j; k++) {
+					double[] Lrowk = L[k];
+					double s = 0.0;
+					for (int i = 0; i < k; i++) {
+						s += Lrowk[i]*Lrowj[i];
+					}
+					Lrowj[k] = s = (a[j][k] - s)/L[k][k];
+					d = d + s*s;
+				}
+			}
+			d = a[j][j] - d;
+			L[j][j] = Math.sqrt(Math.max(d,0.0)); // write ***
+//			for (int k = j+1; k < n; k++) {
+//				L[j][k] = 0.0;
+//			}
+		}
+		return a;
+	}
+
+
+
+	public Matrix solve (Matrix B, int mode) {
+		switch (mode) {
+		case 0: return solve_single(B); 
+		//		case 1: return solve_multi (B);
+		//		case 2: return solve_fast (B);
+		default:return solve_single(B);
+		}
+
+	}
+
+	/** Solve A*X = B
+	   @param  B   A Matrix with as many rows as A and any number of columns.
+	   @return     X so that L*L'*X = B
+	   @exception  IllegalArgumentException  Matrix row dimensions must agree.
+	   @exception  RuntimeException  Matrix is not symmetric positive definite.
+	 */
+
+	public Matrix solve_single (Matrix B) {
+		if (B.getRowDimension() != n) {
+			throw new IllegalArgumentException("Matrix row dimensions must agree.");
+		}
+		// Copy right hand side.
+		double[] x = B.getColumnPackedCopy (); //  (for single-column)
+		// Solve L*Y = B;
+		for (int k = 0; k < n; k++) {
+			for (int i = 0; i < k ; i++) {
+				x[k] -= x[i]*L[k][i];
+			}
+			x[k] /= L[k][k];
+		}
+		// Solve L'*X = Y;
+		for (int k = n-1; k >= 0; k--) {
+			for (int i = k+1; i < n ; i++) {
+				x[k] -= x[i]*L[i][k];
+			}
+			x[k] /= L[k][k];
+		}
+		return new Matrix(x,n);
+	}
+
+	public static Matrix solve_single (Matrix B, Matrix L_mat) {
+		double [][] L = L_mat.getArray();
+		return solve_single (B, L);
+	}
+	
+	public static Matrix solve_single (Matrix B, double [][] L) {
+		int n = L.length;
+		if (B.getRowDimension() != n) {
+			throw new IllegalArgumentException("Matrix row dimensions must agree.");
+		}
+		// Copy right hand side.
+		double[] x = B.getColumnPackedCopy (); //  (for single-column)
+		// Solve L*Y = B;
+		for (int k = 0; k < n; k++) {
+			for (int i = 0; i < k ; i++) {
+				x[k] -= x[i]*L[k][i];
+			}
+			x[k] /= L[k][k];
+		}
+		// Solve L'*X = Y;
+		for (int k = n-1; k >= 0; k--) {
+			for (int i = k+1; i < n ; i++) {
+				x[k] -= x[i]*L[i][k];
+			}
+			x[k] /= L[k][k];
+		}
+		return new Matrix(x,n);
+	}
+	
+	public static void testCholesky(ImagePlus imp_src) {
+		float [] fpixels = (float[]) imp_src.getProcessor().getPixels();
+		int      width = imp_src.getWidth();
+		int      height = imp_src.getHeight();
+		int n = height;
+		double [][] a = new double [n][n];
+		double [][] b = new double [n][1];
+		for (int i = 0; i < n; i++) {
+			for (int j= 0; j < n; j++) {
+				a[i][j] = fpixels[i*width+j];
+			}
+			b[i][0] = fpixels[i*width+n];
+		}
+		testCholesky(
+				new Matrix(a), // Matrix wjtjlambda,
+				new Matrix(b), // Matrix jty)
+				imp_src.getTitle()); // String title);
+	}
+
+	public static Matrix[] testCholesky(
+			Matrix wjtjlambda,
+			Matrix jty,
+			String title) {
+		int block_size = 70; //  64; // 60; // 70; // 80; // 120; // 150; // 200; // 100; // 4; // 10; // 100; // 28;
+		String dbg_title=title+"ch_diff_choleskyBlock-choleskyDecomposition-"+block_size;
+		Matrix wjtjlambda_copy0 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy1 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy2 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy = new Matrix(wjtjlambda.getArrayCopy());
+		double [] starts = new double[7];
+		double start_time = (((double) System.nanoTime()) * 1E-9);
+
+		CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda_copy);
+		starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLLTMulti choleskyLLTMulti_single = new CholeskyLLTMulti(wjtjlambda_copy0,0);
+		starts[1] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyBlock choleskyBlock  = new CholeskyBlock(wjtjlambda_copy1.getArray(),block_size);		
+		starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		choleskyBlock.choleskyBlockMulti();
+		starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		double [][] LTriangle = choleskyBlock.get_LTriangle();
+		Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
+		starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		Matrix mdelta_cholesky_single = CholeskyLLTMulti.solve_single(jty, choleskyLLTMulti_single.getL());
+		starts[5] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		Matrix mdelta_cholesky_multi =  CholeskyLLTMulti.solve_single(jty, LTriangle);
+		starts[6] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		System.out.println("testCholesky(): block_size=             "+block_size);
+		System.out.println("testCholesky(): choleskyDecomposition:  "+(starts[0])+" sec");
+		System.out.println("testCholesky(): choleskyLLTMulti_single:"+(starts[1]-starts[0])+" sec");
+		
+		System.out.println("testCholesky(): CholeskyBlock(): "+       (starts[2]-starts[1])+" sec");
+		System.out.println("testCholesky(): choleskyBlockMulti():   "+(starts[3]-starts[2])+" sec");
+		System.out.println("testCholesky(): get_LTriangle():        "+(starts[4]-starts[3])+" sec");
+		System.out.println("testCholesky(): solve_single(,single):  "+(starts[5]-starts[4])+" sec");
+		System.out.println("testCholesky(): solve_single(,block):   "+(starts[6]-starts[5])+" sec");
+		System.out.println("testCholesky(): title=                  "+block_size);
+		System.out.println("testCholesky(): dbg_title=              "+block_size);
+
+		Matrix ch_diff =      choleskyLLTMulti_single.getL().minus(choleskyDecomposition.getL());
+		Matrix ch_diff_fast = choleskyBlock.getL().minus(choleskyDecomposition.getL());
+		double [][] dbg_img = {
+				choleskyLLTMulti_single.getL().getRowPackedCopy(),
+				choleskyBlock.getL().getRowPackedCopy(),
+				choleskyDecomposition.getL().getRowPackedCopy(),
+				ch_diff.getRowPackedCopy(),
+				ch_diff_fast.getRowPackedCopy()};
+		String[] dbg_titles = {"choleskyLLTMulti_single","choleskyBlock","choleskyDecomposition",
+				"choleskyLLTMulti_single-choleskyDecomposition","choleskyBlock-choleskyDecomposition"};
+		ShowDoubleFloatArrays.showArrays(
+				dbg_img, // double[] pixels,
+				ch_diff.getRowDimension(), // int width,
+				ch_diff.getRowDimension(), // int height,
+				true,
+				dbg_title, // String title)
+				dbg_titles);
+		return new Matrix[] {mdelta_cholesky, mdelta_cholesky_single, mdelta_cholesky_multi};
+	}
+
+	public static Matrix[] testCholesky0(
+			Matrix wjtjlambda,
+			Matrix jty) {
+		int block_size = 128;
+		Matrix wjtjlambda_copy0 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy1 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy2 = new Matrix(wjtjlambda.getArrayCopy());
+		Matrix wjtjlambda_copy = new Matrix(wjtjlambda.getArrayCopy());
+		double [] starts = new double[8];
+		double start_time = (((double) System.nanoTime()) * 1E-9);
+
+		CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda_copy);
+		starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLLTMulti choleskyLLTMulti_single = new CholeskyLLTMulti(wjtjlambda_copy0,0);
+		starts[1] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLLTMulti choleskyLLTMulti_multi = new CholeskyLLTMulti(wjtjlambda_copy1,1);
+		starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLLTMulti choleskyLLTMulti_fast = new CholeskyLLTMulti(wjtjlambda_copy2,2);
+		starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		CholeskyLLTMulti choleskyLLTMulti = choleskyLLTMulti_multi; // fast;
+		Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
+		starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		Matrix mdelta_cholesky_single = choleskyLLTMulti.solve(jty, 0);
+		starts[5] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		Matrix mdelta_cholesky_multi =  choleskyLLTMulti.solve(jty, 1);
+		starts[6] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		Matrix mdelta_cholesky_fast =  choleskyLLTMulti.solve(jty, 2);
+		starts[7] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		System.out.println("testCholesky(): choleskyDecomposition:  "+(starts[0])+" sec");
+		System.out.println("testCholesky(): choleskyLLTMulti_single:"+(starts[1]-starts[0])+" sec");
+		
+		System.out.println("testCholesky(): choleskyLLTMulti_multi: "+(starts[2]-starts[1])+" sec");
+		System.out.println("testCholesky(): choleskyLLTMulti_fast:  "+(starts[3]-starts[2])+" sec");
+
+		System.out.println("testCholesky(): mdelta_cholesky:        "+(starts[4]-starts[3])+" sec");
+		System.out.println("testCholesky(): mdelta_cholesky_single: "+(starts[5]-starts[4])+" sec");
+		System.out.println("testCholesky(): mdelta_cholesky_multi:  "+(starts[6]-starts[5])+" sec");
+		System.out.println("testCholesky(): mdelta_cholesky_fast:   "+(starts[7]-starts[6])+" sec");
+		Matrix ch_diff =      choleskyLLTMulti.getL().minus(choleskyDecomposition.getL());
+		Matrix ch_diff_fast = choleskyLLTMulti_fast.getL().minus(choleskyDecomposition.getL());
+		double [][] dbg_img = {
+				choleskyLLTMulti.getL().getRowPackedCopy(),
+				choleskyLLTMulti_fast.getL().getRowPackedCopy(),
+				choleskyDecomposition.getL().getRowPackedCopy(),
+				ch_diff.getRowPackedCopy(),
+				ch_diff_fast.getRowPackedCopy()};
+		String[] dbg_titles = {"choleskyLLTMulti","choleskyLLTMulti_fast","choleskyDecomposition",
+				"choleskyLLTMulti-choleskyDecomposition","choleskyLLTMulti_fast-choleskyDecomposition"};
+		ShowDoubleFloatArrays.showArrays(
+				dbg_img, // double[] pixels,
+				ch_diff.getRowDimension(), // int width,
+				ch_diff.getRowDimension(), // int height,
+				true,
+				"ch_diff_choleskyLLTMulti-choleskyDecomposition", // String title)
+				dbg_titles);
+		
+		return new Matrix[] {mdelta_cholesky, mdelta_cholesky_single, mdelta_cholesky_multi, mdelta_cholesky_fast};
+	}
+	
+}
--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
@@ -96,6 +96,8 @@ import com.elphel.imagej.cameras.CLTParameters;
 import com.elphel.imagej.cameras.ColorProcParameters;
 import com.elphel.imagej.cameras.EyesisCorrectionParameters;
 import com.elphel.imagej.cameras.ThermalColor;
+import com.elphel.imagej.common.CholeskyLDLTMulti;
+import com.elphel.imagej.common.CholeskyLLTMulti;
 import com.elphel.imagej.common.DoubleFHT;
 import com.elphel.imagej.common.DoubleGaussianBlur;
 import com.elphel.imagej.common.GenericJTabbedDialog;
@@ -865,6 +867,8 @@ public class Eyesis_Correction implements PlugIn, ActionListener {
 			addButton("Vegetation LMA", panelOrange, color_process);
 			addButton("Combine LMA Segments", panelOrange, color_process);
 			addButton("Generate LWIR target", panelOrange, color_process);
+			addButton("Test LDLT Cholesky", panelOrange, color_process);
+			addButton("Test LLT Cholesky",  panelOrange, color_process);
 			plugInFrame.add(panelOrange);
 		}
 		plugInFrame.pack();
@@ -5807,8 +5811,20 @@ public class Eyesis_Correction implements PlugIn, ActionListener {
 					SYNC_COMMAND, // // SyncCommand  SYNC_COMMAND,
 					CLT_PARAMETERS, //CLTParameters    clt_parameters,
 					true); //boolean combine_segments);			
-		} else if (label.equals("Generate LWIR target")) {
-			VegetationSegment.generateSineTarget();	
+		} else if (label.equals("Test LDLT Cholesky")) {
+			ImagePlus imp_sel = WindowManager.getCurrentImage();
+			if (imp_sel == null) {
+				IJ.showMessage("Error", "No images selected");
+				return;
+			}
+			CholeskyLDLTMulti.testCholesky(imp_sel); // ImagePlus imp.);	 
+		} else if (label.equals("Test LLT Cholesky")) {
+			ImagePlus imp_sel = WindowManager.getCurrentImage();
+			if (imp_sel == null) {
+				IJ.showMessage("Error", "No images selected");
+				return;
+			}
+			CholeskyLLTMulti.testCholesky(imp_sel); // ImagePlus imp.);	 
 		}
 		//
 	}

--- a/src/main/java/com/elphel/imagej/vegetation/VegetationLMA.java
+++ b/src/main/java/com/elphel/imagej/vegetation/VegetationLMA.java
@@ -1907,9 +1907,32 @@ public class VegetationLMA {
 		Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
 		Matrix mdelta_cholesky_multi = choleskyLDLTMulti.solve(jty);
 		 */
-		CholeskyLDLTMulti.testCholesky(
-				wjtjlambda, // Matrix wjtjlambda,
-				jty);
+		boolean save_cholesky = true;
+		boolean debug_cholesky = true;
+		if (save_cholesky) {
+			int dbg_n = wjtjlambda.getRowDimension();
+			double [] dbg_a0 = wjtjlambda.getRowPackedCopy();
+			double [] dbg_a = new double[dbg_n*(dbg_n+1)];
+			for (int i = 0; i < dbg_n; i++) {
+				System.arraycopy(
+						dbg_a0,
+						i*dbg_n,
+						dbg_a,
+						i*(dbg_n+1),
+						dbg_n);
+				dbg_a[(i+1)*(dbg_n+1)-1] = jty.get(i,0);
+			}
+			ShowDoubleFloatArrays.showArrays(
+					dbg_a, // double[] pixels,
+					dbg_n+1, // int width,
+					dbg_n, // int height,
+					"spd_a_"+debug_iter); // String title)
+		}
+		if (debug_cholesky) {
+			CholeskyLDLTMulti.testCholesky(
+					wjtjlambda, // Matrix wjtjlambda,
+					jty);
+		}
 		if (debug_level>2) {
 			System.out.println("mdelta");
 			mdelta.print(18, 6);