package com.elphel.imagej.common;
/**
** CholeskyBlock - multithreaded Cholesky decomposition and solution
**
** Copyright (C) 2025 Elphel, Inc.
**
** Using publication with Block Cholesky description:
** Chen, Jianping, et al. "Block algorithm and its implementation for Cholesky factorization."
** ICCGI 2013 (2013): 245.
**
** Data (A and L) are stored as 1D arrays, in columns of specified width (m), in line-scan
** order in each column (right, then down), columns themselves - to the right.
** This makes m x m "tiles" compact in terms of cache, for my computer with the 8-core processor
** and N=2258 matrices the optimal m=70. Multithreaded methods operate on tiles.
** When matrix size is not multiple of m, the last (bottom-right) block has a remainder size,
** other bottom and rightmost are rectangular, all other are m x m.
**
** For the tested processor and matrix size the multithreaded improvement is ~9x, for the
** solve() method ~5x compared to a standard Jama single-threaded   CholeskyDecomposition class.
** First run in multithreaded mode results is longer execution time than the subsequent runs. 
** End of this file contains measured timing.  
**
** -----------------------------------------------------------------------------**
**
**  CholeskyBlock.java is free software: you can redistribute it and/or modify
**  it under the terms of the GNU General Public License as published by
**  the Free Software Foundation, either version 3 of the License, or
**  (at your option) any later version.
**
**  This program is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details.
**
**  You should have received a copy of the GNU General Public License
**  along with this program.  If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
*/
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import Jama.Matrix;

public class CholeskyBlock {
	// These 3 parameters are public and may be set before using the constructor
	public static int dflt_m =            70; // tile size if not specified during initialization
	public static int dflt_threads =     100; // maximal number of threads to use for decomposition
	public static int dflt_solve_threads =16; //  4; // maximal number of threads to use for solve()
	
	public static boolean debug = false; // true;
	private final int m;  // tile size
	private final int np; // number of elements in row/col
	private final int n;  // number of tiles in  row/col
	private final int nf; // number of full rows/cols
	private       int decomp_threads;
	private final double [] A;
	private final double [] L;
	private final double [] B;
	/**
	 * Create Cholesky factorization from SPD matrix mA
	 * @param mA - SPD Matrix instance
	 */
	public CholeskyBlock (Matrix mA) {
		m = dflt_m;
		decomp_threads = dflt_threads;
		np = mA.getRowDimension();
		nf = np / m;
		n = ((nf * m) < np)? (nf + 1) : nf; 
		A = new double [np*np]; //  [n*n*m*m];
		L = new double [A.length];
		B = new double [np];
		setup_ATriangle(mA.getArray());
		choleskyBlockMulti(); 
		
	}
	
	/**
	 * Create Cholesky factorization from 2D square array representing SPD matrix
	 * @param A_in - 2D array corresponding to SPD
	 */
	public CholeskyBlock (
			double [][] A_in) {
		m = dflt_m;
		decomp_threads = dflt_threads;
		np = A_in.length;
		nf = np / m;
		n = ((nf * m) < np)? (nf + 1) : nf; 
		A = new double [np*np]; //  [n*n*m*m];
		L = new double [A.length];
		B = new double [np];
		setup_ATriangle(A_in);
		choleskyBlockMulti(); 
	}
	
	/**
	 * Create Cholesky factorization from 2D square array representing SPD matrix
	 * @param A_in - 2D array corresponding to SPD
	 * @param size block size (tested optimal 70 easier debugging with 100)
	 */
	public CholeskyBlock (
			double [][] A_in,
			int size) {
		m = size;
		decomp_threads = dflt_threads;
		np = A_in.length;
		nf = np / m;
		n = ((nf * m) < np)? (nf + 1) : nf; 
		A = new double [np*np]; //  [n*n*m*m];
		L = new double [A.length];
		B = new double [np];
		setup_ATriangle(A_in);
		choleskyBlockMulti(); 
	}

	/**
	 * Create Cholesky factorization from 2D square array representing SPD matrix
	 * @param A_in - 2D array corresponding to SPD
	 * @param size block size (tested optimal 70 easier debugging with 100)
	 * @param decomp_threads specify number of decomposition threads
	 */
	public CholeskyBlock (
			double [][] A_in,
			int size,
			int decomp_threads) {
		m = size;
		this.decomp_threads = decomp_threads;
		np = A_in.length;
		nf = np / m;
		n = ((nf * m) < np)? (nf + 1) : nf; 
		A = new double [np*np]; //  [n*n*m*m];
		L = new double [A.length];
		B = new double [np];
		setup_ATriangle(A_in);
		choleskyBlockMulti(); 
	}
	
	
	/**
	 * Set the internal data Array A stored as 1D, linescan in columns from a square SPD array 
	 * @param A_in square SPD array (only lower triangle with the diagonal are used)
	 */
	private void setup_ATriangle(double [][] A_in) {
		for (int tile_row = 0; tile_row < nf; tile_row++) {
			for (int tile_col = 0; tile_col < tile_row; tile_col++) {
				int indx = indx_IJ(tile_row, tile_col);
				for (int k = 0; k < m; k++) {
					System.arraycopy(
							A_in[tile_row*m +k],
							tile_col * m,
							A,
							indx + m * k,
							m);
				}
			}
			// copy diagonal
			int indx = indx_IJ(tile_row, tile_row);
			for (int k = 0; k < m; k++) {
				System.arraycopy(
						A_in[tile_row*m +k],
						tile_row * m,
						A,
						indx + m * k,
						k+1);
			}
		}
		if (n > nf) { // if there are small tiles below and to the right
			int tile_row = nf;
			int h = np - m * nf;
			for (int tile_col = 0; tile_col < nf; tile_col++) {
				int indx = indx_IJ(tile_row, tile_col);
				for (int k = 0; k < h; k++) {
					System.arraycopy(
							A_in[tile_row*m +k],
							tile_col * m,
							A,
							indx + m * k,
							m);
				}
				
			}
			// copy diagonal
			int indx = indx_IJ(tile_row, tile_row);
			for (int k = 0; k < h; k++) {
				System.arraycopy(
						A_in[tile_row*m +k],
						tile_row * m,
						A,
						indx + h * k,
						k+1);
			}
		}
	}
	
	/**
	 * Get L lower triangular matrix of Cholesky facterization from the internal column-line-scan 
	 * representation. 
	 * @return lower triangular Cholesky Matrix
	 */
	public Matrix getL() {
		return new Matrix(get_LTriangle(),np,np);
	}
	
	/**
	 * Get internal Cholesky lower-left matrix as a 2d square array
	 * @return 2d square array containing Cholesky low-left matrix data
	 */
	private double [][] get_LTriangle() {
		double [][] L_out = new double[np][np];
		for (int tile_row = 0; tile_row < nf; tile_row++) {
			for (int tile_col = 0; tile_col < tile_row; tile_col++) {
				int indx = indx_IJ(tile_row, tile_col);
				for (int k = 0; k < m; k++) {
					System.arraycopy(
							L,
							indx + m * k,
							L_out[tile_row*m +k],
							tile_col * m,
							m);
				}
			}
			// copy diagonal
			int indx = indx_IJ(tile_row, tile_row);
			for (int k = 0; k < m; k++) {
				System.arraycopy(
						L,
						indx + m * k,
						L_out[tile_row*m +k],
						tile_row * m,
						k+1);
			}
		}
		if (n > nf) { // if there are small tiles below and to the right
			int tile_row = nf;
			int h = np - m * nf;
			for (int tile_col = 0; tile_col < nf; tile_col++) {
				int indx = indx_IJ(tile_row, tile_col);
				for (int k = 0; k < h; k++) {
					System.arraycopy(
							L,
							indx + m * k,
							L_out[tile_row*m +k],
							tile_col * m,
							m);
				}
				
			}
			// copy diagonal
			int indx = indx_IJ(tile_row, tile_row);
			for (int k = 0; k < h; k++) {
				System.arraycopy(
						L,
						indx + h * k,
						L_out[tile_row*m +k],
						tile_row * m,
						k+1);
			}
		}
		return L_out;
	}
	
	
	
	/**
	 * Set internal B/Y/X single-column matrix. Internal representation changes from B to Y to X (solution)
	 * @param mb right-side single-column Matrix (not modified)
	 */
	public void setB(Matrix mb) {
		double [][] a = mb.getArray();
		for (int i = 0; i < np; i++) {
			B[i] = a[i][0];
		}
	}
	
	/**
	 * Get internal B/Y/X single-column matrix. Internal representation changes from B to Y to X (solution)
	 * @return a single-column Matrix containing the solution
	 */
	public Matrix getX() {
		return new Matrix (B, np);
	}
	
	/**
	 * Get index of the top-left tile corner
	 * @param i tile row
	 * @param j tile column
	 * @return index in A and L arrays
	 */
	private int indx_IJ(int i, int j) {
		return j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i;
	}
	
	/**
	 * Calculate non-diagonal tiles of the block-Cholesky factorization 
	 * @param i - tile row
	 * @param j - tile column < i (tile row)
	 */
	private void setL21(
			int       i,   // i > j,
			int       j) { // j <nf
		int indx_diag = indx_IJ(j,j);
		int indx_ij =   indx_IJ(i,j);
		int h = (i < nf) ? m : (np-nf*m);
		// prepare solving Lx = b, copy tile A -> L
		System.arraycopy(A, indx_ij, L, indx_ij, m * h);
		for (int l_row = 0; l_row < m; l_row++) { // was <m <h!
			for (int x_col= 0; x_col < h; x_col++) { // b-vector
				int lindx = indx_ij + m * x_col + l_row;
				double ls = L[lindx];
				for (int l_col = 0; l_col < l_row; l_col++) {
					ls -= L[indx_ij + m * x_col + l_col] * L[indx_diag + m* l_row+l_col];
				}
				L[lindx] = ls/L[indx_diag + (m + 1)* l_row];
			}
		}
		return;
	}
	
	/**
	 * Convert remaining tiles of A-matrix (only in lower-left triangle of tiles, including diagonal ones)
	 * @param diag row/column of the diagonal tile that defines the remainder of the A matrix (to the right
	 *        and below the diagonal tile
	 * @param row row of the transformed A tile (> diag)
	 * @param col column of the transformed A tile (> diag, <= row)
	 */
	private void setA22(
			int diag,// < col,  < row
			int row, // >= col
			int col) {
		int h = (row < nf) ? m : (np-nf*m);
		int indx_a =    indx_IJ(row,col);
		int indx_lrow = indx_IJ(row,diag);
		if (row == col) {
			for (int i = 0; i < h; i++) {
				for (int j = 0; j <= i; j++) {
					for (int k = 0; k < m; k++) { // was h
						A[indx_a + i * h + j] -= L[indx_lrow + i * m + k] * L[indx_lrow + j * m + k];
					}
				}
			}
		} else {
			int indx_lcol = indx_IJ(col,diag);
			for (int i = 0; i < h; i++) {
				for (int j = 0; j < m; j++) {
					for (int k = 0; k < m; k++) {
						A[indx_a + i * m + j] -= L[indx_lrow + i * m + k] * L[indx_lcol + j * m + k];
					}
				}
			}
		}
		return;
	}
	
	/**
	 * Run multithreaded Cholosky factorization, convert internal representation of the A-data to L-data
	 */
	private void choleskyBlockMulti() {
		final Thread[] threads =       newThreadArray(decomp_threads); // 1);
		final AtomicInteger ai =       new AtomicInteger(0);
		cholesky_single(0);
		// Calculate first column under diagonal (L21) - maybe use m
		ai.set(1); // start from the second tile row
		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
			threads[ithread] = new Thread() {
				public void run() {
					for (int tile_row = ai.getAndIncrement(); tile_row < n; tile_row = ai.getAndIncrement()) {
						setL21(tile_row, 0);
					}
				}
			};
		}		      
		startAndJoin(threads);
		for (int tile_diag = 1; tile_diag < n; tile_diag++) {
			final int ftile_diag = tile_diag;
			// Calculate A in one tile column of the remaining A2'
			// start with diagonal (top) tile, and calculate its Cholesky
			// In parallel, calculate A for all tiles in that column below diagonal
//			if (tile_diag == (n-1)) {
//				System.out.println("choleskyBlockMulti() last pass n= "+n+", tile_diag="+tile_diag);
//			}
			ai.set(ftile_diag);
			for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
				threads[ithread] = new Thread() {
					public void run() {
						for (int nRow= ai.getAndIncrement(); nRow < n; nRow = ai.getAndIncrement()) {
							setA22(
									ftile_diag-1, // int diag,// < col,  < row
									nRow,         // int row, // >= col
									ftile_diag);  // int col) 							
							if (nRow == ftile_diag) {
								cholesky_single(ftile_diag);
							}
						}
					}
				};
			}		      
			startAndJoin(threads);
			if (ftile_diag < (n-1)) {
				// Now in parallel calculate L in column ftile_diag under diagonal and
				// finish A2' to the right of ftile_diag column
				final int left_rows = n - ftile_diag - 1;
				final int num_tiles = left_rows * (left_rows + 1) / 2 + 1;

				ai.set(0);
				for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
					threads[ithread] = new Thread() {
						public void run() {
							for (int ntile = ai.getAndIncrement(); ntile < num_tiles; ntile = ai.getAndIncrement()) {
								if (ntile == 0) {
									// Calculate first column of L under diagonal (L21) - maybe use m

									for (int tr = ftile_diag + 1; tr < n; tr++) {
										setL21(
												tr,          // row > column 
												ftile_diag); // column
									}
								} else  {
									int nrow = (int) Math.floor(-0.5 + 0.5* Math.sqrt(1 + 8 * (ntile-1)));
									int ncol = (ntile-1) - (nrow * (nrow + 1) /2);
									int row = ftile_diag + nrow + 1;
									int col = ftile_diag + ncol + 1;
									setA22(
											ftile_diag-1, // int diag,// < col,  < row
											row, // int row, // >= col
											col); // int col) 							
								}
							}
						}
					};
				}		      
				startAndJoin(threads);
			}
		}
		return;
	}
	// Single-threaded, used for a single-tile Cholesky deconstruction

	/**
	 * A single-threaded, single-tile Cholesky factorization converting diagonal A-tile to
	 * the corresponding diagonal L-tile 
	 * @param diag row/column of the tile to convert.
	 */
	private void cholesky_single(int diag) {
		int h = (diag < nf) ? m : (np-nf*m);			
		int indx = indx_IJ(diag, diag);
		Arrays.fill(L, indx, indx+h*h-1, 0);
		for (int j = 0; j < h; j++) {
			int indx_j = indx+ j * h;
			int indx_jj = indx_j + j;
			double d = 0.0;
			for (int k = 0; k < j; k++) {
				int indx_k = indx+ k * h;
				double s = 0.0;
				for (int i = 0; i < k; i++) {
					s += L[indx_k + i] * L[indx_j + i];
				}
				s = (A[indx_j + k]-s)/L[indx_k+k];
				L[indx_j + k] = s;
				d = d + s*s;
			}
			d = A[indx_jj] - d;
			L[indx_jj] = Math.sqrt(Math.max(d,0.0));
		}
	}

	
	/**
	 * Solve L * Y = B for a single-tile data. B has data, will be modified
	 * to contain Y
	 * @param diag number of the diagonal tile (last may be smaller)
	 */
	private void solveY(int diag) {
		int h = (diag < nf) ? m : (np-nf*m);			
		int lindx = indx_IJ(diag, diag);
		int xindx = diag * m; // start of B/x
		for (int k = 0; k < h; k++) {
			double x = B[xindx+k];
			for (int i = 0; i < k ; i++) {
				x -= B[xindx + i] * L[lindx + h*k + i];
			}
			B[xindx+k] = x/L[lindx + (h+1)*k];
		}
		return;
	}

	/**
	 * Solve single-tile Lt*X=Y
	 * B has data (Y), will be modified to contain X
	 * @param diag number of the diagonal tile 
	 *        last may be smaller.
	 */
	private void solveX(int diag) {
		int h = (diag < nf) ? m : (np-nf*m);			
		int lindx = indx_IJ(diag, diag);
		int xindx = diag * m; // start of B/x
		// Solve L'*X = Y;
		for (int k = h-1; k >= 0; k--) {
			double x = B[xindx + k];
			for (int i = k+1; i < h ; i++) {
				x -= B[xindx + i] * L[lindx + h*i + k];
			}
			B[xindx+k] = x/L[lindx + (h+1)*k];
		}
		return;
	}
	
	/**
	 * Subtract Y-column from running B using Y-data in tile j
	 * from the tile i > j.  
	 * @param i tile row to subtract from in B
	 * @param j tile row corresponding to diagonal L tiles
	 */
	private void subYCol(
			int       i,   // i > j,
			int       j) { // j < i
		int lindx = indx_IJ(i,j);
		int xsrc = j * m; // start of B/x to use
		int xdst = i * m; // start of B/x to modify
		int h = (i < nf) ? m : (np-nf*m);
		for (int k = 0; k < h; k++) {
			double x = B[xdst + k];
			for (int l = 0; l < m; l++) {
				x -= L[lindx + k * m + l] * B[xsrc+l];
			}
			B[xdst + k] = x;
		}
		return;
	}

	/**
	 * Subtract X-column from running B using X-data in tile j
	 * from the tile i > j.  
	 * @param i tile row to subtract from in B
	 * @param j tile row corresponding to diagonal L tiles
	 */
	
	private void subXCol(
			int       i,   // i < j,
			int       j) { // j > i
		int lindx = indx_IJ(j,i);
		int xsrc = j * m; // start of B/x to use
		int xdst = i * m; // start of B/x to modify
		int h = (j < nf) ? m : (np-nf*m);
		for (int k = 0; k < m; k++) {
			double x = B[xdst + k];
			for (int l = 0; l < h; l++) {
				x -= L[lindx + l * m + k] * B[xsrc + l];
			}
			B[xdst + k] = x;
		}
		return;
	}
	/**
	 * Solve with existing Cholesky decomposition for a single-column Matrix b
	 * @param b a single-column matrix B that L * Lt * X = B
	 * @return A single-column solution X
	 */
	public Matrix solve (Matrix b) {
		setB(b);
		solve();
		return getX();
	}
	
	/**
	 * Solve with existing Cholesky decomposition for an internal representation of matrix B,
	 * replace B data with X data
	 */
	public void solve() {
		solve(dflt_solve_threads); // this.solve_threads);
	}
	
	/**
	 * Solve with existing Cholesky decomposition for an internal representation of matrix B,
	 * replace B data with X data, limit number of threads to use.
	 * @param solve_threads maximal number of threads to use
	 */
	public void solve(int solve_threads) {
		final Thread[] threads =             newThreadArray(solve_threads);
		final AtomicInteger ai =             new AtomicInteger(0);
		AtomicBoolean [][] tile_started =    new AtomicBoolean[n][n]; 
		AtomicBoolean [][] tile_done =       new AtomicBoolean[n][n];
		AtomicInteger []   row_len_started = new AtomicInteger[n]; //
		AtomicInteger []   row_len_done =    new AtomicInteger[n]; //
		AtomicBoolean []   row_busy =       new AtomicBoolean[n];
		AtomicInteger rows_done =            new AtomicInteger(0); // this number of tile rows solved
		for (int row = 0; row < n; row++) {
			for (int col = 0; col <= row; col++) {
				tile_started[row][col] = new AtomicBoolean(false);
				tile_done[row][col] =    new AtomicBoolean(false);
			}
			row_len_started[row] = new AtomicInteger(0);
			row_len_done[row] =    new AtomicInteger(0);
			row_busy[row] =        new AtomicBoolean(false);
		}
		double [] starts = new double[5];
		double start_time = (((double) System.nanoTime()) * 1E-9);
		solveY(0); // int diag)
		tile_started[0][0].set(true);
		tile_done[0][0].set(true);
		rows_done.set(1);
		row_len_started[0].set(1);
		row_len_done[0].set(1);
		starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
		int [][] debug_order = debug? new int[n][n]:null;
		if (debug) debug_order[0][0] = ai.getAndIncrement()+1;
		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
			threads[ithread] = new Thread() {
				public void run() {
					while (rows_done.get() < n) {
						got_tile:{
							for (int row1 = rows_done.get(); row1 < n; row1++) {
								// is it ready for diagonal solve?
								if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
									solveY(row1);
									rows_done.set(row1+1); // no race is possible?
									if (debug) debug_order[row1][row1] = ai.getAndIncrement()+1;
									break got_tile;
								}
								if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
									for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
										if (!tile_started[row1][col1].getAndSet(true)) {
											row_len_started[row1].getAndAccumulate(col1+1, Math::max);										
											subYCol( row1,  // int       i,   // i > j,
													col1); // 	int      j)
											row_busy[row1].set(false); // release row
											if (debug) debug_order[row1][col1] = ai.getAndIncrement()+1;
											tile_done[row1][col1].set(true);
											// verify if all below down to row_len_done are finished, set row_len_done to max
											for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
												if (tile_done[row1][col2].get()) {
													row_len_done[row1].getAndAccumulate(col2+1, Math::max);
												}
											}
											break got_tile;
										}
									}
									row_busy[row1].set(false); // release row
								}
							}
							break; // could not find a tile to process (number of parallels go down) -  reduce number of remaining threads
						}
					}
				}
			};
		}		      
		startAndJoin(threads);
		// Solve L'X = Y
		rows_done.set(0);// this number of tile rows solved
		for (int row = 0; row < n; row++) {
			for (int col = 0; col <= row; col++) {
				tile_started[row][col].set(false);
				tile_done[row][col].set(false);
			}
			row_len_started[row].set(0);
			row_len_done[row].set(0);
		}
		starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
		solveX(n-1); // int diag)
		starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
		tile_started[0][0].set(true);
		tile_done[0][0].set(true);
		rows_done.set(1);
		row_len_started[0].set(1);
		row_len_done[0].set(1);
		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
			threads[ithread] = new Thread() {
				public void run() {
					while (rows_done.get() < n) {
						got_tile:{
							for (int row1 = rows_done.get(); row1 < n; row1++) {
								// is it ready for diagonal solve?
								if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
									solveX(n - 1 - row1);
									rows_done.set(row1+1); // no race is possible?
									break got_tile;
								}
								if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
									for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
										if (!tile_started[row1][col1].getAndSet(true)) {
											row_len_started[row1].getAndAccumulate(col1+1, Math::max);										
											subXCol(n - 1 - row1,  // int       i,   // i > j,
													n - 1 - col1); // 	int      j)
											row_busy[row1].set(false); // release row
											tile_done[row1][col1].set(true);
											// verify if all below down to row_len_done are finished, set row_len_done to max
											for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
												if (tile_done[row1][col2].get()) {
													row_len_done[row1].getAndAccumulate(col2+1, Math::max);
												}
											}
											break got_tile;
										}
									}
									row_busy[row1].set(false); // release row
								}
							}
							break; // could not find a tile to process (number of parallels go down) -  reduce number of remaining threads
						}
					}
				}
			};
		}		      
		startAndJoin(threads);
		starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
		if (debug) {
			System.out.println("\ncholeskyBlock.solve(): ==== number of threads = "+threads.length+ "====");
			System.out.println(String.format("choleskyBlock.solve(): solveY(0)       %12.9f sec",(starts[0])));
			System.out.println(String.format("choleskyBlock.solve(): solveY_other    %12.9f sec- including 1-st column",(starts[2] - starts[0])));
			System.out.println(String.format("choleskyBlock.solve(): solveX(n-1)     %12.9f sec",(starts[3] - starts[2])));
			System.out.println(String.format("choleskyBlock.solve(): solveX_other    %12.9f sec",(starts[4] - starts[3])));
		}
		return;
	}

	// for comparison/verification
	/**
	 * Standard single-threaded solution with Cholesky decomposition from Jama used for comparison.
	 * @param B A single-column Matrix for the right side of the equation A * X = L * Lt * X = B
	 * @param L Cholesky decomposition lower-left Matrix
	 * @return a single-column solution X.
	 */
	public static Matrix solve_single (Matrix B, double [][] L) {
		int n = L.length;
		if (B.getRowDimension() != n) {
			throw new IllegalArgumentException("Matrix row dimensions must agree.");
		}
		// Copy right hand side.
		double[] x = B.getColumnPackedCopy (); //  (for single-column)
		// Solve L*Y = B;
		for (int k = 0; k < n; k++) {
			for (int i = 0; i < k ; i++) {
				x[k] -= x[i]*L[k][i];
			}
			x[k] /= L[k][k];
		}
		// Solve L'*X = Y;
		for (int k = n-1; k >= 0; k--) {
			for (int i = k+1; i < n ; i++) {
				x[k] -= x[i]*L[i][k];
			}
			x[k] /= L[k][k];
		}
		return new Matrix(x,n);
	}
	
	/*
	 * Multithreading methods from Stephan Preibisch's Multithreading.java class. See:
	 * http://repo.or.cz/w/trakem2.git?a=blob;f=mpi/fruitfly/general/MultiThreading.java;hb=HEAD
	 */
	/**
	 * Create a Thread[] array as large as the number of processors available.
	 * @return an array of threads 
	 */
	public static Thread[] newThreadArray() {
		return newThreadArray(100);
	}
	/**
	 * Create a Thread[] array as large as the number of processors available.
	 * @param maxCPUs limit number of threads
	 * @return an array of threads
	 */
	public static Thread[] newThreadArray(int maxCPUs) { // USED in lwir
		int n_cpus = Runtime.getRuntime().availableProcessors();
		if (n_cpus>maxCPUs)n_cpus=maxCPUs;
		return new Thread[n_cpus];
	}
	/**
	 * Start all given threads and wait on each of them until all are done.
	 * @param threads to start
	 */
	public static void startAndJoin(Thread[] threads) // USED in lwir
	{
		for (int ithread = 0; ithread < threads.length; ++ithread)
		{
			threads[ithread].setPriority(Thread.NORM_PRIORITY);
			threads[ithread].start();
		}

		try
		{
			for (int ithread = 0; ithread < threads.length; ++ithread)
				threads[ithread].join();
		} catch (InterruptedException ie)
		{
			throw new RuntimeException(ie);
		}
	}

	
	/* Results
    First run, next MULTITHREADED ones are faster. Ran on an 8-core x86, 16 threads
	choleskyBlockMulti() last pass n= 33, tile_diag=32
	testCholesky(): matrix size=            2258
	testCholesky(): block_size=             70
	testCholesky(): choleskyDecomposition:  1.7590559809468687 sec   Jama CholeskyDecomposition, single-threaded
	testCholesky(): CholeskyBlock():        0.34954120498150587 sec  FIRST run is faster than single-threaded, but slower than next
	testCholesky(): cholesky.solve():       0.030882437014952302 sec Jama CholeskyDecomposition.solve()
	testCholesky(): block.solve():          0.051133116940036416 sec FIRST run is slower than single-threaded!
	testCholesky(): title=                  spd_a_2.tif
	testCholesky(): dbg_title=              spd_a_2.tifch_diff_choleskyBlock-choleskyDecomposition-70

	DEBUG_LEVEL = 1, CLT_PARAMETERS.lwir.getDebugLevel() = 0 LOG_LEVEL=ERROR LOG_LEVEL_SET=false
	choleskyBlockMulti() last pass n= 33, tile_diag=32
	testCholesky(): matrix size=            2258
	testCholesky(): block_size=             70
	testCholesky(): choleskyDecomposition:  1.7251908951438963 sec
	testCholesky(): CholeskyBlock():        0.19312086096033454 sec
	testCholesky(): cholesky.solve():       0.03081041993573308 sec
	testCholesky(): block.solve():          0.005443983944132924 sec
	testCholesky(): title=                  spd_a_2.tif
	testCholesky(): dbg_title=              spd_a_2.tifch_diff_choleskyBlock-choleskyDecomposition-70
 */

	
}
