Fixed multi-threaded solution

b09025fb · Andrey Filippov · e27934db · b09025fb
Commit b09025fb authored Mar 22, 2025 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 172 additions and 231 deletions

CholeskyBlock.java src/main/java/com/elphel/imagej/common/CholeskyBlock.java +172 -231

No files found.
--- a/src/main/java/com/elphel/imagej/common/CholeskyBlock.java
+++ b/src/main/java/com/elphel/imagej/common/CholeskyBlock.java
@@ -20,27 +20,61 @@ package com.elphel.imagej.common;
 **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 ** -----------------------------------------------------------------------------**
 */
-
 import java.util.Arrays;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.concurrent.atomic.AtomicInteger;
-
 import com.elphel.imagej.tileprocessor.ImageDtt;
-
 import Jama.Matrix;

 public class CholeskyBlock {
+	public static int dflt_m =            70; // tile size if not specified during initialization
+	public static int dflt_threads =     100; // maximal number of threads to use for decomposition
+	public static int dflt_solve_threads =16; //  4; // maximal number of threads to use for solve()
+	public static boolean debug = false; // true; // false;
 	public final int m;  // tile size
 	public final int np; // number of elements in row/col
 	public final int n;  // number of tiles in  row/col
 	public final int nf; // number of full rows/cols
+	public       int decomp_threads;
 	public final double [] A;
 	public final double [] L;
 	public final double [] B;
+
+	public CholeskyBlock (
+			double [][] A_in) {
+		m = dflt_m;
+		decomp_threads = dflt_threads;
+		np = A_in.length;
+		nf = np / m;
+		n = ((nf * m) < np)? (nf + 1) : nf; 
+		A = new double [np*np]; //  [n*n*m*m];
+		L = new double [A.length];
+		B = new double [np];
+		setup_ATriangle(A_in);
+		choleskyBlockMulti(); 
+	}
 	
 	public CholeskyBlock (
 			double [][] A_in,
 			int size) {
 		m = size;
+		decomp_threads = dflt_threads;
+		np = A_in.length;
+		nf = np / m;
+		n = ((nf * m) < np)? (nf + 1) : nf; 
+		A = new double [np*np]; //  [n*n*m*m];
+		L = new double [A.length];
+		B = new double [np];
+		setup_ATriangle(A_in);
+		choleskyBlockMulti(); 
+	}
+
+	public CholeskyBlock (
+			double [][] A_in,
+			int size,
+			int decomp_threads) {
+		m = size;
+		this.decomp_threads = decomp_threads;
 		np = A_in.length;
 		nf = np / m;
 		n = ((nf * m) < np)? (nf + 1) : nf; 
@@ -183,22 +217,6 @@ public class CholeskyBlock {
 		return j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i;
 	}

-	/**
-	 * Tile bottom-right corner, counting i - up, j - to the left.
-	 * Full tile are on the bottom and right (partial - on the top
-	 * and left
-	 * @param i tile row from bottom
-	 * @param j tile column from right
-	 * @return index of the bottom-right corner of the tile (the largest!)
-	 */
-	public int rindx_IJ(int i, int j) { // upside down
-//		return (j * (m * np)) + (np-1 -i)  * ((j >= nf) ?(np-nf*m): m);
-//		return (np*np-1) - indx_IJ(i, j);
-		return np*np-1 - (j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i);
-	}
-
-
-	
 	public void setL21(
 			int       i,   // i > j,
 			int       j) { // j <nf
@@ -249,7 +267,7 @@ public class CholeskyBlock {
 	}
 	
 	public void choleskyBlockMulti() {
-		final Thread[] threads =       ImageDtt.newThreadArray();
+		final Thread[] threads =       ImageDtt.newThreadArray(decomp_threads); // 1);
 		final AtomicInteger ai =       new AtomicInteger(0);
 		cholesky_single(0);
 		// Calculate first column under diagonal (L21) - maybe use m
@@ -330,8 +348,7 @@ public class CholeskyBlock {
 	}
 	
 	
-	// Cholesky-Banachiewicz Algorithm ?
-	// Single-threaded
+	// Single-threaded, used for a single-tile Cholesky deconstruction

 	public void cholesky_single(int diag) {
 		int h = (diag < nf) ? m : (np-nf*m);			
@@ -370,10 +387,8 @@ public class CholeskyBlock {
 			double x = B[xindx+k];
 			for (int i = 0; i < k ; i++) {
 				x -= B[xindx + i] * L[lindx + h*k + i];
-//				x[k] -= x[i]*L[k][i];
 			}
 			B[xindx+k] = x/L[lindx + (h+1)*k];
-//			x[k] /= L[k][k];
 		}
 		return;
 	}
@@ -447,24 +462,6 @@ public class CholeskyBlock {
 	}
 	
 	
-	
-	public void subXCol0(
-			int       i,   // i > j,
-			int       j) { // j < i
-		int lindx = rindx_IJ(i,j);
-		int xsrc = np - 1 - j * m; // start of B/x to use
-		int xdst = np - 1 - i * m; // start of B/x to modify
-		int h = (i < nf) ? m : (np-nf*m);
-		for (int k = 0; k < h; k++) {
-			double x = B[xdst - k];
-			for (int l = 0; l < m; l++) {
-				x -= L[lindx - l * m - k] * B[xsrc - l];
-			}
-			B[xdst - k] = x;
-		}
-		return;
-	}
-	
 	public Matrix solve (Matrix b) {
 		setB(b);
 		solve();
@@ -472,61 +469,147 @@ public class CholeskyBlock {
 	}
 	
 	public void solve() {
-		final Thread[] threads =       ImageDtt.newThreadArray();
-		final AtomicInteger ai =       new AtomicInteger(0);
-		// L, B are set up
-		// convert top fragment of B->Y using top-left tile of L
-		solveY(0); // int diag)	
-		for (int tile_diag = 0; tile_diag < (n -1); tile_diag++) {
-			final int ftile_diag = tile_diag;
-			// subtract vertical column of tiles for top (previous) segment of B->Y from the remaining
-			// segments. After subtracting from the top segment, immediately calculate Y for it, all
-			// other segments subtract in parallel
-			ai.set(tile_diag+1); // start from the second tile segment
-			for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
-				threads[ithread] = new Thread() {
-					public void run() {
-						for (int tile_seg = ai.getAndIncrement(); tile_seg < n; tile_seg = ai.getAndIncrement()) {
-							 subYCol(
-									 tile_seg, // int       i,   // i > j,
-									 ftile_diag); // 	int       j)
-							 if (tile_seg == (ftile_diag+1)) {
-								 solveY(tile_seg); // int diag)	
-							 }
+		solve(dflt_solve_threads); // this.solve_threads);
+	}
+	
+	public void solve(int solve_threads) {
+		final Thread[] threads =             ImageDtt.newThreadArray(solve_threads);
+		final AtomicInteger ai =             new AtomicInteger(0);
+		AtomicBoolean [][] tile_started =    new AtomicBoolean[n][n]; 
+		AtomicBoolean [][] tile_done =       new AtomicBoolean[n][n];
+		AtomicInteger []   row_len_started = new AtomicInteger[n]; //
+		AtomicInteger []   row_len_done =    new AtomicInteger[n]; //
+		AtomicBoolean []   row_busy =       new AtomicBoolean[n];
+		AtomicInteger rows_done =            new AtomicInteger(0); // this number of tile rows solved
+		for (int row = 0; row < n; row++) {
+			for (int col = 0; col <= row; col++) {
+				tile_started[row][col] = new AtomicBoolean(false);
+				tile_done[row][col] =    new AtomicBoolean(false);
+			}
+			row_len_started[row] = new AtomicInteger(0);
+			row_len_done[row] =    new AtomicInteger(0);
+			row_busy[row] =        new AtomicBoolean(false);
+		}
+		double [] starts = new double[5];
+		double start_time = (((double) System.nanoTime()) * 1E-9);
+		solveY(0); // int diag)
+		tile_started[0][0].set(true);
+		tile_done[0][0].set(true);
+		rows_done.set(1);
+		row_len_started[0].set(1);
+		row_len_done[0].set(1);
+		starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		int [][] debug_order = debug? new int[n][n]:null;
+		if (debug) debug_order[0][0] = ai.getAndIncrement()+1;
+		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+			threads[ithread] = new Thread() {
+				public void run() {
+					while (rows_done.get() < n) {
+						got_tile:{
+							for (int row1 = rows_done.get(); row1 < n; row1++) {
+								// is it ready for diagonal solve?
+								if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
+									solveY(row1);
+									rows_done.set(row1+1); // no race is possible?
+									if (debug) debug_order[row1][row1] = ai.getAndIncrement()+1;
+									break got_tile;
+								}
+								if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
+									for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
+										if (!tile_started[row1][col1].getAndSet(true)) {
+											row_len_started[row1].getAndAccumulate(col1+1, Math::max);										
+											subYCol( row1,  // int       i,   // i > j,
+													col1); // 	int      j)
+											row_busy[row1].set(false); // release row
+											if (debug) debug_order[row1][col1] = ai.getAndIncrement()+1;
+											tile_done[row1][col1].set(true);
+											// verify if all below down to row_len_done are finished, set row_len_done to max
+											for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
+												if (tile_done[row1][col2].get()) {
+													row_len_done[row1].getAndAccumulate(col2+1, Math::max);
+												}
+											}
+											break got_tile;
+										}
+									}
+									row_busy[row1].set(false); // release row
+								}
+							}
+							break; // could not find a tile to process (number of parallels go down) -  reduce number of remaining threads
 						}
 					}
-				};
-			}		      
-			ImageDtt.startAndJoin(threads);
-		}
+				}
+			};
+		}		      
+		ImageDtt.startAndJoin(threads);
 		// Solve L'X = Y
-		solveX(n-1); // int diag)	
-		for (int tile_diag = n-1; tile_diag > 0; tile_diag--) {
-			final int ftile_diag = tile_diag;
-			// subtract vertical column of tiles for top (previous) segment of X->Y from the remaining
-			// segments. After subtracting from the top segment, immediately calculate Y for it, all
-			// other segments subtract in parallel
-			ai.set(tile_diag - 1); // start from the second tile segment
-			for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
-				threads[ithread] = new Thread() {
-					public void run() {
-						for (int tile_seg = ai.getAndDecrement(); tile_seg >= 0; tile_seg = ai.getAndDecrement()) {
-							 subXCol(
-									 tile_seg, // int       i,   // i > j,
-									 ftile_diag); // 	int       j)
-							 if (tile_seg == (ftile_diag - 1)) {
-								 solveX(tile_seg);   // int diag)	
-							 }
+		rows_done.set(0);// this number of tile rows solved
+		for (int row = 0; row < n; row++) {
+			for (int col = 0; col <= row; col++) {
+				tile_started[row][col].set(false);
+				tile_done[row][col].set(false);
+			}
+			row_len_started[row].set(0);
+			row_len_done[row].set(0);
+		}
+		starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		solveX(n-1); // int diag)
+		starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		tile_started[0][0].set(true);
+		tile_done[0][0].set(true);
+		rows_done.set(1);
+		row_len_started[0].set(1);
+		row_len_done[0].set(1);
+		for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
+			threads[ithread] = new Thread() {
+				public void run() {
+					while (rows_done.get() < n) {
+						got_tile:{
+							for (int row1 = rows_done.get(); row1 < n; row1++) {
+								// is it ready for diagonal solve?
+								if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
+									solveX(n - 1 - row1);
+									rows_done.set(row1+1); // no race is possible?
+									break got_tile;
+								}
+								if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
+									for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
+										if (!tile_started[row1][col1].getAndSet(true)) {
+											row_len_started[row1].getAndAccumulate(col1+1, Math::max);										
+											subXCol(n - 1 - row1,  // int       i,   // i > j,
+													n - 1 - col1); // 	int      j)
+											row_busy[row1].set(false); // release row
+											tile_done[row1][col1].set(true);
+											// verify if all below down to row_len_done are finished, set row_len_done to max
+											for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
+												if (tile_done[row1][col2].get()) {
+													row_len_done[row1].getAndAccumulate(col2+1, Math::max);
+												}
+											}
+											break got_tile;
+										}
+									}
+									row_busy[row1].set(false); // release row
+								}
+							}
+							break; // could not find a tile to process (number of parallels go down) -  reduce number of remaining threads
 						}
 					}
-				};
-			}		      
-			ImageDtt.startAndJoin(threads);
+				}
+			};
+		}		      
+		ImageDtt.startAndJoin(threads);
+		starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
+		if (debug) {
+			System.out.println("\ncholeskyBlock.solve1(): ==== number of threads = "+threads.length+ "====");
+			System.out.println(String.format("choleskyBlock.solve1(): solveY(0)       %12.9f sec",(starts[0])));
+			System.out.println(String.format("choleskyBlock.solve1(): solveY_other    %12.9f sec- including 1-st column",(starts[2] - starts[0])));
+			System.out.println(String.format("choleskyBlock.solve1(): solveX(n-1)     %12.9f sec",(starts[3] - starts[2])));
+			System.out.println(String.format("choleskyBlock.solve1(): solveX_other    %12.9f sec",(starts[4] - starts[3])));
 		}
 		return;
 	}
-	
-	
+
 	public static Matrix solve (Matrix B, double [][] L) {
 		int n = L.length;
 		if (B.getRowDimension() != n) {
@@ -551,146 +634,4 @@ public class CholeskyBlock {
 		return new Matrix(x,n);
 	}
 	
-/*
-	public void setL21(
-			int       i,   // i > j,
-			int       j) { // j <nf
-		int indx_diag = indx_IJ(j,j);
-		int indx_ij =   indx_IJ(i,j);
-		int h = (i < nf) ? m : (np-nf*m);
-		// prepare solving Lx = b, copy tile A -> L
-		System.arraycopy(A, indx_ij, L, indx_ij, m * h);
-		for (int l_row = 0; l_row < m; l_row++) { // was <m <h!
-			for (int x_col= 0; x_col < h; x_col++) { // b-vector
-				int lindx = indx_ij + m * x_col + l_row;
-				double ls = L[lindx];
-				for (int l_col = 0; l_col < l_row; l_col++) {
-					ls -= L[indx_ij + m * x_col + l_col] * L[indx_diag + m* l_row+l_col];
-				}
-				L[lindx] = ls/L[indx_diag + (m + 1)* l_row];
-			}
-		}
-		return;
-	}
-
- */
-	
-	// ======================== unused =====================
-	
-	
-	public static double [][] cholesky_single (
-			double [][] a,
-			double [][] l) {
-		  int n = a.length;
-		  for (int j = 0; j < n; j++) {
-			  Arrays.fill(l[j], 0.0);
-		  }
-	      // Main loop.
-	      for (int j = 0; j < n; j++) {
-	         double[] Lrowj = l[j];
-	         double d = 0.0;
-	         for (int k = 0; k < j; k++) {
-	            double[] Lrowk = l[k];
-	            double s = 0.0;
-	            for (int i = 0; i < k; i++) {
-	               s += Lrowk[i]*Lrowj[i];
-	            }
-	            Lrowj[k] = s = (a[j][k] - s)/l[k][k];
-	            d = d + s*s;
-	         }
-	         d = a[j][j] - d;
-	         l[j][j] = Math.sqrt(Math.max(d,0.0));
-	      }
-	      return l;
-	}
-
-	/**
-	 * Get a new diagonal square submatrix, copy only lower triangle including diagonal  
-	 * @param arr A or L packed array (line-scan order for each tile column)
-	 * @param i tile index on the diagonal
-	 * @return new array with tile data (m x m, or smaller for the bottom right) 
-	 */
-	public double [][] getDiagTriangle(
-			double [] arr,
-			int i){
-		int l =  (i >= nf) ? (np-nf*m): m;
-		double [][] a_diag = new double[l][l];
-		return getDiagLTriangle (arr, a_diag, i);
-	}
-
-	/**
-	 * Get a new diagonal square submatrix, copy only lower triangle including diagonal 
-	 * @param arr A or L packed array (line-scan order for each tile column)
-	 * @param a_diag - preallocated array (should be smaller if needed) 
-	 * @param i tile index on the diagonal
-	 * @return new array with tile data (m x m, or smaller for the bottom right) 
-	 */
-	public double [][] getDiagLTriangle(
-			double [] arr,
-			double [][] a_diag,
-			int i){
-		int indx = indx_IJ(i,i);
-		int l = a_diag.length;
-		for (int k = 0; k < l; k++) {
-			int kl = k*l;
-			System.arraycopy( arr, indx+kl, a_diag[k], 0, k+1);
-		}
-		return a_diag;
-	}
-	
-	/**
-	 * Save calculated tile L lower diagonal matrix to a packed array
-	 * @param arr    A or L packed array (line-scan order for each tile column)
-	 * @param l_diag lower triangular array with Cholesky decomposition
-	 * @param i tile index on a diagonal
-	 */
-	public void putDiagLTriangle(
-			double []   arr,
-			double [][] l_diag,
-			int         i) {
-		int indx = indx_IJ(i,i);
-		int l = l_diag.length;
-		for (int k = 0; k < l; k++) {
-			int kl = k*l;
-			System.arraycopy(l_diag[k], 0, arr, indx+kl,  k+1);
-		}
-	}
-	
-	/**
-	 * Get a new diagonal square submatrix 
-	 * @param arr A or L packed array (line-scan order for each tile column)
-	 * @param i tile index on the diagonal
-	 * @return new array with tile data (m x m, or smaller for the bottom right) 
-	 */
-	public double [][] getDiagSquare(
-			double [] arr,
-			int i){
-		int l =  (i >= nf) ? (np-nf*m): m;
-		double [][] a_diag = new double[l][l];
-		return getDiagSquare (arr, a_diag, i);
-	}
-
-	/**
-	 * Get a new diagonal square submatrix 
-	 * @param arr A or L packed array (line-scan order for each tile column)
-	 * @param a_diag - preallocated array (should be smaller if needed) 
-	 * @param i tile index on the diagonal
-	 * @return new array with tile data (m x m, or smaller for the bottom right) 
-	 */
-	public double [][] getDiagSquare(
-			double []   arr,
-			double [][] a_diag,
-			int         i){
-		int indx = indx_IJ(i,i);
-		int l = a_diag.length;
-		for (int k = 0; k < l; k++) {
-			int kl = k*l;
-			System.arraycopy( arr, indx+kl, a_diag[k], 0, l);
-		}
-		return a_diag;
-	}
-	
-	
-	
-	
 }