Commit b09025fb authored by Andrey Filippov's avatar Andrey Filippov

Fixed multi-threaded solution

parent e27934db
......@@ -20,27 +20,61 @@ package com.elphel.imagej.common;
** along with this program. If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------**
*/
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import com.elphel.imagej.tileprocessor.ImageDtt;
import Jama.Matrix;
public class CholeskyBlock {
public static int dflt_m = 70; // tile size if not specified during initialization
public static int dflt_threads = 100; // maximal number of threads to use for decomposition
public static int dflt_solve_threads =16; // 4; // maximal number of threads to use for solve()
public static boolean debug = false; // true; // false;
public final int m; // tile size
public final int np; // number of elements in row/col
public final int n; // number of tiles in row/col
public final int nf; // number of full rows/cols
public int decomp_threads;
public final double [] A;
public final double [] L;
public final double [] B;
public CholeskyBlock (
double [][] A_in) {
m = dflt_m;
decomp_threads = dflt_threads;
np = A_in.length;
nf = np / m;
n = ((nf * m) < np)? (nf + 1) : nf;
A = new double [np*np]; // [n*n*m*m];
L = new double [A.length];
B = new double [np];
setup_ATriangle(A_in);
choleskyBlockMulti();
}
public CholeskyBlock (
double [][] A_in,
int size) {
m = size;
decomp_threads = dflt_threads;
np = A_in.length;
nf = np / m;
n = ((nf * m) < np)? (nf + 1) : nf;
A = new double [np*np]; // [n*n*m*m];
L = new double [A.length];
B = new double [np];
setup_ATriangle(A_in);
choleskyBlockMulti();
}
public CholeskyBlock (
double [][] A_in,
int size,
int decomp_threads) {
m = size;
this.decomp_threads = decomp_threads;
np = A_in.length;
nf = np / m;
n = ((nf * m) < np)? (nf + 1) : nf;
......@@ -183,22 +217,6 @@ public class CholeskyBlock {
return j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i;
}
/**
* Tile bottom-right corner, counting i - up, j - to the left.
* Full tile are on the bottom and right (partial - on the top
* and left
* @param i tile row from bottom
* @param j tile column from right
* @return index of the bottom-right corner of the tile (the largest!)
*/
public int rindx_IJ(int i, int j) { // upside down
// return (j * (m * np)) + (np-1 -i) * ((j >= nf) ?(np-nf*m): m);
// return (np*np-1) - indx_IJ(i, j);
return np*np-1 - (j * (m * np) + ((j >= nf) ? (np-nf*m): m) * m * i);
}
public void setL21(
int i, // i > j,
int j) { // j <nf
......@@ -249,7 +267,7 @@ public class CholeskyBlock {
}
public void choleskyBlockMulti() {
final Thread[] threads = ImageDtt.newThreadArray();
final Thread[] threads = ImageDtt.newThreadArray(decomp_threads); // 1);
final AtomicInteger ai = new AtomicInteger(0);
cholesky_single(0);
// Calculate first column under diagonal (L21) - maybe use m
......@@ -330,8 +348,7 @@ public class CholeskyBlock {
}
// Cholesky-Banachiewicz Algorithm ?
// Single-threaded
// Single-threaded, used for a single-tile Cholesky deconstruction
public void cholesky_single(int diag) {
int h = (diag < nf) ? m : (np-nf*m);
......@@ -370,10 +387,8 @@ public class CholeskyBlock {
double x = B[xindx+k];
for (int i = 0; i < k ; i++) {
x -= B[xindx + i] * L[lindx + h*k + i];
// x[k] -= x[i]*L[k][i];
}
B[xindx+k] = x/L[lindx + (h+1)*k];
// x[k] /= L[k][k];
}
return;
}
......@@ -447,24 +462,6 @@ public class CholeskyBlock {
}
public void subXCol0(
int i, // i > j,
int j) { // j < i
int lindx = rindx_IJ(i,j);
int xsrc = np - 1 - j * m; // start of B/x to use
int xdst = np - 1 - i * m; // start of B/x to modify
int h = (i < nf) ? m : (np-nf*m);
for (int k = 0; k < h; k++) {
double x = B[xdst - k];
for (int l = 0; l < m; l++) {
x -= L[lindx - l * m - k] * B[xsrc - l];
}
B[xdst - k] = x;
}
return;
}
public Matrix solve (Matrix b) {
setB(b);
solve();
......@@ -472,61 +469,147 @@ public class CholeskyBlock {
}
public void solve() {
final Thread[] threads = ImageDtt.newThreadArray();
final AtomicInteger ai = new AtomicInteger(0);
// L, B are set up
// convert top fragment of B->Y using top-left tile of L
solveY(0); // int diag)
for (int tile_diag = 0; tile_diag < (n -1); tile_diag++) {
final int ftile_diag = tile_diag;
// subtract vertical column of tiles for top (previous) segment of B->Y from the remaining
// segments. After subtracting from the top segment, immediately calculate Y for it, all
// other segments subtract in parallel
ai.set(tile_diag+1); // start from the second tile segment
for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
threads[ithread] = new Thread() {
public void run() {
for (int tile_seg = ai.getAndIncrement(); tile_seg < n; tile_seg = ai.getAndIncrement()) {
subYCol(
tile_seg, // int i, // i > j,
ftile_diag); // int j)
if (tile_seg == (ftile_diag+1)) {
solveY(tile_seg); // int diag)
}
solve(dflt_solve_threads); // this.solve_threads);
}
public void solve(int solve_threads) {
final Thread[] threads = ImageDtt.newThreadArray(solve_threads);
final AtomicInteger ai = new AtomicInteger(0);
AtomicBoolean [][] tile_started = new AtomicBoolean[n][n];
AtomicBoolean [][] tile_done = new AtomicBoolean[n][n];
AtomicInteger [] row_len_started = new AtomicInteger[n]; //
AtomicInteger [] row_len_done = new AtomicInteger[n]; //
AtomicBoolean [] row_busy = new AtomicBoolean[n];
AtomicInteger rows_done = new AtomicInteger(0); // this number of tile rows solved
for (int row = 0; row < n; row++) {
for (int col = 0; col <= row; col++) {
tile_started[row][col] = new AtomicBoolean(false);
tile_done[row][col] = new AtomicBoolean(false);
}
row_len_started[row] = new AtomicInteger(0);
row_len_done[row] = new AtomicInteger(0);
row_busy[row] = new AtomicBoolean(false);
}
double [] starts = new double[5];
double start_time = (((double) System.nanoTime()) * 1E-9);
solveY(0); // int diag)
tile_started[0][0].set(true);
tile_done[0][0].set(true);
rows_done.set(1);
row_len_started[0].set(1);
row_len_done[0].set(1);
starts[0] = (((double) System.nanoTime()) * 1E-9) - start_time;
int [][] debug_order = debug? new int[n][n]:null;
if (debug) debug_order[0][0] = ai.getAndIncrement()+1;
for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
threads[ithread] = new Thread() {
public void run() {
while (rows_done.get() < n) {
got_tile:{
for (int row1 = rows_done.get(); row1 < n; row1++) {
// is it ready for diagonal solve?
if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
solveY(row1);
rows_done.set(row1+1); // no race is possible?
if (debug) debug_order[row1][row1] = ai.getAndIncrement()+1;
break got_tile;
}
if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
if (!tile_started[row1][col1].getAndSet(true)) {
row_len_started[row1].getAndAccumulate(col1+1, Math::max);
subYCol( row1, // int i, // i > j,
col1); // int j)
row_busy[row1].set(false); // release row
if (debug) debug_order[row1][col1] = ai.getAndIncrement()+1;
tile_done[row1][col1].set(true);
// verify if all below down to row_len_done are finished, set row_len_done to max
for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
if (tile_done[row1][col2].get()) {
row_len_done[row1].getAndAccumulate(col2+1, Math::max);
}
}
break got_tile;
}
}
row_busy[row1].set(false); // release row
}
}
break; // could not find a tile to process (number of parallels go down) - reduce number of remaining threads
}
}
};
}
ImageDtt.startAndJoin(threads);
}
}
};
}
ImageDtt.startAndJoin(threads);
// Solve L'X = Y
solveX(n-1); // int diag)
for (int tile_diag = n-1; tile_diag > 0; tile_diag--) {
final int ftile_diag = tile_diag;
// subtract vertical column of tiles for top (previous) segment of X->Y from the remaining
// segments. After subtracting from the top segment, immediately calculate Y for it, all
// other segments subtract in parallel
ai.set(tile_diag - 1); // start from the second tile segment
for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
threads[ithread] = new Thread() {
public void run() {
for (int tile_seg = ai.getAndDecrement(); tile_seg >= 0; tile_seg = ai.getAndDecrement()) {
subXCol(
tile_seg, // int i, // i > j,
ftile_diag); // int j)
if (tile_seg == (ftile_diag - 1)) {
solveX(tile_seg); // int diag)
}
rows_done.set(0);// this number of tile rows solved
for (int row = 0; row < n; row++) {
for (int col = 0; col <= row; col++) {
tile_started[row][col].set(false);
tile_done[row][col].set(false);
}
row_len_started[row].set(0);
row_len_done[row].set(0);
}
starts[2] = (((double) System.nanoTime()) * 1E-9) - start_time;
solveX(n-1); // int diag)
starts[3] = (((double) System.nanoTime()) * 1E-9) - start_time;
tile_started[0][0].set(true);
tile_done[0][0].set(true);
rows_done.set(1);
row_len_started[0].set(1);
row_len_done[0].set(1);
for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
threads[ithread] = new Thread() {
public void run() {
while (rows_done.get() < n) {
got_tile:{
for (int row1 = rows_done.get(); row1 < n; row1++) {
// is it ready for diagonal solve?
if ((row_len_done[row1].get() == row1) && !tile_started[row1][row1].getAndSet(true)) {
solveX(n - 1 - row1);
rows_done.set(row1+1); // no race is possible?
break got_tile;
}
if (!row_busy[row1].getAndSet(true)) { // do not write to the same row, avoid races
for (int col1 = row_len_started[row1].get(); (col1 < row1) && (col1 < rows_done.get()) ; col1++) {
if (!tile_started[row1][col1].getAndSet(true)) {
row_len_started[row1].getAndAccumulate(col1+1, Math::max);
subXCol(n - 1 - row1, // int i, // i > j,
n - 1 - col1); // int j)
row_busy[row1].set(false); // release row
tile_done[row1][col1].set(true);
// verify if all below down to row_len_done are finished, set row_len_done to max
for (int col2 = row_len_done[row1].get(); col2 < row1; col2++) {
if (tile_done[row1][col2].get()) {
row_len_done[row1].getAndAccumulate(col2+1, Math::max);
}
}
break got_tile;
}
}
row_busy[row1].set(false); // release row
}
}
break; // could not find a tile to process (number of parallels go down) - reduce number of remaining threads
}
}
};
}
ImageDtt.startAndJoin(threads);
}
};
}
ImageDtt.startAndJoin(threads);
starts[4] = (((double) System.nanoTime()) * 1E-9) - start_time;
if (debug) {
System.out.println("\ncholeskyBlock.solve1(): ==== number of threads = "+threads.length+ "====");
System.out.println(String.format("choleskyBlock.solve1(): solveY(0) %12.9f sec",(starts[0])));
System.out.println(String.format("choleskyBlock.solve1(): solveY_other %12.9f sec- including 1-st column",(starts[2] - starts[0])));
System.out.println(String.format("choleskyBlock.solve1(): solveX(n-1) %12.9f sec",(starts[3] - starts[2])));
System.out.println(String.format("choleskyBlock.solve1(): solveX_other %12.9f sec",(starts[4] - starts[3])));
}
return;
}
public static Matrix solve (Matrix B, double [][] L) {
int n = L.length;
if (B.getRowDimension() != n) {
......@@ -551,146 +634,4 @@ public class CholeskyBlock {
return new Matrix(x,n);
}
/*
public void setL21(
int i, // i > j,
int j) { // j <nf
int indx_diag = indx_IJ(j,j);
int indx_ij = indx_IJ(i,j);
int h = (i < nf) ? m : (np-nf*m);
// prepare solving Lx = b, copy tile A -> L
System.arraycopy(A, indx_ij, L, indx_ij, m * h);
for (int l_row = 0; l_row < m; l_row++) { // was <m <h!
for (int x_col= 0; x_col < h; x_col++) { // b-vector
int lindx = indx_ij + m * x_col + l_row;
double ls = L[lindx];
for (int l_col = 0; l_col < l_row; l_col++) {
ls -= L[indx_ij + m * x_col + l_col] * L[indx_diag + m* l_row+l_col];
}
L[lindx] = ls/L[indx_diag + (m + 1)* l_row];
}
}
return;
}
*/
// ======================== unused =====================
public static double [][] cholesky_single (
double [][] a,
double [][] l) {
int n = a.length;
for (int j = 0; j < n; j++) {
Arrays.fill(l[j], 0.0);
}
// Main loop.
for (int j = 0; j < n; j++) {
double[] Lrowj = l[j];
double d = 0.0;
for (int k = 0; k < j; k++) {
double[] Lrowk = l[k];
double s = 0.0;
for (int i = 0; i < k; i++) {
s += Lrowk[i]*Lrowj[i];
}
Lrowj[k] = s = (a[j][k] - s)/l[k][k];
d = d + s*s;
}
d = a[j][j] - d;
l[j][j] = Math.sqrt(Math.max(d,0.0));
}
return l;
}
/**
* Get a new diagonal square submatrix, copy only lower triangle including diagonal
* @param arr A or L packed array (line-scan order for each tile column)
* @param i tile index on the diagonal
* @return new array with tile data (m x m, or smaller for the bottom right)
*/
public double [][] getDiagTriangle(
double [] arr,
int i){
int l = (i >= nf) ? (np-nf*m): m;
double [][] a_diag = new double[l][l];
return getDiagLTriangle (arr, a_diag, i);
}
/**
* Get a new diagonal square submatrix, copy only lower triangle including diagonal
* @param arr A or L packed array (line-scan order for each tile column)
* @param a_diag - preallocated array (should be smaller if needed)
* @param i tile index on the diagonal
* @return new array with tile data (m x m, or smaller for the bottom right)
*/
public double [][] getDiagLTriangle(
double [] arr,
double [][] a_diag,
int i){
int indx = indx_IJ(i,i);
int l = a_diag.length;
for (int k = 0; k < l; k++) {
int kl = k*l;
System.arraycopy( arr, indx+kl, a_diag[k], 0, k+1);
}
return a_diag;
}
/**
* Save calculated tile L lower diagonal matrix to a packed array
* @param arr A or L packed array (line-scan order for each tile column)
* @param l_diag lower triangular array with Cholesky decomposition
* @param i tile index on a diagonal
*/
public void putDiagLTriangle(
double [] arr,
double [][] l_diag,
int i) {
int indx = indx_IJ(i,i);
int l = l_diag.length;
for (int k = 0; k < l; k++) {
int kl = k*l;
System.arraycopy(l_diag[k], 0, arr, indx+kl, k+1);
}
}
/**
* Get a new diagonal square submatrix
* @param arr A or L packed array (line-scan order for each tile column)
* @param i tile index on the diagonal
* @return new array with tile data (m x m, or smaller for the bottom right)
*/
public double [][] getDiagSquare(
double [] arr,
int i){
int l = (i >= nf) ? (np-nf*m): m;
double [][] a_diag = new double[l][l];
return getDiagSquare (arr, a_diag, i);
}
/**
* Get a new diagonal square submatrix
* @param arr A or L packed array (line-scan order for each tile column)
* @param a_diag - preallocated array (should be smaller if needed)
* @param i tile index on the diagonal
* @return new array with tile data (m x m, or smaller for the bottom right)
*/
public double [][] getDiagSquare(
double [] arr,
double [][] a_diag,
int i){
int indx = indx_IJ(i,i);
int l = a_diag.length;
for (int k = 0; k < l; k++) {
int kl = k*l;
System.arraycopy( arr, indx+kl, a_diag[k], 0, l);
}
return a_diag;
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment