Commit 663a964a authored by Andrey Filippov's avatar Andrey Filippov

Testing parallel Cholesky

parent 54367395
package com.elphel.imagej.common;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicInteger;
import com.elphel.imagej.tileprocessor.ImageDtt;
import Jama.CholeskyDecomposition;
import Jama.Matrix;
public class CholeskyLDLTMulti {
public final int n;
public final double [][] L;
public int solve_step = 10;
public double thread_scale = 2.5; // split jobs in thread_scale * num_treads chunks
// public double [][] L;
public CholeskyLDLTMulti(Matrix matA, boolean b) {
boolean debug = true;
int dbg_j = 100; // 1520;
n = matA.getRowDimension();
L = matA.getArray(); // will be modified, copy externally if needed
final Thread[] threads = ImageDtt.newThreadArray();
final AtomicInteger ai = new AtomicInteger(0);
for (int j = 0; j < n; j++) {
if (j == dbg_j) {
System.out.println("CholeskyLDLTMulti(): j=dbg_j="+j);
}
final int fj=j;
for (int k = 0; k < j; k++) {
final int fk=k;
L[j][j] -= L[j][k]*L[j][k]; // diagonal
final double Ljk = L[j][k];
// for (int i = j+1; i < n; i++) {
// l[i][j] -= l[i][k]*l[j][k]; // not including diagonal
// }
// parallel
ai.set(j+1);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int i = ai.getAndIncrement(); i < n; i = ai.getAndIncrement()) {
L[i][fj] -= L[i][fk]*Ljk;
}
}
};
}
ImageDtt.startAndJoin(threads);
}
// parallel
// for (int i = j+1; i < n; i++) {
// l[i][j] /= l[j][j];
// }
final double Ljj = L[j][j];
ai.set(j+1);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int i = ai.getAndIncrement(); i < n; i = ai.getAndIncrement()) {
L[i][fj] /= Ljj;
}
}
};
}
ImageDtt.startAndJoin(threads);
if (debug) {
System.out.println("CholeskyLDLTMulti(): j="+j);
}
}
return;
}
// L overwites strict lower A, d overwrites diagonal A
public CholeskyLDLTMulti(Matrix matA, int[] ii) {
// boolean debug = true;
int dbg_j = 100; // 1520;
n = matA.getRowDimension();
// not needed
L = matA.getArray(); // will be modified, copy externally if needed
final double [][] a = matA.getArray();
final Thread[] threads = ImageDtt.newThreadArray();
final AtomicInteger ai = new AtomicInteger(0);
for (int j = 0; j < n; j++) {
final int fj = j;
if (j == dbg_j) {
System.out.println("CholeskyLDLTMulti(): j=dbg_j="+j);
}
/*
if (j > 0) {
ai.set(0);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int i = ai.getAndIncrement(); i < fj; i = ai.getAndIncrement()) {
a[fj][fj] -= a[i][i]*a[fj][i]*a[fj][i]; // squared
}
}
};
}
ImageDtt.startAndJoin(threads);
}
*/
for (int i = 0; i < j; i++) {
a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
}
/* for (int i = 0; i < j; i++) {
a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
} */
for (int k = j+1; k < n; k++) {
final int fk = k;
/*
if (j > 0) {
ai.set(0);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int i = ai.getAndIncrement(); i < fj; i = ai.getAndIncrement()) {
a[fk][fj] -= a[i][i]* a[fk][i] * a[fj][i];
}
}
};
}
ImageDtt.startAndJoin(threads);
}
*/
for (int i = 0; i < j; i++) {
a[k][j] -= a[i][i]* a[k][i] * a[j][i];
}
a[k][j] /= a[j][j];
}
/* for (int k = j+1; k < n; k++) {
for (int i = 0; i < j; i++) {
a[k][j] -= a[i][i]* a[k][i] * a[j][i];
}
a[k][j] /= a[j][j];
} */
}
return;
}
public CholeskyLDLTMulti(Matrix matA, int ii) { // single-threaded
n = matA.getRowDimension();
// not needed
// L = matA.getArray(); // will be modified, copy externally if needed
final double [][] a = matA.getArray();
for (int j = 0; j < n; j++) {
for (int i = 0; i < j; i++) {
a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
}
for (int k = j+1; k < n; k++) {
for (int i = 0; i < j; i++) {
a[k][j] -= a[i][i]* a[k][i] * a[j][i];
}
a[k][j] /= a[j][j];
}
}
L = a;
return;
}
public CholeskyLDLTMulti(Matrix matA) {
n = matA.getRowDimension();
final double [][] a = matA.getArray();
final Thread[] threads = ImageDtt.newThreadArray();
final AtomicInteger ai = new AtomicInteger(0);
final AtomicInteger ati = new AtomicInteger(0);
final double [] ajj_threaded = new double[threads.length];
final int threads_chunk = (int) (n / (thread_scale * threads.length));
for (int j = 0; j < n; j++) {
final int fj = j;
final double [] aj = a[j];
double ajj = aj[j];
if (j > threads_chunk) { // use multithreaded
Arrays.fill(ajj_threaded, 0.0);
ai.set(0);
ati.set(0);
for (int ithread = 0; ithread < threads.length; ithread++) { // first sum for pairs
threads[ithread] = new Thread() {
public void run() {
int nthread = ati.getAndIncrement();
for (int i0 = ai.getAndAdd(threads_chunk); i0 < fj; i0 = ai.getAndAdd(threads_chunk)) {
int i1 = Math.min(fj, i0+threads_chunk);
for (int i = i0; i < i1; i++) {
double aji = aj[i];
ajj_threaded[nthread] += a[i][i]*aji*aji; // squared
}
}
}
};
}
ImageDtt.startAndJoin(threads);
for (int nthread = 0; nthread < ajj_threaded.length; nthread++) {
ajj -= ajj_threaded[nthread];
}
} else {
for (int i = 0; i < j; i++) { // use aj, a, i
double aji = aj[i];
ajj -= a[i][i]*aji*aji; // squared
}
}
aj[j] = ajj;
// TODO: above run single-threaded for small, then multithreaded with post-accumulation of ajj
if ((n - j+1) > threads_chunk) { // use multithreaded
ai.set(j+1);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int k0 = ai.getAndAdd(threads_chunk); k0 < n; k0 = ai.getAndAdd(threads_chunk)) {
int k1 = Math.min(n, k0+threads_chunk);
for (int k = k0; k < k1; k++) {
double [] ak = a[k];
for (int i = 0; i < fj; i++) { // ak, aj, i,
ak[fj] -= a[i][i] * ak[i] * aj[i];
}
ak[fj] /= aj[fj];
}
}
}
};
}
ImageDtt.startAndJoin(threads);
} else { // use single-threaded
for (int k = j+1; k < n; k++) {
final double [] ak = a[k];
for (int i = 0; i < j; i++) { // ak, aj, i,
ak[j] -= a[i][i]* ak[i] * aj[i];
}
ak[j] /= ajj;
}
}
}
L = a;
return;
}
public CholeskyLDLTMulti(Matrix matA, long ll) { // single-threaded
n = matA.getRowDimension();
// not needed
final double [][] a = matA.getArray();
for (int j = 0; j < n; j++) {
final double [] aj = a[j];
double ajj = aj[j];
for (int i = 0; i < j; i++) { // use aj, a, i
double aji = aj[i];
// a[j][j] -= a[i][i]*a[j][i]*a[j][i]; // squared
ajj -= a[i][i]*aji*aji; // squared
aj[j] -= a[i][i]*aji*aji; // squared
}
aj[j] = ajj;
for (int k = j+1; k < n; k++) {
final double [] ak = a[k];
for (int i = 0; i < j; i++) { // ak, aj, i,
// a[k][j] -= a[i][i]* a[k][i] * a[j][i]; // use digonal
ak[j] -= a[i][i]* ak[i] * aj[i];
}
// a[k][j] /= a[j][j];
// ak[j] /= aj[j];
ak[j] /= ajj;
}
}
L = a;
return;
}
public Matrix solve0 (Matrix B) { // multithreased
if (B.getRowDimension() != n) {
throw new IllegalArgumentException("Matrix row dimensions must agree.");
}
// Copy right hand side.
double[] x = B.getColumnPackedCopy (); // (for single-column)
// Solve L*D*Y = B;
for (int k = 0; k < n; k++) {
for (int i = 0; i < k ; i++) {
x[k] -= x[i]*L[k][i];
}
}
for (int k = 0; k < n; k++) {
x[k] /= L[k][k];
}
// Solve L'*X = Y;
for (int k = n-1; k >= 0; k--) {
for (int i = k+1; i < n ; i++) {
x[k] -= x[i]*L[i][k];
}
}
return new Matrix(x,n);
}
public Matrix solve (Matrix B) {
if (B.getRowDimension() != n) {
throw new IllegalArgumentException("Matrix row dimensions must agree.");
}
// Copy right hand side.
// double[][] X = B.getArrayCopy();
final Thread[] threads = ImageDtt.newThreadArray();
final AtomicInteger ai = new AtomicInteger(0);
final int threads_chunk = (int) (n / (thread_scale * threads.length));
double[] x = B.getColumnPackedCopy (); // (for single-column)
// Solve L*D*Y = B;
for (int row0=0; row0 < n; row0 += solve_step) {
final int frow0 = row0;
final int frow1 = Math.min(row0 + solve_step, n);
// filling triangle single-threaded
for (int row = frow0; row < frow1; row++) {
double [] l_row = L[row];
for (int i = frow0; i < row ; i++) {
x[row] -= x[i]*l_row[i];
}
}
// Filling rectangle parallel
if (frow1 < n) {
ai.set(frow1);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
double [] l_row;
for (int row = ai.getAndAdd(threads_chunk); row < n; row = ai.getAndAdd(threads_chunk)) {
int row_lim = Math.min(n, row+threads_chunk);
for (; row < row_lim; row++) {
l_row = L[row];
for (int col = frow0; col < frow1; col++) {
x[row] -= x[col]*l_row[col];
}
}
}
}
};
}
ImageDtt.startAndJoin(threads);
}
}
// for (int k = 0; k < n; k++) {
// for (int i = 0; i < k ; i++) {
// x[k] -= x[i]*L[k][i];
// }
// }
// make parallel
ai.set(0);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int row = ai.getAndAdd(threads_chunk); row < n; row = ai.getAndAdd(threads_chunk)) {
int row_lim = Math.min(n, row+threads_chunk);
for (; row < row_lim; row++) {
x[row] /= L[row][row];
}
}
}
};
}
ImageDtt.startAndJoin(threads);
/*
for (int row = 0; row < n; row++) {
x[row] /= L[row][row];
}
*/
// Solve L'*X = Y;
for (int row1 = n-1; row1 > 0 ; row1 -= solve_step) {
final int frow1 = row1;
final int frow0 = Math.max(row1 - solve_step, 0);
// filling triangle single-threaded
for (int row = row1-1; row >= frow0 ; row--) {
for (int i = row+1; i <= frow1; i++) {
x[row] -= x[i]*L[i][row];
}
}
// Filling rectangle parallel
if (frow1 > 0) {
ai.set(0);
for (int ithread = 0; ithread < threads.length; ithread++) {
threads[ithread] = new Thread() {
public void run() {
for (int row = ai.getAndAdd(threads_chunk); row < frow0; row = ai.getAndAdd(threads_chunk)) {
int row_lim = Math.min(frow0, row+threads_chunk);
for (; row < row_lim; row++) {
for (int col = frow0+1; col <= frow1; col++) {
x[row] -= x[col]*L[col][row];
}
}
}
}
};
}
ImageDtt.startAndJoin(threads);
}
}
/**
* What was that?
for (int k = n-1; k >= 0; k--) {
for (int i = k+1; i < n ; i++) {
x[k] -= x[i]*L[i][k];
}
// X[k] /= L[k][k];
}
*/
return new Matrix(x,n);
}
public static Matrix[] testCholesky(
Matrix wjtjlambda,
Matrix jty) {
Matrix wjtjlambda_copy0 = new Matrix(wjtjlambda.getArrayCopy());
CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda_copy0);
Matrix wjtjlambda_copy = new Matrix(wjtjlambda.getArrayCopy());
CholeskyLDLTMulti choleskyLDLTMulti = new CholeskyLDLTMulti(wjtjlambda_copy);
Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
Matrix mdelta_cholesky_multi0 = choleskyLDLTMulti.solve0(jty);
Matrix mdelta_cholesky_multi = choleskyLDLTMulti.solve(jty);
return new Matrix[] {mdelta_cholesky, mdelta_cholesky_multi0, mdelta_cholesky_multi};
}
}
......@@ -12,6 +12,7 @@ import java.util.HashSet;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import com.elphel.imagej.common.CholeskyLDLTMulti;
import com.elphel.imagej.common.DoubleGaussianBlur;
import com.elphel.imagej.common.ShowDoubleFloatArrays;
import com.elphel.imagej.jp4.JP46_Reader_camera;
......@@ -21,6 +22,7 @@ import com.elphel.imagej.tileprocessor.QuadCLT;
import com.elphel.imagej.tileprocessor.TileNeibs;
import com.elphel.imagej.tileprocessor.TileProcessor;
import Jama.CholeskyDecomposition;
import Jama.Matrix;
import ij.IJ;
import ij.ImagePlus;
......@@ -1852,31 +1854,62 @@ public class VegetationLMA {
System.out.println("JtJ + lambda*diag(JtJ");
wjtjlambda.print(18, 6);
}
Matrix jtjl_inv = null;
try {
jtjl_inv = wjtjlambda.inverse(); // check for errors
} catch (RuntimeException e) {
rslt[1] = true;
if (debug_level > -2) {
System.out.println("Singular Matrix!");
}
return rslt;
}
if (debug_level>4) {
System.out.println("(JtJ + lambda*diag(JtJ).inv()");
jtjl_inv.print(18, 6);
}
//last_jt has NaNs
// Matrix jty = (new Matrix(this.last_jt)).times(y_minus_fx_weighted);
Matrix jty = (new Matrix(last_jt_decimated)).times(y_minus_fx_weighted);
if (debug_level>2) {
System.out.println("Jt * (y-fx)");
jty.print(18, 6);
Matrix mdelta = null; // jtjl_inv.times(jty);
boolean use_cholesky = true; // false;
double matrix_start_time = ((double) System.nanoTime()) * 1E-9;
if (use_cholesky) {
try {
mdelta = (new CholeskyDecomposition(wjtjlambda)).solve(jty);
} catch (RuntimeException e) {
rslt[1] = true;
if (debug_level > -2) {
System.out.println("Singular Matrix!");
}
return rslt;
}
} else { // old way - inverse() using LU
Matrix jtjl_inv = null;
try {
jtjl_inv = wjtjlambda.inverse(); // check for errors
} catch (RuntimeException e) {
rslt[1] = true;
if (debug_level > -2) {
System.out.println("Singular Matrix!");
}
return rslt;
}
if (debug_level>4) {
System.out.println("(JtJ + lambda*diag(JtJ).inv()");
jtjl_inv.print(18, 6);
}
//last_jt has NaNs
// Matrix jty = (new Matrix(last_jt_decimated)).times(y_minus_fx_weighted);
if (debug_level>2) {
System.out.println("Jt * (y-fx)");
jty.print(18, 6);
}
mdelta = jtjl_inv.times(jty);
}
Matrix mdelta = jtjl_inv.times(jty);
if (debug_level>-2) {
System.out.println("lmaStep(): Matrix inverted in "+((((double) System.nanoTime()) * 1E-9)-matrix_start_time)+
", used "+(use_cholesky? "Cholesky":"LU"));
}
//
/*
CholeskyDecomposition choleskyDecomposition = new CholeskyDecomposition(wjtjlambda);
CholeskyLDLTMulti choleskyLDLTMulti = new CholeskyLDLTMulti(wjtjlambda);
Matrix mdelta_cholesky = choleskyDecomposition.solve(jty);
Matrix mdelta_cholesky_multi = choleskyLDLTMulti.solve(jty);
*/
CholeskyLDLTMulti.testCholesky(
wjtjlambda, // Matrix wjtjlambda,
jty);
if (debug_level>2) {
System.out.println("mdelta");
mdelta.print(18, 6);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment