more experimenting and visualizations of the inter-scene

a74958ee · Andrey Filippov · 61e720a6 · a74958ee · a74958ee · a74958ee
Commit a74958ee authored Nov 01, 2020 by Andrey Filippov
5 changed files
--- a/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
+++ b/src/main/java/com/elphel/imagej/cameras/CLTParameters.java
@@ -800,19 +800,21 @@ public class CLTParameters {
 	public boolean    taEnMismatch         = false;  // Enable cost of a measurement layer not having same layer in the same location or near

 // gpu processing parameters
-	public double     gpu_corr_scale =    0.75; // reduce GPU-generated correlation values
-	public int        gpu_corr_rad =        7;  // size of the correlation to save - initially only 15x15
-	public double     gpu_weight_r =      0.5; // 25;
-	public double     gpu_weight_b =      0.2; // 0.25; // weight g = 1.0 - gpu_weight_r - gpu_weight_b
-	public double     gpu_sigma_r =       0.9; // 1.1;
-	public double     gpu_sigma_b =       0.9; // 1.1;
-	public double     gpu_sigma_g =       0.6; // 0.7;
-	public double     gpu_sigma_m =       0.4; // 0.7;
-	public double     gpu_sigma_rb_corr = 0.3; // apply LPF after accumulating R and B correlation before G,
-	public double     gpu_sigma_corr =    0.8;
-	public double     gpu_sigma_corr_m =  0.15;
-	public double     gpu_fatz =          500.0; // was 30
-	public double     gpu_fatz_m =        500.0; // was 30
+	public double     gpu_corr_scale =       0.75; // reduce GPU-generated correlation values
+	public int        gpu_corr_rad =         7;  // size of the correlation to save - initially only 15x15
+	public double     gpu_weight_r =         0.5; // 25;
+	public double     gpu_weight_b =         0.2; // 0.25; // weight g = 1.0 - gpu_weight_r - gpu_weight_b
+	public double     gpu_sigma_r =          0.9; // 1.1;
+	public double     gpu_sigma_b =          0.9; // 1.1;
+	public double     gpu_sigma_g =          0.6; // 0.7;
+	public double     gpu_sigma_m =          0.4; // 0.7;
+	public double     gpu_sigma_rb_corr =    0.3; // apply LPF after accumulating R and B correlation before G,
+	public double     gpu_sigma_corr =       0.8;
+	public double     gpu_sigma_corr_m =     0.15;
+	public double     gpu_sigma_log_corr =   3.0; // fill in after testing
+	public double     gpu_sigma_log_corr_m = 3.0; // fill in after testing
+	public double     gpu_fatz =           500.0; // was 30
+	public double     gpu_fatz_m =         500.0; // was 30

 	public boolean    gpu_woi =             false; // if true - use gpu_woi_tx, ...
 	public int        gpu_woi_tx =              0;
@@ -908,6 +910,10 @@ public class CLTParameters {
 		return monochrome ? gpu_sigma_corr_m : gpu_sigma_corr;
 	}

+	public double getGpuCorrLoGSigma(boolean monochrome) {
+		return monochrome ? gpu_sigma_log_corr_m : gpu_sigma_log_corr;
+	}
+
 	public double getGpuCorrRBSigma(boolean monochrome) {
 		return monochrome ? 1.0 : gpu_sigma_rb_corr;
 	}
@@ -1648,6 +1654,8 @@ public class CLTParameters {
 		properties.setProperty(prefix+"gpu_sigma_rb_corr",          this.gpu_sigma_rb_corr +"");
 		properties.setProperty(prefix+"gpu_sigma_corr",             this.gpu_sigma_corr +"");
 		properties.setProperty(prefix+"gpu_sigma_corr_m",           this.gpu_sigma_corr_m +"");
+		properties.setProperty(prefix+"gpu_sigma_log_corr",         this.gpu_sigma_log_corr +"");
+		properties.setProperty(prefix+"gpu_sigma_log_corr_m",       this.gpu_sigma_log_corr_m +"");
 		properties.setProperty(prefix+"gpu_fatz",                   this.gpu_fatz +"");
 		properties.setProperty(prefix+"gpu_fatz_m",                 this.gpu_fatz_m +"");

@@ -2468,6 +2476,8 @@ public class CLTParameters {
 		if (properties.getProperty(prefix+"gpu_sigma_rb_corr")!=null)           this.gpu_sigma_rb_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_rb_corr"));
 		if (properties.getProperty(prefix+"gpu_sigma_corr")!=null)              this.gpu_sigma_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr"));
 		if (properties.getProperty(prefix+"gpu_sigma_corr_m")!=null)            this.gpu_sigma_corr_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_corr_m"));
+		if (properties.getProperty(prefix+"gpu_sigma_log_corr")!=null)          this.gpu_sigma_log_corr=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_log_corr"));
+		if (properties.getProperty(prefix+"gpu_sigma_log_corr_m")!=null)        this.gpu_sigma_log_corr_m=Double.parseDouble(properties.getProperty(prefix+"gpu_sigma_log_corr_m"));
 		if (properties.getProperty(prefix+"gpu_fatz")!=null)                    this.gpu_fatz=Double.parseDouble(properties.getProperty(prefix+"gpu_fatz"));
 		if (properties.getProperty(prefix+"gpu_fatz_m")!=null)                  this.gpu_fatz_m=Double.parseDouble(properties.getProperty(prefix+"gpu_fatz_m"));

@@ -3475,6 +3485,12 @@ public class CLTParameters {
 				"LPF sigma to apply to the composite 2D correlation for RGB images");
 		gd.addNumericField("LPF sigma for correlation, mono",                                                           this.gpu_sigma_corr_m, 4, 6,"pix",
 				"LPF sigma to apply to the composite 2D correlation for monochrome images");
+
+		gd.addNumericField("LoG sigma for correlation, color",                                                          this.gpu_sigma_log_corr, 4, 6,"pix",
+				"Use LoG filter to reduce dynamic range of the correlation input to fit into float range");
+		gd.addNumericField("LoG sigma for correlation, mono",                                                           this.gpu_sigma_log_corr_m, 4, 6,"pix",
+				"Use LoG filter to reduce dynamic range of the correlation input to fit into float range");
+		
 		gd.addNumericField("Fat zero (absolute) for phase correlation of color images",                                 this.gpu_fatz, 4, 6,"",
 				"Add squared fat zero to the sum of squared amplitudes, color images");
 		gd.addNumericField("Fat zero (absolute) for phase correlation of monochrome images",                            this.gpu_fatz_m, 4, 6,"",
@@ -4287,6 +4303,8 @@ public class CLTParameters {
 		this.gpu_sigma_rb_corr =    gd.getNextNumber();
 		this.gpu_sigma_corr =       gd.getNextNumber();
 		this.gpu_sigma_corr_m =     gd.getNextNumber();
+		this.gpu_sigma_log_corr =   gd.getNextNumber();
+		this.gpu_sigma_log_corr_m = gd.getNextNumber();
 		this.gpu_fatz =             gd.getNextNumber();
 		this.gpu_fatz_m =           gd.getNextNumber();


--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -669,8 +669,9 @@ public class ImageDtt extends ImageDttCPU {
 			final double              gpu_sigma_b,     // 0.9, 1.1
 			final double              gpu_sigma_g,     // 0.6, 0.7
 			final double              gpu_sigma_m,     //  =       0.4; // 0.7;
-			final double              gpu_sigma_rb_corr, //  = 0.5; // apply LPF after accumulating R and B correlation before G, monochrome ? 1.0 :
-			final double              gpu_sigma_corr,   //  =    0.9;gpu_sigma_corr_m
+			final double              gpu_sigma_rb_corr,    //  = 0.5; // apply LPF after accumulating R and B correlation before G, monochrome ? 1.0 :
+			final double              gpu_sigma_corr,       //  =    0.9;gpu_sigma_corr_m
+			final double              gpu_sigma_log_corr,   // hpf to reduce dynamic range for correlations
 			final double              corr_red, // +used
 			final double              corr_blue,// +used
 			final int                 threadsMax,       // maximal number of threads to launch
@@ -722,6 +723,42 @@ public class ImageDtt extends ImageDttCPU {
 				"lpf_rb_corr", // String const_name, // "lpf_corr"
 				lpf_rb_flat,
 				globalDebugLevel > -1);
+		
+		final float [] log_flat = floatGetCltHpfFd(gpu_sigma_log_corr);
+		if (globalDebugLevel < -100) {
+			double dbg_sum = 0.0;
+			for (int i = 0; i < log_flat.length; i++) dbg_sum +=log_flat[i];
+			System.out.println("dbg_sum("+gpu_sigma_log_corr+")="+dbg_sum);
+			(new ShowDoubleFloatArrays()).showArrays(
+					log_flat,
+					8,
+					8,
+					"hpf_"+gpu_sigma_log_corr);
+			final float [] log_flat0 = floatGetCltHpfFd(4.0);
+			dbg_sum = 0.0;
+			for (int i = 0; i < log_flat.length; i++) dbg_sum +=log_flat0[i];
+			System.out.println("dbg_sum("+4.0+")="+dbg_sum);
+			(new ShowDoubleFloatArrays()).showArrays(
+					log_flat0,
+					8,
+					8,
+					"hpf_"+4.0);
+			final float [] log_flat1 = floatGetCltHpfFd(1.0);
+			dbg_sum = 0.0;
+			for (int i = 0; i < log_flat.length; i++) dbg_sum +=log_flat1[i];
+			System.out.println("dbg_sum("+1.0+")="+dbg_sum);
+			(new ShowDoubleFloatArrays()).showArrays(
+					log_flat1,
+					8,
+					8,
+					"hpf_"+1.0);
+			System.out.println("dbg_sum("+1.0+")="+dbg_sum);
+		}
+		gpuQuad.setLpfCorr(// constants memory - same for all cameras
+				"LoG_corr", // String const_name, // "lpf_corr"
+				log_flat,
+				globalDebugLevel > -1);
+		

 		gpuQuad.setTasks(                  // copy tp_tasks to the GPU memory
 				tp_tasks,                  // TpTask [] tile_tasks,
@@ -1419,7 +1456,8 @@ public class ImageDtt extends ImageDttCPU {
 			                                           // each of the top elements may be null to skip particular combo type
 			final double [][][][]     corr_tiles,      // [tilesY][tilesX][pair][] ([(2*gpu_corr_rad+1)*(2*gpu_corr_rad+1)]) or null
 			final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
-                                                       // [tilesY][tilesX] should be set by caller
+            // 											  [tilesY][tilesX] should be set by caller
+			final float  [][][]       fcorr_tiles,     // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 			// When clt_mismatch is non-zero, no far objects extraction will be attempted
 			final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
 			                                           // values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
@@ -1436,6 +1474,7 @@ public class ImageDtt extends ImageDttCPU {
 			final int                 threadsMax,      // maximal number of threads to launch
 			final int                 globalDebugLevel)
 	{
+		final float gpu_fcorr_scale = (float) gpu_corr_scale;
 		if (this.gpuQuad == null) {
 			System.out.println("clt_aberrations_quad_corr_GPU(): this.gpuQuad is null, bailing out");
 			return;
@@ -1642,14 +1681,12 @@ public class ImageDtt extends ImageDttCPU {
 							// double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15)
 							// added quad and cross combos
 							double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS + num_combo][corr_length]; // 225-long (15x15)
+							float  [][] fcorrs = (fcorr_tiles == null) ? null : new float  [GPUTileProcessor.NUM_PAIRS + num_combo][corr_length]; // 225-long (15x15)
 							int indx_corr = indx_tile * num_tile_corr;
 							int nt = (corr_indices[indx_corr] >> GPUTileProcessor.CORR_NTILE_SHIFT);
 							int tileX = nt % tilesX;
 							int tileY = nt / tilesX;
 							int tIndex = tileY * tilesX + tileX;
-//							if (tileY >= 122) {
-//								System.out.println("tileY="+tileY+" tileX="+tileX);
-//							}

 							// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
 							int pair_mask = 0;
@@ -1661,6 +1698,9 @@ public class ImageDtt extends ImageDttCPU {
 									for (int i = 0; i < corr_length; i++) {
 										corrs[pair][i] = gpu_corr_scale * fcorr2D[indx_corr][i]; // from float to double
 									}
+									if (fcorrs != null) for (int i = 0; i < corr_length; i++) {
+										fcorrs[pair][i] = gpu_fcorr_scale * fcorr2D[indx_corr][i];
+									}
 									indx_corr++; 
 								}
 							}
@@ -1672,18 +1712,25 @@ public class ImageDtt extends ImageDttCPU {
 									for (int i = 0; i < corr_length; i++) {
 										corrs[pair][i] = gpu_corr_scale * fcorr2D_combo[ncm][indx_tile][i]; // from float to double
 									}
+									if (fcorrs != null) for (int i = 0; i < corr_length; i++) {
+										fcorrs[pair][i] = gpu_fcorr_scale * fcorr2D_combo[ncm][indx_tile][i];
+									}
 								}
 							}
 							if (corr_tiles != null) {
 								corr_tiles[tileY][tileX] = corrs; 
 							}
+							if (fcorr_tiles != null) {
+								fcorr_tiles[tileY * tilesX + tileX] = fcorrs; // does not require corr_common_GPU()
+							}
+							
 							if ((disparity_map != null) || (clt_corr_partial != null) || (clt_mismatch != null)) {
 								int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
 								int tile_lma_debug_level =  ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
 								boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
 								corr_common_GPU(
 										imgdtt_params,        // final ImageDttParameters  imgdtt_params,
-										clt_corr_partial,     // final double [][][][][]   clt_corr_partial,			
+										clt_corr_partial,     // final double [][][][][]   clt_corr_partial,
 										used_pairs,           // final int           used_pairs,
 										disparity_map,        // final double [][]   disparity_map,
 										clt_mismatch,         // final double [][]   clt_mismatch,
@@ -1743,7 +1790,7 @@ public class ImageDtt extends ImageDttCPU {
 	
 	public void corr_common_GPU(
 			final ImageDttParameters  imgdtt_params,
-			final double [][][][][]   clt_corr_partial,			
+			final double [][][][][]   clt_corr_partial,
 			final int           used_pairs,
 			final double [][]   disparity_map,
 			final double [][]   clt_mismatch,
@@ -2041,7 +2088,7 @@ public class ImageDtt extends ImageDttCPU {
 				// create LMA instance, calculate LMA composite argmax
 				// Create 2 groups: ortho & diag
 				Correlations2dLMA lma;
-				if (imgdtt_params.pcorr_use) {
+				if (imgdtt_params.pcorr_use) { // new group phase correlation
 					double [][] fake_corrs = {corrs[6],null,null,null,corrs[7],null};
 					lma = corr2d.corrLMA(
 							imgdtt_params,                // ImageDttParameters  imgdtt_params,

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
@@ -24,6 +24,7 @@ package com.elphel.imagej.tileprocessor;
 */
 // ← → ↑ ↓ ⇖ ⇗ ⇘ ⇙ ↔ ↕ 

+import java.awt.Rectangle;
 import java.util.Arrays;
 import java.util.concurrent.atomic.AtomicInteger;

@@ -5525,13 +5526,13 @@ public class ImageDttCPU {
 	}
 	/**
 	 * Get frequency-domain representation of the LPF (version for the GPU, in floats)
-	 * @param sigma blurring in pixels
+	 * @param sigma2 squared Gaussian sigma in pixels
 	 * @return float array of the filter, 64 long for 8-pixel DTT
 	 */
 	public float [] floatGetCltLpfFd(
-			double   sigma) {
+			double   sigma2) {
 		DttRad2 dtt = new DttRad2(transform_size);
-		double [] clt_fd = dtt.dttt_iiie(getLpf(sigma));
+		double [] clt_fd = dtt.dttt_iiie(getLpf(sigma2));
 		int l = clt_fd.length;
 		float []   lpf_flat = new float [l];
 		for (int j = 0; j < l; j++) {
@@ -5540,18 +5541,36 @@ public class ImageDttCPU {
 		return lpf_flat;
 	}

+	/**
+	 * Get frequency-domain representation of the LPF (version for the GPU, in floats)
+	 * @param sigma Gaussian sigma in pixels
+	 * @return float array of the filter, 64 long for 8-pixel DTT
+	 */
+	public float [] floatGetCltHpfFd(
+			double   sigma) {
+		DttRad2 dtt = new DttRad2(transform_size);
+		double [] clt_fd = (sigma == 0.0)? (new double[transform_size*transform_size]) : dtt.dttt_iiie(getLpf(sigma * sigma));
+		int l = clt_fd.length;
+		float []   hpf_flat = new float [l];
+		for (int j = 0; j < l; j++) {
+			hpf_flat[j] = (float) (1.0 - clt_fd[j]*2*transform_size);
+		}
+		return hpf_flat;
+	}
+	
+	
 	/**
 	 * Get pixel-domain representation of the LPF
-	 * @param sigma blurring in pixels
+	 * @param sigma2  squared Gaussian sigma in pixels
 	 * @return double array of the filter, 64 long for 8-pixel DTT
 	 */

 	public double [] getLpf(
-			double   sigma)
+			double   sigma2) // sigma squared
 	{
 		int transform_len = transform_size * transform_size;
 		final double [] filter_direct= new double[transform_len];
-		if (sigma == 0) {
+		if (sigma2 == 0) {
 			filter_direct[0] = 1.0;
 			for (int i= 1; i<filter_direct.length;i++) {
                filter_direct[i] =0;
@@ -5559,7 +5578,7 @@ public class ImageDttCPU {
 		} else {
 			for (int i = 0; i < transform_size; i++){
 				for (int j = 0; j < transform_size; j++){
-					filter_direct[i*transform_size+j] = Math.exp(-(i*i+j*j)/(2*sigma)); // FIXME: should be sigma*sigma !
+					filter_direct[i*transform_size+j] = Math.exp(-(i*i+j*j)/(2*sigma2));
 				}
 			}
 		}
@@ -5580,6 +5599,95 @@ public class ImageDttCPU {
 		return filter_direct;
 	}

+	/**
+	 * Get frequency-domain representation of the LoG (version for the GPU, in floats)
+	 * @param sigma Gaussian sigma in pixels
+	 * @return float array of the filter, 64 long for 8-pixel DTT
+	 */
+	public float [] floatGetCltLoGFd(
+			double   sigma) {
+		DttRad2 dtt = new DttRad2(transform_size);
+		double [] clt_fd = dtt.dttt_iiie(getLoG(sigma));
+		int l = clt_fd.length;
+		float []   log_flat = new float [l];
+		for (int j = 0; j < l; j++) {
+			log_flat[j] = (float) (clt_fd[j]*2*transform_size);
+		}
+		return log_flat;
+	}
+	
+	/**
+	 * Get pixel-domain representation of the LoG
+	 * @param sigma Gaussian sigma in pixels
+	 * @return double array of the filter, 64 long for 8-pixel DTT
+	 */
+	public double [] getLoG(
+			double   sigma)
+	{
+		int transform_len = transform_size * transform_size;
+		final double sigma2 = sigma*sigma;
+		final double sigma4 = sigma2*sigma2;
+		final double [] filter_direct= new double[transform_len];
+		if (sigma == 0) {
+			filter_direct[0] = 1.0;
+			for (int i= 1; i<filter_direct.length;i++) {
+                filter_direct[i] =0;
+			}
+		} else {
+			for (int i = 0; i < transform_size; i++){
+				for (int j = 0; j < transform_size; j++){
+//https://homepages.inf.ed.ac.uk/rbf/HIPR2/log.htm					
+					filter_direct[i*transform_size+j] =
+							-1.0/(Math.PI * sigma4)*(1.0 - (i*i+j*j)/(2*sigma2))*
+							Math.exp(-(i*i+j*j)/(2*sigma2));
+				}
+			}
+		}
+		(new ShowDoubleFloatArrays()).showArrays(
+				filter_direct,
+				8,
+				8,
+				"log_direct-"+sigma);
+		// normalize
+		double sum2 = 0;
+		for (int i = 0; i < transform_size; i++){
+			for (int j = 0; j < transform_size; j++){
+				double d = 	filter_direct[i*transform_size+j];
+				d*=d;
+				d*=Math.cos(Math.PI*i/(2*transform_size))*Math.cos(Math.PI*j/(2*transform_size));
+				if (i > 0) d*= 2.0;
+				if (j > 0) d*= 2.0;
+				sum2 +=d;
+			}
+		}
+		double sum = Math.sqrt(sum2);
+		for (int i = 0; i<filter_direct.length; i++){
+			filter_direct[i] /= sum;
+		}
+		System.out.println("getLoG("+sigma+") sum="+sum);
+		/*
+		sum2 = 0;
+		for (int i = 0; i < transform_size; i++){
+			for (int j = 0; j < transform_size; j++){
+				double d = 	filter_direct[i*transform_size+j];
+				d*=d;
+				d*=Math.cos(Math.PI*i/(2*transform_size))*Math.cos(Math.PI*j/(2*transform_size));
+				if (i > 0) d*= 2.0;
+				if (j > 0) d*= 2.0;
+				sum2 +=d;
+			}
+		}
+		*/
+		(new ShowDoubleFloatArrays()).showArrays(
+				filter_direct,
+				8,
+				8,
+				"log_direct_norm-"+sigma);
+		
+		
+		
+		return filter_direct;
+	}


 	public void clt_lpf(  // USED in lwir
@@ -5838,7 +5946,7 @@ public class ImageDttCPU {
 	}

 	// extract correlation result  in linescan order (for visualization)
-	public double [] corr_dbg( // not used in lwir
+	public double [] corr_dbg(
 			final double [][][] corr_data,
 			final int           corr_size,
 			final double        border_contrast,
@@ -5884,10 +5992,6 @@ public class ImageDttCPU {
 		return corr_data_out;
 	}

-
-//	final float  [][][][]     fcorr_td =       new float[tilesY][tilesX][][];
-//	final float  [][][][]     fcorr_combo_td = new float[4][tilesY][tilesX][];
-
 	public static float [][] corr_td_dbg(
 			final float [][][][] fcorr_td,
 			// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
@@ -5957,6 +6061,60 @@ public class ImageDttCPU {
 	}
 	
 	
+	
+//	final float  [][][][]     fcorr_td =       new float[tilesY][tilesX][][];
+//	final float  [][][][]     fcorr_combo_td = new float[4][tilesY][tilesX][];
+
+	public static void corr_td_normalize(
+			final float [][][][] fcorr_td, // will be updated
+			// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
+			// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
+			final int            num_slices,
+			final int            transform_size,
+			final double         fat_zero_abs,
+			final double         output_amplitude,
+			final int            threadsMax)     // maximal number of threads to launch
+	{
+		final double fat_zero_abs2 = fat_zero_abs * fat_zero_abs; 
+		final int tilesY = (num_slices == 0) ? fcorr_td[0].length : fcorr_td.length;
+		final int tilesX = (num_slices == 0) ? fcorr_td[0][0].length : fcorr_td[0].length;
+		final int nTiles = tilesX*tilesY;
+		final int fnum_slices = (num_slices == 0) ? fcorr_td.length : num_slices;
+		final int transform_len = transform_size*transform_size; // 64
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile/tilesX;
+						int tileX = nTile - tileY * tilesX;
+						if ((num_slices == 0) || (fcorr_td[tileY][tileX] != null)) {
+							for (int slice = 0; slice < fnum_slices; slice ++) {
+								float [] ftile = (num_slices > 0) ? fcorr_td[tileY][tileX][slice] : fcorr_td[slice][tileY][tileX];
+								if (ftile != null) {
+									for (int i = 0; i < transform_len; i++) {
+										double s2 = fat_zero_abs2;
+										for (int q = 0; q < 4; q++) {
+											double d = ftile[q * transform_len + i]; 
+											s2 += d*d;
+										}
+										double k = output_amplitude/Math.sqrt(s2);
+										for (int q = 0; q < 4; q++) {
+											ftile[q * transform_len + i] *= k;
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+	}
+	

 	// extract correlation result  in linescan order (for visualization)
 	public static double [][] corr_partial_dbg( // not used in lwir
@@ -5972,19 +6130,14 @@ public class ImageDttCPU {
 		final int tilesX=corr_data[0].length;
 		final int nTiles=tilesX*tilesY;
 		final int tile_size = corr_size+1;
-		final int corr_len = corr_size*corr_size;
-
-		System.out.println("corr_partial_dbg(): tilesY="+tilesY+", tilesX="+tilesX+", corr_size="+corr_size+", corr_len="+corr_len+
-				" pairs="+pairs +" colors = "+colors+" tile_size="+tile_size);

 		final double [][] corr_data_out = new double[pairs*colors][tilesY*tilesX*tile_size*tile_size];
-//		final String [] colorNames = {"red","blue","green","composite"};

 		final Thread[] threads = newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(0);
 		for (int pair = 0; pair< pairs; pair++) {
 			for (int nColor = 0; nColor < colors; nColor++) {
-				for (int i=0; i<corr_data_out.length;i++) corr_data_out[pair*colors+nColor][i]= 0;
+				Arrays.fill(corr_data_out[pair*colors+nColor], Double.NaN);
 			}
 		}

@@ -6007,10 +6160,10 @@ public class ImageDttCPU {
 												corr_data_out[indx],
 												((tileY*tile_size + i) *tilesX + tileX)*tile_size ,
 												corr_size);
-										corr_data_out[indx][((tileY*tile_size + i) *tilesX + tileX)*tile_size+corr_size] = border_contrast*((i & 1) - 0.5);
+//										corr_data_out[indx][((tileY*tile_size + i) *tilesX + tileX)*tile_size+corr_size] = border_contrast*((i & 1) - 0.5);
 									}
 									for (int i = 0; i < tile_size; i++){
-										corr_data_out[indx][((tileY*tile_size + corr_size) *tilesX + tileX)*tile_size+i] = border_contrast*((i & 1) - 0.5);
+//										corr_data_out[indx][((tileY*tile_size + corr_size) *tilesX + tileX)*tile_size+i] = border_contrast*((i & 1) - 0.5);
 									}
 								}
 							}
@@ -6022,8 +6175,528 @@ public class ImageDttCPU {
 		startAndJoin(threads);
 		return corr_data_out;
 	}
+	
+	public static float [][][] extract_corr_woi(
+			final boolean      copy, // copy tiles stack, not reference
+			final float [][][] fcorr,
+			final Rectangle    woi,
+			final int          tilesX,
+			final int          threadsMax)     // maximal number of threads to launch

+	{
+		final int tilesY = fcorr.length/tilesX;
+		if ((woi.width + woi.x) >= tilesX) {
+			int ww = woi.width; 
+			woi.width = tilesX - woi.x;
+			if (woi.width <= 0) {
+				if (ww > tilesX) ww = tilesX;
+				woi.width = ww;
+				woi.x = tilesX - woi.width; 
+			}
+		}
+		if ((woi.height + woi.y) >= tilesY) {
+			int wndh = woi.height; 
+			woi.height = tilesY - woi.y;
+			if (woi.height <= 0) {
+				if (wndh > tilesY) wndh = tilesY;
+				woi.height = wndh;
+				woi.y = tilesY - woi.height; 
+			}
+		}
+		final int nTiles=woi.width * woi.height;
+		final float [][][] fcorr_out = new float [fcorr.length][][];
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile / woi.width + woi.y;
+						int tileX = nTile % woi.width + woi.x;
+						int tile = tileY * tilesX + tileX;
+						
+						if (copy && (fcorr[tile] != null)) {
+							fcorr_out[tile] = fcorr[tile].clone();	
+						} else {
+							fcorr_out[tile] = fcorr[tile];
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return fcorr_out;
+	}
+	
+	public static float [][] corr_partial_dbg( // not used in lwir
+			final float  [][][]     fcorr_data,       // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+			final int               tilesX,
+			final int               corr_size,
+			final int               layers,
+			final double            border_contrast,
+			final int               threadsMax,     // maximal number of threads to launch
+			final int               globalDebugLevel)
+	{
+		final int tilesY=fcorr_data.length/tilesX;
+		final int nTiles=tilesX*tilesY;
+		final int tile_size = corr_size+1;
+//		final int corr_len = corr_size*corr_size;
+		final float [][] fcorr_data_out = new float[layers][tilesY*tilesX*tile_size*tile_size];
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int layer = 0; layer < layers; layer++) {
+			Arrays.fill(fcorr_data_out[layer], Float.NaN);
+		}
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					int tileY,tileX;
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						tileY = nTile/tilesX;
+						tileX = nTile - tileY * tilesX;
+						if (fcorr_data[nTile] != null) {
+							for (int layer = 0; layer < layers; layer++) {
+								for (int i = 0; i < corr_size;i++){
+									System.arraycopy(
+											fcorr_data[nTile][layer],
+											corr_size* i,
+											fcorr_data_out[layer],
+											((tileY*tile_size + i) *tilesX + tileX)*tile_size ,
+											corr_size);
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return fcorr_data_out;
+	}
+
+
+	// extract correlation result  in linescan order (for visualization)
+	// extracts 10 correlation tiles 
+	public static float [] corr_partial_wnd( // not used in lwir
+			final double [][][][][] corr_data,
+			final int               corr_size,
+			final Rectangle         woi,
+			final int               gap,
+			final int []            wh,
+			final int               threadsMax)     // maximal number of threads to launch
+	{
+		final int tile_size = corr_size+1;
+		final int [][] layout = {{0,0,0},{1,1,0},{2,0,1},{3,1,1},{4,0,2},{5,1,2},{6,0,3},{7,1,3},{8,0,4},{9,1,4}}; // {source_index, row, col};
+		if ((woi.width + woi.x) >= corr_data[0].length) {
+			int ww = woi.width; 
+			woi.width = corr_data[0].length - woi.x;
+			if (woi.width <= 0) {
+				if (ww > corr_data[0].length) ww = corr_data[0].length;
+				woi.width = ww;
+				woi.x = corr_data[0].length - woi.width; 
+			}
+		}
+		if ((woi.height + woi.y) >= corr_data.length) {
+			int wndh = woi.height; 
+			woi.height = corr_data.length - woi.y;
+			if (woi.height <= 0) {
+				if (wndh > corr_data.length) wndh = corr_data.length;
+				woi.height = wndh;
+				woi.y = corr_data.length - woi.height; 
+			}
+		}
+		
+		final int nTiles=woi.width * woi.height;
+		
+		final int clust_width =  5 * tile_size + gap;
+		final int clust_height = 2 * tile_size + gap;
+		
+		final int width =  woi.width* clust_width - gap;
+		final int height = woi.height*clust_height - gap;
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+		final float [] corr_data_out = new float[width * height];
+		Arrays.fill(corr_data_out, Float.NaN);
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile / woi.width; // relative to woi
+						int tileX = nTile % woi.width;
+						int stileY = tileY + woi.y;    // absolute in the corr_data
+						int stileX = tileX + woi.x;
+						if (corr_data[stileY][stileX] != null) {
+							for (int n = 0; n < layout.length; n++) {
+								int src_layer = layout[n][0];
+								int v_tile = layout[n][1];
+								int h_tile = layout[n][2];
+								double [] corr_tile = corr_data[stileY][stileX][src_layer/4][src_layer%4]; // tiles were organized as 4x4
+								int out_x = tileX * clust_width +  h_tile * tile_size;
+								int out_y = tileY * clust_height + v_tile * tile_size;
+								for (int i = 0; i < corr_size;i++){
+									int out_start = (out_y + i) * width + out_x;
+									for (int j = 0; j < corr_size; j++) {
+										corr_data_out[out_start+j] = (float) corr_tile[corr_size* i +j];
+									}
+									/*
+									System.arraycopy(
+											corr_tile,
+											corr_size* i,
+											corr_data_out,
+											(out_y + i) * width + out_x,
+											corr_size);
+									*/
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return corr_data_out;
+	}
+
+	public static float [] corr_partial_wnd( // not used in lwir
+			final float  [][][]     fcorr_data,       // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+			final int               tilesX,
+			final int               corr_size,
+			final Rectangle         woi,
+			final int               gap,
+			final int []            wh,
+			final int               threadsMax)     // maximal number of threads to launch
+	{
+		final int tile_size = corr_size+1;
+		final int tilesY = fcorr_data.length/tilesX;
+		final int [][] layout = {{0,0,0},{1,1,0},{2,0,1},{3,1,1},{4,0,2},{5,1,2},{6,0,3},{7,1,3},{8,0,4},{9,1,4}}; // {source_index, row, col};
+		if ((woi.width + woi.x) >= tilesX) {
+			int ww = woi.width; 
+			woi.width = tilesX - woi.x;
+			if (woi.width <= 0) {
+				if (ww > tilesX) ww = tilesX;
+				woi.width = ww;
+				woi.x = tilesX - woi.width; 
+			}
+		}
+		if ((woi.height + woi.y) >= tilesY) {
+			int wndh = woi.height; 
+			woi.height = tilesY - woi.y;
+			if (woi.height <= 0) {
+				if (wndh > tilesY) wndh = tilesY;
+				woi.height = wndh;
+				woi.y = tilesY - woi.height; 
+			}
+		}
+		final int nTiles=woi.width * woi.height;
+		final int clust_width =  5 * tile_size + gap;
+		final int clust_height = 2 * tile_size + gap;
+		final int width =  woi.width* clust_width - gap;
+		final int height = woi.height*clust_height - gap;
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+		final float [] corr_data_out = new float[width * height];
+		Arrays.fill(corr_data_out, Float.NaN);
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile / woi.width; // relative to woi
+						int tileX = nTile % woi.width;
+						int stileY = tileY + woi.y;    // absolute in the corr_data
+						int stileX = tileX + woi.x;
+						int stile = stileY * tilesX + stileX;
+						if (fcorr_data[stile] != null) {
+							for (int n = 0; n < layout.length; n++) {
+								int src_layer = layout[n][0];
+								int v_tile = layout[n][1];
+								int h_tile = layout[n][2];
+								float [] fcorr_tile = fcorr_data[stile][src_layer]; // tiles were organized as 4x4
+								int out_x = tileX * clust_width +  h_tile * tile_size;
+								int out_y = tileY * clust_height + v_tile * tile_size;
+								for (int i = 0; i < corr_size;i++){
+									int out_start = (out_y + i) * width + out_x;
+									/*
+									for (int j = 0; j < corr_size; j++) {
+										corr_data_out[out_start+j] = (float) corr_tile[corr_size* i +j];
+									}
+									*/
+									
+									System.arraycopy(
+											fcorr_tile,
+											corr_size* i,
+											corr_data_out,
+											(out_y + i) * width + out_x,
+											corr_size);
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return corr_data_out;
+	}
+	
+	
+	
+	public static float [] corr_td_wnd(
+			final float [][][][] fcorr_td,       // float[tilesY][tilesX][num_slices][];
+			final float [][][][] fcorr_combo_td, // float[4][tilesY][tilesX][];
+			final Rectangle      woi,
+			final int            gap,
+			final int []         wh,
+			final int            transform_size,
+			final int            threadsMax)     // maximal number of threads to launch
+	{
+		final int tile_size = 2 * transform_size;
+		final int [][] layout1 = {{0,0,0},{1,1,0},{2,0,1},{3,1,1},{4,0,2},{5,1,2}}; // {source_index, row, col};
+		final int [][] layout2 = {{0,0,3},{1,1,3},{2,0,4},{3,1,4}}; // {source_index, row, col};
+		if ((woi.width + woi.x) >= fcorr_td[0].length) {
+			int ww = woi.width; 
+			woi.width = fcorr_td[0].length - woi.x;
+			if (woi.width <= 0) {
+				if (ww > fcorr_td[0].length) ww = fcorr_td[0].length;
+				woi.width = ww;
+				woi.x = fcorr_td[0].length - woi.width; 
+			}
+		}
+		if ((woi.height + woi.y) >= fcorr_td.length) {
+			int wndh = woi.height; 
+			woi.height = fcorr_td.length - woi.y;
+			if (woi.height <= 0) {
+				if (wndh > fcorr_td.length) wndh = fcorr_td.length;
+				woi.height = wndh;
+				woi.y = fcorr_td.length - woi.height; 
+			}
+		}
+		
+		final int nTiles=woi.width * woi.height;
+		
+		final int clust_width =  5 * tile_size + gap;
+		final int clust_height = 2 * tile_size + gap;
+		
+		final int width =  woi.width* clust_width - gap;
+		final int height = woi.height*clust_height - gap;
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+		final int transform_len = transform_size * transform_size;
+		final float [] fcorr_data_out = new float[width * height];
+		Arrays.fill(fcorr_data_out, Float.NaN);
+		
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile / woi.width; // relative to woi
+						int tileX = nTile % woi.width;
+						int stileY = tileY + woi.y;    // absolute in the corr_data
+						int stileX = tileX + woi.x;
+						// first 6 tiles from fcorr_td =       new float[tilesY][tilesX][num_slices][];
+						if (fcorr_td[stileY][stileX] != null) {
+							for (int n = 0; n < layout1.length; n++) {
+								int src_layer = layout1[n][0];
+								int v_tile = layout1[n][1];
+								int h_tile = layout1[n][2];
+								float [] fcorr_tile = fcorr_td[stileY][stileX][src_layer];
+								for (int qy = 0; qy < 2; qy++) {
+									for (int qx = 0; qx < 2; qx++) {
+										for (int i = 0; i < transform_size;i++){
+											System.arraycopy(
+													fcorr_tile,
+													transform_len * (2 *qy + qx) + transform_size * i,
+													fcorr_data_out,
+													(tileY * clust_height + v_tile * tile_size + qy * transform_size + i) * width +
+													(tileX * clust_width +  h_tile * tile_size + qx * transform_size),
+													transform_size);
+										}
+									}
+								}
+							}
+						}
+						// last 4 tiles from fcorr_combo_td = new float[4][tilesY][tilesX][];
+						for (int n = 0; n < layout2.length; n++) {
+							if (fcorr_combo_td[n][stileY][stileX] != null) {
+								int src_layer = layout2[n][0];
+								int v_tile = layout2[n][1];
+								int h_tile = layout2[n][2];
+								float [] fcorr_tile = fcorr_combo_td[src_layer][stileY][stileX];
+								for (int qy = 0; qy < 2; qy++) {
+									for (int qx = 0; qx < 2; qx++) {
+										for (int i = 0; i < transform_size;i++){
+											System.arraycopy(
+													fcorr_tile,
+													transform_len * (2 *qy + qx) + transform_size * i,
+													fcorr_data_out,
+													(tileY * clust_height + v_tile * tile_size + qy * transform_size + i) * width +
+													(tileX * clust_width +  h_tile * tile_size + qx * transform_size),
+													transform_size);
+										}
+									}
+								}
+							}							
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return fcorr_data_out;
+	}
+
+
+		
+	public static float [][][] corr_get_extra(
+			final float [][][][] fcorrs,
+			final int            tilesX,
+			final int            ncombo,
+			final int            slices,
+			final int            threadsMax)     // maximal number of threads to launch
+	{
+		final int tiles = fcorrs[ncombo].length;
+		final int ncorrs = fcorrs.length - 1;
+		float [][][] fcorr_extra = new float [tiles][][];
+		
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < tiles; nTile = ai.getAndIncrement()) if (fcorrs[ncombo][nTile] != null){
+						fcorr_extra[nTile] = new float [slices][ncorrs];
+						for (int indx0 = 0; indx0 < ncorrs; indx0++) {
+							int indx = indx0 + ((indx0 < ncombo) ? 0 : 1);
+							if (fcorrs[indx][nTile] != null) {
+								for (int slice = 0; slice < slices; slice++) {
+									if ((fcorrs[ncombo][nTile][slice] != null) && (fcorrs[indx][nTile][slice] != null)) {
+										float [] t0 = fcorrs[ncombo][nTile][slice];
+										float [] t1 = fcorrs[indx][nTile][slice];
+										float s00 = 0.0f, s11 = 0.0f, s01 = 0.0f;
+										for (int i = 0; i < t0.length; i++) {
+											s00 += t0[i] * t0[i];
+											s11 += t1[i] * t1[i];
+											s01 += t0[i] * t1[i];
+										}
+										fcorr_extra[nTile][slice][indx] = (float) (s01/Math.sqrt(s00*s11));
+									}
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return fcorr_extra;
+	}
+	
+	// prepare tile-to-tile correlation data as an extra layer, after layers by corr_td_wnd()
+	public static float [] corr_show_extra(
+			final float [][][] fcorr_extra,  // float[tile][slices][extra];
+			final int          tilesX,
+			final Rectangle    woi,
+			final int          gap,
+			final int          step, // 3
+			final int          size, //2
+			final int []       wh,
+			final int          transform_size,
+			final int          threadsMax)     // maximal number of threads to launch
+	{
+		final int tilesY = fcorr_extra.length / tilesX;
+		final int tile_size = 2 * transform_size;
+		final int per_row = (tile_size + (step-size)) /step;
+		final int [][] layout = {{0,0,0},{1,1,0},{2,0,1},{3,1,1},{4,0,2},{5,1,2},{6,0,3},{7,1,3},{8,0,4},{9,1,4}}; // {source_index, row, col};
+		if ((woi.width + woi.x) >= tilesX) {
+			int ww = woi.width; 
+			woi.width = tilesX - woi.x;
+			if (woi.width <= 0) {
+				if (ww > tilesX) ww = tilesX;
+				woi.width = ww;
+				woi.x = tilesX - woi.width; 
+			}
+		}
+		if ((woi.height + woi.y) >= tilesY) {
+			int wndh = woi.height; 
+			woi.height = tilesY - woi.y;
+			if (woi.height <= 0) {
+				if (wndh > tilesY) wndh = fcorr_extra.length;
+				woi.height = wndh;
+				woi.y = tilesY - woi.height; 
+			}
+		}
+		final int nTiles=woi.width * woi.height;
+		final int clust_width =  5 * tile_size + gap;
+		final int clust_height = 2 * tile_size + gap;
+		final int width =  woi.width* clust_width - gap;
+		final int height = woi.height*clust_height - gap;
+		if (wh != null) {
+			wh[0] = width;
+			wh[1] = height;
+		}
+		final float [] fcorr_data_out = new float[width * height];
+		Arrays.fill(fcorr_data_out, Float.NaN);
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+		for (int ithread = 0; ithread < threads.length; ithread++) {
+			threads[ithread] = new Thread() {
+				@Override
+				public void run() {
+					for (int nTile = ai.getAndIncrement(); nTile < nTiles; nTile = ai.getAndIncrement()) {
+						int tileY = nTile / woi.width; // relative to woi
+						int tileX = nTile % woi.width;
+						int stileY = tileY + woi.y;    // absolute in the corr_data
+						int stileX = tileX + woi.x;
+						int stile = stileY * tilesX + stileX; 
+						// first 6 tiles from fcorr_td =       new float[tilesY][tilesX][num_slices][];
+						if (fcorr_extra[stile] != null) {
+							for (int n = 0; n < layout.length; n++) {
+								int src_layer = layout[n][0];
+								int v_tile = layout[n][1];
+								int h_tile = layout[n][2];
+								float [] extra_data = fcorr_extra[stile][src_layer];
+								for (int i = 0; i < extra_data.length; i++) {
+									int extra_row = i / per_row;
+									int extra_col = i % per_row;
+									int extra_0 =
+											(tileY * clust_height + v_tile * tile_size + extra_row * step) * width +
+											(tileX * clust_width +  h_tile * tile_size + extra_col * step);
+									for (int sy = 0; sy < size; sy++) {
+										for (int sx = 0; sx < size; sx++) {
+											fcorr_data_out[extra_0 + sy* width + sx] = extra_data[i];
+										}
+									}
+								}
+							}
+						}
+					}
+				}
+			};
+		}
+		startAndJoin(threads);
+		return fcorr_data_out;
+	}

+	// calculate inter-tile correlation from the data already converted to the debug images
+	// (and so only for the selected woi)




--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
@@ -23,6 +23,7 @@
 */
 package com.elphel.imagej.tileprocessor;

+import java.awt.Rectangle;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -3179,6 +3180,7 @@ public class OpticalFlow {
 							indx_ref,       // final int            indx_ref,
 							combo_dsn[0],   // final double []      disparity_ref,  // disparity in the reference view tiles (Double.NaN - invalid)
 							margin,         // final int            margin,
+							nrefine,        // final int            nrefine, // just for debug title
 							debug_level);   // final int            debug_level)

 			Runtime.getRuntime().gc();
@@ -3189,7 +3191,7 @@ public class OpticalFlow {
 					tilesX,
 					tilesY,
 					true,
-					"accumulated_disparity_map",
+					"accumulated_disparity_map-"+nrefine,
 					ImageDtt.DISPARITY_TITLES
 					);
 			// update disparities
@@ -3398,16 +3400,19 @@ public class OpticalFlow {
 	
 	
 	
-	public double [][] correlateInterscene(
+	public double[][] correlateInterscene(
 //	public double [][][][][] correlateInterscene(
 			final CLTParameters  clt_parameters,
 			final QuadCLT []     scenes,
 			final int            indx_ref,
 			final double []      disparity_ref,  // disparity in the reference view tiles (Double.NaN - invalid)
 			final int            margin,
+			final int            nrefine, // just for debug title
 			final int            debug_level
 			)
 	{
+		final double fat_zero_pre = (debug_level>-100)?-1.0:0.0; //100000.0; // 10000.0; // 1000.0; //
+		final double output_amplitude = 30000; // 10000; // 1000; // 200; // 50.0;
 		final int num_scenes = scenes.length;
 		final QuadCLT ref_scene = scenes[indx_ref];
 		final ErsCorrection ers_reference = ref_scene.getErsCorrection();
@@ -3422,10 +3427,34 @@ public class OpticalFlow {
 				ref_scene.isLwir(),
 				clt_parameters.getScaleStrength(ref_scene.isAux()),
 				ref_scene.getGpuQuad());
+		/*
+		double [][][][][] clt_corr_partial = null;		
+		clt_corr_partial = new double [tilesY][tilesX][][][];
+		for (int i = 0; i < tilesY; i++){
+			for (int j = 0; j < tilesX; j++){
+				clt_corr_partial[i][j] = null;
+			}
+		}
+		*/
+		double [][] disparity_map = new double [ImageDtt.DISPARITY_TITLES.length][];

+		int disparity_modes = 
+				ImageDtt.BITS_ALL_DISPARITIES |
+				ImageDtt.BITS_ALL_DIFFS | // needs max_diff?
+				ImageDtt.BITS_OVEREXPOSED; //  |
+//		final Rectangle tile_woi = new Rectangle(127,2,40,60); // for visualizations
+//		final Rectangle tile_woi = new Rectangle(81,77,64,97); // for visualizations
+		final Rectangle tile_woi = new Rectangle(30,70,50,110); // for visualizations
+		final int vis_gap = 2;
+		final float [][] vis_corr_td = new float[num_scenes + 1][]; // transform-domain visualization
+		final float [][] vis_corr_pd = new float[num_scenes + 2][]; // pixel-domain visualization
+		final int [] wis_wh = new int [2];
+		final float [][][][] fclt_corrs = new float [num_scenes+1][tilesX*tilesY][][]; // will only contain tile_woi tiles to save memory
 		for (int i = 0; i < num_scenes; i++) {
 			if (i == indx_ref) {
-				System.out.println("Correlating reference scene");
+				System.out.println("\nCorrelating reference scene\n");
+			} else {
+				System.out.println("\nCorrelating scene "+i+"\n");
 			}
 			String ts = scenes[i].getImageName();
 			double [][] scene_pXpYD;
@@ -3458,7 +3487,10 @@ public class OpticalFlow {
 				scenes[i].getGpuQuad().updateQuadCLT(scenes[i]); // to re-load new set of Bayer images to the GPU
 			}
 			final double disparity_corr = 0.0; // (z_correction == 0) ? 0.0 : geometryCorrection.getDisparityFromZ(1.0/z_correction);
-			final double gpu_sigma_rb_corr = scenes[i].isMonochrome()? 1.0 : clt_parameters.gpu_sigma_rb_corr;
+			final double gpu_sigma_corr =     clt_parameters.getGpuCorrSigma(scenes[i].isMonochrome());
+			final double gpu_sigma_rb_corr =  scenes[i].isMonochrome()? 1.0 : clt_parameters.gpu_sigma_rb_corr;
+			final double gpu_sigma_log_corr = clt_parameters.getGpuCorrLoGSigma(scenes[i].isMonochrome());
+			final float  [][][]       fclt_corr = new float [tilesX * tilesY][][];
 			image_dtt.quadCorrTD(
 					clt_parameters.img_dtt,        // final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 					scene_pXpYD,                   // final double [][]         pXpYD,            // per-tile array of pX,pY,disparity triplets (or nulls)
@@ -3472,11 +3504,92 @@ public class OpticalFlow {
 					clt_parameters.gpu_sigma_g,    // 0.6, 0.7
 					clt_parameters.gpu_sigma_m,    //  =       0.4; // 0.7;
 					gpu_sigma_rb_corr,             // final double              gpu_sigma_rb_corr, //  = 0.5; // apply LPF after accumulating R and B correlation before G, monochrome ? 1.0 : gpu_sigma_rb_corr;
-					clt_parameters.gpu_sigma_corr, //  =    0.9;gpu_sigma_corr_m
+					gpu_sigma_corr,                //  =    0.9;gpu_sigma_corr_m
+					gpu_sigma_log_corr,            // final double              gpu_sigma_log_corr,   // hpf to reduce dynamic range for correlations
 					clt_parameters.corr_red,       // +used
 					clt_parameters.corr_blue,      // +used
 					threadsMax,                    // final int             threadsMax,       // maximal number of threads to launch
 					debug_level);                  // final int                 globalDebugLevel)
+			if (fat_zero_pre >= 0.0) {
+				ImageDtt.corr_td_normalize(
+						fcorrs_td[i], // final float [][][][] fcorr_td, // will be updated
+						// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
+						// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
+						GPUTileProcessor.NUM_PAIRS, // final int            num_slices,
+						image_dtt.transform_size,   // final int            transform_size,
+						fat_zero_pre, // final double         fat_zero_abs,
+						output_amplitude, // final double         output_amplitude,
+						threadsMax); // final int            threadsMax);     // maximal number of threads to launch
+				ImageDtt.corr_td_normalize(
+						fcorrs_combo_td[i], // final float [][][][] fcorr_td, // will be updated
+						// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
+						// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
+						0, // final int            num_slices,
+						image_dtt.transform_size,   // final int            transform_size,
+						fat_zero_pre, // final double         fat_zero_abs,
+						output_amplitude, //final double         output_amplitude,
+						threadsMax); // final int            threadsMax);     // maximal number of threads to launch
+			}
+			if (vis_corr_td != null) {
+				vis_corr_td[i] = ImageDtt.corr_td_wnd(
+						fcorrs_td[i],             // final float [][][][] fcorr_td,       // float[tilesY][tilesX][num_slices][];
+						fcorrs_combo_td[i],       // final float [][][][] fcorr_combo_td, // float[4][tilesY][tilesX][];
+						tile_woi,                 // final Rectangle      woi,
+						vis_gap,                  // final int            gap,
+						wis_wh,                   // final int []         wh,
+						image_dtt.transform_size, // final int            transform_size,
+						threadsMax);              // final int            threadsMax)     // maximal number of threads to launch
+				Runtime.getRuntime().gc();
+				System.out.println("--- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
+			}
+			if ((vis_corr_pd != null) || (fclt_corrs != null)) { // calculate and extract correlation
+				image_dtt.clt_process_tl_correlations_GPU(	// convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
+						clt_parameters.img_dtt,				// final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
+						// both arrays should have same non-null tiles
+						fcorrs_td[i],				 	 		// final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+						fcorrs_combo_td[i], 						// final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+																// each of the top elements may be null to skip particular combo type
+						null, // 	final double [][][][]     corr_tiles,      // [tilesY][tilesX][pair][] ([(2*gpu_corr_rad+1)*(2*gpu_corr_rad+1)]) or null
+						null, //clt_corr_partial,           			// final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+						// [tilesY][tilesX] should be set by caller
+						fclt_corr, // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+						// When clt_mismatch is non-zero, no far objects extraction will be attempted
+						null, 								// final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
+						// values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
+						disparity_map,						// final double [][]         disparity_map,   // [8][tilesY][tilesX], only [6][] is needed on input or null - do not calculate
+						// last 2 - contrast, avg/ "geometric average)
+						disparity_modes,                      // final int                 disparity_modes, // bit mask of disparity_map slices to calculate/return
+						clt_parameters.gpu_corr_scale,        //  gpu_corr_scale,  //  0.75; // reduce GPU-generated correlation values
+						clt_parameters.getGpuFatZero(ref_scene.isMonochrome()), // final double              gpu_fat_zero,    // clt_parameters.getGpuFatZero(is_mono);absolute == 30.0\
+						image_dtt.transform_size - 1,			// clt_parameters.gpu_corr_rad,   // = transform_size - 1 ?
+						clt_parameters.max_corr_radius,		// final double              max_corr_radius, // 3.9;
+						clt_parameters.clt_window,     		// final int                 window_type,     // GPU: will not be used
+						clt_parameters.tileX,      		    // final int               debug_tileX,
+						clt_parameters.tileY,          		// final int               debug_tileY,
+						threadsMax,
+						debug_level -1 );
+				if (vis_corr_pd != null) {
+					vis_corr_pd[i] = ImageDtt.corr_partial_wnd( // not used in lwir
+							fclt_corr,                      // clt_corr_partial,               // final double [][][][][] corr_data,
+							tilesX,                         // final int               tilesX,
+							2*image_dtt.transform_size - 1, // final int               corr_size,
+							tile_woi,                       // final Rectangle      woi,
+							vis_gap,                        // final int            gap,
+							wis_wh,                         // final int []         wh,
+							threadsMax);                    // final int               threadsMax)
+				}
+				if (fclt_corrs != null) {
+					fclt_corrs[i] = ImageDtt.extract_corr_woi(
+							true, // final boolean      copy, // copy tiles stack, not reference
+							fclt_corr,   // final float [][][] fcorr,
+							tile_woi,    // final Rectangle    woi,
+							tilesX,      // final int          tilesX,
+							threadsMax); // final int          threadsMax)     // maximal number of threads to launch
+							
+				}
+				Runtime.getRuntime().gc();
+				System.out.println("--- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
+			}
 		}
 		// Combine non-null correlations from all scenes (initially combined and individual for visualization and analysis)
 		final float  [][][][]     fcorr_td =       new float[tilesY][tilesX][][];
@@ -3600,7 +3713,7 @@ public class OpticalFlow {
 					wh[0],
 					wh[1],
 					true,
-					ref_scene.getImageName()+"-TD-PART_CORR-D"+clt_parameters.disparity,
+					ref_scene.getImageName()+"-TD-CORR-"+nrefine,
 					dbg_titles);
 		}

@@ -3612,30 +3725,17 @@ public class OpticalFlow {
 		
 		Runtime.getRuntime().gc();
 		System.out.println("--- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
-
-		double [][][][][] clt_corr_partial = null;
-		clt_corr_partial = new double [tilesY][tilesX][][][];
-		for (int i = 0; i < tilesY; i++){
-			for (int j = 0; j < tilesX; j++){
-				clt_corr_partial[i][j] = null;
-			}
-		}
-		double [][] disparity_map = new double [ImageDtt.DISPARITY_TITLES.length][];
-
-		int disparity_modes = 
-				ImageDtt.BITS_ALL_DISPARITIES |
-				ImageDtt.BITS_ALL_DIFFS | // needs max_diff?
-				ImageDtt.BITS_OVEREXPOSED; //  |
-
+		final float  [][][]       fclt_corr = new float [tilesX * tilesY][][];
 		image_dtt.clt_process_tl_correlations_GPU(	// convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
 				clt_parameters.img_dtt,				// final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
 				// both arrays should have same non-null tiles
 				fcorr_td,				 	 		// final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
 				fcorr_combo_td, 						// final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
-														// each of the top elements may be null to skip particular combo type
+				// each of the top elements may be null to skip particular combo type
 				null, // 	final double [][][][]     corr_tiles,      // [tilesY][tilesX][pair][] ([(2*gpu_corr_rad+1)*(2*gpu_corr_rad+1)]) or null
-				clt_corr_partial,           			// final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+				null, // clt_corr_partial,           			// final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 				// [tilesY][tilesX] should be set by caller
+				fclt_corr, // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 				// When clt_mismatch is non-zero, no far objects extraction will be attempted
 				null, 								// final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
 				// values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
@@ -3662,27 +3762,109 @@ public class OpticalFlow {
 //				 ,dbg_titles
 				 );
 		 */
-			Runtime.getRuntime().gc();
-			System.out.println("--- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
-		 
-		  if (debug_level < -10){ // -1
-			  double [][] dbg_corr_rslt_partial = ImageDtt.corr_partial_dbg(
-					  clt_corr_partial,
-					  2*image_dtt.transform_size - 1,	//final int corr_size,
-					  4,	// final int pairs,
-					  4,    // final int colors,
-					  clt_parameters.corr_border_contrast,
-					  threadsMax,
-					  debug_level);
-			  // titles.length = 15, corr_rslt_partial.length=16!
-			  (new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
-					  dbg_corr_rslt_partial,
-					  tilesX*(2*image_dtt.transform_size),
-					  tilesY*(2*image_dtt.transform_size),
-					  true,
-					  ref_scene.getImageName()+"-TD-PART_CORR-D"+clt_parameters.disparity,
-					  ImageDtt.CORR_TITLES);
-		  }
+		if (vis_corr_pd != null) { // add combined data as the last slice
+			vis_corr_pd[num_scenes] = ImageDtt.corr_partial_wnd( // not used in lwir
+					fclt_corr,                      // clt_corr_partial,               // final double [][][][][] corr_data,
+					tilesX,                         // 
+					2*image_dtt.transform_size - 1, // final int               corr_size,
+					tile_woi,                       // final Rectangle      woi,
+					vis_gap,                        // final int            gap,
+					wis_wh,                         // final int []         wh,
+					threadsMax);                    // final int               threadsMax)				
+		}
+
+		Runtime.getRuntime().gc();
+		System.out.println("--- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
+
+		if (vis_corr_td != null) { // add combined data as the last slice
+			vis_corr_td[num_scenes] = ImageDtt.corr_td_wnd(
+					fcorr_td,                 // final float [][][][] fcorr_td,       // float[tilesY][tilesX][num_slices][];
+					fcorr_combo_td,           // final float [][][][] fcorr_combo_td, // float[4][tilesY][tilesX][];
+					tile_woi,                 // final Rectangle      woi,
+					vis_gap,                  // final int            gap,
+					wis_wh,                   // final int []         wh,
+					image_dtt.transform_size, // final int            transform_size,
+					threadsMax);              // final int            threadsMax)     // maximal number of threads to launch
+		}
+
+
+
+		if (debug_level > -10){ // -1
+			float [][] dbg_corr_rslt_partial = ImageDtt.corr_partial_dbg(
+					fclt_corr, // final float  [][][]     fcorr_data,       // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+					tilesX,    //final int               tilesX,
+					2*image_dtt.transform_size - 1,	//final int corr_size,
+					10, // final int layers 4,	// final int pairs,
+//					4,    // final int colors,
+					clt_parameters.corr_border_contrast,
+					threadsMax,
+					debug_level);
+			// titles.length = 15, corr_rslt_partial.length=16!
+			(new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
+					dbg_corr_rslt_partial,
+					tilesX*(2*image_dtt.transform_size),
+					tilesY*(2*image_dtt.transform_size),
+					true,
+					ref_scene.getImageName()+"-PD-CORR-"+nrefine,
+					ImageDtt.CORR_TITLES);
+		}
+		if (vis_corr_td != null) {
+			String [] dbg_titles = new String[num_scenes+1];
+			for (int i = 0; i < num_scenes; i++) {
+				dbg_titles[i] = scenes[i].getImageName();
+			}
+			dbg_titles[num_scenes] = "combo";
+			(new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
+					vis_corr_td,
+					wis_wh[0],
+					wis_wh[1],
+					true,
+					"TD-"+tile_woi.x+"_"+tile_woi.y+"-"+nrefine,
+					dbg_titles);
+		}
+		float [][][] fcorr_extra = null; 
+	    if (fclt_corrs != null) {
+	        fclt_corrs[num_scenes] = ImageDtt.extract_corr_woi(
+					true, // final boolean      copy, // copy tiles stack, not reference
+	                fclt_corr,   // final float [][][] fcorr,
+	                tile_woi,    // final Rectangle    woi,
+	                tilesX,      // final int          tilesX,
+	                threadsMax); // final int          threadsMax)     // maximal number of threads to launch
+	        fcorr_extra = ImageDtt.corr_get_extra(
+	        		fclt_corrs,    // final float [][][][] fcorrs,
+	    			tilesX,        // final int            tilesX,
+	    			num_scenes,    // final int            ncombo,
+	    			10,            // final int            slices,
+	    			threadsMax);   // final int            threadsMax)
+	    }
+
+		if (vis_corr_pd != null) {
+			if (fcorr_extra != null) {
+				vis_corr_pd[num_scenes + 1]=ImageDtt. corr_show_extra(
+						fcorr_extra,              // final float [][][] fcorr_extra,  // float[tile][slices][extra];
+						tilesX,                   // final int          tilesX,
+						tile_woi,                 // final Rectangle    woi,
+						vis_gap,                  // final int          gap,
+						3,                        // final int          step,
+						2,                        // final int          size,
+						null,                     // final int []       wh,
+						image_dtt.transform_size, // final int          transform_size,
+						threadsMax);              //  final int          threadsMax)     // maximal number of threads to launch
+			}
+			String [] dbg_titles = new String[num_scenes+2];
+			for (int i = 0; i < num_scenes; i++) {
+				dbg_titles[i] = scenes[i].getImageName();
+			}
+			dbg_titles[num_scenes] =     "combo";
+			dbg_titles[num_scenes + 1] = "lucky";
+			(new ShowDoubleFloatArrays()).showArrays( // out of boundary 15
+					vis_corr_pd,
+					wis_wh[0],
+					wis_wh[1],
+					true,
+					"PD-"+"FZ-"+(clt_parameters.getGpuFatZero(ref_scene.isMonochrome()))+"-"+tile_woi.x+"_"+tile_woi.y+"-"+nrefine,
+					dbg_titles);
+		}		  
 		 
 		return disparity_map; // disparity_map
 //		return clt_corr_partial; // disparity_map

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -3326,6 +3326,7 @@ public class QuadCLT extends QuadCLTCPU {
 					null, // 	final double [][][][]     corr_tiles,      // [tilesY][tilesX][pair][] ([(2*gpu_corr_rad+1)*(2*gpu_corr_rad+1)]) or null
 					clt_corr_partial1,           			// final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 					// [tilesY][tilesX] should be set by caller
+					null, // [tile][index][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 					// When clt_mismatch is non-zero, no far objects extraction will be attempted
 					null, 								// final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
 					// values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is