working on a separate corr_td for td accumulation

dd8f74c3 · Andrey Filippov · d87183b6 · dd8f74c3 · dd8f74c3 · dd8f74c3
Commit dd8f74c3 authored Sep 02, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -633,9 +633,9 @@ public class GPUTileProcessor {
        private int texture_stride_rgba;
        private int num_task_tiles;
        private int num_corr_tiles;
-        private int num_texture_tiles;
+        private int num_pairs = 6; // number of correlation pairs per tile (should match tasks)
-        private int num_pairs = 6; // number of correlation pairs per tile (should match tsaks)
        private int num_corr_combo_tiles;
+        private int num_texture_tiles;
        private boolean geometry_correction_set = false;
        private boolean geometry_correction_vector_set = false;
@@ -981,16 +981,8 @@ public class GPUTileProcessor {
        }
 /*
-        public void setCorrIndices(int [] corr_indices)
-        {
-        	num_corr_tiles = corr_indices.length;
-        	float [] fcorr_indices = new float [corr_indices.length];
-        	for (int i = 0; i < num_corr_tiles; i++) {
-        		fcorr_indices[i] = Float.intBitsToFloat(corr_indices[i]);
-        	}
-            cuMemcpyHtoD(gpu_corr_indices, Pointer.to(fcorr_indices),  num_corr_tiles * Sizeof.FLOAT);
-        }
        public void setTextureIndices(int [] texture_indices) // never used
        {
        	num_texture_tiles = texture_indices.length;
@@ -1017,7 +1009,7 @@ public class GPUTileProcessor {
        	return texture_indices;
        }
-    //texture_indices
        public void setConvolutionKernel(
        		float [] kernel,  // [tileY][tileX][color][..]
        		float [] kernel_offsets,
@@ -2025,6 +2017,151 @@ public class GPUTileProcessor {
        	cuCtxSynchronize();
        }
+        public int getNumPairs() {return num_pairs;}
+        public int [] setCorrTilesTd(
+        		final float [][][][] corr_tiles, // [tileY][tileX][pair][4*64]
+        		int [][] pairs_map) // typically {{0,0},{1,1},{2,2},{3,3},{4,4},{5,5} [0] - 3rd index in corr_tiles, [1] - 
+        {
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+            num_pairs = pairs_map.length; // set global num_pairs
+            int tilesX = corr_tiles[0].length;
+            int tilesY = corr_tiles.length;
+        	int [] indices = new int [tilesY * tilesX * num_pairs];  // as if all tiles are not null
+            float [] fdata = new float [tilesY * tilesX * corr_size_td* num_pairs]; // as if all tiles are not null
+        	int ntile = 0;
+        	for (int ty = 0; ty < corr_tiles.length; ty++) {
+            	for (int tx = 0; tx < corr_tiles[0].length; tx++) {
+            		if (corr_tiles[ty][tx]!= null) {
+            			for (int pair = 0; pair < num_pairs; pair++) {
+            				indices[ntile * num_pairs + pair] = ((ty * tilesX + tx) << GPUTileProcessor.CORR_NTILE_SHIFT) + pairs_map[pair][1];
+                    		System.arraycopy(corr_tiles[ty][tx][pairs_map[pair][0]], 0, fdata, (ntile*num_pairs + pair) * corr_size_td, corr_size_td);
+            			}
+            			ntile++;
+            		}
+            	}
+        	}
+        	setCorrIndicesTdData(
+        			ntile,   // int    num_tiles,  // corr_indices, fdata may be longer than needed
+        			indices, // int [] corr_indices,
+            		fdata);  // float [] fdata);
+        	return indices;
+        }
+        public float [][][][] getCorrTilesTd() // [tileY][tileX][pair][4*64] , read all available pairs
+        {
+        	int tilesX =     img_width / DTT_SIZE;
+        	int tilesY =     img_height / DTT_SIZE;
+        	float [][][][] corr_tiles = new float[tilesY][tilesX][][]; // num_pairs
+        	return getCorrTilesTd(corr_tiles);
+        }
+        public float [][][][] getCorrTilesTd( // [tileY][tileX][pair][4*64] , read all available pairs
+        		float [][][][] corr_tiles)
+        {
+        	final int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+        	int [] indices = getCorrIndices(); // also sets num_corr_tiles
+        	float [] fdata = getCorrTdData();
+        	int num_tiles = num_corr_tiles / num_pairs;
+        	int width = corr_tiles[0].length;
+        	for (int nt = 0; nt < num_tiles; nt++ ) {
+        		int nTile = (indices[nt * num_pairs] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+        		int ty = nTile / width;
+        		int tx = nTile % width;
+        		corr_tiles[ty][tx] = new float [num_pairs][corr_size_td];
+        		for (int pair = 0; pair < num_pairs; pair++) {
+            		System.arraycopy(fdata, (nt * num_pairs + pair) * corr_size_td, corr_tiles[ty][tx][pair], 0, corr_size_td);
+        		}
+        	}
+        	return corr_tiles;
+        }
+        public int [] setCorrTilesComboTd(
+        		final float [][][] corr_tiles, // [tileY][tileX][4*64]
+        		int ipair) // just to set in the index low bits
+        {
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+            int tilesX = corr_tiles[0].length;
+            int tilesY = corr_tiles.length;
+        	int [] indices = new int [tilesY * tilesX];  // as if all tiles are not null
+            float [] fdata = new float [tilesY * tilesX * corr_size_td];  // as if all tiles are not null
+        	int ntile = 0;
+        	for (int ty = 0; ty < corr_tiles.length; ty++) {
+            	for (int tx = 0; tx < corr_tiles[0].length; tx++) {
+            		if (corr_tiles[ty][tx]!= null) {
+            			indices[ntile] = ((ty * tilesX + tx) << GPUTileProcessor.CORR_NTILE_SHIFT) + ipair;
+            			System.arraycopy(corr_tiles[ty][tx], 0, fdata, ntile * corr_size_td, corr_size_td);
+            			ntile++;
+            		}
+            	}
+        	}
+        	setCorrComboIndicesTdData(
+        			ntile,   // int    num_tiles,  // corr_indices, fdata may be longer than needed
+        			indices, // int [] corr_indices,
+            		fdata);  // float [] fdata);
+        	return indices;
+        }
+        public float [][][] getCorrTilesComboTd() // [tileY][tileX][4*64] , read all available pairs
+        {
+        	int tilesX =     img_width / DTT_SIZE;
+        	int tilesY =     img_height / DTT_SIZE;
+        	float [][][] corr_tiles = new float[tilesY][tilesX][]; // num_pairs
+        	return getCorrTilesComboTd(corr_tiles);
+        }
+        public float [][][] getCorrTilesComboTd( // [tileY][tileX][4*64] , read all available pairs
+        		float [][][] corr_tiles // should be initialized as [tilesX][tilesY][]
+        		)
+        {
+        	final int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+        	int [] indices = getCorrComboIndices(); // num_corr_combo_tiles should be set earlier (from setCorrTilesComboTd or)
+        	float [] fdata = getCorrComboTdData();
+        	int width = corr_tiles[0].length;
+        	for (int nt = 0; nt < num_corr_combo_tiles; nt++ ) {
+        		int nTile = (indices[nt] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+        		int ty = nTile / width;
+        		int tx = nTile % width;
+        		corr_tiles[ty][tx] = new float[corr_size_td];
+        		System.arraycopy(fdata, nt * corr_size_td, corr_tiles[ty][tx], 0, corr_size_td);
+        	}
+        	return corr_tiles;
+        }
+        public void setCorrIndicesTdData(
+        		int    num_tiles,  // corr_indices, fdata may be longer than needed
+        		int [] corr_indices,
+        		float [] fdata)
+        {
+        	num_corr_tiles = num_tiles; // corr_indices.length;
+        	float [] fcorr_indices = new float [num_corr_tiles];
+        	for (int i = 0; i < num_corr_tiles; i++) {
+        		fcorr_indices[i] = Float.intBitsToFloat(corr_indices[i]);
+        	}
+            cuMemcpyHtoD(gpu_corr_indices,   Pointer.to(fcorr_indices),  num_corr_tiles * Sizeof.FLOAT);
+        	float [] fnum_corr_tiles = {(float) num_corr_tiles};
+        	cuMemcpyHtoD(gpu_num_corr_tiles, Pointer.to(fnum_corr_tiles), 1 * Sizeof.FLOAT);
+        	// copy the correlation data
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+            CUDA_MEMCPY2D copyH2D =   new CUDA_MEMCPY2D();
+            copyH2D.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+            copyH2D.srcHost =         Pointer.to(fdata);
+            copyH2D.srcPitch =        corr_size_td*Sizeof.FLOAT; // width_in_bytes;
+            copyH2D.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+            copyH2D.dstDevice =       gpu_corrs_td; // src_dpointer;
+            copyH2D.dstPitch =        corr_stride_td *Sizeof.FLOAT; // device_stride[0];
+            copyH2D.WidthInBytes =    corr_size_td*Sizeof.FLOAT; // width_in_bytes;
+            copyH2D.Height =          num_corr_tiles; // /4;
+            cuMemcpy2D(copyH2D);
+        }
        public int [] getCorrIndices() {
        	float [] fnum_corrs = new float[1];
        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
@@ -2037,9 +2174,54 @@ public class GPUTileProcessor {
        	}
        	num_corr_tiles = num_corrs;
        	return corr_indices;
+        }
+        public float [] getCorrTdData(){
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+        	float [] cpu_corrs = new float [ num_corr_tiles * corr_size_td];
+        	CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
+        	copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+        	copyD2H.srcDevice =       gpu_corrs_td;
+        	copyD2H.srcPitch =        corr_stride_td * Sizeof.FLOAT;
+        	copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+        	copyD2H.dstHost =         Pointer.to(cpu_corrs);
+        	copyD2H.dstPitch =        corr_size_td * Sizeof.FLOAT;
+        	copyD2H.WidthInBytes =    corr_size_td * Sizeof.FLOAT;
+        	copyD2H.Height =          num_corr_tiles;
+        	cuMemcpy2D(copyD2H); // run copy
+        	return cpu_corrs;
+        }
+        public void setCorrComboIndicesTdData(
+        		int    num_tiles,  // corr_combo_indices, fdata may be longer than needed
+        		int [] corr_combo_indices,
+        		float [] fdata)
+        {
+        	num_corr_combo_tiles = num_tiles; // corr_combo_indices.length;
+        	float [] fcorr_combo_indices = new float [num_corr_combo_tiles];
+        	for (int i = 0; i < num_corr_combo_tiles; i++) {
+        		fcorr_combo_indices[i] = Float.intBitsToFloat(corr_combo_indices[i]);
+        	}
+            cuMemcpyHtoD(gpu_corr_indices,   Pointer.to(fcorr_combo_indices),  num_corr_combo_tiles * Sizeof.FLOAT);
+//        	float [] fnum_corr_tiles = {(float) num_corr_tiles};
+//        	cuMemcpyHtoD(gpu_num_corr_tiles, Pointer.to(fnum_corr_tiles), 1 * Sizeof.FLOAT);
+        	// copy the correlation data
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+            CUDA_MEMCPY2D copyH2D =   new CUDA_MEMCPY2D();
+            copyH2D.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+            copyH2D.srcHost =         Pointer.to(fdata);
+            copyH2D.srcPitch =        corr_size_td*Sizeof.FLOAT; // width_in_bytes;
+            copyH2D.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+            copyH2D.dstDevice =       gpu_corrs_combo_td; // src_dpointer;
+            copyH2D.dstPitch =        corr_stride_combo_td *Sizeof.FLOAT; // device_stride[0];
+            copyH2D.WidthInBytes =    corr_size_td*Sizeof.FLOAT; // width_in_bytes;
+            copyH2D.Height =          num_corr_combo_tiles; // /4;
+            cuMemcpy2D(copyH2D);
        }
        public int [] getCorrComboIndices() {
 //        	float [] fnum_corrs = new float[1];
 //        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
@@ -2051,7 +2233,22 @@ public class GPUTileProcessor {
        		corr_combo_indices[i] = Float.floatToIntBits(fcorr_combo_indices[i]);
        	}
        	return corr_combo_indices;
+        }
+        public float [] getCorrComboTdData(){
+        	int corr_size_td = 4 * DTT_SIZE * DTT_SIZE;
+        	float [] cpu_corrs = new float [ num_corr_combo_tiles * corr_size_td];
+        	CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
+        	copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+        	copyD2H.srcDevice =       gpu_corrs_combo_td;
+        	copyD2H.srcPitch =        corr_stride_combo_td * Sizeof.FLOAT;
+        	copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+        	copyD2H.dstHost =         Pointer.to(cpu_corrs);
+        	copyD2H.dstPitch =        corr_size_td * Sizeof.FLOAT;
+        	copyD2H.WidthInBytes =    corr_size_td * Sizeof.FLOAT;
+        	copyD2H.Height =          num_corr_combo_tiles;
+        	cuMemcpy2D(copyD2H); // run copy
+        	return cpu_corrs;
        }
        public float [][] getCorr2D(int corr_rad){

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -6,7 +6,7 @@ import java.util.concurrent.atomic.AtomicInteger;
 import com.elphel.imagej.common.ShowDoubleFloatArrays;
 import com.elphel.imagej.gpu.GPUTileProcessor;
-import Jama.Matrix;
+//import Jama.Matrix;
 public class ImageDtt extends ImageDttCPU {
@@ -50,6 +50,9 @@ public class ImageDtt extends ImageDttCPU {
 			final int                 macro_scale,     // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 			final int [][]            tile_op,         // [tilesY][tilesX] - what to do - 0 - nothing for this tile
 			final double [][]         disparity_array, // [tilesY][tilesX] - individual per-tile expected disparity
+			final float  [][][][]     fcorr_td,        // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+			final float  [][][][]     fcorr_combo_td,  // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+			                                           // each of the top elements may be null to skip particular combo type
 			final double [][][][]     clt_corr_combo,  // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
 			                                           // [type][tilesY][tilesX] should be set by caller
 													   // types: 0 - selected correlation (product+offset), 1 - sum
@@ -158,35 +161,11 @@ public class ImageDtt extends ImageDttCPU {
 				col_weights[1] = corr_blue * col_weights[2];
 			}
 		}
-/*
-		double [] scales = isMonochrome() ?
-				(new double [] {1.0}) :
-					(macro_mode?
-							(new double [] {0.25,0.25,0.5}) :
-								(new double [] {
-										corr_red, // 0.25
-										corr_blue, // 0.25
-										1.0 - corr_red - corr_blue})); // 0.5
-*/		
 		final int corr_size = transform_size * 2 - 1;
-//		final int [][] transpose_indices = new int [corr_size*(corr_size-1)/2][2];
 		if ((globalDebugLevel > -10) && (disparity_corr != 0.0)){
 			System.out.println(String.format("Using manual infinity disparity correction of %8.5f pixels",disparity_corr));
 		}
-/*		
-		{ int indx = 0;
-		for (int i =0; i < corr_size-1; i++){
-			for (int j = i+1; j < corr_size; j++){
-				transpose_indices[indx  ][0] = i * corr_size + j;
-				transpose_indices[indx++][1] = j * corr_size + i;
-			}
-		}
-		}
-*/
-////		final int first_color = isMonochrome()? MONO_CHN : 0; // color that is non-zero
 		// reducing weight of on-axis correlation values to enhance detection of vertical/horizontal lines
 		// multiply correlation results inside the horizontal center strip  2*enhortho_width - 1 wide by enhortho_scale
@@ -236,7 +215,7 @@ public class ImageDtt extends ImageDttCPU {
 		// add optional initialization of debug layers here
 		boolean need_macro = false;
-		boolean need_corr = (clt_mismatch != null);
+		boolean need_corr = (clt_mismatch != null) || (fcorr_combo_td !=null) || (fcorr_td !=null) ; // (not the only reason)
 		// skipping DISPARITY_VARIATIONS_INDEX - it was not used
 		if (disparity_map != null){
 			for (int i = 0; i<disparity_map.length;i++) if ((disparity_modes & (1 << i)) != 0){
@@ -274,13 +253,7 @@ public class ImageDtt extends ImageDttCPU {
 			System.out.println("macro_mode="+macro_mode);
 		}
-		Matrix [] corr_rots_aux = null;
+		final boolean use_main = geometryCorrection_main != null;
-		if (geometryCorrection_main != null) {
-			corr_rots_aux = geometryCorrection.getCorrVector().getRotMatrices(geometryCorrection.getRotMatrix(true));
-		}
-		final boolean use_main = corr_rots_aux != null;
-////		final Matrix [] corr_rots = use_main ? corr_rots_aux : geometryCorrection.getCorrVector().getRotMatrices(); // get array of per-sensor rotation matrices
 		boolean [] used_corrs = new boolean[1];
 	    final int all_pairs = imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
 		final GPUTileProcessor.TpTask[] tp_tasks =  gpuQuad.setTpTask(
@@ -297,7 +270,7 @@ public class ImageDtt extends ImageDttCPU {
 		}
 		//texture_tiles
 		final boolean fneed_macro = need_macro;
-		final boolean fneed_corr =  need_corr && used_corrs[0];
+		final boolean fneed_corr =  need_corr && used_corrs[0]; // *** tasks should include correlation
 		final float [][] lpf_rgb = new float[][] {
 			floatGetCltLpfFd(gpu_sigma_r),
@@ -405,17 +378,13 @@ public class ImageDtt extends ImageDttCPU {
 				true,           // boolean   calc_textures,
 				false);         // boolean   calc_extra)
-//			int numcol = quadCLT_main.isMonochrome()?1:3;
-//			int ports = imp_quad_main.length; // quad
 			int [] texture_indices = gpuQuad.getTextureIndices();
 			int          num_src_slices = numcol + 1; //  + (clt_parameters.keep_weights?(ports + numcol + 1):0); // 12 ; // calculate
 			float [] flat_textures =  gpuQuad.getFlatTextures( // fatal error has been detected by the Java Runtime Environment:
 					texture_indices.length,
 					numcol, // int     num_colors,
 		    		false); // clt_parameters.keep_weights); // boolean keep_weights);
-//	    	int texture_slice_size = (2 * gpuQuad.getDttSize())* (2 * gpuQuad.getDttSize());
-//	    	int texture_tile_size = texture_slice_size * num_src_slices ;
-//	    	double [][][][] texture_tiles = new double [tilesY][tilesX][][];
 			gpuQuad.doubleTextures(
 		    		new Rectangle(0, 0, tilesX, tilesY), // Rectangle    woi,
 		    		texture_tiles,                       // double [][][][] texture_tiles, // null or [tilesY][tilesX]
@@ -433,6 +402,10 @@ public class ImageDtt extends ImageDttCPU {
 			//Generate 2D phase correlations from the CLT representation
 			gpuQuad.execCorr2D_TD(col_weights); // Get TD version of correlations (may be read out and saved) 
 			final int [] corr_indices = gpuQuad.getCorrIndices();
+			if (fcorr_td != null) {
+				gpuQuad.getCorrTilesTd(fcorr_td); // generate time domain correlation pairs
+			}
 			gpuQuad.execCorr2D_normalize(
 	        		false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
 					gpu_fat_zero, // double fat_zero);
@@ -444,6 +417,9 @@ public class ImageDtt extends ImageDttCPU {
 			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
 			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
 			        0x0f); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 0) && (fcorr_combo_td[0] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_td[0]); // generate time domain correlation pairs for quad ortho combination
+			}
 			// normalize and convert to pixel domain
 			gpuQuad.execCorr2D_normalize(
 	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
@@ -457,6 +433,9 @@ public class ImageDtt extends ImageDttCPU {
 			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
 			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
 			        0x30); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 1) && (fcorr_combo_td[1] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_td[1]); // generate time domain correlation pairs for cross diagonal combination
+			}
 			gpuQuad.execCorr2D_normalize(
 	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
 					gpu_fat_zero, // double fat_zero);
@@ -468,17 +447,22 @@ public class ImageDtt extends ImageDttCPU {
 			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
 			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
 			        0x03); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 2) && (fcorr_combo_td[2] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_td[2]); // generate time domain correlation pairs for horizontal combination
+			}
 			gpuQuad.execCorr2D_normalize(
 	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
 					gpu_fat_zero, // double fat_zero);
 		    		gpu_corr_rad); // int corr_radius
 			final float [][] fcorr2D_hor =   gpuQuad.getCorr2DCombo(gpu_corr_rad);
 			// Combine 2 vertical pairs
 			gpuQuad.execCorr2D_combine( // calculate cross pairs
 			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
 			        GPUTileProcessor.NUM_PAIRS,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
 			        0x0c); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			if ((fcorr_combo_td != null) && (fcorr_combo_td.length >= 3) && (fcorr_combo_td[3] != null)) {
+				gpuQuad.getCorrTilesComboTd(fcorr_td[3]); // generate time domain correlation pairs for vertical combination
+			}
 			gpuQuad.execCorr2D_normalize(
 	        		true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
 					gpu_fat_zero, // double fat_zero);
@@ -582,13 +566,408 @@ public class ImageDtt extends ImageDttCPU {
 									corrs[pair][i] = gpu_corr_scale * fcorr2D_vert[indx_tile][i]; // from float to double
 								}
-								// does not include combo
 								int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
+								int tile_lma_debug_level =  ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
+								boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
+								corr_common_GPU(
+										imgdtt_params,        // final ImageDttParameters  imgdtt_params,
+										clt_corr_partial,     // final double [][][][][]   clt_corr_partial,			
+										used_pairs,           // final int           used_pairs,
+										disparity_map,        // final double [][]   disparity_map,
+										clt_mismatch,         // final double [][]   clt_mismatch,
+										saturation_imp,       // final boolean [][]  saturation_imp,
+										fneed_macro,          // final boolean       fneed_macro,
+										corr2d,               // final Correlation2d corr2d,
+										corrs,                // final double [][]   corrs,
+										tileX,                // final int           tileX,
+										tileY,                // final int           tileY,
+										max_corr_radius,      // final double        max_corr_radius, // 3.9;
+										tile_lma_debug_level, // int                 tile_lma_debug_level,
+										debugTile,            // boolean             debugTile,
+										globalDebugLevel);    // final int           globalDebugLevel)							
+								// double extra_disparity = 0.0; // used for textures:  if allowed, shift images extra before trying to combine
+/*
+// Disabled for GPU
+								if      (corr_mode == 0) extra_disparity = disparity_map[DISPARITY_INDEX_INT][tIndex];
+								else if (corr_mode == 1) extra_disparity = disparity_map[DISPARITY_INDEX_CM][tIndex];
+								else if (corr_mode == 2) extra_disparity = disparity_map[DISPARITY_INDEX_POLY][tIndex];
+								else if (corr_mode == 3) extra_disparity = disparity_map[DISPARITY_INDEX_HOR][tIndex];  // not used in lwir
+								else if (corr_mode == 4) extra_disparity = disparity_map[DISPARITY_INDEX_VERT][tIndex];  // not used in lwir
+								if (Double.isNaN(extra_disparity)) extra_disparity = 0;  // used in lwir
+*/
+								if (Double.isNaN(disparity_map[DISPARITY_STRENGTH_INDEX][tIndex])) {
+									System.out.println("BUG: 3. disparity_map[DISPARITY_STRENGTH_INDEX][tIndex] should not be NaN");
+								}
+								// only debug is left
+								// old (per-color correlation)
+								// removed
+							} // end of tile
+						}
+					};
+				}
+				startAndJoin(threads);
+			} else {
+				// no correlation tiles to process
+			}
+		}
+		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
+			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
+		}
+/*
+//		final double [][] dbg_distort = debug_distort? (new double [4*quad][tilesX*tilesY]) : null;
+		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
+			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
+		}
+		if (dbg_ports_coords != null) {
+			(new showDoubleFloatArrays()).showArrays(dbg_ports_coords,  tilesX, tilesY, true, "ports_coordinates", dbg_titles);
+		}
+*/
+//		return clt_data;
+	}
+	public void clt_process_tl_correlations_GPU( // convert to pixel domain and process correlations already prepared in fcorr_td and/or fcorr_combo_td
+			final ImageDttParameters  imgdtt_params,   // Now just extra correlation parameters, later will include, most others
+			// both arrays should have same non-null tiles
+			final float  [][][][]     fcorr_td,        // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+			final float  [][][][]     fcorr_combo_td,  // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+			                                           // each of the top elements may be null to skip particular combo type
+			final double [][][][][]   clt_corr_partial,// [tilesY][tilesX][quad]color][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
+                                                       // [tilesY][tilesX] should be set by caller
+			// When clt_mismatch is non-zero, no far objects extraction will be attempted
+			final double [][]         clt_mismatch,    // [12][tilesY * tilesX] // ***** transpose unapplied ***** ?. null - do not calculate
+			                                           // values in the "main" directions have disparity (*_CM) subtracted, in the perpendicular - as is
+			final double [][]         disparity_map,   // [8][tilesY][tilesX], only [6][] is needed on input or null - do not calculate
+			                                           // last 2 - contrast, avg/ "geometric average)
+			final double              gpu_corr_scale,  //  0.75; // reduce GPU-generated correlation values
+			final double              gpu_fat_zero,    // clt_parameters.getGpuFatZero(is_mono);absolute == 30.0
+			final int                 gpu_corr_rad,    // = transform_size - 1 ?
+			final double              max_corr_radius, // 3.9;
+			final int                 window_type,     // GPU: will not be used
+			final int                 debug_tileX,
+			final int                 debug_tileY,
+			final int                 threadsMax,      // maximal number of threads to launch
+			final int                 globalDebugLevel)
+	{
+		if (this.gpuQuad == null) {
+			System.out.println("clt_aberrations_quad_corr_GPU(): this.gpuQuad is null, bailing out");
+			return;
+		}
+		final boolean [][] saturation_imp = gpuQuad.quadCLT.saturation_imp;               // boolean [][] saturation_imp, // (near) saturated pixels or null
+//gpuQuad
+		final boolean debug_distort= globalDebugLevel > 0; ///false; // true;
+		final double [][] debug_offsets = new double[imgdtt_params.lma_dbg_offset.length][2];
+		for (int i = 0; i < debug_offsets.length; i++) for (int j = 0; j < debug_offsets[i].length; j++) {
+			debug_offsets[i][j] = imgdtt_params.lma_dbg_offset[i][j]*imgdtt_params.lma_dbg_scale;
+		}
+		final int quad = 4;   // number of subcameras
+//		final int numcol = isMonochrome()?1:3;
+		final int width =  gpuQuad.getImageWidth();
+		final int height = gpuQuad.getImageHeight();
+		final int tilesX=gpuQuad.getTilesX(); // width/transform_size;
+		final int tilesY=gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+		final Thread[] threads = newThreadArray(threadsMax);
+		final AtomicInteger ai = new AtomicInteger(0);
+//		final double [] col_weights= new double [numcol]; // colors are RBG
+		final double [][] dbg_distort = debug_distort? (new double [4*quad][tilesX*tilesY]) : null;
+		// not yet used with GPU
+/**		
+		final double [][] corr_wnd = Corr2dLMA.getCorrWnd(
+				transform_size,
+				imgdtt_params.lma_wnd);
+		final double [] corr_wnd_inv_limited = (imgdtt_params.lma_min_wnd <= 1.0)?  new double [corr_wnd.length * corr_wnd[0].length]: null;
+		if (corr_wnd_inv_limited != null) {
+			double inv_pwr = imgdtt_params.lma_wnd_pwr - (imgdtt_params.lma_wnd - 1.0); // compensate for lma_wnd
+			for (int i = imgdtt_params.lma_hard_marg; i < (corr_wnd.length - imgdtt_params.lma_hard_marg); i++) {
+				for (int j = imgdtt_params.lma_hard_marg; j < (corr_wnd.length - imgdtt_params.lma_hard_marg); j++) {
+					corr_wnd_inv_limited[i * (corr_wnd.length) + j] = 1.0/Math.max(Math.pow(corr_wnd[i][j],inv_pwr), imgdtt_params.lma_min_wnd);
+				}
+			}
+		}
+*/
+		// keep for now for mono, find out  what do they mean for macro mode
+		final int corr_size = transform_size * 2 - 1;
+		// reducing weight of on-axis correlation values to enhance detection of vertical/horizontal lines
+		// multiply correlation results inside the horizontal center strip  2*enhortho_width - 1 wide by enhortho_scale
+		final double [] enh_ortho_scale = new double [corr_size];
+		for (int i = 0; i < corr_size; i++){
+			if ((i < (transform_size - imgdtt_params.getEnhOrthoWidth(isAux()))) || (i > (transform_size - 2 + imgdtt_params.getEnhOrthoWidth(isAux())))) {
+				enh_ortho_scale[i] = 1.0;
+			} else {
+				enh_ortho_scale[i] = imgdtt_params.getEnhOrthoScale(isAux());
+			}
+			if (i == (transform_size-1)) enh_ortho_scale[i] = 0.0 ; // hardwired 0 in the center
+			enh_ortho_scale[i] *= Math.sin(Math.PI*(i+1.0)/(2*transform_size));
+		}
+		if (globalDebugLevel > 1){
+			System.out.println("getEnhOrthoWidth(isAux())="+ imgdtt_params.getEnhOrthoWidth(isAux())+" getEnhOrthoScale(isAux())="+ imgdtt_params.getEnhOrthoScale(isAux()));
+			for (int i = 0; i < corr_size; i++){
+				System.out.println(" enh_ortho_scale["+i+"]="+ enh_ortho_scale[i]);
+			}
+		}
+		// Create window  to select center correlation strip using
+		// ortho_height - full width of non-zero elements
+		// ortho_eff_height - effective height (ration of the weighted column sum to the center value)
+		int wcenter = transform_size - 1;
+		final double [] ortho_weights = new double [corr_size]; // [15]
+		for (int i = 0; i < corr_size; i++){
+			if ((i >= wcenter - imgdtt_params.ortho_height/2) && (i <= wcenter + imgdtt_params.ortho_height/2)) {
+				double dx = 1.0*(i-wcenter)/(imgdtt_params.ortho_height/2 + 1);
+				ortho_weights[i] = 0.5*(1.0+Math.cos(Math.PI*dx))/imgdtt_params.ortho_eff_height;
+			}
+		}
+		if (globalDebugLevel > 0){
+			System.out.println("ortho_height="+ imgdtt_params.ortho_height+" ortho_eff_height="+ imgdtt_params.ortho_eff_height);
+			for (int i = 0; i < corr_size; i++){
+				System.out.println(" ortho_weights["+i+"]="+ ortho_weights[i]);
+			}
+		}
+		if (globalDebugLevel > 0) {
+			System.out.println("clt_aberrations_quad_corr(): width="+width+" height="+height+" transform_size="+transform_size+
+					" debug_tileX="+debug_tileX+" debug_tileY="+debug_tileY+" globalDebugLevel="+globalDebugLevel);
+		}
+		// add optional initialization of debug layers here
+//		boolean need_macro = false;
+//		boolean need_corr = true;
+		if (clt_mismatch != null){
+			for (int i = 0; i<clt_mismatch.length;i++){
+				clt_mismatch[i] = new double [tilesY*tilesX]; // will use only "center of mass" centers
+			}
+		}
+		DttRad2 dtt = new DttRad2(transform_size);
+		dtt.set_window(window_type);
+		final double [] lt_window = dtt.getWin2d();	// [256] - never used
+		final double [] lt_window2 = new double [lt_window.length]; // squared - never used
+		for (int i = 0; i < lt_window.length; i++) lt_window2[i] = lt_window[i] * lt_window[i];
+///		final boolean use_main = geometryCorrection_main != null;
+///	    final int all_pairs = imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
+		int [] corr_indices_ = null; 
+		float [][] fcorr2D_ = null;
+		if (fcorr_td != null) {
+			int [][] pairs_map = {{0,0},{1,1},{2,2},{3,3},{4,4},{5,5}};
+			corr_indices_ = gpuQuad.setCorrTilesTd(
+					fcorr_td, // final float [][][][] corr_tiles, // [tileY][tileX][pair][4*64]
+					pairs_map); // int [][] pairs) // typically {{0,0},{1,1},{2,2},{3,3},{4,4},{5,5} [0] - 3rd index in corr_tiles, [1] -
+			gpuQuad.execCorr2D_normalize(
+					false, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+					gpu_fat_zero, // double fat_zero);
+					gpu_corr_rad); // int corr_radius
+			fcorr2D_ = gpuQuad.getCorr2D(gpu_corr_rad); //  int corr_rad);
+		}
+		final int [] corr_indices = corr_indices_; 
+		final float [][] fcorr2D = fcorr2D_;
+		final int num_combo= (fcorr_combo_td != null)? fcorr_combo_td.length : 0;
+		final int [][] corr_combo_indices = (fcorr_combo_td != null)? new int [num_combo][] : null;
+		final float [][][] fcorr2D_combo = (fcorr_combo_td != null)? new float[num_combo][][] : null;
+		if (num_combo > 0) {
+			int [] ipairs = {0xf, 0x30, 0x3, 0xc}; // does not matter. using quad, cross, hor, vert
+			for (int i = 0; i < num_combo; i++) {
+				corr_combo_indices[i] = gpuQuad.setCorrTilesComboTd(
+						fcorr_combo_td[i], // final float [][][] corr_tiles, // [tileY][tileX][4*64]
+						ipairs[i]); // change to smth? int ipair) // just to set in the index low bits
+				// normalize and convert to pixel domain
+				gpuQuad.execCorr2D_normalize(
+						true, // boolean combo, // normalize combo correlations (false - per-pair ones) 
+						gpu_fat_zero, // double fat_zero);
+						gpu_corr_rad); // int corr_radius
+				fcorr2D_combo[i] =  gpuQuad.getCorr2DCombo(gpu_corr_rad);
+			}
+		}
+		final boolean fneed_macro = false;
+		if (corr_indices.length > 0) {
+			/*
+				if (true) { // debugging only
+					int [] wh = new int[2];
+					double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+							tilesX,
+							tilesY,
+							corr_indices,
+							fcorr2D,
+							wh);
+					(new ShowDoubleFloatArrays()).showArrays(
+							dbg_corr,
+							wh[0],
+							wh[1],
+							true,
+							"dbg-corr2D", // name+"-CORR2D-D"+clt_parameters.disparity,
+							GPUTileProcessor.getCorrTitles());
+				}
+			 */
+			final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size
+			// assuming that the correlation pairs sets are the same for each tile that has correlations
+			// find this number
+			int nt0 = (corr_indices[0] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+			int nc0 = 1;
+			for (int i = 1; (i < corr_indices.length) && ((corr_indices[i] >> GPUTileProcessor.CORR_NTILE_SHIFT) == nt0) ; i++) {
+				nc0++;
+			}
+			final int num_tile_corr = nc0; // normally 6
+			final int num_tiles = corr_indices.length / num_tile_corr; 
+			for (int ithread = 0; ithread < threads.length; ithread++) {
+				threads[ithread] = new Thread() {
+					@Override
+					public void run() {
+						Correlation2d corr2d = new Correlation2d(
+								imgdtt_params,              // ImageDttParameters  imgdtt_params,
+								transform_size,             // int transform_size,
+								2.0,                        //  double wndx_scale, // (wndy scale is always 1.0)
+								isMonochrome(), // boolean monochrome,
+								(globalDebugLevel > -1));   //   boolean debug)
+						corr2d.createOrtoNotch(
+								imgdtt_params.getEnhOrthoWidth(isAux()), // double getEnhOrthoWidth(isAux()),
+								imgdtt_params.getEnhOrthoScale(isAux()), //double getEnhOrthoScale(isAux()),
+								(imgdtt_params.lma_debug_level > 1)); // boolean debug);
+						for (int indx_tile = ai.getAndIncrement(); indx_tile < num_tiles; indx_tile = ai.getAndIncrement()) {
+							// double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15)
+							// added quad and cross combos
+							double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS + 4][corr_length]; // 225-long (15x15)
+							int indx_corr = indx_tile * num_tile_corr;
+							int nt = (corr_indices[indx_corr] >> GPUTileProcessor.CORR_NTILE_SHIFT);
+							int tileX = nt % tilesX;
+							int tileY = nt / tilesX;
+							int tIndex = tileY * tilesX + tileX;
+							// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
+							int pair_mask = 0;
+							if (fcorr_td != null) {
+								for (int indx_pair = 0; indx_pair < num_tile_corr; indx_pair++) {
+									int pair = corr_indices[indx_corr] & GPUTileProcessor.CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
+									assert pair < GPUTileProcessor.NUM_PAIRS : "invalid correllation pair";
+									pair_mask |= (1 << pair);
+									for (int i = 0; i < corr_length; i++) {
+										corrs[pair][i] = gpu_corr_scale * fcorr2D[indx_corr][i]; // from float to double
+									}
+									indx_corr++; 
+								}
+							}
+							// add 4 combo layers : quad, cross, hor, vert
+							if (num_combo > 0) {
+								for (int ncm = 0; ncm < num_combo; ncm++) if (corr_combo_indices[ncm]!=null){
+									nt = (corr_combo_indices[ncm][indx_tile] >> GPUTileProcessor.CORR_NTILE_SHIFT); // corr_quad_indices - different sequence
+									int pair = GPUTileProcessor.NUM_PAIRS + ncm; // 6+
+									for (int i = 0; i < corr_length; i++) {
+										corrs[pair][i] = gpu_corr_scale * fcorr2D_combo[ncm][indx_tile][i]; // from float to double
+									}
+								}
+							}
+							int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
 							int tile_lma_debug_level =  ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
 							boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
-								// non-GPU initializaqtion of the data structures
+							corr_common_GPU(
+									imgdtt_params,        // final ImageDttParameters  imgdtt_params,
+									clt_corr_partial,     // final double [][][][][]   clt_corr_partial,			
+									used_pairs,           // final int           used_pairs,
+									disparity_map,        // final double [][]   disparity_map,
+									clt_mismatch,         // final double [][]   clt_mismatch,
+									saturation_imp,       // final boolean [][]  saturation_imp,
+									fneed_macro,          // final boolean       fneed_macro,
+									corr2d,               // final Correlation2d corr2d,
+									corrs,                // final double [][]   corrs,
+									tileX,                // final int           tileX,
+									tileY,                // final int           tileY,
+									max_corr_radius,      // final double        max_corr_radius, // 3.9;
+									tile_lma_debug_level, // int                 tile_lma_debug_level,
+									debugTile,            // boolean             debugTile,
+									globalDebugLevel);    // final int           globalDebugLevel)							
+							// double extra_disparity = 0.0; // used for textures:  if allowed, shift images extra before trying to combine
+							/*
+// Disabled for GPU
+								if      (corr_mode == 0) extra_disparity = disparity_map[DISPARITY_INDEX_INT][tIndex];
+								else if (corr_mode == 1) extra_disparity = disparity_map[DISPARITY_INDEX_CM][tIndex];
+								else if (corr_mode == 2) extra_disparity = disparity_map[DISPARITY_INDEX_POLY][tIndex];
+								else if (corr_mode == 3) extra_disparity = disparity_map[DISPARITY_INDEX_HOR][tIndex];  // not used in lwir
+								else if (corr_mode == 4) extra_disparity = disparity_map[DISPARITY_INDEX_VERT][tIndex];  // not used in lwir
+								if (Double.isNaN(extra_disparity)) extra_disparity = 0;  // used in lwir
+							 */
+							if (Double.isNaN(disparity_map[DISPARITY_STRENGTH_INDEX][tIndex])) {
+								System.out.println("BUG: 3. disparity_map[DISPARITY_STRENGTH_INDEX][tIndex] should not be NaN");
+							}
+							// only debug is left
+							// old (per-color correlation)
+							// removed
+						} // end of tile
+					}
+				};
+			}
+			startAndJoin(threads);
+		} else {
+			// no correlation tiles to process
+		}
+		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
+			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
+		}
+/*
+//		final double [][] dbg_distort = debug_distort? (new double [4*quad][tilesX*tilesY]) : null;
+		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
+			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
+		}
+		if (dbg_ports_coords != null) {
+			(new showDoubleFloatArrays()).showArrays(dbg_ports_coords,  tilesX, tilesY, true, "ports_coordinates", dbg_titles);
+		}
+*/
+//		return clt_data;
+	}
+	public void corr_common_GPU(
+			final ImageDttParameters  imgdtt_params,
+			final double [][][][][]   clt_corr_partial,			
+			final int           used_pairs,
+			final double [][]   disparity_map,
+			final double [][]   clt_mismatch,
+			final boolean [][]  saturation_imp,
+			final boolean       fneed_macro,
+			final Correlation2d corr2d,
+			final double [][]   corrs,
+			final int           tileX,
+			final int           tileY,
+			final double        max_corr_radius, // 3.9;
+			int                 tile_lma_debug_level,
+			boolean             debugTile,
+			final int           globalDebugLevel)
+	{
+		final int quad = 4;   // number of subcameras
+		final int numcol = isMonochrome()?1:3;
+		// does not include combo
+//		int tile_lma_debug_level =  ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
+//		boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
+		// non-GPU initialization of the data structures
+		final int tilesX=gpuQuad.getTilesX(); // width/transform_size;
+//		final int tilesY=gpuQuad.getTilesY(); // final int tilesY=height/transform_size;
+		int tIndex = tileY * tilesX + tileX;
 		final int [] overexp_all = (saturation_imp != null) ? ( new int [2]): null;
 		for (int i = 0; i < disparity_map.length; i++) {
 			if (disparity_map[i] != null) {
@@ -616,10 +995,8 @@ public class ImageDtt extends ImageDttCPU {
 		// calculate all selected pairs correlations
 		//int all_pairs = imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
 		// Code that was after correlations calculation
 		double [][] strips = corr2d.scaleRotateInterpoateCorrelations(
 				corrs,                          // double [][] correlations,
 				used_pairs,                      // int         pairs_mask,
@@ -656,21 +1033,21 @@ public class ImageDtt extends ImageDttCPU {
 				imgdtt_params.corr_offset,     // double      offset);
 				imgdtt_params.twice_diagonal); //    		boolean     twice_diagonal)
 		// Debug feature - only calculated if requested
 		if ((clt_corr_partial != null) && (imgdtt_params.corr_mode_debug || imgdtt_params.gpu_mode_debug)) {
+			@SuppressWarnings("unused")
 			double [] strip_ortho = corr2d.combineInterpolatedCorrelations(
 					strips,                         // double [][] strips,
 					0x0f,                           // int         pairs_mask,
 					imgdtt_params.corr_offset,      // double      offset);
 					imgdtt_params.twice_diagonal);  //    		boolean     twice_diagonal)
+			@SuppressWarnings("unused")
 			double [] strip_diag = corr2d.combineInterpolatedCorrelations(
 					strips,                         // double [][] strips,
 					0x30,                           // int         pairs_mask,
 					imgdtt_params.corr_offset,      // double      offset);
 					imgdtt_params.twice_diagonal);  //    		boolean     twice_diagonal)
+			@SuppressWarnings("unused")
 			double [] strip_all = corr2d.combineInterpolatedCorrelations(
 					strips,                         // double [][] strips,
 					0x3f,                           // int         pairs_mask,
@@ -699,31 +1076,27 @@ public class ImageDtt extends ImageDttCPU {
 			clt_corr_partial[tileY][tileX][1][1] = corrs[5];                        // 6
 			clt_corr_partial[tileY][tileX][1][2] = corrs[6];                        // 7
 			clt_corr_partial[tileY][tileX][1][3] = corrs[7];                        // 8
-//												    	clt_corr_partial[tileY][tileX][1][2] = corrs_ortho;                     // 7
+			//												    	clt_corr_partial[tileY][tileX][1][2] = corrs_ortho;                     // 7
-//												    	clt_corr_partial[tileY][tileX][1][3] = corrs_cross;                     // 8
+			//												    	clt_corr_partial[tileY][tileX][1][3] = corrs_cross;                     // 8
-//												    	clt_corr_partial[tileY][tileX][1][2] = corr2d.debugStrip(strip_hor);    // 7
+			//												    	clt_corr_partial[tileY][tileX][1][2] = corr2d.debugStrip(strip_hor);    // 7
-//												    	clt_corr_partial[tileY][tileX][1][3] = corr2d.debugStrip(strip_vert);   // 8
+			//												    	clt_corr_partial[tileY][tileX][1][3] = corr2d.debugStrip(strip_vert);   // 8
 			//strip_combo_intra						    	
 			clt_corr_partial[tileY][tileX][2][0] = corrs[8];                        // 9
 			clt_corr_partial[tileY][tileX][2][1] = corrs[9];                        // 10
-//												    	clt_corr_partial[tileY][tileX][2][0] = corr2d.debugStrip(strips[4]);    // 9
+			//												    	clt_corr_partial[tileY][tileX][2][0] = corr2d.debugStrip(strips[4]);    // 9
-//												    	clt_corr_partial[tileY][tileX][2][1] = corr2d.debugStrip(strips[5]);    // 10
+			//												    	clt_corr_partial[tileY][tileX][2][1] = corr2d.debugStrip(strips[5]);    // 10
 			clt_corr_partial[tileY][tileX][2][2] = corr2d.debugStrip2(strip_hor);   // 11
 			clt_corr_partial[tileY][tileX][2][3] = corr2d.debugStrip2(strip_vert);  // 12
 			clt_corr_partial[tileY][tileX][3][0] = corr2d.debugStrip2(strips_intra[0]); // 13
 			clt_corr_partial[tileY][tileX][3][1] = corr2d.debugStrip2(strips_intra[1]);  // 14
-//												    	clt_corr_partial[tileY][tileX][3][0] = corr2d.debugStrip2(strip_ortho); // 13
+			//												    	clt_corr_partial[tileY][tileX][3][0] = corr2d.debugStrip2(strip_ortho); // 13
-//												    	clt_corr_partial[tileY][tileX][3][1] = corr2d.debugStrip2(strip_diag);  // 14
+			//												    	clt_corr_partial[tileY][tileX][3][1] = corr2d.debugStrip2(strip_diag);  // 14
 			clt_corr_partial[tileY][tileX][3][2] = corr2d.debugStrip2(strip_combo_intra);    // 15
-//												    	clt_corr_partial[tileY][tileX][3][2] = corr2d.debugStrip(strip_all);    // 15
+			//												    	clt_corr_partial[tileY][tileX][3][2] = corr2d.debugStrip(strip_all);    // 15
 			clt_corr_partial[tileY][tileX][3][3] = corr2d.debugStrip2(strip_combo); // 16
 		}
 		if (imgdtt_params.pcorr_use) {
 			strip_combo = strip_combo_intra;
-						        }
-								if ((clt_corr_combo != null) && imgdtt_params.corr_mode_debug) {
-									// reuse it too?
 		}
 		// calculate CM maximums for all mixed channels
 		// First get integer correlation center, relative to the center
@@ -1056,46 +1429,23 @@ public class ImageDtt extends ImageDttCPU {
 			}
 		} // end of if (corr_stat != null)
 		// double extra_disparity = 0.0; // used for textures:  if allowed, shift images extra before trying to combine
-/*
+		/*
-// Disabled for GPU
+//Disabled for GPU
 			if      (corr_mode == 0) extra_disparity = disparity_map[DISPARITY_INDEX_INT][tIndex];
 			else if (corr_mode == 1) extra_disparity = disparity_map[DISPARITY_INDEX_CM][tIndex];
 			else if (corr_mode == 2) extra_disparity = disparity_map[DISPARITY_INDEX_POLY][tIndex];
 			else if (corr_mode == 3) extra_disparity = disparity_map[DISPARITY_INDEX_HOR][tIndex];  // not used in lwir
 			else if (corr_mode == 4) extra_disparity = disparity_map[DISPARITY_INDEX_VERT][tIndex];  // not used in lwir
 			if (Double.isNaN(extra_disparity)) extra_disparity = 0;  // used in lwir
-*/
+		 */
-								if (Double.isNaN(disparity_map[DISPARITY_STRENGTH_INDEX][tIndex])) {
-									System.out.println("BUG: 3. disparity_map[DISPARITY_STRENGTH_INDEX][tIndex] should not be NaN");
 	} 
-								// only debug is left
-								// old (per-color correlation)
-								// removed
-							} // end of tile
-						}
-					};
-				}
-				startAndJoin(threads);
-			} else {
-				// no correlation tiles to process
-			}
-		}
-		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
-			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
-		}
-/*
-//		final double [][] dbg_distort = debug_distort? (new double [4*quad][tilesX*tilesY]) : null;
-		if ((dbg_distort != null) &&(globalDebugLevel >=0)) {
-			(new ShowDoubleFloatArrays()).showArrays(dbg_distort,  tilesX, tilesY, true, "disparity_distortions"); // , dbg_titles);
-		}
-		if (dbg_ports_coords != null) {
-			(new showDoubleFloatArrays()).showArrays(dbg_ports_coords,  tilesX, tilesY, true, "ports_coordinates", dbg_titles);
-		}
-*/
-//		return clt_data;
-	}
 }
--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -1281,6 +1281,9 @@ public class QuadCLT extends QuadCLTCPU {
 				  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 				  tile_op,                       // per-tile operation bit codes
 				  disparity_array,               // clt_parameters.disparity,     // final double            disparity,
+				  null,							 // final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+				  null, 						 //	final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+					                             // each of the top elements may be null to skip particular combo type
 				  //// Uses quadCLT from gpuQuad			
 				                                 // correlation results - final and partial
 				  clt_corr_combo,                // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
@@ -1523,6 +1526,9 @@ public class QuadCLT extends QuadCLTCPU {
 				  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 				  tile_op,                       // per-tile operation bit codes
 				  disparity_array,               // clt_parameters.disparity,     // final double            disparity,
+				  null,							 // final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+				  null, 						 //	final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+					                             // each of the top elements may be null to skip particular combo type
 				  //// Uses quadCLT from gpuQuad			
 				                                 // correlation results - final and partial
 				  null,                          // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
@@ -1838,6 +1844,9 @@ public class QuadCLT extends QuadCLTCPU {
 				  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 				  tile_op,                       // per-tile operation bit codes
 				  disparity_array,               // clt_parameters.disparity,     // final double            disparity,
+				  null,							 // final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+				  null, 						 //	final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+					                             // each of the top elements may be null to skip particular combo type
 				  //// Uses quadCLT from gpuQuad			
 				                                 // correlation results - final and partial
 				  null,                          // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate
@@ -1951,6 +1960,9 @@ public class QuadCLT extends QuadCLTCPU {
 				  1,                             // final int  macro_scale, // to correlate tile data instead of the pixel data: 1 - pixels, 8 - tiles
 				  tile_op,                       // per-tile operation bit codes
 				  disparity_array,               // clt_parameters.disparity,     // final double            disparity,
+				  null,							 // final float  [][][][]     corr_td,         // [tilesY][tilesX][pair][4*64] transform domain representation of 6 corr pairs
+				  null, 						 //	final float  [][][][]     corr_combo_td,   // [4][tilesY][tilesX][pair][4*64] TD of combo corrs: qud, cross, hor,vert
+					                             // each of the top elements may be null to skip particular combo type
 				  //// Uses quadCLT from gpuQuad			
 				                                 // correlation results - final and partial
 				  null,                          // [type][tilesY][tilesX][(2*transform_size-1)*(2*transform_size-1)] // if null - will not calculate

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -8303,6 +8303,19 @@ if (debugLevel > -100) return true; // temporarily !
 					System.out.println("Adjusting main camera image set for "+quadCLT_main.image_name+
 							", pass "+(num_adjust_main+1)+" of "+adjust_main);
 				}
+				if (debugLevel > -5){
+					int scan_index =  quadCLT_main.tp.clt_3d_passes.size() -1;
+					quadCLT_main.tp.showScan(
+							quadCLT_main.tp.clt_3d_passes.get(scan_index),   // CLTPass3d   scan,
+							"pre-adjust-extrinsic-scan-"+scan_index); //String title)
+					for (int s = 0; (s < 5) && (s < scan_index); s++) {
+						quadCLT_main.tp.showScan(
+								quadCLT_main.tp.clt_3d_passes.get(s),   // CLTPass3d   scan,
+								"pre-adjust-extrinsic-scan-"+s); //String title)
+					}
+				}
+				/*
 				boolean ok = quadCLT_main.extrinsicsCLT(
 						clt_parameters, // EyesisCorrectionParameters.CLTParameters           clt_parameters,
 						false, // adjust_poly,
@@ -8312,6 +8325,7 @@ if (debugLevel > -100) return true; // temporarily !
 // clear memory for main
 				quadCLT_main.tp.resetCLTPasses();
 				if (!ok) break;
+				*/
 			}
 			// Generate 4 main camera images and thumbnail
@@ -8378,6 +8392,15 @@ if (debugLevel > -100) return true; // temporarily !
 						quadCLT_main.tp.clt_3d_passes.get( quadCLT_main.tp.clt_3d_passes.size() -1),
 						false); // boolean force_final);
+				  if (debugLevel > -5){
+					  int scan_index =  quadCLT_main.tp.clt_3d_passes.size() -1;
+					  quadCLT_main.tp.showScan(
+							  quadCLT_main.tp.clt_3d_passes.get(scan_index),   // CLTPass3d   scan,
+							  "test_pre-after-"+scan_index); //String title)
+				  }
 				dsi[DSI_DISPARITY_MAIN] = main_last_scan[0];
 				dsi[DSI_STRENGTH_MAIN] =  main_last_scan[1];
 				if (quadCLT_main.correctionsParameters.clt_batch_dsi) { // Should be always enabled ?
@@ -8479,6 +8502,16 @@ if (debugLevel > -100) return true; // temporarily !
 			// 2) Prepare full D/S and FG/BG data to be embedded within the ML files
 			double [][] main_ds = {dsi[DSI_DISPARITY_MAIN], dsi[DSI_STRENGTH_MAIN]};
+			if ((adjust_aux == 0) &&
+					!quadCLT_main.correctionsParameters.clt_batch_4img_aux &&
+					!quadCLT_main.correctionsParameters.clt_batch_dsi_aux &&
+					!quadCLT_main.correctionsParameters.clt_batch_genMl &&
+					!quadCLT_main.correctionsParameters.clt_batch_save_extrinsics &&
+					!quadCLT_main.correctionsParameters.clt_batch_save_all) {
+				continue; 
+			}
 			quadCLT_aux.ds_from_main = quadCLT_aux.depthMapMainToAux( // only 2 layers for adjustments
 					main_ds, // double [][] ds,
 					quadCLT_main.getGeometryCorrection(), //  GeometryCorrection geometryCorrection_main,