first update for modified GPU code, got

75da8cbd · Andrey Filippov · ca083ffb · 75da8cbd · 75da8cbd · 75da8cbd
Commit 75da8cbd authored Dec 03, 2021 by Andrey Filippov
19 changed files
--- a/pom.xml
+++ b/pom.xml
@@ -43,7 +43,8 @@
            <dependency>
                <groupId>org.jcuda</groupId>
                <artifactId>jcuda</artifactId>
-                <version>10.1.0</version>
+                <!-- <version>10.1.0</version> -->
+                <version>11.2.0</version>
            </dependency>
            <!--
            	As of 2018/09/11 TF for GPU on Maven supports CUDA 9.0 (vs latest 9.2)

--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
--- a/src/main/java/com/elphel/imagej/gpu/ExportForGPUDevelopment.java
+++ b/src/main/java/com/elphel/imagej/gpu/ExportForGPUDevelopment.java
@@ -65,7 +65,7 @@ public class ExportForGPUDevelopment {
 			} // boolean transpose);
 			// make it same length of 16 sensors (for fixed-size struct gc in GPU kernel code 
-			GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.NUM_CAMS) ;
+			GeometryCorrection ext_gc = quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.MAX_NUM_CAMS) ;
 			try {
 //				quadCLT.getGeometryCorrection().saveFloatsGPU(kernel_dir + (quadCLT.isAux()?"aux":"main"));
@@ -184,7 +184,7 @@ public class ExportForGPUDevelopment {
 			}
 		}
 	}
+	@Deprecated
 	public static void saveFloatKernelsBigEndian(String file_prefix, // never used
 			double [][][][][][] clt_kernels,
 			double [][][]       image_data,

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -55,7 +55,10 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
+import com.elphel.imagej.tileprocessor.Correlation2d;
 import ij.IJ;
+import ij.text.TextWindow;
 import jcuda.Pointer;
 import jcuda.driver.CUcontext;
 import jcuda.driver.CUdevice;
@@ -69,7 +72,6 @@ import jcuda.nvrtc.JNvrtc;
 import jcuda.nvrtc.nvrtcProgram;
 public class GPUTileProcessor {
-	static int CORR_VECTOR_MAX_LENGTH =19; // TODO: update to fit for 16-sensor
 	String LIBRARY_PATH = "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a"; // linux
 	static String GPU_RESOURCE_DIR =              "kernels";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
@@ -91,9 +93,10 @@ public class GPUTileProcessor {
 	public static int DTT_SIZE_LOG2 =             3;
 	public static int DTT_SIZE =                  (1 << DTT_SIZE_LOG2);
 	static int        THREADSX =                  DTT_SIZE;
-	public static int NUM_CAMS =                  16; // 4; Now - maximal number of sensors
+	public static int MAX_NUM_CAMS =             16; // 4; Now - maximal number of sensors
-	public static int NUM_PAIRS =                 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
+	public static int CORR_VECTOR_MAX_LENGTH =   3 + 4 * MAX_NUM_CAMS; // 67; // 19; // TODO: update to fit for 16-sensor
-	public static int NUM_COLORS =                3;
+//	public static int NUM_PAIRS =                 6; // top hor, bottom hor, left vert, right vert, main diagonal, other diagonal
+//	public static int NUM_COLORS =                3;
 //	public static int IMG_WIDTH =              2592;
 //	public static int IMG_HEIGHT =             1936;
 	static int        KERNELS_HOR =             164;
@@ -105,14 +108,17 @@ public class GPUTileProcessor {
 	static int        CORR_TILES_PER_BLOCK	=     4;
 	static int        CORR_TILES_PER_BLOCK_NORMALIZE = 4; // maybe change to 8?	
 	static int        CORR_TILES_PER_BLOCK_COMBINE = 4; // increase to 16?
+	static int        NUM_THREADS              = 32;
 	static int        TEXTURE_THREADS_PER_TILE =  8; // 16;
 	static int        TEXTURE_TILES_PER_BLOCK =   1;
 	static int        IMCLT_THREADS_PER_TILE =   16;
 	static int        IMCLT_TILES_PER_BLOCK =     4;
-	static int        TPTASK_SIZE =                 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats
+//	static int        TPTASK_SIZE =                 1+ 1+ NUM_CAMS * 2 + 1 + NUM_CAMS * 4 ; // tp_task structure size in floats
 	static int        CLTEXTRA_SIZE =               8;
 	static int        CORR_SIZE =                   (2* DTT_SIZE - 1) * (2* DTT_SIZE - 1); // 15x15
 	public static int CORR_NTILE_SHIFT =          8;  // also for texture tiles list
+	// FIXME: CORR_PAIRS_MASK will not work !!!	
 	public static int CORR_PAIRS_MASK =        0x3f;  // lower bits used to address correlation pair for the selected tile
 	public static int CORR_TEXTURE_BIT =          7;  // bit 7 used to request texture for the tile
 	public static int TASK_CORR_BITS =            4;  // start of pair mask
@@ -129,7 +135,7 @@ public class GPUTileProcessor {
 	public static int RBYRDIST_LEN =           5001; //for double, 10001 - float;   // length of rByRDist to allocate shared memory
 	public static double RBYRDIST_STEP =          0.0004; //  for double, 0.0002 - for float; // to fit into GPU shared memory (was 0.001);
-	public static int TILES_PER_BLOCK_GEOM =     32/NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
+	public static int TILES_PER_BLOCK_GEOM =     32/MAX_NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
 	public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT));
@@ -139,17 +145,17 @@ public class GPUTileProcessor {
    public boolean kernels_set = false;
    public boolean bayer_set =   false;
-    CUfunction GPU_CONVERT_DIRECT_kernel =          null;
+    CUfunction GPU_CONVERT_DIRECT_kernel =          null; // "convert_direct"
-    CUfunction GPU_IMCLT_ALL_kernel =               null;
+    CUfunction GPU_IMCLT_ALL_kernel =               null; // "imclt_rbg_all"
-    CUfunction GPU_CORRELATE2D_kernel =             null;
+    CUfunction GPU_CORRELATE2D_kernel =             null; // "correlate2D"
-    CUfunction GPU_CORR2D_COMBINE_kernel =          null;
+    CUfunction GPU_CORR2D_COMBINE_kernel =          null; // "corr2D_combine"
-    CUfunction GPU_CORR2D_NORMALIZE_kernel =        null;
+    CUfunction GPU_CORR2D_NORMALIZE_kernel =        null; // "corr2D_normalize";
-    CUfunction GPU_TEXTURES_kernel =                null;
+    CUfunction GPU_TEXTURES_kernel =                null; // "textures_nonoverlap"
-    CUfunction GPU_RBGA_kernel =                    null;
+    CUfunction GPU_RBGA_kernel =                    null; // "generate_RBGA"
-    CUfunction GPU_ROT_DERIV_kernel =               null;
+    CUfunction GPU_ROT_DERIV_kernel =               null; // "calc_rot_deriv"
-//    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null;
+//    private CUfunction GPU_SET_TILES_OFFSETS_kernel =       null; // "get_tiles_offsets"
-    CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null;
+    CUfunction GPU_CALCULATE_TILES_OFFSETS_kernel = null; // "calculate_tiles_offsets"
-    CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;
+    CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null; // "calcReverseDistortionTable"
    CUmodule    module; // to access constants memory
    //    private
@@ -175,9 +181,9 @@ public class GPUTileProcessor {
        return"#define JCUDA\n"+
        				"#define DTT_SIZE_LOG2 " +                  DTT_SIZE_LOG2+"\n"+
        				"#define THREADSX " +                       THREADSX+"\n"+
-        				"#define NUM_CAMS " +                       NUM_CAMS+"\n"+
+        				"#define NUM_CAMS " +                       MAX_NUM_CAMS+"\n"+
-        				"#define NUM_PAIRS " +                      NUM_PAIRS+"\n"+
+//        				"#define NUM_PAIRS " +                      NUM_PAIRS+"\n"+
-        				"#define NUM_COLORS " +                     NUM_COLORS+"\n"+
+//        				"#define NUM_COLORS " +                     NUM_COLORS+"\n"+
        				"#define KERNELS_LSTEP " +                  KERNELS_LSTEP+"\n"+
        				"#define THREADS_PER_TILE " +               THREADS_PER_TILE+"\n"+
        				"#define TILES_PER_BLOCK " +                TILES_PER_BLOCK+"\n"+
@@ -185,12 +191,13 @@ public class GPUTileProcessor {
        				"#define CORR_TILES_PER_BLOCK " +           CORR_TILES_PER_BLOCK+"\n"+
        				"#define CORR_TILES_PER_BLOCK_NORMALIZE " + CORR_TILES_PER_BLOCK_NORMALIZE+"\n"+
        				"#define CORR_TILES_PER_BLOCK_COMBINE " +   CORR_TILES_PER_BLOCK_COMBINE+"\n"+
+        				"#define NUM_THREADS " +                    NUM_THREADS+"\n"+
        				"#define TEXTURE_THREADS_PER_TILE " +       TEXTURE_THREADS_PER_TILE+"\n"+
        				"#define TEXTURE_TILES_PER_BLOCK " +        TEXTURE_TILES_PER_BLOCK+"\n"+
        				"#define IMCLT_THREADS_PER_TILE " +         IMCLT_THREADS_PER_TILE+"\n"+
        				"#define IMCLT_TILES_PER_BLOCK " +          IMCLT_TILES_PER_BLOCK+"\n"+
        				"#define CORR_NTILE_SHIFT " +               CORR_NTILE_SHIFT+"\n"+
-        				"#define CORR_PAIRS_MASK " +                CORR_PAIRS_MASK+"\n"+
+//        				"#define CORR_PAIRS_MASK " +                CORR_PAIRS_MASK+"\n"+
        				"#define CORR_TEXTURE_BIT " +               CORR_TEXTURE_BIT+"\n"+
        				"#define TASK_CORR_BITS " +                 TASK_CORR_BITS+"\n"+
        				"#define TASK_TEXTURE_N_BIT " +             TASK_TEXTURE_N_BIT+"\n"+
@@ -240,7 +247,7 @@ public class GPUTileProcessor {
        ClassLoader classLoader = getClass().getClassLoader();
        String [] kernelSources = new String[GPU_SRC_FILES.length];
+        boolean show_source = false; // true;
        for (int cunit = 0; cunit < kernelSources.length; cunit++) {
        	kernelSources[cunit] = ""; // use StringBuffer?
            for (String src_file:GPU_SRC_FILES[cunit]) {
@@ -267,7 +274,17 @@ public class GPUTileProcessor {
                	kernelSources[cunit] += sourceFile;
            	}
            }
+            if (show_source) {
+//            	String lines[] = kernelSources[cunit].split("\\r?\\n");
+//            	String body = "";
+//            	for (int l = 0; l < lines.length; l++) {
+//            		body += (l+1)+"\t"+lines[l]+"\n";
+//            	}
+//            	new TextWindow("GPU_Source_Code", "#\tline", body,400,800);
+            	new TextWindow("GPU_Source_Code", "", kernelSources[cunit],400,800);
+            }
        }
        // Create the kernel functions (first - just test)
        String [] func_names = {
        		GPU_CONVERT_DIRECT_NAME,
@@ -319,13 +336,15 @@ public class GPUTileProcessor {
    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other","quad","cross"};
    }
    public static double [][] getCorr2DView(
+    		int num_sensors,
    		int tilesX,
    		int tilesY,
    		int [] indices,
    		float [][] corr2d,
    		int [] wh){ // if is [2] - return width, height
+    	int max_num_pairs = Correlation2d.getNumPairs(num_sensors);
    	if ((corr2d == null) || (corr2d.length == 0)) {
-    		return new double [NUM_PAIRS][0];
+    		return new double [max_num_pairs][0];
    	}
    	int num_pairs = -1; // corr2d.length;
 		for (int n = 0; n < indices.length; n++) {
@@ -334,7 +353,7 @@ public class GPUTileProcessor {
 		}
 		num_pairs++;
 		if (num_pairs < 1) {
-    		return new double [NUM_PAIRS][0];
+    		return new double [max_num_pairs][0];
 		}
    	boolean [] bpairs = new boolean[num_pairs];
 		for (int n = 0; n < indices.length; n++) {

--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
--- a/src/main/java/com/elphel/imagej/gpu/TpTask.java
+++ b/src/main/java/com/elphel/imagej/gpu/TpTask.java
@@ -6,10 +6,18 @@ public class TpTask {
    public int        num_sensors = 4;
 	public int        ty;
 	public int        tx;
+	public float []   centerXY = {Float.NaN,Float.NaN};
 	public float[][]  xy = null;
 	public float[][]  xy_aux = null;
 	public float [][] disp_dist = null;
 //	public float      weight;
+	public static int getSize(int num_sensors) {
+		return 5 + 2* num_sensors + 4 * num_sensors;
+	}
+	public int getSize() {
+		return 5 + 2* num_sensors + 4 * num_sensors;
+	}
 	public TpTask(
 			int num_sensors,
@@ -30,15 +38,21 @@ public class TpTask {
 	}
 	/**
 	 * Initialize from the float array (read from the GPU)
+	 * @param num_sensors number of sesnors in an array
 	 * @param flt float array containing tasks data
 	 * @param indx task number to use
+	 * @param use_aux (always false now)
 	 */
-	public TpTask(float [] flt, int indx, boolean use_aux)
+	public TpTask(int num_sensors, float [] flt, int task_indx, boolean use_aux)
 	{
-		task =    Float.floatToIntBits(flt[indx++]);
+		int indx = task_indx * getSize(num_sensors);
-		int txy = Float.floatToIntBits(flt[indx++]);
+		task =    Float.floatToIntBits(flt[indx++]); // 0
+		int txy = Float.floatToIntBits(flt[indx++]); // 1
 		ty = txy >> 16;
 		tx = txy & 0xffff;
+		target_disparity = flt[indx++];              // 2
+		centerXY[0] = flt[indx++];                   // 3
+		centerXY[1] = flt[indx++];                   // 4
 		if (use_aux) {
 			xy_aux = new float[num_sensors][2];
    		for (int i = 0; i < num_sensors; i++) {
@@ -52,7 +66,6 @@ public class TpTask {
    			xy[i][1] = flt[indx++];
    		}
 		}
-		target_disparity = flt[indx++];
 		disp_dist = new float [num_sensors][4];
 		for (int i = 0; i < num_sensors; i++) {
 			for (int j = 0; j < 4; j++) {
@@ -123,8 +136,6 @@ public class TpTask {
 		return dXY;
 	}
 	public int getTileY(){
 		return ty;
 	}
@@ -137,17 +148,36 @@ public class TpTask {
 	public double getTargetDisparity() {
 		return target_disparity;
 	}
+	public float [] getCenterXY() {
+		return centerXY;
+	}
+	public double [] getDoubleCenterXY() {
+		return new double [] {centerXY[0],centerXY[1]};
+	}
+	public void setCenterXY(double [] centerXY) {
+		this.centerXY = new float [] {(float) centerXY[0],(float) centerXY[1]};
+	}
 	// convert this class instance to float array to match layout of the C struct
 	public float [] asFloatArray(boolean use_aux) {
-		float [] flt = new float [GPUTileProcessor.TPTASK_SIZE];
+		float [] flt = new float [getSize()];
 		return asFloatArray(flt, 0, use_aux);
 	}
 	// convert this class instance to float array to match layout of the C struct,
 	// fill existing float array from the specified index
-	public float [] asFloatArray(float [] flt, int indx, boolean use_aux) {
+	public float [] asFloatArray(float [] flt, int 	task_indx, boolean use_aux) {
-		flt[indx++] = Float.intBitsToFloat(task);
+		int indx = task_indx * getSize(num_sensors);
-		flt[indx++] = Float.intBitsToFloat(tx + (ty << 16));
+		flt[indx++] = Float.intBitsToFloat(task);            // 0
+		flt[indx++] = Float.intBitsToFloat(tx + (ty << 16)); // 1
+		flt[indx++] = this.target_disparity;                 // 2
+		flt[indx++] = centerXY[0];                           // 3
+		flt[indx++] = centerXY[1];                           // 4
 		float [][] offsets = use_aux? this.xy_aux: this.xy;
 		for (int i = 0; i < num_sensors; i++) {
 			if (offsets != null) {
@@ -157,9 +187,8 @@ public class TpTask {
 				indx+= 2;
 			}
 		}
-		flt[indx++] = this.target_disparity;
 		/*
-		for (int i = 0; i < NUM_CAMS; i++) { // actually disp_dist will be initialized by the GPU
+		for (int i = 0; i < num_sensors; i++) { // actually disp_dist will be initialized by the GPU
 			indx+= 4;
 			flt[indx++] = disp_dist[i][0];
 			flt[indx++] = disp_dist[i][1];

--- a/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
@@ -323,6 +323,7 @@ public class GeometryCorrection {
 		System.arraycopy(woi_tops,  0, egc.woi_tops,0, min_nc);
 		for (int n = 0; n < min_nc; n++) {
 			egc.pXY0[n] = pXY0[n].clone(); 
+			egc.rXY[n] =  rXY[n].clone(); 
 		}
 		CorrVector cv = new CorrVector (
 				egc, // GeometryCorrection geometryCorrection,

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDttCPU.java
@@ -7744,6 +7744,8 @@ public class ImageDttCPU {
 		}
 		final int num_combo = (dcorr_combo == null)? 0 : dcorr_combo.length;
 		final int corr_length = (2 * transform_size - 1) * (2 * transform_size - 1);
+		// FIXME: will not work with combining pairs !!!
+		final int num_pairs = Correlation2d.getNumPairs(quadCLT.getNumSensors());
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
 				@Override
@@ -7765,7 +7767,7 @@ public class ImageDttCPU {
 						int tileX = tIndex % tilesX;
 						if (dcorr[tileY][tileX] != null) {
 							// added quad and cross combos
-							double [][]  corrs = new double [GPUTileProcessor.NUM_PAIRS + num_combo][corr_length]; // 225-long (15x15)
+							double [][]  corrs = new double [num_pairs + num_combo][corr_length]; // 225-long (15x15)
 							// Prepare the same (currently 10-layer) corrs as double [][], as in CPU version
 							int pair_mask = 0;
 							if (dcorr != null) {
@@ -7780,7 +7782,7 @@ public class ImageDttCPU {
 							if (num_combo > 0) {
 								for (int pair_combo = 0; pair_combo < dcorr_combo.length; pair_combo++) {
-									corrs[GPUTileProcessor.NUM_PAIRS + pair_combo] = dcorr_combo[pair_combo][tileY][tileX];
+									corrs[num_pairs + pair_combo] = dcorr_combo[pair_combo][tileY][tileX];
 								}
 							}
 							if (corr_tiles != null) {
@@ -10098,7 +10100,8 @@ public class ImageDttCPU {
 			for (int i = 0; i < transform_size2; i++){
 				System.arraycopy(image_data[chn_img], (ctile_top + i) * width + ctile_left, tile_in, transform_size2 * i, transform_size2);
 			}
-		} else { // copy by 1
+		} else { // copy by 1 ASSUMES Bayer (extends in pairs, incorrect for MONO)
+/*			
 			for (int i = 0; i < transform_size2; i++){
 				int pi = ctile_top + i;
 				if      (pi < 0)       pi &= 1;
@@ -10110,6 +10113,22 @@ public class ImageDttCPU {
 					tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
 				}
 			}
+*/
+			int extend_c = isMonochrome()?0:1;
+			for (int i = 0; i < transform_size2; i++){
+				int pi = ctile_top + i;
+				if      (pi < 0)       pi &= extend_c; // 1;
+//				else if (pi >= height) pi = height - 2 + (pi & 1);
+				else if (pi >= height) pi = height - 1 - extend_c + (pi & extend_c);
+				for (int j = 0; j < transform_size2; j++){
+					int pj = ctile_left + j;
+					if      (pj < 0)      pj &= extend_c; // 1;
+//					else if (pj >= width) pj = width - 2 + (pj & 1);
+					else if (pj >= width) pj = width - 1 - extend_c + (pj & extend_c);
+					tile_in[transform_size2 * i + j] = image_data[chn_img][pi * width + pj];
+				}
+			}
 		}
 		if (debug_gpu) {
 			System.out.println("---Image tile for color="+chn_img+"---");
@@ -10382,7 +10401,7 @@ public class ImageDttCPU {
 		if (!debug_fpga) {
 			for (int dct_mode = 0; dct_mode < 4; dct_mode++) {
-				if (fold_coeff != null){
+				if (fold_coeff != null){ // not null w/o debug
 					clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode, fold_coeff); // DCCT, DSCT, DCST, DSST
 				} else {
 					clt_tile[dct_mode] = dtt.fold_tile (tile_in, transform_size, dct_mode); // DCCT, DSCT, DCST, DSST

--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
@@ -4319,11 +4319,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 			// experimental, not currently used
 			if (fat_zero_pre >= 0.0) {
+				// FIXME: will not work with combining pairs !!!
+				int num_pairs = Correlation2d.getNumPairs(numSens);
 				ImageDtt.corr_td_normalize(
 						fcorrs_td[nscene], // final float [][][][] fcorr_td, // will be updated
 						// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
 						// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
-						GPUTileProcessor.NUM_PAIRS, // final int            num_slices,
+						num_pairs, // GPUTileProcessor.NUM_PAIRS, // final int            num_slices,
 						image_dtt.transform_size,   // final int            transform_size,
 						fat_zero_pre, // final double         fat_zero_abs,
 						output_amplitude, // final double         output_amplitude,
@@ -4529,11 +4531,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 			final float [][][][] fcorr_td_dbg =       (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
 			final float [][][][] fcorr_combo_td_dbg = (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
 			int [] wh = new int[2];
+			// FIXME: will not work with combining pairs !!!
+			int num_pairs = Correlation2d.getNumPairs(numSens);
 			float [][] dbg_corr = ImageDtt.corr_td_dbg(
 					fcorr_td_dbg,               // final float [][][][] fcorr_td,
 					// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
 					// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
-					GPUTileProcessor.NUM_PAIRS, // final int            num_slices,
+					num_pairs,                  // final int            num_slices,
 					image_dtt.transform_size,   // final int            transform_size,
 					wh,                         // final int []         wh, // should be initialized as int[2];
 					threadsMax); // final int            threadsMax)     // maximal number of threads to launch
@@ -5492,11 +5496,13 @@ public double[][] correlateIntersceneDebug( // only uses GPU and quad
 			final float [][][][] fcorr_td_dbg =       (indx_corr < 0) ? fcorr_td : fcorrs_td[indx_corr];
 			final float [][][][] fcorr_combo_td_dbg = null; // (indx_corr < 0) ? fcorr_combo_td : fcorrs_combo_td[indx_corr];
 			int [] wh = new int[2];
+			// FIXME: will not work with combining pairs !!!
+			int num_pairs = Correlation2d.getNumPairs(numSens);
 			float [][] dbg_corr = ImageDtt.corr_td_dbg(
 					fcorr_td_dbg,               // final float [][][][] fcorr_td,
 					// if 0 - fcorr_combo_td = new float[4][tilesY][tilesX][];
 					// if > 0 - fcorr_td =       new float[tilesY][tilesX][num_slices][];
-					GPUTileProcessor.NUM_PAIRS, // final int            num_slices,
+					num_pairs,                  // final int            num_slices,
 					image_dtt.transform_size,   // final int            transform_size,
 					wh,                         // final int []         wh, // should be initialized as int[2];
 					threadsMax); // final int            threadsMax)     // maximal number of threads to launch

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLTCPU.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLTCPU.java
@@ -115,7 +115,7 @@ public class QuadCLTCPU {
 	static String []                                       fine_corr_dir_names = {"X","Y"};
 	public static String                                   PREFIX =     "EYESIS_DCT.";    // change later (first on save)
 	public static String                                   PREFIX_AUX = "EYESIS_DCT_AUX."; // change later (first on save)
-	static int                                             QUAD =  4; // number of cameras
+//	static int                                             QUAD =  4; // number of cameras
 	public Properties                                      properties = null;
 	public EyesisCorrections                               eyesisCorrections = null;
@@ -1621,7 +1621,7 @@ public class QuadCLTCPU {
 				pXY0);
 		geometryCorrection.planeProjectLenses(); // project all lenses to the common plane
-		// calcualte reverse distortion as a table to be linear interpolated (now cubic!)
+		// calculate reverse distortion as a table to be linear interpolated (now cubic!)
 		geometryCorrection.calcReverseDistortionTable();
 //		if (numSensors == 4){
@@ -7032,8 +7032,8 @@ public class QuadCLTCPU {
 				  this.correctionsParameters.blueProc,
 				  debugLevel);
 		  if (debugLevel > 1) System.out.println("Processed colors to YPbPr, total number of slices="+stack.getSize());
-		  if (saveShowIntermediate && (debugLevel > 1)) {
+		  if (saveShowIntermediate && (stack != null) && (debugLevel > 1)) {
-			  ImagePlus imp_dbg=new ImagePlus("procColors",stack);
+			  ImagePlus imp_dbg=new ImagePlus("procColors",stack);  // null
 			  eyesisCorrections.saveAndShow(
 					  imp_dbg,
 					  this.correctionsParameters);
@@ -13367,8 +13367,10 @@ public class QuadCLTCPU {
 				  IJ.d2s(0.000000001*(System.nanoTime()-this.startSetTime),3)+" sec, --- Free memory="+Runtime.getRuntime().freeMemory()+" (of "+Runtime.getRuntime().totalMemory()+")");
 			  }
 			  this.startSetTime = System.nanoTime();
-			  boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null;
+//			  boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[QUAD][] : null;
-			  double [] scaleExposures = new double[QUAD]; //
+//			  double [] scaleExposures = new double[QUAD]; //
+			  boolean [][] saturation_imp = (clt_parameters.sat_level > 0.0)? new boolean[getNumSensors()][] : null;
+			  double [] scaleExposures = new double[getNumSensors()]; //
 			  ImagePlus [] imp_srcs = conditionImageSetBatch(
 					  nSet,               // final int                           nSet, // index of the 4-image set
 					  clt_parameters,     // final EyesisCorrectionParameters.CLTParameters           clt_parameters,

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -1384,13 +1384,14 @@ public class TwoQuadCLT {
 						port_xy_main_dbg,                     // final double [][][]       port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development)
 						port_xy_aux_dbg);                     // final double [][][]       port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development)
-		int numSensors = GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux
+		int numSensors = quadCLT_main.getNumSensors(); // GPUTileProcessor.NUM_CAMS; // Wrong - different for main and aux
-		String [] sub_titles = new String [numSensors * (GPUTileProcessor.NUM_COLORS+1)];
+		int num_colors_main = quadCLT_main.isMonochrome()?1:3; 
+		String [] sub_titles = new String [numSensors * (num_colors_main+1)];
 		double [][] sub_disparity_map = new double [sub_titles.length][];
 		for (int ncam = 0; ncam < numSensors; ncam++) {
 			sub_disparity_map[ncam] = disparity_map[ncam + ImageDtt.IMG_DIFF0_INDEX];
 			sub_titles[ncam] = ImageDtt.getDisparityTitles(numSensors)[ncam + ImageDtt.IMG_DIFF0_INDEX];
-			for (int ncol = 0; ncol < GPUTileProcessor.NUM_COLORS; ncol++) {
+			for (int ncol = 0; ncol < num_colors_main; ncol++) {
 				sub_disparity_map[ncam + (ncol + 1)* numSensors] =
 						disparity_map[ncam +ncol* numSensors+ ImageDtt.getImgToneRGB(numSensors)];
 				sub_titles[ncam + (ncol + 1)* numSensors] =
@@ -1407,7 +1408,7 @@ public class TwoQuadCLT {
 		// Create list of all correlation pairs
 		double [][][][][][] clt_data = clt_bidata[0];
 		int numTiles = tilesX * tilesY;
-		int numPairs = GPUTileProcessor.NUM_PAIRS;
+		int numPairs = Correlation2d.getNumPairs(quadCLT_main.getNumSensors()); // GPUTileProcessor.NUM_PAIRS;
 		int [] corr_indices = new int [numTiles * numPairs];
 		int indx=0;
 		for (int i = 0; i < numTiles; i++) {
@@ -1430,6 +1431,7 @@ public class TwoQuadCLT {
 		}
 		int [] wh = new int[2];
 		double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+				quadCLT_main.getNumSensors(),
 	    		tilesX,
 	    		tilesY,
 	    		corr_indices,

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -44,10 +44,13 @@
 extern "C" __global__ void convert_direct( // called with a single block, single thread
 		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+		int                num_cams,           // actual number of cameras
+		int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
 		float           ** gpu_kernel_offsets, // [NUM_CAMS],
 		float           ** gpu_kernels,        // [NUM_CAMS],
 		float           ** gpu_images,         // [NUM_CAMS],
-		struct tp_task   * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
@@ -60,15 +63,21 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		int *              pnum_active_tiles,  //  indices to gpu_tasks
 		int                tilesx);
 extern "C" __global__ void correlate2D(
+		int               num_cams,
+//		int *             sel_pairs,
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero,           // here - absolute
-		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
 		int               num_tiles,          // number of tiles in task
 		int               tilesx,             // number of tile rows
 		int             * gpu_corr_indices,   // packed tile+pair
@@ -99,7 +108,9 @@ extern "C" __global__ void corr2D_combine(
 		float           * gpu_corrs_combo);   // combined correlation output (one per tile)
 extern "C" __global__ void textures_nonoverlap(
-		struct tp_task  * gpu_tasks,
+		int               num_cams,           // number of cameras
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+		//		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
 //		int               num_tilesx,         // number of tiles in a row
 // declare arrays in device code?
@@ -121,6 +132,7 @@ extern "C" __global__ void textures_nonoverlap(
 extern "C"
 __global__ void imclt_rbg_all(
+		int                num_cams,
 		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
 		int                apply_lpf,
@@ -142,8 +154,10 @@ extern "C" __global__ void imclt_rbg(
 		const size_t      dstride);            // in floats (pixels)
 extern "C" __global__ void generate_RBGA(
+		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
-		struct tp_task   * gpu_tasks,
+		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
 		// declare arrays in device code?
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)

--- a/src/main/resources/kernels/dtt8x8.cu
+++ b/src/main/resources/kernels/dtt8x8.cu
@@ -951,6 +951,7 @@ __device__ void dttiv_color_2d(
    dctiv_nodiverg( // all colors
 			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
 			1); //int inc);
+//	__syncthreads();// worsened
    if (color == BAYER_GREEN){
        dstiv_nodiverg( // all colors
 				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
@@ -969,6 +970,7 @@ __device__ void dttiv_color_2d(
    dctiv_nodiverg( // all colors
    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
 			DTT_SIZE1); // int inc,
+//	__syncthreads();// worsened
    if (color == BAYER_GREEN){
          dctiv_nodiverg( // all colors
        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
@@ -977,6 +979,50 @@ __device__ void dttiv_color_2d(
  	 __syncthreads();// __syncwarp();
 }
+__device__ void dttiv_mono_2d(
+		float * clt_tile)
+{
+	// Copy 0-> 1
+    dctiv_nodiverg(
+			clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dstiv_nodiverg(
+    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dctiv_nodiverg(
+			clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dstiv_nodiverg(
+    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+	__syncthreads();// __syncwarp();
+#ifdef DEBUG222
+    if ((threadIdx.x) == 0){
+        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
+    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+    }
+     __syncthreads();// __syncwarp();
+#endif
+ 	dctiv_nodiverg( // CC
+ 			clt_tile + threadIdx.x,
+			DTT_SIZE1); // int inc,
+ 	dctiv_nodiverg( // SC
+ 			clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
+			DTT_SIZE1); // int inc,
+ 	dstiv_nodiverg( // CS
+ 			clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+			DTT_SIZE1); // int inc,
+ 	dstiv_nodiverg( // SS
+ 			clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+			DTT_SIZE1); // int inc,
+  	 __syncthreads();// __syncwarp();
+}
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the

--- a/src/main/resources/kernels/dtt8x8.h
+++ b/src/main/resources/kernels/dtt8x8.h
@@ -88,6 +88,8 @@ extern __device__ void dttii_2d(
 extern __device__ void dttiv_color_2d(
 		float * clt_tile,
 		int color);
+extern __device__ void dttiv_mono_2d(
+		float * clt_tile);
 extern __device__ void imclt(
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile );

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -41,6 +41,7 @@
 #include "tp_defines.h"
 #endif
 #define NVRTC_BUG 1
 #ifndef M_PI
 #define M_PI  3.14159265358979323846 /* pi */
@@ -60,11 +61,17 @@ struct tp_task {
 		int      txy;
 		unsigned short sxy[2];
 	};
-	float xy[NUM_CAMS][2];
 	float target_disparity;
+	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
+	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
+	float xy[NUM_CAMS][2];
 	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
 };
+#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define tp_task_xy_offset 5
+#define tp_task_centerXY_offset 3
 struct corr_vector{
 	float tilt    [NUM_CAMS-1]; // 0..2
 	float azimuth [NUM_CAMS-1]; // 3..5
@@ -142,7 +149,10 @@ struct gc {
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+		int                  num_cams,
+//		struct tp_task     * gpu_tasks,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
@@ -150,7 +160,10 @@ extern "C" __global__ void get_tiles_offsets(
 		trot_deriv   * gpu_rot_deriv);
 extern "C" __global__ void calculate_tiles_offsets(
-		struct tp_task     * gpu_tasks,
+		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+		int                  num_cams,
+		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//		struct tp_task     * gpu_tasks,
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
@@ -160,6 +173,7 @@ extern "C" __global__ void calculate_tiles_offsets(
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
+		int                  num_cams,
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv);
@@ -170,3 +184,4 @@ extern "C" __global__ void calcReverseDistortionTable(
 		float * rByRDist);