tested gpu intrascene

fa5947b6 · Andrey Filippov · 2eae496d · fa5947b6 · fa5947b6
Commit fa5947b6 authored Aug 27, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 364 additions and 56 deletions

GPUTileProcessor.java src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java +298 -51

QuadCLT.java src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java +66 -5

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -100,6 +100,8 @@ public class GPUTileProcessor {
 	static String GPU_CONVERT_DIRECT_NAME =        "convert_direct";      // name in C code
 	static String GPU_IMCLT_ALL_NAME =             "imclt_rbg_all";
 	static String GPU_CORRELATE2D_NAME =           "correlate2D";         // name in C code
+	static String GPU_CORR2D_COMBINE_NAME =        "corr2D_combine";      // name in C code
+	static String GPU_CORR2D_NORMALIZE_NAME =      "corr2D_normalize";    // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_nonoverlap"; // name in C code
 	static String GPU_RBGA_NAME =                  "generate_RBGA";       // name in C code
 	static String GPU_ROT_DERIV =                  "calc_rot_deriv";      // calculate rotation matrices and derivatives
@@ -122,6 +124,8 @@ public class GPUTileProcessor {
 	static int        TILES_PER_BLOCK =           4; // 8 - slower
 	static int        CORR_THREADS_PER_TILE =     8;
 	static int        CORR_TILES_PER_BLOCK	=     4;
+	static int        CORR_TILES_PER_BLOCK_NORMALIZE = 4; // maybe change to 8?	
+	static int        CORR_TILES_PER_BLOCK_COMBINE = 4; // increase to 16?
 	static int        TEXTURE_THREADS_PER_TILE =  8; // 16;
 	static int        TEXTURE_TILES_PER_BLOCK =   1;
 	static int        IMCLT_THREADS_PER_TILE =   16;
@@ -159,6 +163,8 @@ public class GPUTileProcessor {
    private CUfunction GPU_CONVERT_DIRECT_kernel =          null;
    private CUfunction GPU_IMCLT_ALL_kernel =               null;
    private CUfunction GPU_CORRELATE2D_kernel =             null;
+    private CUfunction GPU_CORR2D_COMBINE_kernel =          null;
+    private CUfunction GPU_CORR2D_NORMALIZE_kernel =        null;
    private CUfunction GPU_TEXTURES_kernel =                null;
    private CUfunction GPU_RBGA_kernel =                    null;
    private CUfunction GPU_ROT_DERIV_kernel =               null;
@@ -248,6 +254,8 @@ public class GPUTileProcessor {
        				"#define TILES_PER_BLOCK " +                TILES_PER_BLOCK+"\n"+
        				"#define CORR_THREADS_PER_TILE " +          CORR_THREADS_PER_TILE+"\n"+
        				"#define CORR_TILES_PER_BLOCK " +           CORR_TILES_PER_BLOCK+"\n"+
+        				"#define CORR_TILES_PER_BLOCK_NORMALIZE " + CORR_TILES_PER_BLOCK_NORMALIZE+"\n"+
+        				"#define CORR_TILES_PER_BLOCK_COMBINE " +   CORR_TILES_PER_BLOCK_COMBINE+"\n"+
        				"#define TEXTURE_THREADS_PER_TILE " +       TEXTURE_THREADS_PER_TILE+"\n"+
        				"#define TEXTURE_TILES_PER_BLOCK " +        TEXTURE_TILES_PER_BLOCK+"\n"+
        				"#define IMCLT_THREADS_PER_TILE " +         IMCLT_THREADS_PER_TILE+"\n"+
@@ -267,8 +275,6 @@ public class GPUTileProcessor {
        				"#define RBYRDIST_LEN " +                   RBYRDIST_LEN+"\n"+
        				"#define RBYRDIST_STEP " +                  RBYRDIST_STEP+"\n"+
        				"#define TILES_PER_BLOCK_GEOM " +           TILES_PER_BLOCK_GEOM+"\n";
-
-
    }

    public GPUTileProcessor(
@@ -338,6 +344,8 @@ public class GPUTileProcessor {
        		GPU_CONVERT_DIRECT_NAME,
        		GPU_IMCLT_ALL_NAME,
        		GPU_CORRELATE2D_NAME,
+        		GPU_CORR2D_COMBINE_NAME,
+        		GPU_CORR2D_NORMALIZE_NAME,
        		GPU_TEXTURES_NAME,
        		GPU_RBGA_NAME,
        		GPU_ROT_DERIV,
@@ -351,16 +359,20 @@ public class GPUTileProcessor {
        GPU_CONVERT_DIRECT_kernel =          functions[0];
        GPU_IMCLT_ALL_kernel =               functions[1];
        GPU_CORRELATE2D_kernel =             functions[2];
-        GPU_TEXTURES_kernel=                 functions[3];
-        GPU_RBGA_kernel=                     functions[4];
-        GPU_ROT_DERIV_kernel =               functions[5];
-        GPU_SET_TILES_OFFSETS_kernel =       functions[6];
-        GPU_CALC_REVERSE_DISTORTION_kernel = functions[7];
+        GPU_CORR2D_COMBINE_kernel =          functions[3];
+        GPU_CORR2D_NORMALIZE_kernel =        functions[4];
+        GPU_TEXTURES_kernel=                 functions[5];
+        GPU_RBGA_kernel=                     functions[6];
+        GPU_ROT_DERIV_kernel =               functions[7];
+        GPU_SET_TILES_OFFSETS_kernel =       functions[8];
+        GPU_CALC_REVERSE_DISTORTION_kernel = functions[9];

        System.out.println("GPU kernel functions initialized");
        System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
        System.out.println(GPU_IMCLT_ALL_kernel.toString());
        System.out.println(GPU_CORRELATE2D_kernel.toString());
+        System.out.println(GPU_CORR2D_COMBINE_kernel.toString());
+        System.out.println(GPU_CORR2D_NORMALIZE_kernel.toString());
        System.out.println(GPU_TEXTURES_kernel.toString());
        System.out.println(GPU_RBGA_kernel.toString());
        System.out.println(GPU_ROT_DERIV_kernel.toString());
@@ -372,7 +384,7 @@ public class GPUTileProcessor {
    

    public static String [] getCorrTitles() {
-    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other"};
+    	return new String []{"hor-top","hor-bottom","vert-left","vert-right","diag-main","diag-other","quad","cross"};
    }
    public static double [][] getCorr2DView(
    		int tilesX,
@@ -383,28 +395,48 @@ public class GPUTileProcessor {
    	if ((corr2d == null) || (corr2d.length == 0)) {
    		return new double [NUM_PAIRS][0];
    	}
+    	int num_pairs = -1; // corr2d.length;
+		for (int n = 0; n < indices.length; n++) {
+			int np = indices[n] & CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
+			if (np > num_pairs) num_pairs = np;
+		}
+		num_pairs++;
+		if (num_pairs < 1) {
+    		return new double [NUM_PAIRS][0];
+		}
+    	boolean [] bpairs = new boolean[num_pairs];
+		for (int n = 0; n < indices.length; n++) {
+			bpairs[indices[n] & CORR_PAIRS_MASK] = true;
+		}    	
+    	int first_pair = -1;
+    	for (int i = 0; (i < bpairs.length) && (first_pair < 0); i++) {
+    		if (bpairs[i]) first_pair = i; 
+    	}
    	
    	int corr_size = (int)(Math.round(Math.sqrt(corr2d[0].length)));//  make smaller later?
+    	
    	int width =  tilesX * (corr_size + 1) + 1;
    	int height = tilesY * (corr_size + 1) + 1;
-    	double [][] data = new double [NUM_PAIRS][];
-    	data[0] = new double[height*width];
+    	double [][] data = new double [num_pairs][];
+    	data[first_pair] = new double[height*width];
    	for (int ty = 0; ty < tilesY; ty++) {
    		for (int tx = 0; tx < tilesX; tx++) {
    			for (int i = 0; i< corr_size; i++) {
    				for (int j = 0; j < corr_size; j++) {
-    					data[0][(ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1)] = Double.NaN;
+    					data[first_pair][(ty * (corr_size + 1) + i + 1) * width + (tx * (corr_size + 1) + j + 1)] = Double.NaN;
+    				}
    			}
    		}
    	}
+		for (int np = first_pair+1; np < num_pairs; np++) {
+			if (bpairs[np]) {
+				data[np] = data[first_pair].clone();
 			}
-		for (int np = 1; np < NUM_PAIRS; np++) {
-			data[np] = data[0].clone();
 		}
 		for (int n = 0; n < indices.length; n++) {
 			int nt = indices[n] >> CORR_NTILE_SHIFT;
 			int np = indices[n] & CORR_PAIRS_MASK; // ((1 << CORR_NTILE_SHIFT) - 1); // np should
-			assert np < NUM_PAIRS : "invalid correllation pair";
+			assert np < num_pairs : "invalid correllation pair";
 			int tx = nt % tilesX;
 			int ty = nt / tilesX;
 			for (int i = 0; i< corr_size; i++) {
@@ -534,10 +566,15 @@ public class GPUTileProcessor {
        private CUdeviceptr gpu_bayer;
        private CUdeviceptr gpu_tasks;
        private CUdeviceptr gpu_corrs;
+        private CUdeviceptr gpu_corrs_td;
+        private CUdeviceptr gpu_corrs_combo;
+        private CUdeviceptr gpu_corrs_combo_td;
+        
        private CUdeviceptr gpu_textures;
        private CUdeviceptr gpu_clt;
        private CUdeviceptr gpu_4_images;
        private CUdeviceptr gpu_corr_indices;
+        private CUdeviceptr gpu_corr_combo_indices;
        private CUdeviceptr gpu_num_corr_tiles;
        private CUdeviceptr gpu_texture_indices_ovlp;
        private CUdeviceptr gpu_num_texture_ovlp;
@@ -556,12 +593,18 @@ public class GPUTileProcessor {
        private CUdeviceptr gpu_num_active_tiles;
        private int mclt_stride;
        private int corr_stride;
+        private int corr_stride_td;
+        private int corr_stride_combo;
+        private int corr_stride_combo_td;
        private int imclt_stride;
        private int texture_stride;
        private int texture_stride_rgba;
        private int num_task_tiles;
        private int num_corr_tiles;
        private int num_texture_tiles;
+        private int num_pairs = 6; // number of correlation pairs per tile (should match tsaks)
+        private int num_corr_combo_tiles;
+        
        private boolean geometry_correction_set = false;
        private boolean geometry_correction_vector_set = false;
    	public GpuQuad(
@@ -598,10 +641,17 @@ public class GPUTileProcessor {
            gpu_bayer =               new CUdeviceptr();
            gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT
            gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.FLOAT
+            
+            gpu_corrs_td =            new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * 4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT
+            gpu_corrs_combo =         new CUdeviceptr(); //  allocate tilesX * tilesY             * CORR_SIZE * Sizeof.FLOAT
+            gpu_corrs_combo_td =      new CUdeviceptr(); //  allocate tilesX * tilesY *             4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT
+            
            gpu_textures =            new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
            gpu_clt =                 new CUdeviceptr();
            gpu_4_images =            new CUdeviceptr();
            gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+            // May add separate gpu_corr_indices_td here
+            gpu_corr_combo_indices =  new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.FLOAT            
            gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
            gpu_texture_indices_ovlp =new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
            gpu_num_texture_ovlp =    new CUdeviceptr(); //  8 ints
@@ -695,6 +745,8 @@ public class GPUTileProcessor {
    //=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
        	// Set corrs array
        	cuMemAlloc(gpu_corr_indices,         tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
+        	cuMemAlloc(gpu_corr_combo_indices,   tilesX * tilesY *             Sizeof.FLOAT);
+        	
        	cuMemAlloc(gpu_num_corr_tiles,                    1 * Sizeof.FLOAT);

        	//#define TILESYA       ((TILESY +3) & (~3))
@@ -716,6 +768,7 @@ public class GPUTileProcessor {
        	cuMemAlloc(gpu_active_tiles,        tilesX * tilesY * Sizeof.FLOAT);
        	cuMemAlloc(gpu_num_active_tiles,                  1 * Sizeof.FLOAT);

+        	// allocate space for pixel-domain correlations (6 per tile)
            cuMemAllocPitch (
            		gpu_corrs,                             // CUdeviceptr dptr,
            		device_stride,                         // long[] pPitch,
@@ -723,6 +776,34 @@ public class GPUTileProcessor {
            		NUM_PAIRS * tilesX * tilesY,           // long Height,
                    Sizeof.FLOAT);                         // int ElementSizeBytes)
            corr_stride = (int)(device_stride[0] / Sizeof.FLOAT);
+            
+        	// allocate space for transform-domain correlations (6 per tile)
+            cuMemAllocPitch (
+            		gpu_corrs_td,                          // CUdeviceptr dptr,
+            		device_stride,                         // long[] pPitch,
+            		4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT,// long WidthInBytes,
+            		NUM_PAIRS * tilesX * tilesY,           // long Height,
+                    Sizeof.FLOAT);                         // int ElementSizeBytes)
+            corr_stride_td = (int)(device_stride[0] / Sizeof.FLOAT);
+            
+        	// allocate space for pixel-domain combined correlations (1 per tile)
+            cuMemAllocPitch (
+            		gpu_corrs_combo,                       // CUdeviceptr dptr,
+            		device_stride,                         // long[] pPitch,
+            		CORR_SIZE * Sizeof.FLOAT,              // long WidthInBytes,
+            		tilesX * tilesY,                       // long Height,
+                    Sizeof.FLOAT);                         // int ElementSizeBytes)
+            corr_stride_combo = (int)(device_stride[0] / Sizeof.FLOAT);
+            
+        	// allocate space for transform-domain combined correlations (1 per tile)
+            cuMemAllocPitch (
+            		gpu_corrs_combo_td,                          // CUdeviceptr dptr,
+            		device_stride,                         // long[] pPitch,
+            		4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT,// long WidthInBytes,
+            		tilesX * tilesY,           // long Height,
+                    Sizeof.FLOAT);                         // int ElementSizeBytes)
+            corr_stride_combo_td = (int)(device_stride[0] / Sizeof.FLOAT);
+            
            int max_texture_size = (num_colors + 1 + (num_cams + num_colors + 1)) * (2 * DTT_SIZE)* (2 * DTT_SIZE);
            cuMemAllocPitch (
            		gpu_textures,                             // CUdeviceptr dptr,
@@ -1556,6 +1637,134 @@ public class GPUTileProcessor {
        	cuCtxSynchronize();
        }

+        /**
+         * Generate 2D correlations from the CLT representation in transform domain, no normalization         
+         * @param scales R,G,B weights
+         */
+        public void execCorr2D_TD(
+        		double [] scales) {
+        	if (GPU_CORRELATE2D_kernel == null)
+        	{
+        		IJ.showMessage("Error", "No GPU kernel: GPU_CORRELATE2D_kernel");
+        		return;
+        	}
+            int tilesX =  img_width / DTT_SIZE;
+        	int num_colors = scales.length;
+        	if (num_colors > 3) num_colors = 3;
+        	float fscale0 = (float) scales[0];
+        	float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
+        	float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
+    		int [] GridFullWarps =    {1, 1, 1};
+        	int [] ThreadsFullWarps = {1, 1, 1};
+        	Pointer kernelParameters = Pointer.to(
+        			Pointer.to(gpu_clt),                         // float          ** gpu_clt,
+        			Pointer.to(new int[] { num_colors }),        // int               colors,             // number of colors (3/1)
+        			Pointer.to(new float[] {fscale0  }),         // float             scale0,             // scale for R
+        			Pointer.to(new float[] {fscale1  }),         // float             scale1,             // scale for B
+        			Pointer.to(new float[] {fscale2  }),         // float             scale2,             // scale for G
+        			Pointer.to(new float[] {(float) 0.0 }),      // float             fat_zero,           // here - absolute
+            		Pointer.to(gpu_tasks),                       // struct tp_task  * gpu_tasks,
+            		Pointer.to(new int[] { num_task_tiles }),    // int               num_tiles           // number of tiles in task
+            		Pointer.to(new int[] { tilesX }),            // int               tilesx,             // number of tile rows
+        			Pointer.to(gpu_corr_indices),                // int             * gpu_corr_indices,   // packed tile+pair
+        			Pointer.to(gpu_num_corr_tiles),              // int             * pnum_corr_tiles,    // pointer to a number of tiles to process
+        			Pointer.to(new int[] { corr_stride_td }),    // const size_t      corr_stride,        // in floats
+        			Pointer.to(new int[] { 0 }),  // generate TD // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+        			Pointer.to(gpu_corrs_td)                     // float           * gpu_corrs);         // correlation output data
+        			);
+        	cuCtxSynchronize();
+        	// Call the kernel function
+        	cuLaunchKernel(GPU_CORRELATE2D_kernel,
+        			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+        			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+        			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+        			kernelParameters, null);   // Kernel- and extra parameters
+        	cuCtxSynchronize();
+        }
+
+        /**
+         * Combine intra-scene correlations in transform domain (possible to accumulate more)          
+         * @param init_corr - true: init output to 0 before accumulating, false: add to current value
+         * @param num_pairs_in - typically ==6 - number of pairs per tile (tile task should have same number per each tile).
+         * This number should match correlations in tasks
+         * @param pairs_mask - selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+         */
+        public void execCorr2D_combine(
+        		boolean init_corr,    // initialize output tiles (false - add to current)
+        		int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+        		int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+        		) {
+        	if (GPU_CORR2D_COMBINE_kernel == null)
+        	{
+        		IJ.showMessage("Error", "No GPU kernel: GPU_CORR2D_COMBINE_kernel");
+        		return;
+        	}
+        	num_pairs = num_pairs_in;
+        	float [] fnum_corrs = new float[1];
+        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
+        	num_corr_combo_tiles =      Float.floatToIntBits(fnum_corrs[0])/num_pairs; // number of correlation tiles calculated
+        	
+    		int [] GridFullWarps =    {1, 1, 1};
+        	int [] ThreadsFullWarps = {1, 1, 1};
+        	Pointer kernelParameters = Pointer.to(
+            		Pointer.to(new int[] { num_corr_combo_tiles }), // num_task_tiles }), // int   num_tiles           // number of tiles in task
+            		Pointer.to(new int[] { num_pairs }),           // int               num_pairs,          // num pairs per tile (should be the same)
+        			Pointer.to(new int[] { init_corr ? 1 : 0 }),   // int               init_output,        // 1- reset output tiles to zero before accumulating
+            		Pointer.to(new int[] { pairs_mask }),          // int               pairs_mask,         // selected pairs
+        			Pointer.to(gpu_corr_indices),                  // int             * gpu_corr_indices,   // packed tile+pair
+        			Pointer.to(gpu_corr_combo_indices),            // int             * gpu_combo_indices,  // output if not null: packed tile+pairs_mask
+        			Pointer.to(new int[] { corr_stride_td }),      // const size_t      corr_stride,        // (in floats) stride for the input TD correlations
+        			Pointer.to(gpu_corrs_td),                      // float           * gpu_corrs,          // input correlation tiles
+        			Pointer.to(new int[] { corr_stride_combo_td }),// const size_t      corr_stride_combo,  // (in floats) stride for the output TD
+        			Pointer.to(gpu_corrs_combo_td));               // float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+        	cuCtxSynchronize();
+        	// Call the kernel function
+        	cuLaunchKernel(GPU_CORR2D_COMBINE_kernel,
+        			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+        			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+        			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+        			kernelParameters, null);   // Kernel- and extra parameters
+        	cuCtxSynchronize();
+        }
+        /**
+         * Normalize 2D correlations, transform and unfold 
+         * @param fat_zero - absolute fat zero - add to correlations before normalization
+         * @param corr_radius - correlation result size (maximal 7 for 15x15)
+         */
+
+        public void execCorr2D_normalize(
+        		double fat_zero,
+        		int corr_radius) {
+        	if (GPU_CORR2D_NORMALIZE_kernel == null)
+        	{
+        		IJ.showMessage("Error", "No GPU kernel: GPU_CORR2D_NORMALIZE_kernel");
+        		return;
+        	}
+//        	float [] fnum_corrs = new float[1];
+//        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
+//        	int num_tiles =      Float.floatToIntBits(fnum_corrs[0])/num_pairs; // number of correlation tiles calculated
+        	
+    		int [] GridFullWarps =    {1, 1, 1};
+        	int [] ThreadsFullWarps = {1, 1, 1};
+        	Pointer kernelParameters = Pointer.to(
+            		Pointer.to(new int[] { num_corr_combo_tiles }), // num_task_tiles }), // int   num_corr_tiles,     // number of correlation tiles to process
+        			Pointer.to(new int[] { corr_stride_combo_td }),// const size_t      corr_stride_td,     // in floats
+        			Pointer.to(gpu_corrs_combo_td),                // float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+        			Pointer.to(new int[] { corr_stride_combo }),   // const size_t      corr_stride,        // in floats
+        			Pointer.to(gpu_corrs_combo),                   // float           * gpu_corrs,          // correlation output data (pixel domain)
+        			Pointer.to(new float[] {(float) fat_zero }),   // float             fat_zero,           // here - absolute
+        			Pointer.to(new int[] { corr_radius }));        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+
+        	cuCtxSynchronize();
+        	// Call the kernel function
+        	cuLaunchKernel(GPU_CORR2D_NORMALIZE_kernel,
+        			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+        			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+        			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+        			kernelParameters, null);   // Kernel- and extra parameters
+        	cuCtxSynchronize();
+        }
+        
 /**
 * Generate combined (overlapping) texture
 * @param color_weights - [3] (RGB) or [1] (mono) color weights for matching
@@ -1734,6 +1943,20 @@ public class GPUTileProcessor {

        }

+        public int [] getCorrComboIndices() {
+//        	float [] fnum_corrs = new float[1];
+//        	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
+//        	int num_corrs =      Float.floatToIntBits(fnum_corrs[0]);
+        	float [] fcorr_combo_indices = new float [num_corr_combo_tiles];
+        	cuMemcpyDtoH(Pointer.to(fcorr_combo_indices), gpu_corr_combo_indices,  num_corr_combo_tiles * Sizeof.FLOAT);
+        	int [] corr_combo_indices = new int [num_corr_combo_tiles];
+        	for (int i = 0; i < num_corr_combo_tiles; i++) {
+        		corr_combo_indices[i] = Float.floatToIntBits(fcorr_combo_indices[i]);
+        	}
+        	return corr_combo_indices;
+
+        }
+
        public float [][] getCorr2D(int corr_rad){
        	int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
        	float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
@@ -1758,6 +1981,30 @@ public class GPUTileProcessor {
        	return corrs;
        }

+        public float [][] getCorr2DCombo(int corr_rad){
+        	int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
+        	float [] cpu_corrs = new float [ num_corr_combo_tiles * corr_size];
+        	CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
+        	copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
+        	copyD2H.srcDevice =       gpu_corrs_combo;
+        	copyD2H.srcPitch =        corr_stride_combo * Sizeof.FLOAT;
+
+        	copyD2H.dstMemoryType =   CUmemorytype.CU_MEMORYTYPE_HOST;
+        	copyD2H.dstHost =         Pointer.to(cpu_corrs);
+        	copyD2H.dstPitch =        corr_size * Sizeof.FLOAT;
+
+        	copyD2H.WidthInBytes =    corr_size * Sizeof.FLOAT;
+        	copyD2H.Height =          num_corr_combo_tiles;
+
+        	cuMemcpy2D(copyD2H); // run copy
+
+        	float [][] corrs = new float [num_corr_combo_tiles][ corr_size];
+        	for (int ncorr = 0; ncorr < num_corr_combo_tiles; ncorr++) {
+        		System.arraycopy(cpu_corrs, ncorr*corr_size, corrs[ncorr], 0, corr_size);
+        	}
+        	return corrs;
+        }
+        

 //	        
 /**

--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -657,6 +657,30 @@ public class QuadCLT extends QuadCLTCPU {
 	    		clt_parameters.gpu_corr_rad); // int corr_radius

 		long endCorr2d = System.nanoTime();
+
+		
+		// SHould be done before execCorr2D_TD as corr_indices are shared to save memory
+		int [] corr_indices = quadCLT_main.getGPU().getCorrIndices();
+		// the following is not yet shared
+		float [][] corr2D = quadCLT_main.getGPU().getCorr2D(
+				clt_parameters.gpu_corr_rad); //  int corr_rad);
+		
+		
+		
+// calculate correlations, keep TD
+		quadCLT_main.getGPU().execCorr2D_TD(
+	    		scales);// double [] scales,
+		
+		quadCLT_main.getGPU().execCorr2D_combine( // calculate cross pairs
+		        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+		        6,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+		        0x0f); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+		
+		quadCLT_main.getGPU().execCorr2D_normalize(
+	    		fat_zero, // double fat_zero);
+	    		clt_parameters.gpu_corr_rad); // int corr_radius
+		
+		
 // run textures
 		long startTextures = System.nanoTime();   // System.nanoTime();
 		boolean   calc_textures = clt_parameters.gpu_show_jtextures; //  true;
@@ -770,16 +794,52 @@ public class QuadCLT extends QuadCLTCPU {
 		//Show 2D correlations
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
-			int [] corr_indices = quadCLT_main.getGPU().getCorrIndices();
-			float [][] corr2D = quadCLT_main.getGPU().getCorr2D(
-					clt_parameters.gpu_corr_rad); //  int corr_rad);
-			// convert to 6-layer image		 using tasks
-			double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+			int [] corr_quad_indices = quadCLT_main.getGPU().getCorrComboIndices(); // get quad
+			float [][] corr2D_quad = quadCLT_main.getGPU().getCorr2DCombo(clt_parameters.gpu_corr_rad);
+// calculate and get cross here!			
+			quadCLT_main.getGPU().execCorr2D_combine( // calculate cross pairs
+			        true, // boolean init_corr,    // initialize output tiles (false - add to current)
+			        6,    // int     num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
+			        0x30); // int     pairs_mask    // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+			
+			quadCLT_main.getGPU().execCorr2D_normalize(
+		    		fat_zero, // double fat_zero);
+		    		clt_parameters.gpu_corr_rad); // int corr_radius
+			
+			int [] corr_cross_indices = quadCLT_main.getGPU().getCorrComboIndices(); // get quad
+			float [][] corr2D_cross = quadCLT_main.getGPU().getCorr2DCombo(clt_parameters.gpu_corr_rad);
+			
+			double [][] dbg_corr_pairs = GPUTileProcessor.getCorr2DView(
 					tilesX,
 					tilesY,
 					corr_indices,
 					corr2D,
 					wh);
+			double [][] dbg_corr_quad = GPUTileProcessor.getCorr2DView(
+					tilesX,
+					tilesY,
+					corr_quad_indices,
+					corr2D_quad,
+					wh);
+
+			double [][] dbg_corr_cross = GPUTileProcessor.getCorr2DView(
+					tilesX,
+					tilesY,
+					corr_cross_indices,
+					corr2D_cross,
+					wh);
+			
+			double [][] dbg_corr = {
+					dbg_corr_pairs[0],
+					dbg_corr_pairs[1],
+					dbg_corr_pairs[2],
+					dbg_corr_pairs[3],
+					dbg_corr_pairs[4],
+					dbg_corr_pairs[5],
+					dbg_corr_quad[15],
+					dbg_corr_cross[48]
+			};
+
 			(new ShowDoubleFloatArrays()).showArrays(
 					dbg_corr,
 					wh[0],
@@ -788,6 +848,7 @@ public class QuadCLT extends QuadCLTCPU {
 					name+"-CORR2D-D"+clt_parameters.disparity,
 					GPUTileProcessor.getCorrTitles());
 		}
+		
 // convert to overlapping and show
 		if (clt_parameters.gen_chn_img) { // save and show 4-slice image
 			// combine to a sliced color image