changing corr2D to CDP

4fb94627 · Andrey Filippov · 20df596a · 4fb94627 · 4fb94627 · 4fb94627
Commit 4fb94627 authored Apr 16, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -190,6 +190,7 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_clt =                 new CUdeviceptr();
    private CUdeviceptr gpu_4_images =            new CUdeviceptr();
    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
+    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
@@ -575,8 +576,8 @@ public class GPUTileProcessor {
    	cuMemAlloc(gpu_tasks,      tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
 //=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
    	// Set corrs array
-///    	cuMemAlloc(gpu_corrs,       tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
-    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
+    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_num_corr_tiles,                    1 * Sizeof.FLOAT);

    	//#define TILESYA       ((TILESY +3) & (~3))
    	int tilesYa = (tilesY + 3) & ~3;
@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }

-    public void execConverDirect() {
+    public void execConvertDirect() {
        if (GPU_CONVERT_DIRECT_kernel == null)
        {
            IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
    	float fscale0 = (float) scales[0];
    	float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
    	float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
-		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
-    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
+//		int [] GridFullWarps =    {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
+//    	int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
+		int [] GridFullWarps =    {1, 1, 1};
+    	int [] ThreadsFullWarps = {1, 1, 1};
    	Pointer kernelParameters = Pointer.to(
-    			Pointer.to(gpu_clt),
-    			Pointer.to(new int[] { num_colors }),
-    			Pointer.to(new float[] {fscale0  }),
-    			Pointer.to(new float[] {fscale1  }),
-    			Pointer.to(new float[] {fscale2  }),
-    			Pointer.to(new float[] {(float) fat_zero }),
-    			Pointer.to(new int[] { num_corr_tiles }), // lpf_mask
-    			Pointer.to(gpu_corr_indices),
-    			Pointer.to(new int[] { corr_stride }),
-    			Pointer.to(new int[] { corr_radius }),
-    			Pointer.to(gpu_corrs) // lpf_mask
+    			Pointer.to(gpu_clt),                        // float          ** gpu_clt,
+    			Pointer.to(new int[] { num_colors }),       // int               colors,             // number of colors (3/1)
+    			Pointer.to(new float[] {fscale0  }),        // float             scale0,             // scale for R
+    			Pointer.to(new float[] {fscale1  }),        // float             scale1,             // scale for B
+    			Pointer.to(new float[] {fscale2  }),        // float             scale2,             // scale for G
+    			Pointer.to(new float[] {(float) fat_zero }),// float             fat_zero,           // here - absolute
+        		Pointer.to(gpu_tasks),                      // struct tp_task  * gpu_tasks,
+        		Pointer.to(new int[] { num_task_tiles }),   // int               num_tiles           // number of tiles in task
+    			Pointer.to(gpu_corr_indices),               // int             * gpu_corr_indices,   // packed tile+pair
+    			Pointer.to(gpu_num_corr_tiles),             // int             * pnum_corr_tiles,    // pointer to a number of tiles to process
+    			Pointer.to(new int[] { corr_stride }),      // const size_t      corr_stride,        // in floats
+    			Pointer.to(new int[] { corr_radius }),      // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+    			Pointer.to(gpu_corrs)                       // float           * gpu_corrs);         // correlation output data
    			);
    	cuCtxSynchronize();
    	// Call the kernel function
@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
        }
        return corrs;
    }
+    public int [] getCorrIndices() {
+    	float [] fnum_corrs = new float[1];
+    	cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles,  1 * Sizeof.FLOAT);
+    	int num_corrs =      Float.floatToIntBits(fnum_corrs[0]);
+    	float [] fcorr_indices = new float [num_corrs];
+    	cuMemcpyDtoH(Pointer.to(fcorr_indices), gpu_corr_indices,  num_corrs * Sizeof.FLOAT);
+    	int [] corr_indices = new int [num_corrs];
+    	for (int i = 0; i < num_corrs; i++) {
+    		corr_indices[i] = Float.floatToIntBits(fcorr_indices[i]);
+    	}
+    	num_corr_tiles = num_corrs;
+    	return corr_indices;
+
+    }

    /**
     * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
 				use_aux); // boolean use_aux)


-		int [] corr_indices = gPUTileProcessor.getCorrTasks(
-				tp_tasks);
+//		int [] corr_indices = gPUTileProcessor.getCorrTasks(
+//				tp_tasks);
 		// corr_indices array of integers to be passed to GPU
-		gPUTileProcessor.setCorrIndices(corr_indices);
+//		gPUTileProcessor.setCorrIndices(corr_indices);

 		int [] texture_indices = gPUTileProcessor.getTextureTasks(
 				tp_tasks);
@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
 		long startDirectConvert=System.nanoTime();

 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execConverDirect();
+			gPUTileProcessor.execConvertDirect();
 		}

 // run imclt;
@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
 		int tilesY =  GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
+			int [] corr_indices = gPUTileProcessor.getCorrIndices();
 			float [][] corr2D = gPUTileProcessor.getCorr2D(
 					clt_parameters.gpu_corr_rad); //  int corr_rad);
 			// convert to 6-layer image		 using tasks

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -31,24 +31,19 @@
 */

 /**
-**************************************************************************
-* \file TileProcessor.h
-* \brief header file for  the Tile Processor for frequency domain
+ **************************************************************************
+ * \file TileProcessor.h
+ * \brief header file for  the Tile Processor for frequency domain

-*/
+ */
 #pragma once
 #ifndef NUM_CAMS
 #include "tp_defines.h"
 #endif

-extern "C" __global__ void index_direct(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,          // number of tiles in task
-		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero

-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+extern "C" __global__ void convert_direct( // called with a single block, single thread
+		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 		float           ** gpu_kernel_offsets, // [NUM_CAMS],
 		float           ** gpu_kernels,        // [NUM_CAMS],
 		float           ** gpu_images,         // [NUM_CAMS],
@@ -64,49 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
 		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
 		int *              pnum_active_tiles);  //  indices to gpu_tasks

-extern "C" __global__ void convert_correct_tiles(
-			float           ** gpu_kernel_offsets, // [NUM_CAMS],
-			float           ** gpu_kernels,        // [NUM_CAMS],
-			float           ** gpu_images,         // [NUM_CAMS],
-			struct tp_task   * gpu_tasks,
-			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
-			int                num_active_tiles,   // number of tiles in task
-			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-			size_t             dstride,            // in floats (pixels)
-//			int                num_tiles,          // number of tiles in task
-			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-			int                woi_width,
-			int                woi_height,
-			int                kernels_hor,
-			int                kernels_vert);
-
+extern "C" __global__ void correlate2D(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+		int               num_tiles,          // number of tiles in task
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs);          // correlation output data

-extern "C" __global__ void clear_texture_list(
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		int                width,  // <= TILESX, use for faster processing of LWIR images
-		int                height); // <= TILESY, use for faster processing of LWIR images

-extern "C" __global__ void mark_texture_tiles(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,            // number of tiles in task list
-		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
-extern "C" __global__ void mark_texture_neighbor_tiles(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,           // number of tiles in task list
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-		int              * woi);                // x,y,width,height of the woi
-extern "C" __global__ void gen_texture_list(
-		struct tp_task   * gpu_tasks,
-		int                num_tiles,           // number of tiles in task list
-		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,  // number of texture tiles to process
-		int              * woi);                // x,y,width,height of the woi
-
-extern "C" __global__ void clear_texture_rbga(
-		int               texture_width,
-		int               texture_slice_height,
-		const size_t      texture_rbga_stride,     // in floats 8*stride
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -126,7 +94,7 @@ extern "C" __global__ void textures_accumulate(
 		float             weight2,            // scale for G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
-// combining both non-overlap and overlap (each calculated if pointer is not null )
+		// combining both non-overlap and overlap (each calculated if pointer is not null )
 		size_t            texture_rbg_stride, // in floats
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
@@ -154,18 +122,17 @@ extern "C" __global__ void imclt_rbg(
 		int               woi_theight,
 		const size_t      dstride);            // in floats (pixels)

-extern "C"
-__global__ void generate_RBGA(
-// Parameters to generate texture tasks
+extern "C" __global__ void generate_RBGA(
+		// Parameters to generate texture tasks
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
-// declare arrays in device code?
+		// declare arrays in device code?
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
 		int              * woi,                // x,y,width,height of the woi
 		int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
 		int                height, // <= TILESY, use for faster processing of LWIR images
-// Parameters for the texture generation
+		// Parameters for the texture generation
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate