moved LPF to 4-image generation to match correlations and textures

94aa6562 · Andrey Filippov · 3d5ddc28 · 94aa6562 · 94aa6562 · 94aa6562
Commit 94aa6562 authored Apr 08, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -979,12 +979,15 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }
-    public void execImcltRbg() {
+    public void execImcltRbg(
+    		boolean is_mono
+    		) {
    	if (GPU_IMCLT_RBG_kernel == null)
    	{
    		IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
    		return;
    	}
+    	int apply_lpf =  1;
    	int tilesX =  IMG_WIDTH / DTT_SIZE;
    	int tilesY =  IMG_HEIGHT / DTT_SIZE;
    	int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
@@ -999,6 +1002,8 @@ public class GPUTileProcessor {
 						Pointer kernelParameters = Pointer.to(
 								Pointer.to(gpu_clt_h[ncam]),
 								Pointer.to(gpu_corr_images_h[ncam]),
+								Pointer.to(new int[] { apply_lpf }),
+								Pointer.to(new int[] { is_mono ? 1 : 0 }),
 								Pointer.to(new int[] { color }),
 								Pointer.to(new int[] { v_offs }),
 								Pointer.to(new int[] { h_offs }),

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2088,7 +2088,7 @@ public class TwoQuadCLT {
 // run imclt;
 		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execImcltRbg();
+			gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
 		}
 		long endImcltTime = System.nanoTime();
 // run correlation
@@ -2175,22 +2175,22 @@ public class TwoQuadCLT {
 		int tilesY =  GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
 		int [] wh = new int[2];
 		if (clt_parameters.show_corr) {
-		float [][] corr2D = gPUTileProcessor.getCorr2D(
+			float [][] corr2D = gPUTileProcessor.getCorr2D(
-				clt_parameters.gpu_corr_rad); //  int corr_rad);
+					clt_parameters.gpu_corr_rad); //  int corr_rad);
-// convert to 6-layer image		 using tasks
+			// convert to 6-layer image		 using tasks
-		double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
+			double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
-	    		tilesX,
+					tilesX,
-	    		tilesY,
+					tilesY,
-	    		corr_indices,
+					corr_indices,
-	    		corr2D,
+					corr2D,
-	    		wh);
+					wh);
-		(new ShowDoubleFloatArrays()).showArrays(
+			(new ShowDoubleFloatArrays()).showArrays(
-				dbg_corr,
+					dbg_corr,
-				wh[0],
+					wh[0],
-				wh[1],
+					wh[1],
-				true,
+					true,
-				"CORR2D",
+					"CORR2D",
-				GPUTileProcessor.getCorrTitles());
+					GPUTileProcessor.getCorrTitles());
 		}
 // convert to overlapping and show
 		if (clt_parameters.gen_chn_img) {

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -105,7 +105,9 @@
 #define DBG_TILE_X     161 // 49
 #define DBG_TILE_Y     111 // 66
-#define DBG_TILE     (DBG_TILE_Y * 324 + DBG_TILE_X)
+#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#undef DBG_MARK_DBG_TILE 1
 //56494
 // struct tp_task
 //#define TASK_SIZE      12
@@ -879,6 +881,18 @@ extern "C" __global__ void textures_accumulate(
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C"
+__global__ void imclt_rbg(
+		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+		int               apply_lpf,
+		int               mono,               // defines lpf filter
+		int               color,              // defines location of clt data
+		int               v_offset,
+		int               h_offset,
+		const size_t      dstride);            // in floats (pixels)
+//===========================
 extern "C"
 __global__ void correlate2D(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -2274,16 +2288,12 @@ __global__ void textures_accumulate(
 } // textures_accumulate()
 extern "C"
 __global__ void imclt_rbg(
 		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+		int               apply_lpf,
+		int               mono,
 		int               color,
 		int               v_offset,
 		int               h_offset,
@@ -2334,13 +2344,27 @@ __global__ void imclt_rbg(
    clt_tile += column + thr3; // first 2 rows
    gpu_tile += column;  // first 2 rows
+    if (apply_lpf) {
+    	// lpf - covers 2 rows, as there there are 16 threads
+		float *lpf0 = lpf_data[mono? 3 :color] + threadIdx.x; // lpf_data[3] - mono
+#pragma unroll
+		for (int q = 0; q < 4; q++){
+			float *lpf = lpf0;
+			for (int i = 0; i < DTT_SIZE/2; i++){
+				*clt_tile= *gpu_tile * (*lpf);
+				clt_tile += (2 * DTT_SIZE1);
+				gpu_tile += (2 * DTT_SIZE);
+				lpf +=      (2 * DTT_SIZE);
+			}
+		}
+    } else {
 #pragma unroll
-    for (int i = 0; i < DTT_SIZE2; i++){
+    	for (int i = 0; i < DTT_SIZE2; i++){
-    	*clt_tile= *gpu_tile;
+    		*clt_tile= *gpu_tile;
-    	clt_tile += (2 * DTT_SIZE1);
+    		clt_tile += (2 * DTT_SIZE1);
-    	gpu_tile += (2 * DTT_SIZE);
+    		gpu_tile += (2 * DTT_SIZE);
+    	}
    }
 	float * mclt_top = ((float*) mclt_tiles) +  tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column;
 	float * rbg_top = color_plane + (tileY * DTT_SIZE)* dstride + (tileX * DTT_SIZE) + column;
 	float * mclt_tile = mclt_top;
@@ -2377,31 +2401,33 @@ __global__ void imclt_rbg(
-//	save result (back)
+    //	save result (back)
-	float * rbg_p = rbg_top;
+    float * rbg_p = rbg_top;
-	mclt_tile =     mclt_top;
+    mclt_tile =     mclt_top;
-	if ((tileX == 0)  && (tileY == 0)){
+    if ((tileX == 0)  && (tileY == 0)){
 #pragma unroll
-		for (int i = 0; i < DTT_SIZE2; i++){
+    	for (int i = 0; i < DTT_SIZE2; i++){
-			*rbg_p = 100.0f; // just testing
+    		*rbg_p = 100.0f; // just testing
-			mclt_tile += DTT_SIZE21;
+    		mclt_tile += DTT_SIZE21;
-			rbg_p +=     dstride; // DTT_SIZE2; // FIXME
+    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
-		}
+    	}
-	} else if ((tileX == DBG_TILE_X)  && (tileY == DBG_TILE_Y)){
+#ifdef DBG_MARK_DBG_TILE
+    } else if ((tileX == DBG_TILE_X)  && (tileY == DBG_TILE_Y)){
 #pragma unroll
-		for (int i = 0; i < DTT_SIZE2; i++){
+    	for (int i = 0; i < DTT_SIZE2; i++){
-			*rbg_p = (*mclt_tile) * 2.0; // just testing
+    		*rbg_p = (*mclt_tile) * 2.0; // just testing
-			mclt_tile += DTT_SIZE21;
+    		mclt_tile += DTT_SIZE21;
-			rbg_p +=     dstride; // DTT_SIZE2; // FIXME
+    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
-		}
+    	}
-	} else {
+#endif
+    } else {
 #pragma unroll
-		for (int i = 0; i < DTT_SIZE2; i++){
+    	for (int i = 0; i < DTT_SIZE2; i++){
-			*rbg_p = *mclt_tile;
+    		*rbg_p = *mclt_tile;
-			mclt_tile += DTT_SIZE21;
+    		mclt_tile += DTT_SIZE21;
-			rbg_p +=     dstride; // DTT_SIZE2; // FIXME
+    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
-		}
+    	}
-	}
+    }
 }
@@ -3184,15 +3210,15 @@ __device__ void convertCorrectTile(
    			 lpf   += DTT_SIZE;
    		 }
    	 }
-         __syncthreads();// __syncwarp();
+    	 __syncthreads();// __syncwarp();
 #ifdef DBG_TILE
 #ifdef DEBUG3
-         if ((threadIdx.x) == 0){
+    	 if ((threadIdx.x) == 0){
-        	 printf("\nDTT Tiles after LPF, color = %d\n",color);
+    		 printf("\nDTT Tiles after LPF, color = %d\n",color);
-        	 debug_print_clt1(clt_tile, color,  0xf); // only 1 quadrant for R,B and 2 - for G
+    		 debug_print_clt1(clt_tile, color,  0xf); // only 1 quadrant for R,B and 2 - for G
-        	 printf("\nDTT All done\n");
+    		 printf("\nDTT All done\n");
-         }
+    	 }
-     __syncthreads();// __syncwarp();
+    	 __syncthreads();// __syncwarp();
 #endif
 #endif
     }

--- a/src/main/resources/kernels/test_tp.cu
+++ b/src/main/resources/kernels/test_tp.cu