moved LPF to 4-image generation to match correlations and textures

94aa6562 · Andrey Filippov · 3d5ddc28 · 94aa6562 · 94aa6562 · 94aa6562
Commit 94aa6562 authored Apr 08, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -979,12 +979,15 @@ public class GPUTileProcessor {
    	cuCtxSynchronize(); // remove later
    }
-    public void execImcltRbg() {
+    public void execImcltRbg(
+    		boolean is_mono
+    		) {
    	if (GPU_IMCLT_RBG_kernel == null)
    	{
    		IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
    		return;
    	}
+    	int apply_lpf =  1;
    	int tilesX =  IMG_WIDTH / DTT_SIZE;
    	int tilesY =  IMG_HEIGHT / DTT_SIZE;
    	int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
@@ -999,6 +1002,8 @@ public class GPUTileProcessor {
 						Pointer kernelParameters = Pointer.to(
 								Pointer.to(gpu_clt_h[ncam]),
 								Pointer.to(gpu_corr_images_h[ncam]),
+								Pointer.to(new int[] { apply_lpf }),
+								Pointer.to(new int[] { is_mono ? 1 : 0 }),
 								Pointer.to(new int[] { color }),
 								Pointer.to(new int[] { v_offs }),
 								Pointer.to(new int[] { h_offs }),

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2088,7 +2088,7 @@ public class TwoQuadCLT {
 // run imclt;
 		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execImcltRbg();
+			gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
 		}
 		long endImcltTime = System.nanoTime();
 // run correlation
@@ -2177,7 +2177,7 @@ public class TwoQuadCLT {
 		if (clt_parameters.show_corr) {
 			float [][] corr2D = gPUTileProcessor.getCorr2D(
 					clt_parameters.gpu_corr_rad); //  int corr_rad);
-// convert to 6-layer image		 using tasks
+			// convert to 6-layer image		 using tasks
 			double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
 					tilesX,
 					tilesY,

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -106,6 +106,8 @@
 #define DBG_TILE_Y     111 // 66
 #define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#undef DBG_MARK_DBG_TILE 1
 //56494
 // struct tp_task
 //#define TASK_SIZE      12
@@ -879,6 +881,18 @@ extern "C" __global__ void textures_accumulate(
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C"
+__global__ void imclt_rbg(
+		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+		int               apply_lpf,
+		int               mono,               // defines lpf filter
+		int               color,              // defines location of clt data
+		int               v_offset,
+		int               h_offset,
+		const size_t      dstride);            // in floats (pixels)
+//===========================
 extern "C"
 __global__ void correlate2D(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -2274,16 +2288,12 @@ __global__ void textures_accumulate(
 } // textures_accumulate()
 extern "C"
 __global__ void imclt_rbg(
 		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+		int               apply_lpf,
+		int               mono,
 		int               color,
 		int               v_offset,
 		int               h_offset,
@@ -2334,13 +2344,27 @@ __global__ void imclt_rbg(
    clt_tile += column + thr3; // first 2 rows
    gpu_tile += column;  // first 2 rows
+    if (apply_lpf) {
+    	// lpf - covers 2 rows, as there there are 16 threads
+		float *lpf0 = lpf_data[mono? 3 :color] + threadIdx.x; // lpf_data[3] - mono
+#pragma unroll
+		for (int q = 0; q < 4; q++){
+			float *lpf = lpf0;
+			for (int i = 0; i < DTT_SIZE/2; i++){
+				*clt_tile= *gpu_tile * (*lpf);
+				clt_tile += (2 * DTT_SIZE1);
+				gpu_tile += (2 * DTT_SIZE);
+				lpf +=      (2 * DTT_SIZE);
+			}
+		}
+    } else {
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE2; i++){
    		*clt_tile= *gpu_tile;
    		clt_tile += (2 * DTT_SIZE1);
    		gpu_tile += (2 * DTT_SIZE);
    	}
+    }
 	float * mclt_top = ((float*) mclt_tiles) +  tile_in_block * (DTT_SIZE2 * DTT_SIZE21) + column;
 	float * rbg_top = color_plane + (tileY * DTT_SIZE)* dstride + (tileX * DTT_SIZE) + column;
 	float * mclt_tile = mclt_top;
@@ -2377,7 +2401,7 @@ __global__ void imclt_rbg(
-//	save result (back)
+    //	save result (back)
    float * rbg_p = rbg_top;
    mclt_tile =     mclt_top;
    if ((tileX == 0)  && (tileY == 0)){
@@ -2387,6 +2411,7 @@ __global__ void imclt_rbg(
    		mclt_tile += DTT_SIZE21;
    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
    	}
+#ifdef DBG_MARK_DBG_TILE
    } else if ((tileX == DBG_TILE_X)  && (tileY == DBG_TILE_Y)){
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE2; i++){
@@ -2394,6 +2419,7 @@ __global__ void imclt_rbg(
    		mclt_tile += DTT_SIZE21;
    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
    	}
+#endif
    } else {
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE2; i++){

--- a/src/main/resources/kernels/test_tp.cu
+++ b/src/main/resources/kernels/test_tp.cu