implemented (not tested) corr2D_normalize()

47b1fb86 · Andrey Filippov · 75fa734d · 47b1fb86 · 47b1fb86 · 47b1fb86
Commit 47b1fb86 authored Aug 26, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 248 additions and 45 deletions

TileProcessor.cuh src/TileProcessor.cuh +203 -9

TileProcessor.h src/TileProcessor.h +8 -0

tp_defines.h src/tp_defines.h +37 -36

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -910,6 +910,15 @@ extern "C" __global__ void correlate2D_inner(
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data

+extern "C" __global__ void corr2D_normalize_inner(
+		int               num_corr_tiles,     // number of correlation tiles to process
+		const size_t      corr_stride_td,     // (in floats) stride for the input TD correlations
+		float           * gpu_corrs_td,       // correlation tiles in transform domain
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+		float             fat_zero,           // here - absolute
+		int               corr_radius);        // radius of the output correlation (7 for 15x15)
+
 extern "C" __global__ void textures_accumulate(
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -1033,7 +1042,7 @@ extern "C" __global__ void correlate2D_inner(
 {
 	float scales[3] = {scale0, scale1, scale2};
 	int corr_in_block = threadIdx.y;
-	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
+	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
 	if (corr_num >= num_corr_tiles){
 		return; // nothing to do
 	}
@@ -1245,23 +1254,208 @@ extern "C" __global__ void correlate2D_inner(
 #endif
 #endif
    } else { //     if (corr_radius > 0) { transform domain output
-//    	int corr_tile_offset =  + corr_stride * corr_num;
+    	//    	int corr_tile_offset =  + corr_stride * corr_num;
    	float *mem_corr = gpu_corrs + corr_stride * corr_num + threadIdx.x;
    	float *clt = clt_corr + threadIdx.x;
 #pragma unroll
-    	for (int q = 0; q < 4; q++){
-#pragma unroll
-    		for (int i = 0; i < DTT_SIZE; i++){
-    			(*mem_corr) = (*clt);
-    			clt        += DTT_SIZE1;
-    			mem_corr   += DTT_SIZE;
-    		}
+    	for (int i = 0; i < DTT_SIZE4; i++){
+    		(*mem_corr) = (*clt);
+    		clt        += DTT_SIZE1;
+    		mem_corr   += DTT_SIZE;
    	}
    	__syncthreads();// __syncwarp();
    } //     if (corr_radius > 0) ... else

 }

+/**
+ * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles.This is an outer kernel
+ * that calls the inner one with CDP, this one should be configured as correlate2D<<<1,1>>>
+ *
+ * @param num_tiles        number of correlation tiles to process
+ * @param corr_stride_td,  stride (in floats) for correlation input (transform domain).
+ * @param gpu_corrs_td     correlation data in transform domain
+ * @param corr_stride,     stride (in floats) for correlation pixel-domain outputs.
+ * @param gpu_corrs        allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
+ * @param fat_zero         add this value squared to the sum of squared components before normalization
+ * @param corr_radius,     radius of the output correlation (maximal 7 for 15x15)
+ */
+extern "C" __global__ void corr2D_normalize(
+		int               num_corr_tiles,     // number of correlation tiles to process
+		const size_t      corr_stride_td,     // in floats
+		float           * gpu_corrs_td,       // correlation tiles in transform domain
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+		float             fat_zero,           // here - absolute
+		int               corr_radius)        // radius of the output correlation (7 for 15x15)
+{
+	 if (threadIdx.x == 0) { // only 1 thread, 1 block
+		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		    dim3 grid_corr((num_corr_tiles + CORR_TILES_PER_BLOCK_NORMALIZE-1) / CORR_TILES_PER_BLOCK_NORMALIZE,1,1);
+		    corr2D_normalize_inner<<<grid_corr,threads_corr>>>(
+	        		num_corr_tiles,      // int               num_corr_tiles,     // number of correlation tiles to process
+					corr_stride_td,      // const size_t      corr_stride,        // in floats
+					gpu_corrs_td,        // float           * gpu_corrs_td,       // correlation tiles in transform domain
+					corr_stride,         // const size_t      corr_stride,        // in floats
+					gpu_corrs,           // float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+					fat_zero,            // float             fat_zero,           // here - absolute
+					corr_radius);        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+	 }
+}
+
+/**
+ * Normalize, low-pass filter, convert to pixel domain and unfold correlation tiles. This is an inner
+ * kernel that is called from corr2D_normalize.
+ *
+ * @param num_tiles        number of correlation tiles to process
+ * @param corr_stride_td,  stride (in floats) for correlation input (transform domain).
+ * @param gpu_corrs_td     correlation data in transform domain
+ * @param corr_stride,     stride (in floats) for correlation pixel-domain outputs.
+ * @param gpu_corrs        allocated array for the correlation output data (each element stride, payload: (2*corr_radius+1)^2
+ * @param fat_zero         add this value squared to the sum of squared components before normalization
+ * @param corr_radius,     radius of the output correlation (maximal 7 for 15x15)
+ */
+
+extern "C" __global__ void corr2D_normalize_inner(
+		int               num_corr_tiles,     // number of correlation tiles to process
+		const size_t      corr_stride_td,     // (in floats) stride for the input TD correlations
+		float           * gpu_corrs_td,       // correlation tiles in transform domain
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+		float             fat_zero,           // here - absolute
+		int               corr_radius)        // radius of the output correlation (7 for 15x15)
+{
+	int corr_in_block = threadIdx.y;
+	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
+	if (corr_num >= num_corr_tiles){
+		return; // nothing to do
+	}
+    __syncthreads();// __syncwarp();
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK_NORMALIZE][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float mlt_corrs   [CORR_TILES_PER_BLOCK_NORMALIZE][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
+    // set clt_corr to all zeros
+    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+    float * mclt_corr = ((float *) mlt_corrs) +  corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
+
+    // Read correlation tile from the device memory to the shared memory
+	float *mem_corr = gpu_corrs_td + corr_stride_td * corr_num + threadIdx.x;
+	float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+	for (int i = 0; i < DTT_SIZE4; i++){
+		(*clt)      = (*mem_corr);
+		clt        += DTT_SIZE1;
+		mem_corr   += DTT_SIZE;
+	}
+	__syncthreads();// __syncwarp();
+
+	// normalize Amplitude
+	normalizeTileAmplitude(
+			clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+			fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
+	// Low Pass Filter from constant area (is it possible to replace?)
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+		printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
+		debug_print_clt1(clt_corr, -1,  0xf);
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+		printf("\ncorrelate2D LPF\n");
+		debug_print_lpf(lpf_corr);
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+	// Apply LPF filter
+	clt = clt_corr + threadIdx.x;
+#pragma unroll
+	for (int q = 0; q < 4; q++){
+		float *lpf = lpf_corr + threadIdx.x;
+#pragma unroll
+		for (int i = 0; i < DTT_SIZE; i++){
+			(*clt) *= (*lpf);
+			clt   += DTT_SIZE1;
+			lpf   += DTT_SIZE;
+		}
+	}
+	__syncthreads();// __syncwarp();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+		printf("\ncorrelate2D CORRELATION LPF-ed\n");
+		debug_print_clt1(clt_corr, -1,  0xf);
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+	// Convert correlation to pixel domain with DTT-II
+	dttii_2d(clt_corr);
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
+		printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS, corr_radius=%d\n",corr_radius);
+		debug_print_clt1(clt_corr, -1,  0xf);
+	}
+	__syncthreads();// __syncwarp();
+
+#endif
+#endif
+
+	// Unfold center area (2 * corr_radius + 1) * (2 * corr_radius + 1)
+	corrUnfoldTile(
+			corr_radius, // int corr_radius,
+			(float *) clt_corr,   // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+			(float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+
+	__syncthreads();
+
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+		printf("\ncorrelate2D after UNFOLD, corr_radius=%d\n",corr_radius);
+		debug_print_corr_15x15(
+				corr_radius, // int     corr_radius,
+				mclt_corr,
+				-1);
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+	// copy (2 * corr_radius +1) x (2 * corr_radius +1) (up to 15x15) tile to the main memory
+	int size2r1 = 2 * corr_radius + 1;
+	int len2r1x2r1 = size2r1 * size2r1;
+	int corr_tile_offset =  + corr_stride * corr_num;
+	mem_corr = gpu_corrs + corr_tile_offset;
+#pragma unroll
+	for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+		mem_corr[offs] = mclt_corr[offs];
+	}
+	__syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+	if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+		printf("\ncorrelate2D after copy to main memory\n");
+		//    	debug_print_clt1(clt_corr, -1,  0xf);
+	}
+	__syncthreads();// __syncwarp();
+#endif
+#endif
+
+
+
+}
+
+
 /**
 * Calculate texture as RGBA (or YA for mono) from the in-memory frequency domain representation
 * and the per-tile task array (may be sparse).

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -77,6 +77,14 @@ extern "C" __global__ void correlate2D(
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data

+extern "C" __global__ void corr2D_normalize(
+		int               num_corr_tiles,     // number of correlation tiles to process
+		const size_t      corr_stride_td,     // in floats
+		float           * gpu_corrs_td,       // correlation tiles in transform domain
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+		float             fat_zero,           // here - absolute
+		int               corr_radius);        // radius of the output correlation (7 for 15x15)

 extern "C" __global__ void textures_nonoverlap(
 		struct tp_task  * gpu_tasks,

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -40,42 +40,43 @@
 #pragma once
 #ifndef JCUDA
 #include <stdio.h>
-#define THREADSX         (DTT_SIZE)
-#define NUM_CAMS                  4
-#define NUM_PAIRS                 6
-#define NUM_COLORS                3
-#define IMG_WIDTH              2592
-#define IMG_HEIGHT             1936
-#define KERNELS_HOR             164
-#define KERNELS_VERT            123
-#define KERNELS_LSTEP             4
-#define THREADS_PER_TILE          8
-#define TILES_PER_BLOCK           4
-#define CORR_THREADS_PER_TILE     8
-#define CORR_TILES_PER_BLOCK      4
-#define TEXTURE_THREADS_PER_TILE  8
-#define TEXTURE_TILES_PER_BLOCK   1
-#define IMCLT_THREADS_PER_TILE   16
-#define IMCLT_TILES_PER_BLOCK     4
-#define CORR_NTILE_SHIFT          8 // higher bits - number of a pair, other bits tile number
-#define CORR_PAIRS_MASK        0x3f// lower bits used to address correlation pair for the selected tile
-#define CORR_TEXTURE_BIT          7 // bit 7 used to request texture for the tile
-#define TASK_CORR_BITS            4
-#define TASK_TEXTURE_N_BIT        0 // Texture with North neighbor
-#define TASK_TEXTURE_E_BIT        1 // Texture with East  neighbor
-#define TASK_TEXTURE_S_BIT        2 // Texture with South neighbor
-#define TASK_TEXTURE_W_BIT        3 // Texture with West  neighbor
-#define TASK_TEXTURE_BIT          3 // bit to request texture calculation int task field of struct tp_task
-#define LIST_TEXTURE_BIT          7 // bit to request texture calculation
-#define CORR_OUT_RAD              4
-#define FAT_ZERO_WEIGHT           0.0001 // add to port weights to avoid nan
-
-#define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list
-
-#define DBG_DISPARITY            0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
-#define RBYRDIST_LEN           5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
-#define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
-#define TILES_PER_BLOCK_GEOM     (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+#define THREADSX              (DTT_SIZE)
+#define NUM_CAMS                       4
+#define NUM_PAIRS                      6
+#define NUM_COLORS                     3
+#define IMG_WIDTH                   2592
+#define IMG_HEIGHT                  1936
+#define KERNELS_HOR                  164
+#define KERNELS_VERT                 123
+#define KERNELS_LSTEP                  4
+#define THREADS_PER_TILE               8
+#define TILES_PER_BLOCK                4
+#define CORR_THREADS_PER_TILE          8
+#define CORR_TILES_PER_BLOCK           4
+#define CORR_TILES_PER_BLOCK_NORMALIZE 4
+#define TEXTURE_THREADS_PER_TILE       8
+#define TEXTURE_TILES_PER_BLOCK        1
+#define IMCLT_THREADS_PER_TILE        16
+#define IMCLT_TILES_PER_BLOCK          4
+#define CORR_NTILE_SHIFT               8 // higher bits - number of a pair, other bits tile number
+#define CORR_PAIRS_MASK             0x3f// lower bits used to address correlation pair for the selected tile
+#define CORR_TEXTURE_BIT               7 // bit 7 used to request texture for the tile
+#define TASK_CORR_BITS                 4
+#define TASK_TEXTURE_N_BIT             0 // Texture with North neighbor
+#define TASK_TEXTURE_E_BIT             1 // Texture with East  neighbor
+#define TASK_TEXTURE_S_BIT             2 // Texture with South neighbor
+#define TASK_TEXTURE_W_BIT             3 // Texture with West  neighbor
+#define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
+#define LIST_TEXTURE_BIT               7 // bit to request texture calculation
+#define CORR_OUT_RAD                   4
+#define FAT_ZERO_WEIGHT                0.0001 // add to port weights to avoid nan
+
+#define THREADS_DYNAMIC_BITS           5 // treads in block for CDP creation of the texture list
+
+#define DBG_DISPARITY                  0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
+#define RBYRDIST_LEN                5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
+#define RBYRDIST_STEP                  0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
+#define TILES_PER_BLOCK_GEOM          (32/NUM_CAMS)   // each tile has NUM_CAMS threads

 // only used in C++ test
 #define TILESX        (IMG_WIDTH / DTT_SIZE)