changing corr2D to CDP, handling sparse tasks

99662bc7 · Andrey Filippov · 80a4578b · 99662bc7 · 99662bc7 · 99662bc7
Commit 99662bc7 authored Apr 16, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 144 additions and 35 deletions

TileProcessor.cuh src/TileProcessor.cuh +92 -8

TileProcessor.h src/TileProcessor.h +17 -1

test_tp.cu src/test_tp.cu +35 -26

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -858,7 +858,13 @@ __global__ void index_direct(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task
 		int *              active_tiles,       // pointer to the calculated number of non-zero tiles
-		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero
+		int *              pnum_active_tiles); //  indices to gpu_tasks  // should be initialized to zero
+__global__ void index_correlate(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,         // number of tiles in task
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles);  // pointer to the length of correlation tasks array
 //extern "C"
 __global__ void convert_correct_tiles(
@@ -877,11 +883,64 @@ __global__ void convert_correct_tiles(
 		int                kernels_hor,
 		int                kernels_vert);
-// ====== end of local declarations ====
+extern "C" __global__ void correlate2D_inner(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		size_t            num_corr_tiles,     // number of correlation tiles to process
+		int             * gpu_corr_indices,   // packed tile+pair
+		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs);          // correlation output data
+// ====== end of local declarations ====
 extern "C" __global__ void correlate2D(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+		int               num_tiles,          // number of tiles in task
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs)          // correlation output data
+{
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	 if (threadIdx.x == 0) { // only 1 thread, 1 block
+		 *pnum_corr_tiles = 0;
+		 index_correlate<<<blocks0,threads0>>>(
+				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_tiles,           // int                num_tiles,          // number of tiles in task
+				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
+				 pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
+		 cudaDeviceSynchronize();
+		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		    dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+		        correlate2D_inner<<<grid_corr,threads_corr>>>(
+		        		gpu_clt,            // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+						colors,             // int               colors,             // number of colors (3/1)
+						scale0,             // float             scale0,             // scale for R
+						scale1,             // float             scale1,             // scale for B
+						scale2,             // float             scale2,             // scale for G
+						fat_zero,           // float             fat_zero,           // here - absolute
+		        		*pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+		        		gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
+						corr_stride,        // const size_t      corr_stride,        // in floats
+						corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		        		gpu_corrs);         // float           * gpu_corrs);         // correlation output data
+	 }
+}
+extern "C" __global__ void correlate2D_inner(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
@@ -1527,18 +1586,43 @@ __global__ void index_direct(
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task
 		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              num_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+		int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
 {
 	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
 	if (num_tile >= num_tiles){
 		return;
 	}
 	if (gpu_tasks[num_tile].task != 0) {
-		active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
+		active_tiles[atomicAdd(pnum_active_tiles, 1)] = num_tile;
 	}
 }
-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+__global__ void index_correlate(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,         // number of tiles in task
+		int *              gpu_corr_indices,  // array of correlation tasks
+		int *              pnum_corr_tiles)   // pointer to the length of correlation tasks array
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	int cm = (gpu_tasks[num_tile].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
+	if (cm != 0) {
+		int nb = __popc (cm); // number of non-zero bits
+		int indx = atomicAdd(pnum_corr_tiles, nb);
+		int txy = gpu_tasks[num_tile].txy;
+		int tx = txy & 0xffff;
+		int ty = txy >> 16;
+		int nt = ty * TILESX + tx;
+		for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
+			gpu_corr_indices[indx++] = (nt << CORR_NTILE_SHIFT) | b;
+		}
+	}
+}
+extern "C" __global__ void convert_direct(  // called with a single block, single thread
 //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
@@ -1557,7 +1641,7 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
 {
 	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
 	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
-	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+	 if (threadIdx.x == 0) { // always 1
 		 *pnum_active_tiles = 0;
 		 index_direct<<<blocks0,threads0>>>(
 				 gpu_tasks,           // struct tp_task   * gpu_tasks,

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -42,7 +42,7 @@
 #endif
-extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+extern "C" __global__ void convert_direct( // called with a single block, single thread
 		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 		float           ** gpu_kernel_offsets, // [NUM_CAMS],
 		float           ** gpu_kernels,        // [NUM_CAMS],
@@ -59,6 +59,22 @@ extern "C" __global__ void convert_direct( // called with a single block, CONVER
 		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
 		int *              pnum_active_tiles);  //  indices to gpu_tasks
+extern "C" __global__ void correlate2D(
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		int               colors,             // number of colors (3/1)
+		float             scale0,             // scale for R
+		float             scale1,             // scale for B
+		float             scale2,             // scale for G
+		float             fat_zero,           // here - absolute
+		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+		int               num_tiles,          // number of tiles in task
+		int             * gpu_corr_indices,   // packed tile+pair
+		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		const size_t      corr_stride,        // in floats
+		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs);          // correlation output data
 extern "C" __global__ void textures_accumulate(
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -362,8 +362,11 @@ int main(int argc, char **argv)
    struct tp_task  * gpu_tasks;
    int *             gpu_active_tiles;
    int *             gpu_num_active;
+    int *             gpu_num_corr_tiles;
    checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_num_active,                     sizeof(int)));
+    checkCudaErrors (cudaMalloc((void **)&gpu_num_corr_tiles,                 sizeof(int)));
    size_t  dstride;          // in bytes !
    size_t  dstride_rslt;     // in bytes !
@@ -523,7 +526,6 @@ int main(int argc, char **argv)
    }
    // num_corrs now has the total number of correlations
    // copy corr_indices to gpu
-//    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
    gpu_corr_indices = (int  *) copyalloc_kernel_gpu(
    		(float * ) corr_indices,
 			num_corrs,
@@ -849,22 +851,6 @@ int main(int argc, char **argv)
 				KERNELS_VERT,          // int                kernels_vert);
 				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
    			gpu_num_active);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
-#if 0
-    	convert_correct_tiles<<<grid_tp,threads_tp>>>(
-    			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
-				gpu_kernels,           // float           ** gpu_kernels,
-				gpu_images,            // float           ** gpu_images,
-				gpu_tasks,             // struct tp_task   * gpu_tasks,
-				gpu_clt,               // float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				dstride/sizeof(float), // size_t             dstride, // for gpu_images
-				tp_task_size,          // int                num_tiles) // number of tiles in task
-				0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-				IMG_WIDTH,             // int                woi_width,
-				IMG_HEIGHT,            // int                woi_height,
-				KERNELS_HOR,           // int                kernels_hor,
-				KERNELS_VERT);         // int                kernels_vert);
-#endif
    	getLastCudaError("Kernel execution failed");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -878,8 +864,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaMemcpy(
 			&num_active_tiles,
 			gpu_num_active,
-			sizeof(int), // 8 sequences (0,2,4,6 - non-border, growing up;
+			sizeof(int),
-			//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
 			cudaMemcpyDeviceToHost));
    printf("Run time =%f ms, num active tiles = %d\n",  avgTime, num_active_tiles);
@@ -987,8 +972,9 @@ int main(int argc, char **argv)
 #ifndef NOCORR
 //    cudaProfilerStart();
    // testing corr
-    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+//    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
+    //        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+ //   printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
    StopWatchInterface *timerCORR = 0;
    sdkCreateTimer(&timerCORR);
@@ -1000,9 +986,24 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerCORR);
    		sdkStartTimer(&timerCORR);
    	}
+#if 1
+        correlate2D<<<1,1>>>(
+		gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		3,                          // int               colors,             // number of colors (3/1)
+		0.25,                       // float             scale0,             // scale for R
+		0.25,                       // float             scale1,             // scale for B
+		0.5,                        // float             scale2,             // scale for G
+		30.0,                       // float             fat_zero,           // here - absolute
+		gpu_tasks,                  // struct tp_task  * gpu_tasks,
+		tp_task_size,               // int               num_tiles) // number of tiles in task
+		gpu_corr_indices,           //  int            * gpu_corr_indices,   // packed tile+pair
+		gpu_num_corr_tiles,         // int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
+		CORR_OUT_RAD,               // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		gpu_corrs);                 // float           * gpu_corrs);          // correlation output data
+#else
        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-        correlate2D<<<grid_corr,threads_corr>>>(
+        correlate2D_inner<<<grid_corr,threads_corr>>>(
 		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		3,         // int               colors,             // number of colors (3/1)
 		0.25,      // float             scale0,             // scale for R
@@ -1014,6 +1015,8 @@ int main(int argc, char **argv)
 		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
 		CORR_OUT_RAD, // int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		gpu_corrs); // float           * gpu_corrs);          // correlation output data
+#endif
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
@@ -1022,14 +1025,19 @@ int main(int argc, char **argv)
    sdkStopTimer(&timerCORR);
    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
    sdkDeleteTimer(&timerCORR);
-    printf("Average CORR run time =%f ms\n",  avgTimeCORR);
+    printf("Average CORR run time =%f ms, num cor tiles (old) = %d\n",  avgTimeCORR, num_corrs);
+	checkCudaErrors(cudaMemcpy(
+			&num_corrs,
+			gpu_num_corr_tiles,
+			sizeof(int),
+			cudaMemcpyDeviceToHost));
+    printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n",  avgTimeCORR, num_corrs);
    int corr_size =        2 * CORR_OUT_RAD + 1;
    int rslt_corr_size =   num_corrs * corr_size * corr_size;
    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr,
 			(corr_size * corr_size) * sizeof(float),
@@ -1361,6 +1369,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaFree(gpu_corr_images));
 	checkCudaErrors(cudaFree(gpu_corrs));
 	checkCudaErrors(cudaFree(gpu_corr_indices));
+	checkCudaErrors(cudaFree(gpu_num_corr_tiles));
 	checkCudaErrors(cudaFree(gpu_texture_indices));
 	checkCudaErrors(cudaFree(gpu_port_offsets));
 	checkCudaErrors(cudaFree(gpu_textures));