4 images with CDP

f134cfa4 · Andrey Filippov · bd04c118 · f134cfa4 · f134cfa4 · f134cfa4
Commit f134cfa4 authored Apr 14, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 58 additions and 6 deletions

TileProcessor.cuh src/TileProcessor.cuh +28 -1

TileProcessor.h src/TileProcessor.h +10 -2

test_tp.cu src/test_tp.cu +20 -3

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -2055,7 +2055,34 @@ __global__ void imclt_rbg_all(
 		int                woi_theight,
 		const size_t       dstride)            // in floats (pixels)
 {
+	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
+	if (threadIdx.x == 0) { // anyway 1,1,1
+		for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+			for (int color = 0; color < colors; color++) {
+				for (int v_offs = 0; v_offs < 2; v_offs++){
+					for (int h_offs = 0; h_offs < 2; h_offs++){
+						int tilesy_half = (woi_theight + (v_offs ^ 1)) >> 1;
+						int tilesx_half = (woi_twidth + (h_offs ^ 1)) >> 1;
+						int tiles_in_pass = tilesy_half * tilesx_half;
+						dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
+						//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
+						imclt_rbg<<<grid_imclt,threads_imclt>>>(
+								gpu_clt[ncam],         // float           * gpu_clt,     // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+								gpu_corr_images[ncam], // float           * gpu_rbg,     // WIDTH, 3 * HEIGHT
+								1,                     // int               apply_lpf,
+								colors,                // int               colors,      // defines lpf filter
+								color,                 // int               color,       // defines location of clt data
+								v_offs,                // int               v_offset,
+								h_offs,                // int               h_offset,
+								woi_twidth,            // int               woi_twidth,  // will increase by DTT_SIZE (todo - cut away?)
+								woi_theight,           // int               woi_theight, // will increase by DTT_SIZE (todo - cut away?)
+								dstride);              // const size_t      dstride);    // in floats (pixels)
+						cudaDeviceSynchronize();
+					}
+				}
+			}
+		}
+	}
 }

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -109,6 +109,16 @@ extern "C" __global__ void textures_accumulate(
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C"
+__global__ void imclt_rbg_all(
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
+		int                apply_lpf,
+		int                colors,
+		int                woi_twidth,
+		int                woi_theight,
+		const size_t       dstride);            // in floats (pixels)
 extern "C" __global__ void imclt_rbg(
 		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
@@ -151,5 +161,3 @@ __global__ void generate_RBGA(
 			const size_t      texture_rbga_stride,     // in floats
 			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -823,8 +823,8 @@ int main(int argc, char **argv)
    // testing imclt
-    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
+//    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
+//    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    StopWatchInterface *timerIMCLT = 0;
    sdkCreateTimer(&timerIMCLT);
@@ -836,7 +836,23 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerIMCLT);
    		sdkStartTimer(&timerIMCLT);
    	}
+#define CDP1
+#ifdef CDP1
+        dim3 threads_imclt_all(1, 1, 1);
+		dim3 grid_imclt_all(1, 1, 1);
+        printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
+        printf("grid_imclt_all=   (%d, %d, %d)\n",grid_imclt_all.x,   grid_imclt_all.y,   grid_imclt_all.z);
+        imclt_rbg_all<<<grid_imclt_all,threads_imclt_all>>>(
+        		gpu_clt,                     // float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				gpu_corr_images,             // float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
+				1,                           // int               apply_lpf,
+				NUM_COLORS,                  // int               colors,               // defines lpf filter
+				TILESX,                      // int               woi_twidth,
+				TILESY,                      // int               woi_theight,
+				dstride_rslt/sizeof(float)); // const size_t      dstride);            // in floats (pixels)
+#else
+        dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
+        printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    		for (int color = 0; color < NUM_COLORS; color++) {
    			for (int v_offs = 0; v_offs < 2; v_offs++){
@@ -861,6 +877,7 @@ int main(int argc, char **argv)
    			}
    		}
    	}
+#endif
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);