simplified by using larger output array

39e75987 · Andrey Filippov · 3f0b0bc0 · 39e75987 · 39e75987
Commit 39e75987 authored Apr 06, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 32 additions and 27 deletions

GPUTileProcessor.java src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java +7 -4

TileProcessor.cuh src/main/resources/kernels/TileProcessor.cuh +25 -23

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -495,8 +495,8 @@ public class GPUTileProcessor {
        		tilesX * tilesY,             // long Height,
                Sizeof.FLOAT);                         // int ElementSizeBytes)
        texture_stride = (int)(device_stride[0] / Sizeof.FLOAT);
-        int max_rgba_width  =  tilesX * DTT_SIZE;
+        int max_rgba_width  =  (tilesX + 1) * DTT_SIZE;
-        int max_rgba_height =  tilesY * DTT_SIZE;
+        int max_rgba_height =  (tilesY + 1) * DTT_SIZE;
        int max_rbga_slices =  NUM_COLORS + 1;
        cuMemAllocPitch (
@@ -1095,7 +1095,7 @@ public class GPUTileProcessor {
    	int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
    	Pointer kernelParameters = Pointer.to(
-    			Pointer.to(new int[] {0}), // 0,          // int               border_tile,        // if 1 - watch for border
+//    			Pointer.to(new int[] {0}), // 0,          // int               border_tile,        // if 1 - watch for border
    			Pointer.to(gpu_texture_indices), //  int             * woi, - not used
    			Pointer.to(gpu_clt),
    			Pointer.to(new int[] { num_texture_tiles }),
@@ -1155,6 +1155,7 @@ public class GPUTileProcessor {
    /**
     * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
+     * device array has 4 pixels margins on each side, skip them here
     * @param num_colors number of colors (1 or 3)
     * @param woi should be initialized as Rectangle(). x,y,width, height will be populated (in pixels,)
     * @return RBGA slices, last (alpha) in 0.0... 1.0 range, colors match input range
@@ -1180,9 +1181,11 @@ public class GPUTileProcessor {
        copy_rbga.WidthInBytes =    woi.width * Sizeof.FLOAT;
        copy_rbga.Height =          woi.height;
+        copy_rbga.srcXInBytes =     4 * Sizeof.FLOAT;
        for (int ncol = 0; ncol<= num_colors; ncol++ ) {
        	copy_rbga.dstHost =     Pointer.to(rslt[ncol]);
-            copy_rbga.srcY =        woi.height * ncol;
+            copy_rbga.srcY =        4 + (woi.height +DTT_SIZE) * ncol;
            cuMemcpy2D(copy_rbga); // run copy
        }
        return rslt;

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -1149,7 +1149,7 @@ extern "C" __global__ void clear_texture_rbga(
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
-		int               border_tile,        // if 1 - watch for border
+//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
@@ -1537,9 +1537,9 @@ __global__ void generate_RBGA(
 	 __syncthreads();
 // Zero output textures. Trim
 // texture_rbga_stride
-	 int texture_width =        *(woi + 2) * DTT_SIZE;
+	 int texture_width =        (*(woi + 2) + 1)* DTT_SIZE;
-	 int texture_tiles_height = *(woi + 3) * DTT_SIZE;
+	 int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE;
-	 int texture_height =       texture_tiles_height * DTT_SIZE;
+///	 int texture_height =       texture_tiles_height * DTT_SIZE;
 	 int texture_slices =       colors + 1;
 	 if (threadIdx.x == 0) {
@@ -1581,7 +1581,8 @@ __global__ void generate_RBGA(
 #endif
 			    /* */
 			    textures_accumulate<<<grid_texture,threads_texture>>>(
-			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
+			    		// get rid of border tile
+//			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
 			    		woi,                             // int             * woi,                // x, y, width,height
 						gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
@@ -2205,7 +2206,7 @@ __global__ void textures_gen(
 #endif // ifdef USE_textures_gen
 extern "C"
 __global__ void textures_accumulate(
-		int               border_tile,        // if 1 - watch for border
+//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
@@ -2484,8 +2485,9 @@ __global__ void textures_accumulate(
 	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
 #ifdef DEBUG12
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
+//			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
-					tile_num, (int) tile_code, border_tile);
+//					tile_num, (int) tile_code, border_tile);
+			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d\n", tile_num, (int) tile_code);
 			for (int ncol = 0; ncol <= colors; ncol++) {
 				printf("\ntile[%d]\n",ncol);
@@ -2517,11 +2519,11 @@ __global__ void textures_accumulate(
 				}
 			}
 		}
-		int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color
+		int slice_stride = texture_rbg_stride * (*(woi + 3) + 1) * DTT_SIZE; // offset to the next color
 		int tileY = tile_num / TILESX; // slow, but 1 per tile
 		int tileX = tile_num - tileY * TILESX;
-		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
-		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
 		int height = *(woi + 3) << DTT_SIZE_LOG2;
 #ifdef DEBUG12
@@ -2556,9 +2558,9 @@ __global__ void textures_accumulate(
 			__syncthreads();// __syncwarp();
 #endif // DEBUG12
-			if (!border_tile ||
+			///			if (!border_tile ||
-///					((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESY)) && (g_col < (DTT_SIZE * TILESX)))){
+			///					((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESY)) && (g_col < (DTT_SIZE * TILESX)))){
-					((g_row >= 0) && (g_col >= 0) && (g_row < height) && (g_col < (DTT_SIZE * TILESX)))){
+			///					((g_row >= 0) && (g_col >= 0) && (g_row < height) && (g_col < (DTT_SIZE * TILESX)))){
 			// always copy 3 (1) colors + alpha
 			if (colors == 3){
 #pragma unroll
@@ -2571,7 +2573,7 @@ __global__ void textures_accumulate(
 					*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
 				}
 			}
-			}
+///			}
 		}
 	} // 	if (gpu_texture_rbg) { // generate RGBA
 } // textures_accumulate()