Commit 39e75987 authored by Andrey Filippov's avatar Andrey Filippov

simplified by using larger output array

parent 3f0b0bc0
...@@ -495,8 +495,8 @@ public class GPUTileProcessor { ...@@ -495,8 +495,8 @@ public class GPUTileProcessor {
tilesX * tilesY, // long Height, tilesX * tilesY, // long Height,
Sizeof.FLOAT); // int ElementSizeBytes) Sizeof.FLOAT); // int ElementSizeBytes)
texture_stride = (int)(device_stride[0] / Sizeof.FLOAT); texture_stride = (int)(device_stride[0] / Sizeof.FLOAT);
int max_rgba_width = tilesX * DTT_SIZE; int max_rgba_width = (tilesX + 1) * DTT_SIZE;
int max_rgba_height = tilesY * DTT_SIZE; int max_rgba_height = (tilesY + 1) * DTT_SIZE;
int max_rbga_slices = NUM_COLORS + 1; int max_rbga_slices = NUM_COLORS + 1;
cuMemAllocPitch ( cuMemAllocPitch (
...@@ -1095,7 +1095,7 @@ public class GPUTileProcessor { ...@@ -1095,7 +1095,7 @@ public class GPUTileProcessor {
int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1}; int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border // Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border
Pointer.to(gpu_texture_indices), // int * woi, - not used Pointer.to(gpu_texture_indices), // int * woi, - not used
Pointer.to(gpu_clt), Pointer.to(gpu_clt),
Pointer.to(new int[] { num_texture_tiles }), Pointer.to(new int[] { num_texture_tiles }),
...@@ -1155,6 +1155,7 @@ public class GPUTileProcessor { ...@@ -1155,6 +1155,7 @@ public class GPUTileProcessor {
/** /**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices. * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
* device array has 4 pixels margins on each side, skip them here
* @param num_colors number of colors (1 or 3) * @param num_colors number of colors (1 or 3)
* @param woi should be initialized as Rectangle(). x,y,width, height will be populated (in pixels,) * @param woi should be initialized as Rectangle(). x,y,width, height will be populated (in pixels,)
* @return RBGA slices, last (alpha) in 0.0... 1.0 range, colors match input range * @return RBGA slices, last (alpha) in 0.0... 1.0 range, colors match input range
...@@ -1180,9 +1181,11 @@ public class GPUTileProcessor { ...@@ -1180,9 +1181,11 @@ public class GPUTileProcessor {
copy_rbga.WidthInBytes = woi.width * Sizeof.FLOAT; copy_rbga.WidthInBytes = woi.width * Sizeof.FLOAT;
copy_rbga.Height = woi.height; copy_rbga.Height = woi.height;
copy_rbga.srcXInBytes = 4 * Sizeof.FLOAT;
for (int ncol = 0; ncol<= num_colors; ncol++ ) { for (int ncol = 0; ncol<= num_colors; ncol++ ) {
copy_rbga.dstHost = Pointer.to(rslt[ncol]); copy_rbga.dstHost = Pointer.to(rslt[ncol]);
copy_rbga.srcY = woi.height * ncol; copy_rbga.srcY = 4 + (woi.height +DTT_SIZE) * ncol;
cuMemcpy2D(copy_rbga); // run copy cuMemcpy2D(copy_rbga); // run copy
} }
return rslt; return rslt;
......
...@@ -1149,7 +1149,7 @@ extern "C" __global__ void clear_texture_rbga( ...@@ -1149,7 +1149,7 @@ extern "C" __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate( extern "C" __global__ void textures_accumulate(
int border_tile, // if 1 - watch for border // int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process size_t num_texture_tiles, // number of texture tiles to process
...@@ -1537,9 +1537,9 @@ __global__ void generate_RBGA( ...@@ -1537,9 +1537,9 @@ __global__ void generate_RBGA(
__syncthreads(); __syncthreads();
// Zero output textures. Trim // Zero output textures. Trim
// texture_rbga_stride // texture_rbga_stride
int texture_width = *(woi + 2) * DTT_SIZE; int texture_width = (*(woi + 2) + 1)* DTT_SIZE;
int texture_tiles_height = *(woi + 3) * DTT_SIZE; int texture_tiles_height = (*(woi + 3) + 1) * DTT_SIZE;
int texture_height = texture_tiles_height * DTT_SIZE; /// int texture_height = texture_tiles_height * DTT_SIZE;
int texture_slices = colors + 1; int texture_slices = colors + 1;
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -1581,7 +1581,8 @@ __global__ void generate_RBGA( ...@@ -1581,7 +1581,8 @@ __global__ void generate_RBGA(
#endif #endif
/* */ /* */
textures_accumulate<<<grid_texture,threads_texture>>>( textures_accumulate<<<grid_texture,threads_texture>>>(
border_tile, // int border_tile, // if 1 - watch for border // get rid of border tile
// border_tile, // int border_tile, // if 1 - watch for border
woi, // int * woi, // x, y, width,height woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process ntt, // size_t num_texture_tiles, // number of texture tiles to process
...@@ -2205,7 +2206,7 @@ __global__ void textures_gen( ...@@ -2205,7 +2206,7 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen #endif // ifdef USE_textures_gen
extern "C" extern "C"
__global__ void textures_accumulate( __global__ void textures_accumulate(
int border_tile, // if 1 - watch for border // int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process size_t num_texture_tiles, // number of texture tiles to process
...@@ -2484,8 +2485,9 @@ __global__ void textures_accumulate( ...@@ -2484,8 +2485,9 @@ __global__ void textures_accumulate(
if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
#ifdef DEBUG12 #ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){ if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n", // printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
tile_num, (int) tile_code, border_tile); // tile_num, (int) tile_code, border_tile);
printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d\n", tile_num, (int) tile_code);
for (int ncol = 0; ncol <= colors; ncol++) { for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol); printf("\ntile[%d]\n",ncol);
...@@ -2517,11 +2519,11 @@ __global__ void textures_accumulate( ...@@ -2517,11 +2519,11 @@ __global__ void textures_accumulate(
} }
} }
} }
int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color int slice_stride = texture_rbg_stride * (*(woi + 3) + 1) * DTT_SIZE; // offset to the next color
int tileY = tile_num / TILESX; // slow, but 1 per tile int tileY = tile_num / TILESX; // slow, but 1 per tile
int tileX = tile_num - tileY * TILESX; int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4 int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4 int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int height = *(woi + 3) << DTT_SIZE_LOG2; int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12 #ifdef DEBUG12
...@@ -2556,22 +2558,22 @@ __global__ void textures_accumulate( ...@@ -2556,22 +2558,22 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif // DEBUG12 #endif // DEBUG12
if (!border_tile || /// if (!border_tile ||
/// ((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESY)) && (g_col < (DTT_SIZE * TILESX)))){ /// ((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESY)) && (g_col < (DTT_SIZE * TILESX)))){
((g_row >= 0) && (g_col >= 0) && (g_row < height) && (g_col < (DTT_SIZE * TILESX)))){ /// ((g_row >= 0) && (g_col >= 0) && (g_row < height) && (g_col < (DTT_SIZE * TILESX)))){
// always copy 3 (1) colors + alpha // always copy 3 (1) colors + alpha
if (colors == 3){ if (colors == 3){
#pragma unroll #pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4 for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)); *(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
} }
} else { // assuming colors = 1 } else { // assuming colors = 1
#pragma unroll #pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2 for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)); *(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} }
} }
/// }
} }
} // if (gpu_texture_rbg) { // generate RGBA } // if (gpu_texture_rbg) { // generate RGBA
} // textures_accumulate() } // textures_accumulate()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment