Commit 16bc5c47 authored by Andrey Filippov's avatar Andrey Filippov

debugging rgba texture generation

parent 11455aa9
......@@ -72,8 +72,8 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#undef HAS_PRINTF
//#define HAS_PRINTF
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
......@@ -87,7 +87,8 @@
#define DEBUG9 1
*/
#define DEBUG10 1
#define DEBUG11 1
#define DEBUG12 1
//#define USE_textures_gen
#endif //#ifndef JCUDA
......@@ -1533,10 +1534,15 @@ __global__ void generate_RBGA(
int texture_slices = colors + 1;
if (threadIdx.x == 0) {
//DTT_SIZE_LOG2
// dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
// int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
// dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
clear_texture_rbga<<<blocks2,threads2>>>(
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
......@@ -1547,12 +1553,23 @@ __global__ void generate_RBGA(
for (int pass = 0; pass < 8; pass++){
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2)); // 1/4
if (border_tile){
ti_offset += TILESX * (TILESYA >> 2) - ntt;
}
#ifdef DEBUG12
printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
pass, border_tile,ti_offset, ntt);
printf("\ngenerate_RBGA() gpu_texture_indices= 0x%x, gpu_texture_indices + ti_offset=0x%x\n",
(int) gpu_texture_indices, (int) (gpu_texture_indices + ti_offset));
printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
grid_texture.x, grid_texture.y, grid_texture.z);
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n");
#endif
/* */
textures_accumulate<<<grid_texture,threads_texture>>>(
border_tile, // int border_tile, // if 1 - watch for border
......@@ -1578,9 +1595,8 @@ __global__ void generate_RBGA(
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles); // (float *) 0 ); // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
cudaDeviceSynchronize(); // not needed yet, just for testing
/* */
}
}
......@@ -1590,21 +1606,20 @@ __global__ void generate_RBGA(
// blockDim.x * gridDim.x >= width
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_width, // aligned to DTT_SIZE
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
if (col > texture_width) {
return;
}
int row = (blockIdx.y << 3); // includes slices
int row = blockIdx.y;; // includes slices
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
*(pix) = 0.0;
pix += texture_rbga_stride;
*(pix++) = 0.0;
}
}
......@@ -1778,26 +1793,51 @@ __global__ void gen_texture_list(
int cxy = gpu_tasks[task_num].txy;
int x = (cxy & 0xffff);
int y = (cxy >> 16);
#ifdef DEBUG12
if ((x == DBG_TILE_X) && (y == DBG_TILE_Y)){
printf("\ngen_texture_list() x = %d, y= %d\n",x, y);
printf("\ngen_texture_list() num_texture_tiles = %d(%d) %d(%d) %d(%d) %d(%d)\n",
num_texture_tiles[0],num_texture_tiles[1],num_texture_tiles[2],num_texture_tiles[3],
num_texture_tiles[4],num_texture_tiles[5],num_texture_tiles[6],num_texture_tiles[7]);
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
// don't care if calculate extra pixels that still fit into memory
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
int buff_head = 0;
int num_offset = 0;
if (x & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
num_texture_tiles += 2; // int *
buff_head += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
num_offset += 2; // int *
}
if (y & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 1);
num_texture_tiles += 4; // int *
buff_head += TILESX * (TILESYA >> 1);
num_offset += 4; // int *
}
if (is_border){
gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
num_texture_tiles += 1; // int *
buff_head += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
num_offset += 1; // int *
}
gpu_texture_indices += buff_head;
num_texture_tiles += num_offset;
// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
int buf_offset = atomicAdd(num_texture_tiles, 1);
if (is_border){
buf_offset = -buf_offset;
}
#ifdef DEBUG12
if ((x == DBG_TILE_X) && (y == DBG_TILE_Y)){
printf("\ngen_texture_list() buff_head=%d, buf_offset = %d, num_offset= %d, is_border=%d\n",
buff_head, buf_offset, num_offset,is_border);
printf("\ngen_texture_list() gpu_texture_indices = 0x%x, gpu_texture_indices + buf_offset = 0x%x\n",
(int) gpu_texture_indices, (int) (gpu_texture_indices + buf_offset));
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
*(gpu_texture_indices + buf_offset) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
......@@ -2420,7 +2460,7 @@ __global__ void textures_accumulate(
}
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
printf("\textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
}
__syncthreads();// __syncwarp();
#endif
......@@ -2432,6 +2472,20 @@ __global__ void textures_accumulate(
}
if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
tile_num, (int) tile_code, border_tile);
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
-1);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
......@@ -2453,12 +2507,26 @@ __global__ void textures_accumulate(
}
}
}
int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color
int tileY = tile_num / TILESX; // slow, but 1 per tile
int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
tileX, tileY, tile_x0, tile_y0, slice_stride);
for (int ncol = 0; ncol <= colors; ncol++) {
printf("\ntile[%d]\n",ncol);
debug_print_mclt(
(float *) (shr1.rgbaw[ncol]),
-1);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1); // row inside a tile (0..15)
int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
......@@ -2468,6 +2536,15 @@ __global__ void textures_accumulate(
int gi = g_row * texture_rbg_stride + g_col; // offset to the top left corner
float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
pass, row, col, g_row, g_col, i, gi);
}
__syncthreads();// __syncwarp();
#endif // DEBUG12
if (!border_tile ||
((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
// always copy 3 (1) colors + alpha
......
......@@ -261,6 +261,7 @@ int main(int argc, char **argv)
"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
// not yet used
float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
......@@ -429,7 +430,6 @@ int main(int argc, char **argv)
int tp_task_size = sizeof(task_data)/sizeof(struct tp_task);
#ifdef DBG_TILE
#ifdef DBG0
//#define NUM_TEST_TILES 128
#define NUM_TEST_TILES 1
......@@ -445,7 +445,6 @@ int main(int argc, char **argv)
}
tp_task_size = NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
#endif
#endif
// segfault in the next
......@@ -660,24 +659,10 @@ int main(int argc, char **argv)
}
}
}
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
sdkStopTimer(&timerIMCLT);
......@@ -746,12 +731,6 @@ int main(int argc, char **argv)
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
sdkStopTimer(&timerCORR);
......@@ -809,8 +788,6 @@ int main(int argc, char **argv)
// Channel0 weight = 0.294118
// Channel1 weight = 0.117647
// Channel2 weight = 0.588235
#define TEXTURES_ACCUMULATE
#ifdef TEXTURES_ACCUMULATE
textures_accumulate<<<grid_texture,threads_texture>>> (
0, // int border_tile, // if 1 - watch for border
(int *) 0, // int * woi, // x, y, width,height
......@@ -835,37 +812,9 @@ int main(int argc, char **argv)
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
#else
textures_gen<<<grid_texture,threads_texture>>> (
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
#endif
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURE);
......@@ -945,12 +894,6 @@ int main(int argc, char **argv)
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
#ifdef DEBUG4
break;
#endif
#ifdef DEBUG5
break;
#endif
}
/// cudaProfilerStop();
sdkStopTimer(&timerTEXTURELIST);
......@@ -958,7 +901,7 @@ int main(int argc, char **argv)
sdkDeleteTimer(&timerTEXTURELIST);
printf("Average TextureList run time =%f ms\n", avgTimeTEXTURESLIST);
int cpu_num_texture_tiles[4];
int cpu_num_texture_tiles[8];
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
......@@ -966,7 +909,6 @@ int main(int argc, char **argv)
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
checkCudaErrors(cudaMemcpy(
// &cpu_num_texture_tiles,
cpu_num_texture_tiles,
gpu_num_texture_tiles,
8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
......@@ -1003,6 +945,121 @@ int main(int argc, char **argv)
}
#endif //GEN_TEXTURE_LIST
#ifndef NOTEXTURE_RGBA
dim3 threads_rgba(1, 1, 1);
dim3 grid_rgba(1,1,1);
printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
printf("grid_rgba=(%d, %d, %d)\n", grid_rgba.x,grid_rgba.y,grid_rgba.z);
StopWatchInterface *timerRGBA = 0;
sdkCreateTimer(&timerRGBA);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerRGBA);
sdkStartTimer(&timerRGBA);
}
generate_RBGA<<<grid_rgba,threads_rgba>>> (
// Parameters to generate texture tasks
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
// declare arrays in device code?
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (8 elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY, // int height); // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
0, // int keep_weights, // return channel weights after A in RGBA
dstride_textures_rbga/sizeof(float), // const size_t texture_rbga_stride, // in floats
gpu_textures_rbga); // float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
sdkStopTimer(&timerRGBA);
float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
sdkDeleteTimer(&timerRGBA);
printf("Average Texture run time =%f ms\n", avgTimeRGBA);
checkCudaErrors(cudaMemcpy(
cpu_woi,
gpu_woi,
4 * sizeof(float),
cudaMemcpyDeviceToHost));
printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
int rgba_woi_width = cpu_woi[2] * DTT_SIZE;
int rgba_woi_height = cpu_woi[3] * DTT_SIZE;
int rslt_rgba_size = rgba_woi_width * rgba_woi_height * rbga_slices;
float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures_rgba,
rgba_width * sizeof(float),
gpu_textures_rbga,
dstride_textures_rbga,
rgba_width * sizeof(float),
rgba_height * rbga_slices,
cudaMemcpyDeviceToHost));
#ifndef NSAVE_TEXTURES
printf("Writing RBGA texture slices to %s\n", result_textures_rgba_file);
writeFloatsToFile(
cpu_textures_rgba, // float * data, // allocated array
rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path
#endif
#ifdef DEBUG11
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){
printf("\nchn = %d\n", chn);
int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
for (int i = 0; i < 8; i++){
for (int j = 0; j < 8; j++){
printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
}
printf("\n");
}
}
#endif // DEBUG11
free(cpu_textures_rgba);
#endif // ifndef NOTEXTURES
#ifdef SAVE_CLT
free(cpu_clt);
#endif
......
......@@ -45,9 +45,11 @@
* with Nvidia Nsight, driver API when calling these kernels from Java
*/
#ifndef JCUDA
#define DTT_SIZE 8
#define DTT_SIZE_LOG2 3
//#define DTT_SIZE 8
#endif
#pragma once
#define DTT_SIZE (1 << DTT_SIZE_LOG2)
#define DTTTEST_BLOCK_WIDTH 32
#define DTTTEST_BLOCK_HEIGHT 16
#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH+1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment