Commit 12758c3e authored by Andrey Filippov's avatar Andrey Filippov

Before adding offset parameter to textures_accumulate

parent 0638c622
...@@ -978,33 +978,30 @@ __device__ void imclt_plane( // not implemented, not used ...@@ -978,33 +978,30 @@ __device__ void imclt_plane( // not implemented, not used
float * gpu_rbg, // WIDTH, HEIGHT float * gpu_rbg, // WIDTH, HEIGHT
const size_t dstride); // in floats (pixels) const size_t dstride); // in floats (pixels)
__global__ void clear_texture_list( extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILES-X, use for faster processing of LWIR images int width, // <= TILES-X, use for faster processing of LWIR images
int height); // <= TILES-Y, use for faster processing of LWIR images int height); // <= TILES-Y, use for faster processing of LWIR images
__global__ void mark_texture_tiles( extern "C" __global__ void mark_texture_tiles(
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int * gpu_texture_indices);// packed tile + bits (now only (1 << 7) int * gpu_texture_indices);// packed tile + bits (now only (1 << 7)
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? extern "C" __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7) int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi int * woi); // x,y,width,height of the woi
__global__ void gen_texture_list( extern "C" __global__ void gen_texture_list(
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
...@@ -1012,7 +1009,7 @@ __global__ void gen_texture_list( ...@@ -1012,7 +1009,7 @@ __global__ void gen_texture_list(
int * num_texture_tiles, // number of texture tiles to process int * num_texture_tiles, // number of texture tiles to process
int * woi); // min_x, min_y, max_x, max_y input int * woi); // min_x, min_y, max_x, max_y input
__global__ void clear_texture_rbga( extern "C" __global__ void clear_texture_rbga(
int texture_width, int texture_width,
int texture_slice_height, int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride const size_t texture_rbga_stride, // in floats 8*stride
...@@ -1044,10 +1041,9 @@ __global__ void index_correlate( ...@@ -1044,10 +1041,9 @@ __global__ void index_correlate(
int * gpu_corr_indices, // array of correlation tasks int * gpu_corr_indices, // array of correlation tasks
int * pnum_corr_tiles); // pointer to the length of correlation tasks array int * pnum_corr_tiles); // pointer to the length of correlation tasks array
__global__ void create_nonoverlap_list( extern "C" __global__ void create_nonoverlap_list(
int num_cams, int num_cams,
float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks , // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int width, // number of tiles in a row int width, // number of tiles in a row
int * nonoverlap_list, // pointer to the calculated number of non-zero tiles int * nonoverlap_list, // pointer to the calculated number of non-zero tiles
...@@ -2279,8 +2275,7 @@ __global__ void mark_texture_tiles( ...@@ -2279,8 +2275,7 @@ __global__ void mark_texture_tiles(
* the result textures to fade along the border. * the result textures to fade along the border.
* *
* @param num_cams number of cameras * @param num_cams number of cameras
* @param gpu_ftasks flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 * @param gpu_ftasks flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
// * @param gpu_tasks array of per-tile tasks (struct tp_task)
* @param num_tiles number of tiles int gpu_tasks array prepared for processing * @param num_tiles number of tiles int gpu_tasks array prepared for processing
* @param width number of tiles in a row * @param width number of tiles in a row
* @param height number of tiles rows * @param height number of tiles rows
...@@ -2291,7 +2286,6 @@ __global__ void mark_texture_tiles( ...@@ -2291,7 +2286,6 @@ __global__ void mark_texture_tiles(
__global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
...@@ -2304,12 +2298,10 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? ...@@ -2304,12 +2298,10 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
return; // nothing to do return; // nothing to do
} }
/// int task = gpu_tasks[task_num].task;
int task = get_task_task(task_num, gpu_ftasks, num_cams); int task = get_task_task(task_num, gpu_ftasks, num_cams);
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile return; // NOP tile
} }
/// int cxy = gpu_tasks[task_num].txy;
int cxy = get_task_txy(task_num, gpu_ftasks, num_cams); int cxy = get_task_txy(task_num, gpu_ftasks, num_cams);
int x = (cxy & 0xffff); int x = (cxy & 0xffff);
...@@ -2344,12 +2336,11 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__? ...@@ -2344,12 +2336,11 @@ __global__ void mark_texture_neighbor_tiles( // TODO: remove __global__?
*/ */
__global__ void gen_texture_list( __global__ void gen_texture_list(
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
/// struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list int num_tiles, // number of tiles in task list
int width, // number of tiles in a row int width, // number of tiles in a row
int height, // number of tiles rows int height, // number of tiles rows
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7) int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process int * num_texture_tiles, // number of texture tiles to process
int * woi) // min_x, min_y, max_x, max_y input int * woi) // min_x, min_y, max_x, max_y input
...@@ -2413,7 +2404,10 @@ __global__ void gen_texture_list( ...@@ -2413,7 +2404,10 @@ __global__ void gen_texture_list(
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif // DEBUG12 #endif // DEBUG12
*(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT); // *(gpu_texture_indices + buf_offset) = task | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
// keep only 8 LSBs of task, use higher 24 for task number
*(gpu_texture_indices + buf_offset) = (task & ((1 << CORR_NTILE_SHIFT) -1)) | ((x + y * width) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
//CORR_NTILE_SHIFT
} }
//inline __device__ int get_task_size(int num_cams){ //inline __device__ int get_task_size(int num_cams){
// return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams); // return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
...@@ -2465,7 +2459,7 @@ __global__ void index_direct( ...@@ -2465,7 +2459,7 @@ __global__ void index_direct(
* @param nonoverlap_list integer array to place the generated list * @param nonoverlap_list integer array to place the generated list
* @param pnonoverlap_length single-element integer array return generated list length * @param pnonoverlap_length single-element integer array return generated list length
*/ */
__global__ void create_nonoverlap_list( extern "C" __global__ void create_nonoverlap_list(
int num_cams, int num_cams,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
...@@ -3334,7 +3328,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1) ...@@ -3334,7 +3328,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif // DEBUG12 #endif // DEBUG12
int alpha_mode = alphaIndex[tile_code]; int alpha_mode = alphaIndex[tile_code]; // only 4 lowest bits
if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is. if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
for (int pass = 0; pass < 8; pass ++) { for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1); int row = pass * 2 + (threadIdx.y >> 1);
......
...@@ -32,7 +32,7 @@ ...@@ -32,7 +32,7 @@
#define NOCORR #define NOCORR
#define NOCORR_TD #define NOCORR_TD
//#define NOTEXTURES_HOST #define NOTEXTURES_HOST
#define NOTEXTURES #define NOTEXTURES
#define NOTEXTURE_RGBA #define NOTEXTURE_RGBA
#define SAVE_CLT #define SAVE_CLT
...@@ -345,8 +345,8 @@ void generate_RBGA_host( ...@@ -345,8 +345,8 @@ void generate_RBGA_host(
// TODO: create gpu_woi to pass (copy from woi) // TODO: create gpu_woi to pass (copy from woi)
// set lower 4 bits in each gpu_ftasks task // set lower 4 bits in each gpu_ftasks task
mark_texture_neighbor_tiles <<<blocks,threads>>>( mark_texture_neighbor_tiles <<<blocks,threads>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
num_tiles, // number of tiles in task list num_tiles, // number of tiles in task list
width, // number of tiles in a row width, // number of tiles in a row
height, // number of tiles rows height, // number of tiles rows
...@@ -431,6 +431,22 @@ void generate_RBGA_host( ...@@ -431,6 +431,22 @@ void generate_RBGA_host(
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1); dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2); int blocks_x2 = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical dim3 blocks2 (blocks_x2, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
#ifdef DEBUG8A
int cpu_texture_indices [TILESX*TILESYA];
checkCudaErrors(cudaMemcpy(
(float * ) cpu_texture_indices,
gpu_texture_indices,
TILESX*TILESYA * sizeof(float),
cudaMemcpyDeviceToHost));
for (int i = 0; i < 256; i++){
int indx = cpu_texture_indices[i];
printf("%02d %04x %03d %03d %x\n",i,indx, (indx>>8) / 80, (indx >> 8) % 80, indx&0xff);
}
#endif // #ifdef DEBUG8A
clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
texture_width, texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height, texture_tiles_height * texture_slices, // int texture_slice_height,
...@@ -461,22 +477,21 @@ void generate_RBGA_host( ...@@ -461,22 +477,21 @@ void generate_RBGA_host(
printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n", printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
threads_texture.x, threads_texture.y, threads_texture.z); threads_texture.x, threads_texture.y, threads_texture.z);
printf("\n"); printf("\n");
#endif #endif
/* */ /* */
int shared_size = host_get_textures_shared_size( // in bytes int shared_size = host_get_textures_shared_size( // in bytes
num_cams, // int num_cams, // actual number of cameras num_cams, // int num_cams, // actual number of cameras
colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
0); // int * offsets); // in floats 0); // int * offsets); // in floats
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
num_cams, // int num_cams, // number of cameras used num_cams, // int num_cams, // number of cameras used
gpu_woi, // int * woi, // x, y, width,height gpu_woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process ntt, // size_t num_texture_tiles, // number of texture tiles to process
// TODO: add int parameter ti_offset (can not pass to the middle of a device array)?
// alternatively (to minimize dynamic parallelism mods) temporarily copy the 1/4 of a gpu_texture_indices
// or still pass parameter?
// first - try to add offset to GPU address
gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction, gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1) colors, // int colors, // number of colors (3/1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment