Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
11455aa9
Commit
11455aa9
authored
Apr 04, 2020
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
compared with java, disabled duplicate function to reduce ptx size
parent
9619c4ac
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
670 additions
and
67 deletions
+670
-67
TileProcessor.cuh
src/TileProcessor.cuh
+594
-41
dtt8x8.cu
src/dtt8x8.cu
+76
-26
No files found.
src/TileProcessor.cuh
View file @
11455aa9
...
...
@@ -73,6 +73,7 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#undef HAS_PRINTF
//#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
...
...
@@ -85,6 +86,9 @@
#define DEBUG8 1
#define DEBUG9 1
*/
#define DEBUG10 1
//#define USE_textures_gen
#endif //#ifndef JCUDA
...
...
@@ -1128,7 +1132,8 @@ extern "C"
__global__ void mark_texture_neighbor_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi); // x,y,width,height of the woi
extern "C"
__global__ void gen_texture_list(
struct tp_task * gpu_tasks,
...
...
@@ -1137,6 +1142,36 @@ __global__ void gen_texture_list(
int * num_texture_tiles, // number of texture tiles to process
int * woi); // x,y,width,height of the woi
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C"
__global__ void correlate2D(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -1407,6 +1442,175 @@ Java code:
}
#define USE_CDP
#ifdef USE_CDP
extern "C"
__global__ void generate_RBGA(
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
// TODO use atomic_add to increment num_texture_tiles
// TODO calculate woi
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1);
if (threadIdx.x == 0) {
clear_texture_list<<<blocks0,threads0>>>(
gpu_texture_indices,
width,
height);
cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + ((1 << THREADS_DYNAMIC_BITS)) -1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = TILESX;
*(woi + 1) = TILESY;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices,// packed tile + bits (now only (1 << 7)
num_texture_tiles, // number of texture tiles to process
woi); // x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0); // width
*(woi + 3) += 1 - *(woi + 1); // height
}
__syncthreads();
// Zero output textures. Trim
// texture_rbga_stride
int texture_width = *(woi + 2) * DTT_SIZE;
int texture_tiles_height = *(woi + 3) * DTT_SIZE;
int texture_height = texture_tiles_height * DTT_SIZE;
int texture_slices = colors + 1;
if (threadIdx.x == 0) {
dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
clear_texture_rbga<<<blocks2,threads2>>>(
texture_width,
texture_tiles_height * texture_slices, // int texture_slice_height,
texture_rbga_stride, // const size_t texture_rbga_stride, // in floats 8*stride
gpu_texture_tiles) ; // float * gpu_texture_tiles);
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x116 tiles overhang by 4 pixels)
cudaDeviceSynchronize(); // not needed yet, just for testing
for (int pass = 0; pass < 8; pass++){
dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
int border_tile = pass >> 2;
size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2)); // 1/4
if (border_tile){
ti_offset += TILESX * (TILESYA >> 2) - ntt;
}
/* */
textures_accumulate<<<grid_texture,threads_texture>>>(
border_tile, // int border_tile, // if 1 - watch for border
woi, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
ntt, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices + ti_offset, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weight0, // float weight0, // scale for R
weight1, // float weight1, // scale for B
weight2, // float weight2, // scale for G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
texture_rbga_stride, // size_t texture_rbg_stride, // in floats
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles); // (float *) 0 ); // float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
/* */
}
}
__syncthreads();
}
// blockDim.x * gridDim.x >= width
extern "C" __global__ void clear_texture_rbga(
int texture_width,
int texture_slice_height,
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
int col = blockDim.x * blockIdx.x + threadIdx.x;
if (col > texture_width) {
return;
}
int row = (blockIdx.y << 3); // includes slices
float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
#pragma unroll
for (int n = 0; n < DTT_SIZE; n++) {
*(pix) = 0.0;
pix += texture_rbga_stride;
}
}
/**
* prepare list of texture tiles, woi, and calculate orthogonal neighbors for tiles (in 4 bits of the task field
* use 4x8=32 threads,
...
...
@@ -1416,6 +1620,8 @@ __global__ void prepare_texture_list(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// modified to have 8 length - split each subsequence into non-border/border tiles. Non-border will grow up,
// border - down from the sam3\e 1/4 of the buffer
int * num_texture_tiles, // number of texture tiles to process (4 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
...
...
@@ -1426,7 +1632,9 @@ __global__ void prepare_texture_list(
// int task_num = blockIdx.x;
// int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
// int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
dim3 blocks0 (blocks_x, height, 1);
if (threadIdx.x == 0) {
...
...
@@ -1436,40 +1644,45 @@ __global__ void prepare_texture_list(
height);
cudaDeviceSynchronize(); // not needed yet, just for testing
dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
int blocks_t = (num_tiles + 1) >> THREADS_DYNAMIC_BITS;//
int blocks_t = (num_tiles +
((1 << THREADS_DYNAMIC_BITS)) -
1) >> THREADS_DYNAMIC_BITS;//
dim3 blocks(blocks_t, 1, 1);
// mark used tiles in gpu_texture_indices memory
mark_texture_tiles <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize();
// not needed yet, just for testing
cudaDeviceSynchronize();
// mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
*(woi + 0) = TILESX;
*(woi + 1) = TILESY;
*(woi + 2) = 0; // maximal x
*(woi + 3) = 0; // maximal y
mark_texture_neighbor_tiles <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices); // packed tile + bits (now only (1 << 7)
cudaDeviceSynchronize(); // not needed yet, just for testing
num_tiles, // number of tiles in task list
gpu_texture_indices, // packed tile + bits (now only (1 << 7)
woi); // min_x, min_y, max_x, max_y
cudaDeviceSynchronize();
// Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
*(woi + 0) = TILESX;
*(woi + 1) = TILESY;
*(woi + 2) = 0;
*(woi + 3) = 0;
*(num_texture_tiles+0) = 0;
*(num_texture_tiles+1) = 0;
*(num_texture_tiles+2) = 0;
*(num_texture_tiles+3) = 0;
*(num_texture_tiles+4) = 0;
*(num_texture_tiles+5) = 0;
*(num_texture_tiles+6) = 0;
*(num_texture_tiles+7) = 0;
gen_texture_list <<<blocks,threads>>>(
gpu_tasks,
num_tiles, // number of tiles in task list
gpu_texture_indices,
// packed tile + bits (now only (1 << 7)
gpu_texture_indices,// packed tile + bits (now only (1 << 7)
num_texture_tiles, // number of texture tiles to process
woi);
// x,y,width,height of the woi
woi);
// x,y, here woi[2] = max_X, woi[3] - max-Y
cudaDeviceSynchronize(); // not needed yet, just for testing
*(woi + 2) += 1 - *(woi + 0);
*(woi + 3) += 1 - *(woi + 1);
*(woi + 2) += 1 - *(woi + 0);
// width
*(woi + 3) += 1 - *(woi + 1);
// height
}
__syncthreads();
}
...
...
@@ -1500,13 +1713,10 @@ __global__ void mark_texture_tiles(
if (task_num >= num_tiles) {
return; // nothing to do
}
// struct tp_task * gpu_task = &gpu_tasks[task_num];
// int task = gpu_task->task;
int task = gpu_tasks[task_num].task;
if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
}
// int cxy = gpu_task->txy;
int cxy = gpu_tasks[task_num].txy;
*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * TILESX) = 1;
}
...
...
@@ -1517,7 +1727,9 @@ extern "C"
__global__ void mark_texture_neighbor_tiles(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
int * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * woi) // x,y,width,height of the woi
{
int task_num = blockDim.x * blockIdx.x + threadIdx.x;
if (task_num >= num_tiles) {
...
...
@@ -1533,6 +1745,10 @@ __global__ void mark_texture_neighbor_tiles(
int cxy = gpu_tasks[task_num].txy;
int x = (cxy & 0xffff);
int y = (cxy >> 16);
atomicMin(woi+0, x);
atomicMin(woi+1, y);
atomicMax(woi+2, x);
atomicMax(woi+3, y);
int d = 0;
if ((y > 0) && *(gpu_texture_indices + x + (y - 1) * TILESX)) d |= (1 << TASK_TEXTURE_N_BIT);
if ((x < (TILESX - 1)) && *(gpu_texture_indices + (x + 1) + y * TILESX)) d |= (1 << TASK_TEXTURE_E_BIT);
...
...
@@ -1547,45 +1763,44 @@ __global__ void gen_texture_list(
int num_tiles, // number of tiles in task list
int * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process
int * woi) //
x,y,width,height of the woi
int * woi) //
min_x, min_y, max_x, max_y input
{
// int *gpu_texture_indices1 = gpu_texture_indices0 + TILESX * (TILESYA >> 2);
// int *gpu_texture_indices2 = gpu_texture_indices1 + TILESX * (TILESYA >> 2);
// int *gpu_texture_indices3 = gpu_texture_indices2 + TILESX * (TILESYA >> 2);
int task_num = blockDim.x * blockIdx.x + threadIdx.x;
if (task_num >= num_tiles) {
return; // nothing to do
}
// struct tp_task * gpu_task = &gpu_tasks[task_num];
// int task = gpu_task->task;
int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
return; // NOP tile
}
// int cxy = gpu_task->txy;
int cxy = gpu_tasks[task_num].txy;
int x = (cxy & 0xffff);
int y = (cxy >> 16);
// int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
// don't care if calculate extra pixels that still fit into memory
int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
if (x & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
num_texture_tiles +=
1
; // int *
num_texture_tiles +=
2
; // int *
}
if (y & 1) {
gpu_texture_indices += TILESX * (TILESYA >> 1);
num_texture_tiles += 2; // int *
num_texture_tiles += 4; // int *
}
if (is_border){
gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
num_texture_tiles += 1; // int *
}
// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
atomicMin(woi+0, x
);
atomicMin(woi+1, y);
atomicMax(woi+2, x)
;
atomicMax(woi+3, y);
*(gpu_texture_indices +
atomicAdd(num_texture_tiles, 1)
) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
int buf_offset = atomicAdd(num_texture_tiles, 1
);
if (is_border){
buf_offset = -buf_offset
;
}
*(gpu_texture_indices +
buf_offset
) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
}
//CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
#endif //#ifdef USE_CDP
...
...
@@ -1669,6 +1884,9 @@ __global__ void convert_correct_tiles(
}
}
}
//#undef USE_textures_gen
#ifdef USE_textures_gen
extern "C"
__global__ void textures_gen(
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -1933,10 +2151,347 @@ __global__ void textures_gen(
}
__syncthreads();// __syncwarp();
#endif
}
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
size_t texture_rbg_stride, // in floats
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles) // (number of colors +1 + ?)*16*16 rgba texture tiles
{
float weights[3] = {weight0, weight1, weight2};
// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
int camera_num = threadIdx.y;
int tile_indx = blockIdx.x; // * TEXTURE_TILES_PER_BLOCK + tile_in_block;
if (tile_indx >= num_texture_tiles){
return; // nothing to do
}
// get number of tile
int tile_code = gpu_texture_indices[tile_indx];
if ((tile_code & (1 << CORR_TEXTURE_BIT)) == 0){
return; // nothing to do
}
int tile_num = tile_code >> CORR_NTILE_SHIFT;
__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
__shared__ union {
float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // NUM_CAMS == 4
float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; // to align with clt_tiles
} shr;
__shared__ union {
float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// add more
} shr1;
// __shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
// __shared__ float color_avg [NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
__shared__ float port_offsets[NUM_CAMS][2];
__shared__ float ports_rgb [NUM_CAMS][NUM_COLORS]; // return to system memory (optionally pass null to skip calculation)
__shared__ float max_diff [NUM_CAMS]; // return to system memory (optionally pass null to skip calculation)
if (threadIdx.x < 2){
port_offsets[camera_num][threadIdx.x] = * (gpu_port_offsets + 2 * camera_num + threadIdx.x);
}
#ifdef DBG_TILE
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile = %d\n",tile_num);
// debug_print_clt1(clt_tile1, color, 0xf); //
// printf("\textures_gen tile = %d, pair=%d, color = %d CAMERA22\n",tile_num, corr_pair,color);
// debug_print_clt1(clt_tile2, color, 0xf); //
}
__syncthreads();// __syncwarp();
#endif
#endif
// serially for each color, parallel for each camera
// copy clt (frequency domain data)
for (int color = 0; color < colors; color++){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
float * clt_tile = ((float *) shr.clt_tiles[camera_num][color]); // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
float * clt_tilei = clt_tile + threadIdx.x;
float * gpu_tile = ((float *) gpu_clt[camera_num]) + (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * mclt_tile = (float *) mclt_tiles [camera_num][color];
float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
float * mclt_tmp = (float *) shr1.mclt_tmp[camera_num][color];
// float scale = 0.25;
#pragma unroll
for (int q = 0; q < 4; q++) {
float *lpf = lpf_data[(colors > 1)? color : 3] + threadIdx.x; // lpf_data[3] - mono
#pragma unroll
for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
// *clt_tilei = *gpu_tile * (*lpf) * scale;
*clt_tilei = *gpu_tile * (*lpf);
clt_tilei += DTT_SIZE1;
gpu_tile += DTT_SIZE;
lpf += DTT_SIZE;
}
}
__syncthreads();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen LPF for color = %d\n",color);
debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
printf("\ntextures_gen tile = %d, color = %d \n",tile_num, color);
debug_print_clt_scaled(clt_tile, color, 0xf, 0.25); //
}
__syncthreads();// __syncwarp();
#endif
// perform idct
imclt8threads(
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
debug_print_mclt(
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color);
}
__syncthreads();// __syncwarp();
#endif
if (colors > 1) {
debayer_shot(
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#pragma unroll
for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
float * msp = mclt_tile + threadIdx.x + n;
float * dst = mclt_dst + threadIdx.x + n;
#pragma unroll
for (int row = 0; row < DTT_SIZE2; row++){
*dst = *msp;
msp += DTT_SIZE21;
dst += DTT_SIZE21;
}
}
__syncthreads();
}
#ifdef DEBUG77
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
debug_print_mclt(
mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][0], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
} // for (int color = 0; color < colors; color++)
__syncthreads(); // __syncwarp();
/// return;
#ifdef DEBUG77
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
}
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG77
for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
(float *) shr.mclt_debayer[ccam][nncol], // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
tile_combine_rgba(
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
// multiply all 4(2) slices by a window (if not all directions)
if (gpu_texture_tiles && (texture_stride != 0)){ // generate non-ovelapping tiles
float * gpu_texture_tile = gpu_texture_tiles + tile_indx * texture_stride;
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
int gi = row * DTT_SIZE2 + col;
float * gpu_texture_tile_gi = gpu_texture_tile + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
// always copy 3 (1) colors + alpha
if (colors == 3){
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1 ; ncol++) { // 12
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
} else { // assuming colors = 1
if (keep_weights) {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1 + NUM_CAMS + 1 + 1 ; ncol++) { // 8
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else {
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
}
}
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
}
__syncthreads();// __syncwarp();
#endif
} // if (gpu_texture_tiles){ // generate non-ovelapping tiles
tile_code &= TASK_TEXTURE_BITS;
if (!tile_code){
return; // should not happen
}
if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1);
int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
int i = row * DTT_SIZE21 + col;
int gi = row * DTT_SIZE2 + col;
float * rgba_i = ((float *) shr1.rgbaw) + i;
// always copy 3 (1) colors + alpha
if (colors == 3){
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
}
} else { // assuming colors = 1
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
}
}
}
}
int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
int tileY = tile_num / TILESX; // slow, but 1 per tile
int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
for (int pass = 0; pass < 8; pass ++) {
int row = pass * 2 + (threadIdx.y >> 1); // row inside a tile (0..15)
int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
int g_row = row + tile_y0;
int g_col = col + tile_x0;
int i = row * DTT_SIZE21 + col;
int gi = g_row * texture_rbg_stride + g_col; // offset to the top left corner
float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
float * rgba_i = ((float *) shr1.rgbaw) + i;
if (!border_tile ||
((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
// always copy 3 (1) colors + alpha
if (colors == 3){
#pragma unroll
for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
} else { // assuming colors = 1
#pragma unroll
for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
*(gpu_texture_rbg_gi + ncol * slice_stride) += *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
}
}
}
}
} // if (gpu_texture_rbg) { // generate RGBA
} // textures_accumulate()
}
extern "C"
...
...
@@ -2966,9 +3521,7 @@ __device__ void convertCorrectTile(
//#endif
}
//#ifndef NOICLT1
#ifdef NOICLT1
extern "C"
__global__ void test_imclt(
float * gpu_clt, // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
...
...
@@ -3024,7 +3577,7 @@ __global__ void test_imclt(
__syncthreads();// __syncwarp();
}
}
#endif // NOICLT1
//
// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
...
...
src/dtt8x8.cu
View file @
11455aa9
...
...
@@ -325,6 +325,7 @@ int main(int argc, char **argv)
int * gpu_corr_indices;
float * gpu_textures;
float * gpu_textures_rbga;
int * gpu_texture_indices;
int * gpu_woi;
int * gpu_num_texture_tiles;
...
...
@@ -346,6 +347,7 @@ int main(int argc, char **argv)
size_t dstride_rslt; // in bytes !
size_t dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
float lpf_rbg[3][64]; // not used
...
...
@@ -491,7 +493,8 @@ int main(int argc, char **argv)
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
...
...
@@ -510,6 +513,15 @@ int main(int argc, char **argv)
tile_texture_size, // int width (floats),
TILESX * TILESY); // int height);
int rgba_width = TILESX * DTT_SIZE;
int rgba_height = TILESY * DTT_SIZE;
int rbga_slices = texture_colors + 1; // 4/1
gpu_textures_rbga = alloc_image_gpu(
&dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile
rgba_width, // int width (floats),
rgba_height * rbga_slices); // int height);
// Now copy arrays of per-camera pointers to GPU memory to GPU itself
...
...
@@ -797,26 +809,53 @@ int main(int argc, char **argv)
// Channel0 weight = 0.294118
// Channel1 weight = 0.117647
// Channel2 weight = 0.588235
textures_gen<<<grid_texture,threads_texture>>> (
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
#define TEXTURES_ACCUMULATE
#ifdef TEXTURES_ACCUMULATE
textures_accumulate<<<grid_texture,threads_texture>>> (
0, // int border_tile, // if 1 - watch for border
(int *) 0, // int * woi, // x, y, width,height
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // const size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
#else
textures_gen<<<grid_texture,threads_texture>>> (
gpu_clt , // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
num_textures, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_port_offsets, // float * port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
texture_colors, // int colors, // number of colors (3/1)
(texture_colors == 1), // int is_lwir, // do not perform shot correction
10.0, // float min_shot, // 10.0
3.0, // float scale_shot, // 3.0
1.5f, // float diff_sigma, // pixel value/pixel change
10.0f, // float diff_threshold, // pixel value/pixel change
3.0, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
0.294118, // float weight0, // scale for R
0.117647, // float weight1, // scale for B
0.588235, // float weight2, // scale for G
1, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_texture_weights, // int keep_weights, // return channel weights after A in RGBA
dstride_textures/sizeof(float), // const size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures); // float * gpu_texture_tiles); // 4*16*16 rgba texture tiles
#endif
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
...
...
@@ -856,7 +895,7 @@ int main(int argc, char **argv)
result_textures_file); // const char * path) // file path
//DBG_TILE
#ifdef DEBUG
9
#ifdef DEBUG
10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
for (int i = 0; i < tile_texture_size; i++){
...
...
@@ -898,7 +937,7 @@ int main(int argc, char **argv)
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (
4
elements)
gpu_num_texture_tiles, // int * num_texture_tiles, // number of texture tiles to process (
8
elements)
gpu_woi, // int * woi, // x,y,width,height of the woi
TILESX, // int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
TILESY); // int height); // <= TILESY, use for faster processing of LWIR images
...
...
@@ -930,10 +969,20 @@ int main(int argc, char **argv)
// &cpu_num_texture_tiles,
cpu_num_texture_tiles,
gpu_num_texture_tiles,
4 * sizeof(float), // 4 separate sequences
8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
cudaMemcpyDeviceToHost));
printf("cpu_num_texture_tiles=(%d, %d, %d, %d)\n", cpu_num_texture_tiles[0],
cpu_num_texture_tiles[1], cpu_num_texture_tiles[2], cpu_num_texture_tiles[3]);
printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
tp_task_size
);
for (int q = 0; q < 4; q++) {
checkCudaErrors(cudaMemcpy(
texture_indices + q * TILESX * (TILESYA >> 2),
...
...
@@ -980,6 +1029,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_texture_indices));
checkCudaErrors(cudaFree(gpu_port_offsets));
checkCudaErrors(cudaFree(gpu_textures));
checkCudaErrors(cudaFree(gpu_textures_rbga));
checkCudaErrors(cudaFree(gpu_woi));
checkCudaErrors(cudaFree(gpu_num_texture_tiles));
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment