Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
f9641f6c
Commit
f9641f6c
authored
Dec 08, 2021
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Tested nonoverlap textures with 16xmono, without Dynamic Parallelism
parent
29147908
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
146 additions
and
136 deletions
+146
-136
TileProcessor.cuh
src/TileProcessor.cuh
+92
-123
test_tp.cu
src/test_tp.cu
+50
-9
tp_defines.h
src/tp_defines.h
+4
-4
No files found.
src/TileProcessor.cuh
View file @
f9641f6c
...
...
@@ -1131,6 +1131,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx);
...
...
@@ -2082,6 +2083,7 @@ extern "C" __global__ void generate_RBGA(
gpu_texture_tiles, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
(float *)0, //);//gpu_diff_rgb_combo); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
width);
cudaDeviceSynchronize(); // not needed yet, just for testing
...
...
@@ -2853,30 +2855,31 @@ extern "C" __global__ void textures_nonoverlap(
__syncthreads();
#endif
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
*pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float weights[3], // scale for R,B,G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
num_tilesx);
textures_accumulate <<<grid_texture,threads_texture, shared_size>>>( // 65536>>>( //
num_cams, // int num_cams, // number of cameras used
(int *) 0, // int * woi, // x, y, width,height
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
*pnum_texture_tiles, // size_t num_texture_tiles, // number of texture tiles to process
gpu_texture_indices, // int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
colors, // int colors, // number of colors (3/1)
is_lwir, // int is_lwir, // do not perform shot correction
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float weights[3], // scale for R,B,G
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
0, // int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
1, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
num_tilesx);
}
}
...
...
@@ -2909,7 +2912,10 @@ extern "C" __global__ void textures_nonoverlap(
* @param gpu_texture_rbg output array (number of colors +1 + ?) * woi.height * output stride(first woi.width valid) float values (or 0)
* @param texture_stride output stride for non-overlapping texture tile output in floats (or 0 to skip)
* @param gpu_texture_tiles output of the non-overlapping tiles (or 0 to skip)
* @param linescan_order if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
* @param gpu_diff_rgb_combo low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null
* @param tilesx number of tiles in a row. If negative then output gpu_diff_rgb_combo in linescan order,
* if positive - in gpu_texture_indices order
*/
extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
int num_cams, // number of cameras used
...
...
@@ -2917,7 +2923,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
struct gc * gpu_geometry_correction,
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
...
...
@@ -2934,6 +2939,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
size_t texture_stride, // in floats (now 256*4 = 1024)
float * gpu_texture_tiles, // (number of colors +1 + ?)*16*16 rgba texture tiles
int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
float * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
int tilesx)
{
...
...
@@ -3003,25 +3009,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
float * max_diff_tmp = &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
float * ports_rgb_tmp = &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8 = 0x60 | [4*3][8]
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
// __shared__ union {
// float clt_tiles [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9 = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
// float mclt_debayer [NUM_CAMS][NUM_COLORS][MCLT_UNION_LEN]; // 16 * 1 * 16 * 18 = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
// } shr;
// __shared__ union {
// float mclt_tmp [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// // (1 + 1 + 16 + 1 + 1)*16*17 = 0x1540 | (3 + 1 + 4 + 3 + 1)*16*17 = 0xcc0
// // add more
// } shr1;
// __shared__ float port_offsets [NUM_CAMS][2]; // 16 * 2 = 0x20 | 4*2 = 0x8
// __shared__ float ports_rgb_shared [NUM_COLORS][NUM_CAMS]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
// __shared__ float max_diff_shared [NUM_CAMS]; // 16 = 0x10 | 4 = 0x4 | return to system memory (optionally pass null to skip calculation)
// __shared__ float max_diff_tmp [NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
// __shared__ float ports_rgb_tmp [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8 = 0x60 | [4*3][8]
#ifdef DBG_TILE
#ifdef DEBUG7AXX
...
...
@@ -3045,20 +3032,13 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads();// __syncwarp(); // is it needed?
for (int color = 0; color < colors; color++){
// int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
// float * clt_tile = ((float *) shr.clt_tiles[camera_num][color]); // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
// float * clt_tilei = clt_tile + threadIdx.x;
// float * gpu_tile = ((float *) gpu_clt[camera_num]) + (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
// float * mclt_tile = (float *) mclt_tiles [camera_num][color];
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
// float * mclt_tmp = (float *) shr1.mclt_tmp[camera_num][color];
int cam_col = (camera_num * colors + color);
float * clt_tile = clt_tiles + cam_col * 2 * DTT_SIZE * DTT_SIZE21; // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
// clt_tiles is union with mclt_debayer, so has to have same step
float * clt_tile = clt_tiles + (camera_num * colors + color) * MCLT_UNION_LEN;
float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here
float * gpu_tile = ((float *) gpu_clt[camera_num]) + (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
float * mclt_tile = mclt_tiles + (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;
float * mclt_dst = mclt_debayer + (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18
float * mclt_tmp = mclt_tmps + (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21;
float * mclt_tmp = mclt_tmps + (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21;
// 16*17
// no camera_num below
#pragma unroll
for (int q = 0; q < 4; q++) {
...
...
@@ -3098,12 +3078,12 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
0);
#endif
__syncthreads();// __syncwarp();
#ifdef DEBUG7A
XXX
#ifdef DEBUG7A
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){
printf("\ntextures_gen mclt camera = % d, color = %d\n",ncam, color);
printf("\n
3104
textures_gen mclt camera = % d, color = %d\n",ncam, color);
debug_print_mclt(
mclt_tile + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_tile
s
+ (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
color);
}
}
...
...
@@ -3134,6 +3114,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
// copy? - no, just remember to use mclt_tile, not mclt_dst
// will have to copy mclt_tiles -> mclt_dst as they have different gaps
// untested copy for mono mode
#ifdef DEBUG7AXXX
if (tile_num == DBG_TILE) {
// for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
...
...
@@ -3147,49 +3128,54 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG7AXX // Good here
if (tile_num == DBG_TILE) {
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((threadIdx.x == 0) && (camera_num == ccam)){
printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
debug_print_mclt( // broken for camera 1
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
if (tile_num == DBG_TILE) {
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((threadIdx.x == 0) && (camera_num == ccam)){
printf("\n3155 textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
debug_print_mclt( // broken for camera 1
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
printf("3162 camera_num_offs= %d threadIdx.y= %d, color = %d mclt_tile=0x%x, mclt_dst=0x%x\n",
camera_num_offs,threadIdx.y, color, (int) mclt_tile, (int) mclt_dst);
}
}
__syncthreads();// __syncwarp();
__syncthreads();// __syncwarp();
#endif
//#ifdef DEBUGXXXX // no copy at all
//#pragma unroll
//#pragma unroll
for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
float * msp = mclt_tile + threadIdx.x + n;
float * dst = mclt_dst + threadIdx.x + n;
//#pragma unroll
//#pragma unroll
for (int row = 0; row < DTT_SIZE2; row++){
*dst = *msp;
msp += DTT_SIZE21;
dst += DTT_SIZE21;
}
}
//#endif
__syncthreads();
}
#ifdef DEBUG7AXXX
} //if (colors > 1) else
#ifdef DEBUG7AXX // still good here
if (tile_num == DBG_TILE) {
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((threadIdx.x == 0) && (
camera_num == ccam
)){
printf("\n
textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs
,threadIdx.y, color);
if ((threadIdx.x == 0) && (
(camera_num & 0x3) == (ccam & 0x3)
)){
printf("\n
3185 mclt_tile : textures_gen mclt_tile camera_num_offs= %d camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam
,threadIdx.y, color);
debug_print_mclt( // broken for camera 1
mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
// mclt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_tiles + (ccam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21,
-1);
printf("\n
textures_gen AFTER DEBAER camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,
threadIdx.y, color);
printf("\n
3190 mclt_dst: textures_gen AFTER DEBAER camera_num_offs= %d camera number= %d threadIdx.y= %d, color = %d\n", camera_num_offs, ccam,
threadIdx.y, color);
debug_print_mclt(
mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
// mclt_dst, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
mclt_debayer +(ccam * colors + color) * MCLT_UNION_LEN, // 16 * 18
-1);
/*
printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
...
...
@@ -3197,7 +3183,6 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
mclt_debayer + (ccam * colors * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
*/
}
__syncthreads();// __syncwarp();
}
...
...
@@ -3208,58 +3193,45 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
__syncthreads(); // __syncwarp();
/// return;
#ifdef DEBUG7AXXX
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < num_cams; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 camera_num_offs = %d, cam= %d, color = %d\n", camera_num_offs, ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
}
}
__syncthreads();// __syncwarp();
#endif
#ifdef DEBUG7AXXX
//#ifdef DEBUG22
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
mclt_debayer+ ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
// __shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
#ifdef DEBUG7A
//#ifdef DEBUG22
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\n3227: mclt_tiles + (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21 cam= %d, color = %d\n",ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
mclt_tiles + (ccam * colors + nncol) * 2 * DTT_SIZE * DTT_SIZE21,
-1);
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
// __shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
#ifdef DEBUG7A
//#ifdef DEBUG22
for (int ccam = 0; ccam < num_cams; ccam++) {
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
for (int ccam = 0; ccam < num_cams; ccam++) {
// if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == ccam)){
for (int nncol = 0; nncol < colors; nncol++){
printf("\ntextures_gen AFTER DEBAYERs all cameras cam= %d, color = %d\n", ccam, nncol);
// float * mclt_dst = (float *) shr.mclt_debayer[camera_num][color];
debug_print_mclt(
mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-1);
}
for (int nncol = 0; nncol < colors; nncol++){
printf("\n 3244 mclt_dst: textures_gen AFTER DEBAER camera number= %d threadIdx.y= %d, color = %d\n", ccam, threadIdx.y, nncol);
debug_print_mclt(
mclt_debayer +(ccam * colors + nncol) * MCLT_UNION_LEN, // 16 * 18
-1);
}
}
__syncthreads();// __syncwarp();
}
__syncthreads();// __syncwarp();
#endif
#ifdef DBG_TILE
int debug = (tile_num == DBG_TILE);
#else
...
...
@@ -3474,20 +3446,17 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
//DBG_TILE
#endif// #ifdef DEBUG7A
int tile_offset = (linescan_order ? tile_num : tile_indx) * num_cams* (colors + 1);
for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
int camera_num = threadIdx.y + camera_num_offs;
// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;// tile_num
// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position
float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_
num * num_cams* (colors + 1)
+ camera_num;//
// float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num * num_cams* (colors + 1) + camera_num;//
float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_
offset
+ camera_num;//
if (threadIdx.x == 0){
*pdiff_rgb_combo = max_diff_shared[camera_num];
}
if (threadIdx.x < colors){
// *(pdiff_rgb_combo + (threadIdx.x + 1) * NUM_CAMS) = ports_rgb_shared[threadIdx.x][camera_num];// [color][camera]
*(pdiff_rgb_combo + (threadIdx.x + 1) * num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]
}
}
...
...
src/test_tp.cu
View file @
f9641f6c
...
...
@@ -861,7 +861,8 @@ int main(int argc, char **argv)
gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) generate_RBGA_params, sizeof(generate_RBGA_params));
/// int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0)) *256;
int tile_texture_layers = (texture_colors + 1 + (keep_texture_weights? (num_cams + texture_colors + 1): 0));
int tile_texture_size = tile_texture_layers *256;
gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
...
...
@@ -1475,7 +1476,7 @@ int main(int argc, char **argv)
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
int linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
int cpu_pnum_texture_tiles = 0;
...
...
@@ -1549,12 +1550,13 @@ int main(int argc, char **argv)
generate_RBGA_params[4], // min_agree, // float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
gpu_color_weights, // float weights[3], // scale for R,B,G
1, // dust_remove, // int dust_remove, // Do not reduce average weight when only one image differs much from the average
1, // 0,
// int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
keep_texture_weights, // 0, // 1
// int keep_weights, // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
// combining both non-overlap and overlap (each calculated if pointer is not null )
0, // size_t texture_rbg_stride, // in floats
(float *) 0, // float * gpu_texture_rbg, // (number of colors +1 + ?)*16*16 rgba texture tiles
0, // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
(float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
dstride_textures /sizeof(float), // texture_stride, // size_t texture_stride, // in floats (now 256*4 = 1024)
gpu_textures, // (float *) 0, // gpu_texture_tiles, //(float *)0);// float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
linescan_order, // int linescan_order, // if !=0 then output gpu_diff_rgb_combo in linescan order, else - in gpu_texture_indices order
gpu_diff_rgb_combo, //); // float * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
TILESX);
getLastCudaError("Kernel failure");
...
...
@@ -1568,9 +1570,14 @@ int main(int argc, char **argv)
printf("Average Texture run time =%f ms\n", avgTimeTEXTURES);
int rslt_texture_size = num_textures * tile_texture_size;
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy(
(float * ) texture_indices,
gpu_texture_indices,
cpu_pnum_texture_tiles * sizeof(float),
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy2D( // something wrong with size
float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_textures,
tile_texture_size * sizeof(float),
gpu_textures,
...
...
@@ -1578,6 +1585,33 @@ int main(int argc, char **argv)
tile_texture_size * sizeof(float),
num_textures,
cudaMemcpyDeviceToHost));
// float non_overlap_layers [tile_texture_layers][TILESY*16][TILESX*16];
int num_nonoverlap_pixels = tile_texture_layers * TILESY*16 * TILESX*16;
float * non_overlap_layers = (float *)malloc(num_nonoverlap_pixels* sizeof(float));
for (int i = 0; i < num_nonoverlap_pixels; i++){
non_overlap_layers[i] = NAN;
}
for (int itile = 0; itile < cpu_pnum_texture_tiles; itile++) { // if (texture_indices[itile] & ((1 << LIST_TEXTURE_BIT))){
int ntile = texture_indices[itile] >> CORR_NTILE_SHIFT;
int tileX = ntile % TILESX;
int tileY = ntile / TILESX;
for (int ilayer = 0; ilayer < tile_texture_layers; ilayer++){
int src_index0 = itile * tile_texture_size + 256 * ilayer;
int dst_index0 = ilayer * (TILESX * TILESYA * 256) + (tileY * 16) * (16 * TILESX) + (tileX * 16);
for (int iy = 0; iy < 16; iy++){
int src_index1 = src_index0 + 16 * iy;
int dst_index1 = dst_index0 + iy * (16 * TILESX);
for (int ix = 0; ix < 16; ix++){
// int src_index = src_index1 + ix;
// int dst_index = dst_index1 + ix;
int src_index= itile * tile_texture_size + 256 * ilayer + 16 * iy + ix;
int dst_index = ilayer * (TILESX * TILESYA * 256) + (tileY * 16 + iy) * (16 * TILESX) + (tileX * 16) + ix;
non_overlap_layers[dst_index] = cpu_textures[src_index];
}
}
}
}
int ntiles = TILESX * TILESY;
int nlayers = num_cams * (num_colors + 1);
...
...
@@ -1604,12 +1638,19 @@ int main(int argc, char **argv)
cpu_textures, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
*/
*/
writeFloatsToFile(
non_overlap_layers, // float * data, // allocated array
rslt_texture_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
/*
* non_overlap_layers
writeFloatsToFile(
cpu_diff_rgb_combo, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_textures_file); // const char * path) // file path
*/
printf("Writing low-res data to %s\n", result_diff_rgb_combo_file);
writeFloatsToFile(
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
...
...
src/tp_defines.h
View file @
f9641f6c
...
...
@@ -106,8 +106,8 @@
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#if TEST_LWIR
#define DBG_TILE_X 52 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y 5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE_X 5
0 // 5
2 // 32 // 162 // 151 // 161 // 49
#define DBG_TILE_Y
19 //
5 // 36 // 88 // 121 // 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
#else
#define DBG_TILE_X 114 // 32 // 162 // 151 // 161 // 49
...
...
@@ -128,7 +128,7 @@
//#define DEBUG6 1
// #define DEBUG7 1
#define DEBUG7A 1
////
#define DEBUG7A 1
/*
#define DEBUG7 1
#define DEBUG8 1
...
...
@@ -148,7 +148,7 @@
#define DEBUG20 1 // Geometry Correction
#define DEBUG21 1 // Geometry Correction
//#define DEBUG210 1
#define DEBUG30 1
////
#define DEBUG30 1
//#define DEBUG22 1
//#define DEBUG23 1
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment