Commit 35b71554 authored by Andrey Filippov's avatar Andrey Filippov

Mitigating bug, could not resolve

parent 4398007c
......@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles(
int kernels_vert, //);
int tilesx);
extern "C" __global__ void correlate2D_inner(
int num_cams,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
......@@ -1076,12 +1075,12 @@ extern "C" __global__ void correlate2D_inner(
float scale0, // scale for R
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
float fat_zero2, // here - absolute
int num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process
......@@ -1177,12 +1176,12 @@ extern "C" __global__ void correlate2D(
float scale2, // scale for G
float fat_zero2, // here - absolute
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats
// const size_t corr_stride, // in floats
size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data
{
......@@ -1247,12 +1246,16 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B
float scale2, // scale for G
float fat_zero2, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process
int num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
size_t corr_stride, // in floats
int corr_radius0, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{
// int corr_radius = corr_radius0 & 0x1f;// minimal "bad"
// int corr_radius = corr_radius0 & 0xf; // maximal "good"
int corr_radius = corr_radius0 & 0x7; // actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
......@@ -1268,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner(
if (corr_pair > pair_list_len){
return; // BUG - should not happen
}
// int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
// int cam2 = pairs[corr_pair][1]; // number of the second camera in a pair
int cam1 = all_pairs[pair_list_start + corr_pair][0]; // number of the first camera in a pair
int cam2 = all_pairs[pair_list_start + corr_pair][1]; // number of the second camera in a pair
__syncthreads();// __syncwarp();
......@@ -1281,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner(
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int color = 0; color < colors; color++){
// copy clt (frequency domain data)
float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
......@@ -1380,13 +1382,14 @@ extern "C" __global__ void correlate2D_inner(
#endif
} // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){
// corr_radius = 7;
// Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
if (corr_radius > 0) {
normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero2); // float fat_zero2 ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?)
__syncthreads(); /// ***** Was not here: probably not needed
#ifdef DBG_TILE
#ifdef DEBUG6
......@@ -1408,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
float *clt = clt_corr + threadIdx.x;
#pragma unroll
for (int q = 0; q < 4; q++){
......@@ -1422,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner(
}
}
__syncthreads();// __syncwarp();
// corr_radius = 7;
#ifdef DBG_TILE
#ifdef DEBUG6
......@@ -1434,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
dttii_2d(clt_corr);
// has __syncthreads() inside
#ifdef DBG_TILE
#ifdef DEBUG6
......@@ -1445,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner(
#endif
#endif
__syncthreads();
// corr_radius = 7;
corrUnfoldTile(
corr_radius, // int corr_radius,
(float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads();
#ifdef DBG_TILE
......@@ -1467,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner(
#endif
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
int size2r1 = 2 * corr_radius + 1;
int len2r1x2r1 = size2r1 * size2r1;
int size2r1 = 2 * corr_radius + 1; // 15 for full corr tile
int len2r1x2r1 = size2r1 * size2r1; // 225 for full corr tile
int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_tile_offset;
#pragma unroll
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs];
mem_corr[offs] = mclt_corr[offs]; // copy OK
}
__syncthreads();
......@@ -1751,6 +1753,9 @@ extern "C" __global__ void corr2D_normalize_inner(
float fat_zero2, // here - absolute, squared
int corr_radius) // radius of the output correlation (7 for 15x15)
{
corr_radius &= 0x7; // actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
if (corr_num >= num_corr_tiles){
......
......@@ -82,7 +82,8 @@ extern "C" __global__ void correlate2D(
int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats
size_t corr_stride, // in floats
// int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data
......
......@@ -30,7 +30,7 @@
** -----------------------------------------------------------------------------**
*/
#define NOCORR
// #define NOCORR
//#define NOCORR_TD
//#define NOTEXTURES_HOST
#define NOTEXTURES
......@@ -53,6 +53,25 @@
#include "geometry_correction.h"
#include "TileProcessor.cuh"
#if TEST_LWIR
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3))
float * copyalloc_kernel_gpu(float * kernel_host,
int size, // size in floats
......@@ -1259,23 +1278,9 @@ int main(int argc, char **argv)
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
#endif
// printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy);
// printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
#ifdef DBG_TILE
printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
/*
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][0], task_data1[DBG_TILE].xy[ncam][0],
task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
printf("camera %d pY old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][1], task_data1[DBG_TILE].xy[ncam][1],
task_data [DBG_TILE].xy[ncam][1]- task_data1[DBG_TILE].xy[ncam][1]);
}
*/
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
for (int ncam = 0; ncam < num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
......@@ -1288,16 +1293,7 @@ int main(int argc, char **argv)
(*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
}
#if 0
// temporarily restore tasks
checkCudaErrors(cudaMemcpy(
gpu_tasks,
&task_data,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyHostToDevice));
#endif
#endif //#ifdef DBG_TILE
#endif // TEST_GEOM_CORR
......@@ -1305,13 +1301,8 @@ int main(int argc, char **argv)
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
#if 0
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
#else
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
#endif
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
......@@ -1333,7 +1324,6 @@ int main(int argc, char **argv)
gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task
......@@ -1373,12 +1363,10 @@ int main(int argc, char **argv)
gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
//#ifndef DBG_TILE
printf("Writing CLT data to %s\n", ports_clt_file[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
ports_clt_file[ncam]); // const char * path) // file path
//#endif
}
#endif
......@@ -1453,14 +1441,11 @@ int main(int argc, char **argv)
// 3* (IMG_HEIGHT + DTT_SIZE),
num_colors* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost));
///#ifndef DBG_TILE
printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path
///#endif
}
free(cpu_corr_image);
......@@ -1482,7 +1467,6 @@ int main(int argc, char **argv)
}
correlate2D<<<1,1>>>(
num_cams, // int num_cams,
// 0, // int * sel_pairs, // unused bits should be 0
sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0
......@@ -1778,7 +1762,7 @@ int main(int argc, char **argv)
#ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
result_corr_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
......@@ -1985,8 +1969,7 @@ int main(int argc, char **argv)
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path
//DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
......@@ -2002,6 +1985,7 @@ int main(int argc, char **argv)
}
}
#endif // DEBUG9
#endif //#ifdef DBG_TILE
#endif
free(cpu_textures);
free (cpu_diff_rgb_combo);
......@@ -2108,7 +2092,7 @@ int main(int argc, char **argv)
diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path
//DBG_TILE
#ifdef DBG_TILE
#ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0;
......@@ -2124,6 +2108,8 @@ int main(int argc, char **argv)
}
}
#endif // DEBUG9
#endif //#ifdef DBG_TILE
#endif
free(cpu_textures);
free (cpu_diff_rgb_combo);
......@@ -2243,6 +2229,8 @@ int main(int argc, char **argv)
rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path
#endif
#ifdef DBG_TILE
#ifdef DEBUG11
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){
......@@ -2257,6 +2245,7 @@ int main(int argc, char **argv)
}
}
#endif // DEBUG11
#endif //#ifdef DBG_TILE
free(cpu_textures_rgba);
#endif // ifndef NOTEXTURES
......
......@@ -41,22 +41,11 @@
#ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE)
#define TEST_LWIR 1
#define NUM_CAMS 16 // now maximal number of cameras
//#define NUM_PAIRS 6
//#define NUM_COLORS 1 //3
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#if TEST_LWIR
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define TEST_LWIR 1
#define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4
......@@ -79,30 +68,22 @@
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
//#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
//#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3))
#define DEBUG_OOB1 1
#ifdef DEBUG_ANY
//#define DEBUG_OOB1 1
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#if TEST_LWIR
......@@ -119,6 +100,8 @@
//#undef HAS_PRINTF
#define HAS_PRINTF
//7
//#define DEBUG1 1
//#define DEBUG2 1
......@@ -146,8 +129,8 @@
#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#define DEBUG20 1 // Geometry Correction
#define DEBUG21 1 // Geometry Correction
//#define DEBUG20 1 // Geometry Correction
//#define DEBUG21 1 // Geometry Correction
//#define DEBUG210 1
////#define DEBUG30 1
//#define DEBUG22 1
......@@ -155,5 +138,8 @@
#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment