Commit 35b71554 authored by Andrey Filippov's avatar Andrey Filippov

Mitigating bug, could not resolve

parent 4398007c
...@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles( ...@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles(
int kernels_vert, //); int kernels_vert, //);
int tilesx); int tilesx);
extern "C" __global__ void correlate2D_inner( extern "C" __global__ void correlate2D_inner(
int num_cams, int num_cams,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
...@@ -1076,12 +1075,12 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1076,12 +1075,12 @@ extern "C" __global__ void correlate2D_inner(
float scale0, // scale for R float scale0, // scale for R
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute float fat_zero2, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius0, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data (either pixel domain or transform domain
extern "C" __global__ void corr2D_normalize_inner( extern "C" __global__ void corr2D_normalize_inner(
int num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
...@@ -1177,12 +1176,12 @@ extern "C" __global__ void correlate2D( ...@@ -1177,12 +1176,12 @@ extern "C" __global__ void correlate2D(
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute float fat_zero2, // here - absolute
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // array of per-tile tasks (now bits 4..9 - correlation pairs)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats // const size_t corr_stride, // in floats
size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data float * gpu_corrs) // correlation output data
{ {
...@@ -1247,12 +1246,16 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1247,12 +1246,16 @@ extern "C" __global__ void correlate2D_inner(
float scale1, // scale for B float scale1, // scale for B
float scale2, // scale for G float scale2, // scale for G
float fat_zero2, // here - absolute float fat_zero2, // here - absolute
size_t num_corr_tiles, // number of correlation tiles to process int num_corr_tiles, // number of correlation tiles to process
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
const size_t corr_stride, // in floats size_t corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius0, // radius of the output correlation (7 for 15x15)
float * gpu_corrs) // correlation output data (either pixel domain or transform domain float * gpu_corrs) // correlation output data (either pixel domain or transform domain
{ {
// int corr_radius = corr_radius0 & 0x1f;// minimal "bad"
// int corr_radius = corr_radius0 & 0xf; // maximal "good"
int corr_radius = corr_radius0 & 0x7; // actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
float scales[3] = {scale0, scale1, scale2}; float scales[3] = {scale0, scale1, scale2};
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4 int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
...@@ -1268,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1268,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner(
if (corr_pair > pair_list_len){ if (corr_pair > pair_list_len){
return; // BUG - should not happen return; // BUG - should not happen
} }
// int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
// int cam2 = pairs[corr_pair][1]; // number of the second camera in a pair
int cam1 = all_pairs[pair_list_start + corr_pair][0]; // number of the first camera in a pair int cam1 = all_pairs[pair_list_start + corr_pair][0]; // number of the first camera in a pair
int cam2 = all_pairs[pair_list_start + corr_pair][1]; // number of the second camera in a pair int cam2 = all_pairs[pair_list_start + corr_pair][1]; // number of the second camera in a pair
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
...@@ -1281,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1281,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner(
float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0 float * clt_corr = ((float *) clt_corrs) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1); float * mclt_corr = ((float *) mlt_corrs) + corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
resetCorrelation(clt_corr); resetCorrelation(clt_corr);
__syncthreads(); /// ***** Was not here: probably not needed
for (int color = 0; color < colors; color++){ for (int color = 0; color < colors; color++){
// copy clt (frequency domain data) // copy clt (frequency domain data)
float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); float * clt_tile1 = ((float *) clt_tiles1) + corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
...@@ -1380,13 +1382,14 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1380,13 +1382,14 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
} // if (color == 1){ // LPF only after B (nothing in mono) } // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){ } // for (int color = 0; color < colors; color++){
// corr_radius = 7;
// Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required // Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
if (corr_radius > 0) { if (corr_radius > 0) {
normalizeTileAmplitude( normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
fat_zero2); // float fat_zero2 ) // fat zero is absolute, scale it outside fat_zero2); // float fat_zero2 ) // fat zero is absolute, scale it outside
// Low Pass Filter from constant area (is it possible to replace?) // Low Pass Filter from constant area (is it possible to replace?)
__syncthreads(); /// ***** Was not here: probably not needed
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
...@@ -1408,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1408,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
#endif #endif
float *clt = clt_corr + threadIdx.x; float *clt = clt_corr + threadIdx.x;
#pragma unroll #pragma unroll
for (int q = 0; q < 4; q++){ for (int q = 0; q < 4; q++){
...@@ -1422,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1422,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner(
} }
} }
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
// corr_radius = 7;
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
...@@ -1434,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1434,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
#endif #endif
dttii_2d(clt_corr); dttii_2d(clt_corr);
// has __syncthreads() inside
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
...@@ -1445,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1445,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
#endif #endif
__syncthreads();
// corr_radius = 7;
corrUnfoldTile( corrUnfoldTile(
corr_radius, // int corr_radius, corr_radius, // int corr_radius,
(float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports (float *) clt_corr, // float* qdata0, // [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
(float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15 (float *) mclt_corr); // float* rslt) // [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
__syncthreads(); __syncthreads();
#ifdef DBG_TILE #ifdef DBG_TILE
...@@ -1467,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner( ...@@ -1467,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner(
#endif #endif
// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1) // copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
int size2r1 = 2 * corr_radius + 1; int size2r1 = 2 * corr_radius + 1; // 15 for full corr tile
int len2r1x2r1 = size2r1 * size2r1; int len2r1x2r1 = size2r1 * size2r1; // 225 for full corr tile
int corr_tile_offset = + corr_stride * corr_num; int corr_tile_offset = + corr_stride * corr_num;
float *mem_corr = gpu_corrs + corr_tile_offset; float *mem_corr = gpu_corrs + corr_tile_offset;
#pragma unroll #pragma unroll
// for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread // for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
mem_corr[offs] = mclt_corr[offs]; mem_corr[offs] = mclt_corr[offs]; // copy OK
} }
__syncthreads(); __syncthreads();
...@@ -1751,6 +1753,9 @@ extern "C" __global__ void corr2D_normalize_inner( ...@@ -1751,6 +1753,9 @@ extern "C" __global__ void corr2D_normalize_inner(
float fat_zero2, // here - absolute, squared float fat_zero2, // here - absolute, squared
int corr_radius) // radius of the output correlation (7 for 15x15) int corr_radius) // radius of the output correlation (7 for 15x15)
{ {
corr_radius &= 0x7; // actual never >7. Still did not understand where is the problem,
// providing literal "7" in the call does not fix the problem
int corr_in_block = threadIdx.y; int corr_in_block = threadIdx.y;
int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4 int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
if (corr_num >= num_corr_tiles){ if (corr_num >= num_corr_tiles){
......
...@@ -82,7 +82,8 @@ extern "C" __global__ void correlate2D( ...@@ -82,7 +82,8 @@ extern "C" __global__ void correlate2D(
int tilesx, // number of tile rows int tilesx, // number of tile rows
int * gpu_corr_indices, // packed tile+pair int * gpu_corr_indices, // packed tile+pair
int * pnum_corr_tiles, // pointer to a number of correlation tiles to process int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
const size_t corr_stride, // in floats size_t corr_stride, // in floats
// int corr_stride, // in floats
int corr_radius, // radius of the output correlation (7 for 15x15) int corr_radius, // radius of the output correlation (7 for 15x15)
float * gpu_corrs); // correlation output data float * gpu_corrs); // correlation output data
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
** -----------------------------------------------------------------------------** ** -----------------------------------------------------------------------------**
*/ */
#define NOCORR // #define NOCORR
//#define NOCORR_TD //#define NOCORR_TD
//#define NOTEXTURES_HOST //#define NOTEXTURES_HOST
#define NOTEXTURES #define NOTEXTURES
...@@ -53,6 +53,25 @@ ...@@ -53,6 +53,25 @@
#include "geometry_correction.h" #include "geometry_correction.h"
#include "TileProcessor.cuh" #include "TileProcessor.cuh"
#if TEST_LWIR
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3))
float * copyalloc_kernel_gpu(float * kernel_host, float * copyalloc_kernel_gpu(float * kernel_host,
int size, // size in floats int size, // size in floats
...@@ -1259,23 +1278,9 @@ int main(int argc, char **argv) ...@@ -1259,23 +1278,9 @@ int main(int argc, char **argv)
struct tp_task * old_task = &task_data [DBG_TILE]; struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE]; struct tp_task * new_task = &task_data1[DBG_TILE];
#endif #endif
// printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy); #ifdef DBG_TILE
// printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy); printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy); printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
/*
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][0], task_data1[DBG_TILE].xy[ncam][0],
task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
printf("camera %d pY old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][1], task_data1[DBG_TILE].xy[ncam][1],
task_data [DBG_TILE].xy[ncam][1]- task_data1[DBG_TILE].xy[ncam][1]);
}
*/
// for (int ncam = 0; ncam < NUM_CAMS; ncam++){
for (int ncam = 0; ncam < num_cams; ncam++){ for (int ncam = 0; ncam < num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam, printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0), *(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
...@@ -1288,16 +1293,7 @@ int main(int argc, char **argv) ...@@ -1288,16 +1293,7 @@ int main(int argc, char **argv)
(*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) - (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
(*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1))); (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
} }
#endif //#ifdef DBG_TILE
#if 0
// temporarily restore tasks
checkCudaErrors(cudaMemcpy(
gpu_tasks,
&task_data,
tp_task_size * sizeof(struct tp_task),
cudaMemcpyHostToDevice));
#endif
#endif // TEST_GEOM_CORR #endif // TEST_GEOM_CORR
...@@ -1305,13 +1301,8 @@ int main(int argc, char **argv) ...@@ -1305,13 +1301,8 @@ int main(int argc, char **argv)
StopWatchInterface *timerTP = 0; StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP); sdkCreateTimer(&timerTP);
#if 0
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
#else
dim3 threads_tp(1, 1, 1); dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1); dim3 grid_tp(1, 1, 1);
#endif
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z); printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z); printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
...@@ -1333,7 +1324,6 @@ int main(int argc, char **argv) ...@@ -1333,7 +1324,6 @@ int main(int argc, char **argv)
gpu_kernels, // float ** gpu_kernels, gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images, gpu_images, // float ** gpu_images,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images dstride/sizeof(float), // size_t dstride, // for gpu_images
tp_task_size, // int num_tiles) // number of tiles in task tp_task_size, // int num_tiles) // number of tiles in task
...@@ -1373,12 +1363,10 @@ int main(int argc, char **argv) ...@@ -1373,12 +1363,10 @@ int main(int argc, char **argv)
gpu_clt_h[ncam], gpu_clt_h[ncam],
rslt_size * sizeof(float), rslt_size * sizeof(float),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
//#ifndef DBG_TILE
printf("Writing CLT data to %s\n", ports_clt_file[ncam]); printf("Writing CLT data to %s\n", ports_clt_file[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements rslt_size, // int size, // length in elements
ports_clt_file[ncam]); // const char * path) // file path ports_clt_file[ncam]); // const char * path) // file path
//#endif
} }
#endif #endif
...@@ -1453,14 +1441,11 @@ int main(int argc, char **argv) ...@@ -1453,14 +1441,11 @@ int main(int argc, char **argv)
// 3* (IMG_HEIGHT + DTT_SIZE), // 3* (IMG_HEIGHT + DTT_SIZE),
num_colors* (IMG_HEIGHT + DTT_SIZE), num_colors* (IMG_HEIGHT + DTT_SIZE),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
///#ifndef DBG_TILE
printf("Writing RBG data to %s\n", result_rbg_file[ncam]); printf("Writing RBG data to %s\n", result_rbg_file[ncam]);
writeFloatsToFile( // will have margins writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements rslt_img_size, // int size, // length in elements
result_rbg_file[ncam]); // const char * path) // file path result_rbg_file[ncam]); // const char * path) // file path
///#endif
} }
free(cpu_corr_image); free(cpu_corr_image);
...@@ -1482,7 +1467,6 @@ int main(int argc, char **argv) ...@@ -1482,7 +1467,6 @@ int main(int argc, char **argv)
} }
correlate2D<<<1,1>>>( correlate2D<<<1,1>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
// 0, // int * sel_pairs, // unused bits should be 0
sel_pairs[0], // int sel_pairs0 // unused bits should be 0 sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0 sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0 sel_pairs[2], // int sel_pairs2, // unused bits should be 0
...@@ -1778,7 +1762,7 @@ int main(int argc, char **argv) ...@@ -1778,7 +1762,7 @@ int main(int argc, char **argv)
#ifndef NSAVE_CORR #ifndef NSAVE_CORR
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n", printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ; result_corr_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile( writeFloatsToFile(
corr_img, // float * data, // allocated array corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements corr_img_size, // int size, // length in elements
...@@ -1985,8 +1969,7 @@ int main(int argc, char **argv) ...@@ -1985,8 +1969,7 @@ int main(int argc, char **argv)
cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo, // float * data, // allocated array
diff_rgb_combo_size, // int size, // length in elements diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path result_diff_rgb_combo_file); // const char * path) // file path
#ifdef DBG_TILE
//DBG_TILE
#ifdef DEBUG10 #ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size; int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0; int chn = 0;
...@@ -2002,6 +1985,7 @@ int main(int argc, char **argv) ...@@ -2002,6 +1985,7 @@ int main(int argc, char **argv)
} }
} }
#endif // DEBUG9 #endif // DEBUG9
#endif //#ifdef DBG_TILE
#endif #endif
free(cpu_textures); free(cpu_textures);
free (cpu_diff_rgb_combo); free (cpu_diff_rgb_combo);
...@@ -2108,7 +2092,7 @@ int main(int argc, char **argv) ...@@ -2108,7 +2092,7 @@ int main(int argc, char **argv)
diff_rgb_combo_size, // int size, // length in elements diff_rgb_combo_size, // int size, // length in elements
result_diff_rgb_combo_file); // const char * path) // file path result_diff_rgb_combo_file); // const char * path) // file path
//DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG10 #ifdef DEBUG10
int texture_offset = DBG_TILE * tile_texture_size; int texture_offset = DBG_TILE * tile_texture_size;
int chn = 0; int chn = 0;
...@@ -2124,6 +2108,8 @@ int main(int argc, char **argv) ...@@ -2124,6 +2108,8 @@ int main(int argc, char **argv)
} }
} }
#endif // DEBUG9 #endif // DEBUG9
#endif //#ifdef DBG_TILE
#endif #endif
free(cpu_textures); free(cpu_textures);
free (cpu_diff_rgb_combo); free (cpu_diff_rgb_combo);
...@@ -2243,6 +2229,8 @@ int main(int argc, char **argv) ...@@ -2243,6 +2229,8 @@ int main(int argc, char **argv)
rslt_rgba_size, // int size, // length in elements rslt_rgba_size, // int size, // length in elements
result_textures_rgba_file); // const char * path) // file path result_textures_rgba_file); // const char * path) // file path
#endif #endif
#ifdef DBG_TILE
#ifdef DEBUG11 #ifdef DEBUG11
int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]); int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width + (DBG_TILE_X - cpu_woi[0]);
for (int chn = 0; chn < rbga_slices; chn++){ for (int chn = 0; chn < rbga_slices; chn++){
...@@ -2257,6 +2245,7 @@ int main(int argc, char **argv) ...@@ -2257,6 +2245,7 @@ int main(int argc, char **argv)
} }
} }
#endif // DEBUG11 #endif // DEBUG11
#endif //#ifdef DBG_TILE
free(cpu_textures_rgba); free(cpu_textures_rgba);
#endif // ifndef NOTEXTURES #endif // ifndef NOTEXTURES
......
...@@ -41,22 +41,11 @@ ...@@ -41,22 +41,11 @@
#ifndef JCUDA #ifndef JCUDA
#include <stdio.h> #include <stdio.h>
#define THREADSX (DTT_SIZE) #define THREADSX (DTT_SIZE)
#define TEST_LWIR 1
#define NUM_CAMS 16 // now maximal number of cameras #define NUM_CAMS 16 // now maximal number of cameras
//#define NUM_PAIRS 6 //#define NUM_PAIRS 6
//#define NUM_COLORS 1 //3 //#define NUM_COLORS 1 //3
// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64] // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#if TEST_LWIR #define TEST_LWIR 1
#define IMG_WIDTH 640
#define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define KERNELS_LSTEP 4 #define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8 #define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4 #define TILES_PER_BLOCK 4
...@@ -79,30 +68,22 @@ ...@@ -79,30 +68,22 @@
#define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor #define TASK_TEXTURE_E_BIT 1 // Texture with East neighbor
#define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor #define TASK_TEXTURE_S_BIT 2 // Texture with South neighbor
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor #define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task //#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation #define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9) //#define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan #define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list #define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 0.0 // 56.0// 0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory #define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001); #define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads #define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// only used in C++ test
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define TILESYA ((TILESY +3) & (~3))
#define DEBUG_OOB1 1 #ifdef DEBUG_ANY
//#define DEBUG_OOB1 1
// Use CORR_OUT_RAD for the correlation output // Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40 //#define DBG_TILE_X 40
//#define DBG_TILE_Y 80 //#define DBG_TILE_Y 80
#if TEST_LWIR #if TEST_LWIR
...@@ -119,6 +100,8 @@ ...@@ -119,6 +100,8 @@
//#undef HAS_PRINTF //#undef HAS_PRINTF
#define HAS_PRINTF #define HAS_PRINTF
//7 //7
//#define DEBUG1 1 //#define DEBUG1 1
//#define DEBUG2 1 //#define DEBUG2 1
...@@ -146,8 +129,8 @@ ...@@ -146,8 +129,8 @@
#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0) #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#define DEBUG20 1 // Geometry Correction //#define DEBUG20 1 // Geometry Correction
#define DEBUG21 1 // Geometry Correction //#define DEBUG21 1 // Geometry Correction
//#define DEBUG210 1 //#define DEBUG210 1
////#define DEBUG30 1 ////#define DEBUG30 1
//#define DEBUG22 1 //#define DEBUG22 1
...@@ -155,5 +138,8 @@ ...@@ -155,5 +138,8 @@
#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0) #endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
#endif //#ifdef DEBUG_ANY
#endif //#ifndef JCUDA #endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment