Commit cde525c8 authored by Andrey Filippov's avatar Andrey Filippov

debugging

parent 40239aff
...@@ -1031,7 +1031,11 @@ __global__ void index_direct( ...@@ -1031,7 +1031,11 @@ __global__ void index_direct(
__global__ void index_correlate( __global__ void index_correlate(
int num_cams, int num_cams,
int * sel_pairs, // int * sel_pairs, // unused bits should be 0
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
...@@ -1161,7 +1165,11 @@ __device__ int get_textures_shared_size( // in bytes ...@@ -1161,7 +1165,11 @@ __device__ int get_textures_shared_size( // in bytes
*/ */
extern "C" __global__ void correlate2D( extern "C" __global__ void correlate2D(
int num_cams, int num_cams,
int * sel_pairs, // int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
float scale0, // scale for R float scale0, // scale for R
...@@ -1184,29 +1192,34 @@ extern "C" __global__ void correlate2D( ...@@ -1184,29 +1192,34 @@ extern "C" __global__ void correlate2D(
*pnum_corr_tiles = 0; *pnum_corr_tiles = 0;
index_correlate<<<blocks0,threads0>>>( index_correlate<<<blocks0,threads0>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
sel_pairs, // int * sel_pairs, // sel_pairs, // int * sel_pairs,
sel_pairs0, // int sel_pairs0,
sel_pairs1, // int sel_pairs1,
sel_pairs2, // int sel_pairs2,
sel_pairs3, // int sel_pairs3,
gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks, // gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles, // number of tiles in task num_tiles, // int num_tiles, // number of tiles in task
tilesx, // int width, // number of tiles in a row tilesx, // int width, // number of tiles in a row
gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks gpu_corr_indices, // int * gpu_corr_indices, // array of correlation tasks
pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array pnum_corr_tiles); // int * pnum_corr_tiles); // pointer to the length of correlation tasks array
cudaDeviceSynchronize(); cudaDeviceSynchronize();
dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1); dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1); dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
correlate2D_inner<<<grid_corr,threads_corr>>>( correlate2D_inner<<<grid_corr,threads_corr>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
colors, // int colors, // number of colors (3/1) colors, // int colors, // number of colors (3/1)
scale0, // float scale0, // scale for R scale0, // float scale0, // scale for R
scale1, // float scale1, // scale for B scale1, // float scale1, // scale for B
scale2, // float scale2, // scale for G scale2, // float scale2, // scale for G
fat_zero, // float fat_zero, // here - absolute fat_zero, // float fat_zero, // here - absolute
*pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process *pnum_corr_tiles, // size_t num_corr_tiles, // number of correlation tiles to process
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
corr_stride, // const size_t corr_stride, // in floats corr_stride, // const size_t corr_stride, // in floats
corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15) corr_radius, // int corr_radius, // radius of the output correlation (7 for 15x15)
gpu_corrs); // float * gpu_corrs); // correlation output data gpu_corrs); // float * gpu_corrs); // correlation output data
} }
} }
...@@ -2493,7 +2506,11 @@ __global__ void create_nonoverlap_list( ...@@ -2493,7 +2506,11 @@ __global__ void create_nonoverlap_list(
*/ */
__global__ void index_correlate( __global__ void index_correlate(
int num_cams, int num_cams,
int * sel_pairs, // unused bits should be 0 // int * sel_pairs, // unused bits should be 0
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16 float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// struct tp_task * gpu_tasks, // struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
...@@ -2505,6 +2522,7 @@ __global__ void index_correlate( ...@@ -2505,6 +2522,7 @@ __global__ void index_correlate(
if (num_tile >= num_tiles){ if (num_tile >= num_tiles){
return; return;
} }
int sel_pairs[] = {sel_pairs0, sel_pairs1, sel_pairs2, sel_pairs3};
// int task_size = get_task_size(num_cams); // int task_size = get_task_size(num_cams);
int task_task =get_task_task(num_tile, gpu_ftasks, num_cams); int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0? if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?
......
...@@ -65,7 +65,11 @@ extern "C" __global__ void convert_direct( // called with a single block, single ...@@ -65,7 +65,11 @@ extern "C" __global__ void convert_direct( // called with a single block, single
extern "C" __global__ void correlate2D( extern "C" __global__ void correlate2D(
int num_cams, int num_cams,
int * sel_pairs, // int * sel_pairs,
int sel_pairs0,
int sel_pairs1,
int sel_pairs2,
int sel_pairs3,
float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
int colors, // number of colors (3/1) int colors, // number of colors (3/1)
float scale0, // scale for R float scale0, // scale for R
......
...@@ -37,11 +37,16 @@ ...@@ -37,11 +37,16 @@
*/ */
#ifndef JCUDA #ifndef JCUDA
#include "tp_defines.h" #include "tp_defines.h"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h" #include "geometry_correction.h"
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
#ifndef get_task_size
#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
#endif
// Using NUM_CAMS threads per tile // Using NUM_CAMS threads per tile
#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS) #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM) ///#define CYCLES_COPY_GC ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
......
...@@ -30,6 +30,12 @@ ...@@ -30,6 +30,12 @@
** -----------------------------------------------------------------------------** ** -----------------------------------------------------------------------------**
*/ */
#define NOCORR
#define NOCORR_TD
#define NOTEXTURES
#define NOTEXTURE_RGBA
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <math.h> #include <math.h>
...@@ -297,11 +303,29 @@ int main(int argc, char **argv) ...@@ -297,11 +303,29 @@ int main(int argc, char **argv)
const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist"; const char* rByRDist_file = "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist";
const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector"; const char* correction_vector_file = "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector";
const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction"; const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction";
int sel_pairs[4];
#if TEST_LWIR
// testing with 16 LWIR
int num_cams = 16;
int num_colors = 1;
sel_pairs[0] = 0xffffffff;
sel_pairs[1] = 0xffffffff;
sel_pairs[2] = 0xffffffff;
sel_pairs[3] = 0x00ffffff;
int num_pairs = 120;
#else
// testing with quad RGB // testing with quad RGB
int num_cams = 4; int num_cams = 4;
int num_colors = 3; int num_colors = 3;
int task_size = sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams); sel_pairs[0] = 0x3f;
sel_pairs[1] = 0;
sel_pairs[2] = 0;
sel_pairs[3] = 0;
int num_pairs = 6;
#endif
int task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
// FIXME: update to use new correlations and num_cams // FIXME: update to use new correlations and num_cams
float port_offsets4[4][2] = {// used only in textures to scale differences float port_offsets4[4][2] = {// used only in textures to scale differences
...@@ -318,7 +342,6 @@ int main(int argc, char **argv) ...@@ -318,7 +342,6 @@ int main(int argc, char **argv)
} }
} else { } else {
for (int ncam = 0; ncam < num_cams; ncam++) { for (int ncam = 0; ncam < num_cams; ncam++) {
// double alpha = 2 * Math.PI * (i + (topis0 ? 0 : 0.5))/num_sensors;
double alpha = 2 * M_PI * (ncam) /num_cams; // math.h double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
port_offsets[ncam][0] = 0.5 * sin((alpha)); port_offsets[ncam][0] = 0.5 * sin((alpha));
port_offsets[ncam][1] = -0.5 * cos((alpha)); port_offsets[ncam][1] = -0.5 * cos((alpha));
...@@ -338,8 +361,8 @@ int main(int argc, char **argv) ...@@ -338,8 +361,8 @@ int main(int argc, char **argv)
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float)); float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile /// static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile /// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float)); float * ftask_data = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * task_size * sizeof(float)); float * ftask_data1 = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
...@@ -387,8 +410,8 @@ int main(int argc, char **argv) ...@@ -387,8 +410,8 @@ int main(int argc, char **argv)
// GPU pointers to GPU memory // GPU pointers to GPU memory
struct tp_task * gpu_tasks; // TODO: ***** remove ! **** /// struct tp_task * gpu_tasks; // TODO: ***** remove ! **** DONE
float * gpu_ftasks; // TODO: ***** allocate ! **** float * gpu_ftasks; // TODO: ***** allocate ! **** DONE
int * gpu_active_tiles; int * gpu_active_tiles;
int * gpu_num_active; int * gpu_num_active;
int * gpu_num_corr_tiles; int * gpu_num_corr_tiles;
...@@ -473,13 +496,13 @@ int main(int argc, char **argv) ...@@ -473,13 +496,13 @@ int main(int argc, char **argv)
gpu_corrs = alloc_image_gpu( gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
CORR_SIZE, // int width, CORR_SIZE, // int width,
NUM_PAIRS * TILESX * TILESY); // int height); num_pairs * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it) // read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be // allocate all other correlation data, some may be
gpu_corrs_td = alloc_image_gpu( gpu_corrs_td = alloc_image_gpu(
&dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * DTT_SIZE * DTT_SIZE, // int width, 4 * DTT_SIZE * DTT_SIZE, // int width,
NUM_PAIRS * TILESX * TILESY); // int height); num_pairs * TILESX * TILESY); // int height);
gpu_corrs_combo = alloc_image_gpu( gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes) &dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
...@@ -534,7 +557,8 @@ int main(int argc, char **argv) ...@@ -534,7 +557,8 @@ int main(int argc, char **argv)
for (int ty = 0; ty < TILESY; ty++){ for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){ for (int tx = 0; tx < TILESX; tx++){
int nt = ty * TILESX + tx; int nt = ty * TILESX + tx;
int task_task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS); // int task_task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
int task_task = 0xf | (1 << TASK_CORR_BITS); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16); int task_txy = tx + (ty << 16);
float task_target_disparity = DBG_DISPARITY; float task_target_disparity = DBG_DISPARITY;
float * tp = ftask_data + task_size * nt; float * tp = ftask_data + task_size * nt;
...@@ -548,7 +572,7 @@ int main(int argc, char **argv) ...@@ -548,7 +572,7 @@ int main(int argc, char **argv)
} }
} }
int tp_task_size = sizeof(ftask_data)/sizeof(float)/task_size; int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
int num_active_tiles; // will be calculated by convert_direct int num_active_tiles; // will be calculated by convert_direct
...@@ -574,8 +598,9 @@ int main(int argc, char **argv) ...@@ -574,8 +598,9 @@ int main(int argc, char **argv)
#endif #endif
// segfault in the next // segfault in the next
gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float))); /// gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
gpu_ftasks = (float *) copyalloc_kernel_gpu((float * ) &ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float))); // gpu_ftasks = (float *) copyalloc_kernel_gpu((float * ) &ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
// build corr_indices - not needed anymore? // build corr_indices - not needed anymore?
/* /*
...@@ -599,7 +624,7 @@ int main(int argc, char **argv) ...@@ -599,7 +624,7 @@ int main(int argc, char **argv)
NUM_PAIRS * TILESX * TILESY); NUM_PAIRS * TILESX * TILESY);
*/ */
// just allocate // just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, NUM_PAIRS * TILESX * TILESY*sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int))); checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
// //
...@@ -849,12 +874,20 @@ int main(int argc, char **argv) ...@@ -849,12 +874,20 @@ int main(int argc, char **argv)
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations; float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM); sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM); printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
/*
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
&task_data1, &task_data1,
gpu_tasks, gpu_tasks,
tp_task_size * sizeof(struct tp_task), tp_task_size * sizeof(struct tp_task),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
*/
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
ftask_data1,
gpu_ftasks,
tp_task_size * task_size *sizeof(float),
cudaMemcpyDeviceToHost));
//task_size
#if 0 // for manual browsing #if 0 // for manual browsing
struct tp_task * old_task = &task_data [DBG_TILE]; struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE]; struct tp_task * new_task = &task_data1[DBG_TILE];
...@@ -1079,10 +1112,13 @@ int main(int argc, char **argv) ...@@ -1079,10 +1112,13 @@ int main(int argc, char **argv)
sdkResetTimer(&timerCORR); sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR); sdkStartTimer(&timerCORR);
} }
// FIXME: update to provide sel_pairs
correlate2D<<<1,1>>>( correlate2D<<<1,1>>>(
num_cams, // int num_cams, num_cams, // int num_cams,
0, // int * sel_pairs, // unused bits should be 0 // 0, // int * sel_pairs, // unused bits should be 0
sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0
sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1) 3, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R color_weights[0], // 0.25, // float scale0, // scale for R
...@@ -1156,8 +1192,11 @@ int main(int argc, char **argv) ...@@ -1156,8 +1192,11 @@ int main(int argc, char **argv)
// FIXME: provide sel_pairs // FIXME: provide sel_pairs
correlate2D<<<1,1>>>( // output TD tiles, no normalization correlate2D<<<1,1>>>( // output TD tiles, no normalization
num_cams, // int num_cams, num_cams, // int num_cams,
0, // int * sel_pairs, // unused bits should be 0 // 0, // int * sel_pairs, // unused bits should be 0
sel_pairs[0], // int sel_pairs0 // unused bits should be 0
sel_pairs[1], // int sel_pairs1, // unused bits should be 0
sel_pairs[2], // int sel_pairs2, // unused bits should be 0
sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
3, // int colors, // number of colors (3/1) 3, // int colors, // number of colors (3/1)
color_weights[0], // 0.25, // float scale0, // scale for R color_weights[0], // 0.25, // float scale0, // scale for R
...@@ -1182,11 +1221,11 @@ int main(int argc, char **argv) ...@@ -1182,11 +1221,11 @@ int main(int argc, char **argv)
gpu_num_corr_tiles, gpu_num_corr_tiles,
sizeof(int), sizeof(int),
cudaMemcpyDeviceToHost)); cudaMemcpyDeviceToHost));
num_corr_combo = num_corrs/NUM_PAIRS; num_corr_combo = num_corrs/num_pairs;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs) num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
NUM_PAIRS, // int num_pairs, // num pairs per tile (should be the same) num_pairs, // int num_pairs, // num pairs per tile (should be the same)
1, // int init_output, // !=0 - reset output tiles to zero before accumulating 1, // int init_output, // !=0 - reset output tiles to zero before accumulating
0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) 0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
...@@ -1464,7 +1503,8 @@ int main(int argc, char **argv) ...@@ -1464,7 +1503,8 @@ int main(int argc, char **argv)
checkCudaErrors(cudaFree(gpu_clt_h[ncam])); checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
checkCudaErrors(cudaFree(gpu_corr_images_h[ncam])); checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
} }
checkCudaErrors(cudaFree(gpu_tasks)); // checkCudaErrors(cudaFree(gpu_tasks));
checkCudaErrors(cudaFree(gpu_ftasks));
checkCudaErrors(cudaFree(gpu_active_tiles)); checkCudaErrors(cudaFree(gpu_active_tiles));
checkCudaErrors(cudaFree(gpu_num_active)); checkCudaErrors(cudaFree(gpu_num_active));
checkCudaErrors(cudaFree(gpu_kernels)); checkCudaErrors(cudaFree(gpu_kernels));
......
...@@ -41,14 +41,22 @@ ...@@ -41,14 +41,22 @@
#ifndef JCUDA #ifndef JCUDA
#include <stdio.h> #include <stdio.h>
#define THREADSX (DTT_SIZE) #define THREADSX (DTT_SIZE)
#define TEST_LWIR 1 #define TEST_LWIR 0
#define NUM_CAMS 16 // now maximal number of cameras #define NUM_CAMS 16 // now maximal number of cameras
#define NUM_PAIRS 6 //#define NUM_PAIRS 6
#define NUM_COLORS 1 //3 #define NUM_COLORS 1 //3
#define IMG_WIDTH 2592 // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
#define IMG_HEIGHT 1936 #if TEST_LWIR
#define KERNELS_HOR 164 #define IMG_WIDTH 640
#define KERNELS_VERT 123 #define IMG_HEIGHT 512
#define KERNELS_HOR 82 // 80+2
#define KERNELS_VERT 66 // 64+2
#else
#define IMG_WIDTH 2592
#define IMG_HEIGHT 1936
#define KERNELS_HOR 164 // 2592 / 16 + 2
#define KERNELS_VERT 123 // 1936 / 16 + 2
#endif
#define KERNELS_LSTEP 4 #define KERNELS_LSTEP 4
#define THREADS_PER_TILE 8 #define THREADS_PER_TILE 8
#define TILES_PER_BLOCK 4 #define TILES_PER_BLOCK 4
...@@ -73,7 +81,7 @@ ...@@ -73,7 +81,7 @@
#define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor #define TASK_TEXTURE_W_BIT 3 // Texture with West neighbor
#define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task #define TASK_TEXTURE_BIT 3 // bit to request texture calculation int task field of struct tp_task
#define LIST_TEXTURE_BIT 7 // bit to request texture calculation #define LIST_TEXTURE_BIT 7 // bit to request texture calculation
#define CORR_OUT_RAD 4 #define CORR_OUT_RAD 7 // full tile (15x15), was 4 (9x9)
#define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan #define FAT_ZERO_WEIGHT 0.0001 // add to port weights to avoid nan
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list #define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment