......@@ -110,14 +110,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define MCLT_UNION_LEN (DTT_SIZE2 * (DTT_SIZE2 + 2))
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 161 // 49
#define DBG_TILE_Y 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
// struct tp_task
......@@ -1150,6 +1142,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -1900,11 +1893,11 @@ __global__ void textures_gen(
#endif // ifdef USE_textures_gen
extern "C"
__global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -2006,14 +1999,21 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
// perform idct
#ifdef DBG_TILE // perform idct
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
((tile_num == DBG_TILE) && (threadIdx.x == 0)));
0, // int do_acc, // 1 - add to previous value, 0 - overwrite
clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
mclt_tile, // float * mclt_tile )
__syncthreads();// __syncwarp();
#ifdef DEBUG7
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
printf("\ntextures_gen mclt color = %d\n",color);
......@@ -2024,6 +2024,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
if (colors > 1) {
#ifdef DBG_TILE
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
......@@ -2032,6 +2033,16 @@ __global__ void textures_accumulate(
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
((tile_num == DBG_TILE) && (threadIdx.x == 0))); // int debug);
(color < 2), // const int rb_mode, // 0 - green, 1 - r/b
min_shot, // float min_shot, // 10.0
scale_shot, // float scale_shot, // 3.0 (0.0 for mono)
mclt_tile, // float * mclt_src, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_dst, // float * mclt_dst, // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
mclt_tmp, // float * mclt_tmp,
0); // int debug);
__syncthreads();// __syncwarp();
} else {
// copy? - no, just remember to use mclt_tile, not mclt_dst
......@@ -2105,6 +2116,7 @@ __global__ void textures_accumulate(
__syncthreads();// __syncwarp();
// __shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
#ifdef DBG_TILE
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
......@@ -2120,7 +2132,23 @@ __global__ void textures_accumulate(
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
(tile_num == DBG_TILE) ); //int debug );
colors, // int colors, // number of colors
(float*) shr.mclt_debayer, // float * mclt_tile, // debayer // has gaps to align with union !
(float*) mclt_tiles, // float * rbg_tile, // if not null - original (not-debayered) rbg tile to use for the output
(float *) shr1.rgbaw, // float * rgba, // result
(float * ) 0, // float * ports_rgb, // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
(float * ) 0, // float * max_diff, // maximal (weighted) deviation of each channel from the average /null
(float *) port_offsets, // float * port_offsets, // [port]{x_off, y_off} - just to scale pixel value differences
diff_sigma, // float diff_sigma, // pixel value/pixel change
diff_threshold, // float diff_threshold, // pixel value/pixel change
min_agree, // float min_agree, NOT USED? // minimal number of channels to agree on a point (real number to work with fuzzy averages)
weights, // float * chn_weights, // color channel weights, sum == 1.0
dust_remove, // int dust_remove, // Do not reduce average weight when only one image differes much from the average
keep_weights, // int keep_weights, // return channel weights and rms after A in RGBA (weight are always calculated)
0); //int debug );
// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
// float rgbaw [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
// size_t texture_tile_offset = + tile_indx * texture_stride;
......@@ -80,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
const size_t texture_rbga_stride, // in floats 8*stride
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
extern "C" __global__ void textures_accumulate(
// int border_tile, // if 1 - watch for border
int * woi, // x, y, width,height
size_t num_texture_tiles, // number of texture tiles to process
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
......@@ -127,6 +127,7 @@ __global__ void generate_RBGA(
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
// TODO: use geometry_correction rXY !
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
......@@ -42,6 +42,7 @@
#define SCENE_UNITS_SCALE 0.001 // meters from mm
#define MIN_DISPARITY 0.01 // minimal disparity to try to convert to world coordinates
struct tp_task {
int task;
union {
......@@ -114,17 +115,19 @@ struct gc {
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist); // length should match RBYRDIST_LEN
float * gpu_rByRDist, // length should match RBYRDIST_LEN
union trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
......@@ -339,7 +339,8 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY];
......@@ -634,8 +635,8 @@ struct tp_task {
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * ) gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * ) gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
......@@ -683,7 +684,7 @@ struct tp_task {
dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
......@@ -703,7 +704,8 @@ struct tp_task {
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
......@@ -714,6 +716,38 @@ struct tp_task {
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
// gpu_tasks = (struct tp_task *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
// static struct tp_task task_data1 [TILESX*TILESY]; // maximal length - each tile
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
tp_task_size * sizeof(struct tp_task),
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
printf("old_task txy = 0x%x\n", task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < NUM_CAMS; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][0], task_data1[DBG_TILE].xy[ncam][0],
task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
printf("camera %d pY old %f new %f diff = %f\n", ncam,
task_data [DBG_TILE].xy[ncam][1], task_data1[DBG_TILE].xy[ncam][1],
task_data [DBG_TILE].xy[ncam][1]- task_data1[DBG_TILE].xy[ncam][1]);
#if 0
// temporarily restore tasks
tp_task_size * sizeof(struct tp_task),
#endif // TEST_GEOM_CORR
......@@ -72,10 +72,22 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java)
#define DBG_DISPARITY 56.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
#define TILES_PER_BLOCK_GEOM (32/NUM_CAMS) // each tile has NUM_CAMS threads
// Use CORR_OUT_RAD for the correlation output
//#define DBG_TILE_X 40
//#define DBG_TILE_Y 80
#define DBG_TILE_X 151 // 161 // 49
#define DBG_TILE_Y 69 // 111 // 66
#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
//#undef HAS_PRINTF
......@@ -99,7 +111,9 @@
//#define USE_textures_gen
//#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
//#define DEBUG20 1
#define DEBUG21 1
#endif //#ifndef JCUDA
