Commit dba4dfce authored by Andrey Filippov's avatar Andrey Filippov

implemented calc_rot_deriv()

parent b0f7d665
...@@ -45,6 +45,15 @@ ...@@ -45,6 +45,15 @@
#include "TileProcessor.h" #include "TileProcessor.h"
#endif // #ifndef JCUDA #endif // #ifndef JCUDA
// CUDA fast math is slower!
//#define FASTMATH 1
/*
fast
GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
nofast
GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
*/
#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT)) #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
//#define IMCLT14 //#define IMCLT14
...@@ -1019,19 +1028,7 @@ __global__ void correlate2D( ...@@ -1019,19 +1028,7 @@ __global__ void correlate2D(
__syncthreads();// __syncwarp(); __syncthreads();// __syncwarp();
#endif #endif
#endif #endif
} // if (color == 1){ // LPF only after B (nothing in mono) } // if (color == 1){ // LPF only after B (nothing in mono)
} // for (int color = 0; color < colors; color++){ } // for (int color = 0; color < colors; color++){
normalizeTileAmplitude( normalizeTileAmplitude(
clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports clt_corr, // float * clt_tile, // [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
...@@ -1083,23 +1080,6 @@ __global__ void correlate2D( ...@@ -1083,23 +1080,6 @@ __global__ void correlate2D(
#endif #endif
#endif #endif
dttii_2d(clt_corr); dttii_2d(clt_corr);
/*
// change to 16-32 threads?? in next iteration
// vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
for (int q = 0; q < 4; q++){
int is_sin = (q >> 1) & 1;
dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
}
__syncthreads();
// hor pass, corresponding to vert pass in Java
for (int q = 0; q < 4; q++){
int is_sin = q & 1;
dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 , 1, is_sin); // horizontal pass, tread is row
}
__syncthreads();
*/
#ifdef DBG_TILE #ifdef DBG_TILE
#ifdef DEBUG6 #ifdef DEBUG6
...@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude( ...@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude(
*(clt_tile_j1) * *(clt_tile_j1) + *(clt_tile_j1) * *(clt_tile_j1) +
*(clt_tile_j2) * *(clt_tile_j2) + *(clt_tile_j2) * *(clt_tile_j2) +
*(clt_tile_j3) * *(clt_tile_j3); *(clt_tile_j3) * *(clt_tile_j3);
#ifdef FASTMATH
float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
#else
float scale = rsqrtf(s2); // 1.0/sqrt(s2) float scale = rsqrtf(s2); // 1.0/sqrt(s2)
#endif
*(clt_tile_j0) *= scale; *(clt_tile_j0) *= scale;
*(clt_tile_j1) *= scale; *(clt_tile_j1) *= scale;
*(clt_tile_j2) *= scale; *(clt_tile_j2) *= scale;
...@@ -3333,7 +3317,12 @@ __device__ void debayer_shot( ...@@ -3333,7 +3317,12 @@ __device__ void debayer_shot(
if (scale_shot > 0.0) { if (scale_shot > 0.0) {
#ifdef FASTMATH
float k = __frsqrt_rn(min_shot);
#else
float k = rsqrtf(min_shot); float k = rsqrtf(min_shot);
#endif
// double k = 1.0/Math.sqrt(min_shot); //sqrtf // double k = 1.0/Math.sqrt(min_shot); //sqrtf
//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i])); //for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
...@@ -3343,7 +3332,14 @@ __device__ void debayer_shot( ...@@ -3343,7 +3332,14 @@ __device__ void debayer_shot(
#pragma unroll #pragma unroll
for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){ for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
float d = *mcltp; float d = *mcltp;
#ifdef FASTMATH
*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
#else
*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d)); *mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
#endif
mcltp += DTT_SIZE; mcltp += DTT_SIZE;
} }
mcltp += (DTT_SIZE21-DTT_SIZE2); mcltp += (DTT_SIZE21-DTT_SIZE2);
...@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba( ...@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba(
s2 += d * d; s2 += d * d;
} }
float mse = (s0*s2 - s1*s1) / (s0 * s0); float mse = (s0*s2 - s1*s1) / (s0 * s0);
#ifdef FASTMATH
* crms_col_i = __fsqrt_rn(mse);
#else
* crms_col_i = sqrtf(mse); * crms_col_i = sqrtf(mse);
#endif
sw += *(chn_weights +ncol) * mse; sw += *(chn_weights +ncol) * mse;
} }
#ifdef FASTMATH
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
#else
*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window *(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
#endif
} }
#ifdef DEBUG9 #ifdef DEBUG9
} }
...@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba( ...@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba(
dc *= wnd2_inv; // to compensate fading near the edges dc *= wnd2_inv; // to compensate fading near the edges
d+= *(chn_weights + ncol) * dc * dc; d+= *(chn_weights + ncol) * dc * dc;
} }
#ifdef FASTMATH
d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#else
d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
#endif
// Add weight to both channels in a pair // Add weight to both channels in a pair
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d; *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d; *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
...@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba( ...@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba(
} }
// TODO: Should it use pair_dist2r ? no as it is relative? // TODO: Should it use pair_dist2r ? no as it is relative?
// port_weights[ip][i] = Math.exp(-ksigma * d2[ip]); // port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
#ifdef FASTMATH
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#else
*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT); *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
#endif
} }
// and now make a new average with those weights // and now make a new average with those weights
// Inserting dust remove here // Inserting dust remove here
...@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba( ...@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba(
for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){ for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
mx = fmaxf(mx, max_diff_tmp[cam][i]); mx = fmaxf(mx, max_diff_tmp[cam][i]);
} }
#ifdef FASTMATH
max_diff[cam] = __fsqrt_rn(mx);
#else
max_diff[cam] = sqrtf(mx); max_diff[cam] = sqrtf(mx);
#endif
} }
} }
......
...@@ -41,6 +41,18 @@ ...@@ -41,6 +41,18 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask); // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
extern "C" __global__ void clear_texture_list( extern "C" __global__ void clear_texture_list(
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7) int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int width, // <= TILESX, use for faster processing of LWIR images int width, // <= TILESX, use for faster processing of LWIR images
...@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg( ...@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg(
int h_offset, int h_offset,
const size_t dstride); // in floats (pixels) const size_t dstride); // in floats (pixels)
extern "C"
__global__ void generate_RBGA(
// Parameters to generate texture tasks
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task list
// declare arrays in device code?
int * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
int * num_texture_tiles, // number of texture tiles to process (8 separate elements for accumulation)
int * woi, // x,y,width,height of the woi
int width, // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
int height, // <= TILESY, use for faster processing of LWIR images
// Parameters for the texture generation
float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
float * gpu_port_offsets, // relative ports x,y offsets - just to scale differences, may be approximate
int colors, // number of colors (3/1)
int is_lwir, // do not perform shot correction
float min_shot, // 10.0
float scale_shot, // 3.0
float diff_sigma, // pixel value/pixel change
float diff_threshold, // pixel value/pixel change
float min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
float weight0, // scale for R
float weight1, // scale for B
float weight2, // scale for G
int dust_remove, // Do not reduce average weight when only one image differs much from the average
int keep_weights, // return channel weights after A in RGBA (was removed)
const size_t texture_rbga_stride, // in floats
float * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
...@@ -72,9 +72,9 @@ ...@@ -72,9 +72,9 @@
// kernels (not used so far) // kernels (not used so far)
#ifdef BBBB #if 0
extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode); extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
#endif// #ifdef BBBB #endif// #if 0
//=========================== 2D functions =============== //=========================== 2D functions ===============
extern __device__ void corrUnfoldTile( extern __device__ void corrUnfoldTile(
......
This diff is collapsed.
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
#define SCENE_UNITS_SCALE 0.001 // meters from mm
struct tp_task { struct tp_task {
int task; int task;
union { union {
...@@ -62,18 +63,36 @@ struct corr_vector{ ...@@ -62,18 +63,36 @@ struct corr_vector{
float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
}; };
union trot_deriv{
struct {
float rots [NUM_CAMS][3][3];
float d_daz [NUM_CAMS][3][3];
float d_tilt [NUM_CAMS][3][3];
float d_roll [NUM_CAMS][3][3];
float d_zoom [NUM_CAMS][3][3];
};
float matrices [5][NUM_CAMS][3][3];
};
struct gc { struct gc {
float pixelCorrectionWidth; // =2592; // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
float pixelCorrectionHeight; // =1936;
float line_time; // duration of one scan line readout (for ERS)
float focalLength; // =FOCAL_LENGTH; float focalLength; // =FOCAL_LENGTH;
float pixelSize; // = PIXEL_SIZE; //um float pixelSize; // = PIXEL_SIZE; //um
float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor float distortionRadius; // = DISTORTION_RADIUS; // mm - half width of the sensor
union {
float distortionA8; //r^8 (normalized to focal length or to sensor half width?) struct {
float distortionA7; //r^7 (normalized to focal length or to sensor half width?) float distortionC; // r^2
float distortionA6; //r^6 (normalized to focal length or to sensor half width?) float distortionB; // r^3
float distortionA5; //r^5 (normalized to focal length or to sensor half width?) float distortionA; // r^4 (normalized to focal length or to sensor half width?)
float distortionA; // r^4 (normalized to focal length or to sensor half width?) float distortionA5; //r^5 (normalized to focal length or to sensor half width?)
float distortionB; // r^3 float distortionA6; //r^6 (normalized to focal length or to sensor half width?)
float distortionC; // r^2 float distortionA7; //r^7 (normalized to focal length or to sensor half width?)
float distortionA8; //r^8 (normalized to focal length or to sensor half width?)
};
float rad_coeff [7];
};
// parameters, common for all sensors // parameters, common for all sensors
float elevation; // degrees, up - positive; float elevation; // degrees, up - positive;
float heading; // degrees, CW (from top) - positive float heading; // degrees, CW (from top) - positive
...@@ -81,19 +100,34 @@ struct gc { ...@@ -81,19 +100,34 @@ struct gc {
float forward [NUM_CAMS]; float forward [NUM_CAMS];
float right [NUM_CAMS]; float right [NUM_CAMS];
float height [NUM_CAMS]; float height [NUM_CAMS];
float roll [NUM_CAMS]; // degrees, CW (to target) - positive float roll [NUM_CAMS]; // degrees, CW (to target) - positive
float pXY0 [NUM_CAMS][2];
float common_right; // mm right, camera center float common_right; // mm right, camera center
float common_forward; // mm forward (to target), camera center float common_forward; // mm forward (to target), camera center
float common_height; // mm up, camera center float common_height; // mm up, camera center
float common_roll; // degrees CW (to target) camera as a whole float common_roll; // degrees CW (to target) camera as a whole
// float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved) // float [][] XYZ_he; // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
// float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll) // float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading, common_roll)
float rXY [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius float rXY [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
// float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}}; // float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
// only used for the multi-quad systems // only used for the multi-quad systems
float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
}; };
extern "C" __global__ void get_tiles_offsets(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct gc * gpu_geometry_correction,
struct corr_vector * gpu_correction_vector,
float * gpu_rByRDist); // length should match RBYRDIST_LEN
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
union trot_deriv * gpu_rot_deriv);
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
//#include "dtt8x8.cuh" //#include "dtt8x8.cuh"
#include "dtt8x8.h" #include "dtt8x8.h"
#include "geometry_correction.h"
#include "TileProcessor.cuh" #include "TileProcessor.cuh"
///#include "cuda_profiler_api.h" ///#include "cuda_profiler_api.h"
//#include "cudaProfiler.h" //#include "cudaProfiler.h"
...@@ -339,6 +340,7 @@ struct tp_task { ...@@ -339,6 +340,7 @@ struct tp_task {
float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float)); float * host_kern_buf = (float *)malloc(KERN_SIZE * sizeof(float));
// static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile static struct tp_task task_data [TILESX*TILESY]; // maximal length - each tile
union trot_deriv rot_deriv;
int corr_indices [NUM_PAIRS*TILESX*TILESY]; int corr_indices [NUM_PAIRS*TILESX*TILESY];
// int texture_indices [TILESX*TILESY]; // int texture_indices [TILESX*TILESY];
int texture_indices [TILESX*TILESYA]; int texture_indices [TILESX*TILESYA];
...@@ -386,13 +388,13 @@ struct tp_task { ...@@ -386,13 +388,13 @@ struct tp_task {
struct gc fgeometry_correction; struct gc fgeometry_correction;
float* correction_vector; float* correction_vector;
int correction_vector_length; int correction_vector_length;
// float rByRDist
float * rByRDist; float * rByRDist;
int rByRDist_length; int rByRDist_length;
float * gpu_geometry_correction; struct gc * gpu_geometry_correction;
float * gpu_correction_vector; struct corr_vector * gpu_correction_vector;
float * gpu_rByRDist; float * gpu_rByRDist;
union trot_deriv * gpu_rot_deriv;
readFloatsFromFile( readFloatsFromFile(
(float *) &fgeometry_correction, // float * data, // allocated array (float *) &fgeometry_correction, // float * data, // allocated array
...@@ -405,11 +407,11 @@ struct tp_task { ...@@ -405,11 +407,11 @@ struct tp_task {
correction_vector_file, // const char * path, correction_vector_file, // const char * path,
&correction_vector_length); // int * len_in_floats) &correction_vector_length); // int * len_in_floats)
gpu_geometry_correction = copyalloc_kernel_gpu( gpu_geometry_correction = (struct gc *) copyalloc_kernel_gpu(
(float *) &fgeometry_correction, (float *) &fgeometry_correction,
sizeof(fgeometry_correction)/sizeof(float)); sizeof(fgeometry_correction)/sizeof(float));
gpu_correction_vector = copyalloc_kernel_gpu( gpu_correction_vector = (struct corr_vector * ) copyalloc_kernel_gpu(
correction_vector, correction_vector,
correction_vector_length); correction_vector_length);
...@@ -417,6 +419,8 @@ struct tp_task { ...@@ -417,6 +419,8 @@ struct tp_task {
rByRDist, rByRDist,
rByRDist_length); rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
float lpf_rbg[3][64]; // not used float lpf_rbg[3][64]; // not used
for (int ncol = 0; ncol < 3; ncol++) { for (int ncol = 0; ncol < 3; ncol++) {
if (lpf_sigmas[ncol] > 0.0) { if (lpf_sigmas[ncol] > 0.0) {
...@@ -597,6 +601,125 @@ struct tp_task { ...@@ -597,6 +601,125 @@ struct tp_task {
gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS); gpu_clt = copyalloc_pointers_gpu (gpu_clt_h, NUM_CAMS);
// gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS); // gpu_corr_images = copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
#define TEST_ROT_MATRICES
#ifdef TEST_ROT_MATRICES
// dim3 threads_rot(3,3,NUM_CAMS);
// dim3 grid_rot (1, 1, 1);
dim3 threads_rot(3,3,3);
dim3 grid_rot (NUM_CAMS, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
// calc_rot_matrices<<<grid_rot,threads_rot>>> (
// gpu_correction_vector); // struct corr_vector * gpu_correction_vector,
calc_rot_deriv<<<grid_rot,threads_rot>>> (
(corr_vector * ) gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
(trot_deriv * ) gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
checkCudaErrors(cudaMemcpy(
&rot_deriv,
gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
const char* matrices_names[] = {
"rot","d_daz","d_tilt","d_roll","d_zoom"};
for (int i = 0; i < 5;i++){
printf("Matrix %s for camera\n",matrices_names[i]);
for (int row = 0; row<3; row++){
for (int ncam = 0; ncam<NUM_CAMS;ncam++){
for (int col = 0; col <3; col++){
printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
if (col == 2){
if (ncam == (NUM_CAMS-1)){
printf("\n");
} else {
printf(" ");
}
} else {
printf(" ");
}
}
}
}
}
#endif // TEST_ROT_MATRICES
#define TEST_GEOM_CORR
#ifdef TEST_GEOM_CORR
dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
dim3 grid_geom ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
get_tiles_offsets<<<grid_geom,threads_geom>>> (
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
#endif // TEST_GEOM_CORR
//create and start CUDA timer //create and start CUDA timer
StopWatchInterface *timerTP = 0; StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP); sdkCreateTimer(&timerTP);
...@@ -607,28 +730,23 @@ struct tp_task { ...@@ -607,28 +730,23 @@ struct tp_task {
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z); printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z); printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
#ifdef DBG_TILE
const int numIterations = 1; //0;
const int i0 = 0; // -1;
#else
const int numIterations = 10; // 0; //0;
const int i0 = -1; // 0; // -1;
#endif
cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared); cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
/// cudaProfilerStart(); /// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS]; float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; // [NUM_CAMS];
for (int i = i0; i < numIterations; i++) for (int i = i0; i < numIterations; i++)
{ {
if (i == 0) if (i == 0)
{ {
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP); sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP); sdkStartTimer(&timerTP);
} }
convert_correct_tiles<<<grid_tp,threads_tp>>>( convert_correct_tiles<<<grid_tp,threads_tp>>>(
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets, fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
gpu_kernels, // float ** gpu_kernels, gpu_kernels, // float ** gpu_kernels,
gpu_images, // float ** gpu_images, gpu_images, // float ** gpu_images,
gpu_tasks, // struct tp_task * gpu_tasks, gpu_tasks, // struct tp_task * gpu_tasks,
...@@ -638,11 +756,11 @@ struct tp_task { ...@@ -638,11 +756,11 @@ struct tp_task {
0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green 0); // 7); // 0); // 7); // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
getLastCudaError("Kernel execution failed"); getLastCudaError("Kernel execution failed");
checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaDeviceSynchronize());
printf("%d\n",i); printf("%d\n",i);
} }
// checkCudaErrors(cudaDeviceSynchronize()); // checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&timerTP); sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations; float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP); sdkDeleteTimer(&timerTP);
...@@ -1154,6 +1272,8 @@ struct tp_task { ...@@ -1154,6 +1272,8 @@ struct tp_task {
checkCudaErrors(cudaFree(gpu_geometry_correction)); checkCudaErrors(cudaFree(gpu_geometry_correction));
checkCudaErrors(cudaFree(gpu_correction_vector)); checkCudaErrors(cudaFree(gpu_correction_vector));
checkCudaErrors(cudaFree(gpu_rByRDist)); checkCudaErrors(cudaFree(gpu_rByRDist));
checkCudaErrors(cudaFree(gpu_rot_deriv));
free (rByRDist); free (rByRDist);
free (correction_vector); free (correction_vector);
......
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
// Avoiding includes in jcuda, all source files will be merged // Avoiding includes in jcuda, all source files will be merged
#pragma once #pragma once
#ifndef JCUDA #ifndef JCUDA
#include <stdio.h>
#define THREADSX (DTT_SIZE) #define THREADSX (DTT_SIZE)
#define NUM_CAMS 4 #define NUM_CAMS 4
#define NUM_PAIRS 6 #define NUM_PAIRS 6
...@@ -72,7 +73,11 @@ ...@@ -72,7 +73,11 @@
#define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list #define THREADS_DYNAMIC_BITS 5 // treads in block for CDP creation of the texture list
#define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java) #define DBG_DISPARITY 32.0 // disparity for which to calculate offsets (not needed in Java)
#define RBYRDIST_LEN 20001 // length of #define RBYRDIST_LEN 5001 // for doubles 10001 - floats // length of rByRDist to allocate shared memory
#define RBYRDIST_STEP 0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
#define TILES_PER_BLOCK_GEOM 32 // each tile has NUM_CAMS threads
//#undef HAS_PRINTF //#undef HAS_PRINTF
#define HAS_PRINTF #define HAS_PRINTF
//7 //7
...@@ -87,10 +92,15 @@ ...@@ -87,10 +92,15 @@
#define DEBUG8 1 #define DEBUG8 1
#define DEBUG9 1 #define DEBUG9 1
*/ */
#define DEBUG10 1 //textures
#define DEBUG11 1 //#define DEBUG10 1
#define DEBUG12 1 //#define DEBUG11 1
//#define DEBUG12 1
//#define USE_textures_gen //#define USE_textures_gen
#define DEBUG_OOB1 1 //#define DEBUG_OOB1 1
// geom
#define DEBUG20 1
#endif //#ifndef JCUDA #endif //#ifndef JCUDA
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment