Commit 14d7689b authored by Andrey Filippov's avatar Andrey Filippov

More methods replacing flat code

parent dc090454
......@@ -280,22 +280,318 @@ void TpHostGpu::setRGBA(){
(m_tpParams.num_colors + 1) * sizeof(float)));
}
void TpHostGpu::testRotMatrices (int num_runs){ // 424
trot_deriv TpHostGpu::testRotMatrices (int num_runs){ // 424
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_correction_vector){
throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
}
if (!m_gpu_rot_deriv){
throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
}
dim3 threads_rot (3,3,3);
dim3 grid_rot (m_tpParams.num_cams, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
calc_rot_deriv<<<grid_rot,threads_rot>>> (
m_tpParams.num_cams, // int num_cams,
m_gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("testRotMatrices pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
trot_deriv rot_deriv{};
checkCudaErrors(cudaMemcpy(
&rot_deriv,
m_gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
return rot_deriv;
}
void TpHostGpu::testReverseDistortions (int num_runs){ // 468
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_rByRDist){
throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
}
if (!m_rByRDist){
throw std::runtime_error("Error: m_rByRDist is not initialized ");
}
dim3 threads_rd(3,3,3);
dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
sdkCreateTimer(&timerREVERSE_DISTORTIONS);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerREVERSE_DISTORTIONS);
sdkStartTimer(&timerREVERSE_DISTORTIONS);
}
calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_gpu_rByRDist);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("testReverseDistortions pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerREVERSE_DISTORTIONS);
float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
printf("Average calcReverseDistortionTable run time =%f ms\n", avgTimeREVERSE_DISTORTIONS);
float * rByRDist_gen = (float *) malloc(m_tpParams.rbyrdist_len * sizeof(float));
checkCudaErrors(cudaMemcpy(
rByRDist_gen,
m_gpu_rByRDist,
m_tpParams.rbyrdist_len * sizeof(float),
cudaMemcpyDeviceToHost));
float max_err = 0;
for (int i = 0; i < m_tpParams.rbyrdist_len; i++){
float err = abs(rByRDist_gen[i] - m_rByRDist[i]);
if (err > max_err){
max_err = err;
}
}
printf("Maximal rByRDist error = %f\n",max_err);
free (rByRDist_gen);
// temporarily restore
if (0) {
checkCudaErrors(cudaMemcpy(
m_gpu_rByRDist,
m_rByRDist,
m_tpParams.rbyrdist_len * sizeof(float),
cudaMemcpyHostToDevice));
}
}
void TpHostGpu::testGeomCorrect (int num_runs){ // 534
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_correction_vector){
throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
}
if (!m_gpu_rByRDist){
throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
}
if (!m_gpu_rot_deriv){
throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
}
if (!m_ftask_data){
throw std::runtime_error("Error: m_ftask_data is not initialized ");
}
if (!m_ftask_data1){
throw std::runtime_error("Error: m_ftask_data1 is not initialized ");
}
dim3 threads_geom(m_tpParams.num_cams, m_tpParams.tiles_per_block_geom, 1);
dim3 grid_geom ((m_tpParams.tp_tasks_size + m_tpParams.tiles_per_block_geom-1)/m_tpParams.tiles_per_block_geom, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
m_tpParams.num_cams, // int num_cams,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
m_gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
m_ftask_data1,
m_gpu_ftasks,
m_tpParams.tp_tasks_size * m_tpParams.task_size *sizeof(float),
cudaMemcpyDeviceToHost));
//task_size
#if 0 // for manual browsing
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
#endif
if( m_tpParams.debug_tile) {
printf("old_task txy = 0x%x\n", *(int *) (m_ftask_data + m_tpParams.tp_tasks_size * m_tpParams.dbg_tile + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + 1)) ; // task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
(*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)) -
(*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
(*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)) -
(*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)));
}
}
}
void TpHostGpu::testConvertDirect (int num_runs){ // 608
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_kernel_offsets){
throw std::runtime_error("Error: m_gpu_kernel_offsets is not initialized ");
}
if (!m_gpu_kernels){
throw std::runtime_error("Error: m_gpu_kernels is not initialized ");
}
if (!m_gpu_images){
throw std::runtime_error("Error: m_gpu_images is not initialized ");
}
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_active_tiles){
throw std::runtime_error("Error: m_gpu_active_tiles is not initialized ");
}
if (!m_gpu_num_active){
throw std::runtime_error("Error: m_gpu_num_active is not initialized ");
}
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // IMG_WIDTH, // int woi_width,
m_tpParams.img_height, // IMG_HEIGHT, // int woi_height,
0, // m_tpParams.kernels_hor, // KERNELS_HOR, // int kernels_hor,
m_tpParams.kernels_hor, // KERNELS_VERT, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // TILESX); // int tilesx)
printf("HOST: convert_direct() done\n");
getLastCudaError("Kernel execution failed");
printf("HOST: convert_direct() done - 1\n");
checkCudaErrors(cudaDeviceSynchronize());
printf("HOST: convert_direct() done - 2\n");
// printf("%d\n",i);
}
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
int num_active_tiles; // calculated by convert_direct
checkCudaErrors(cudaMemcpy(
&num_active_tiles,
m_gpu_num_active, // make it local?
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles);
saveClt(
m_tpPaths.ports_clt_file, // const char ** paths, // tpPaths.ports_clt_file
"CLT data", // const char * prompt, // "CLT data"
m_gpu_clt_h); // float ** gpu_clt_h);
}
void TpHostGpu::saveClt(
const char ** paths, // tpPaths.ports_clt_file
const char * prompt, // "CLT data"
float ** gpu_clt_h){ // m_gpu_clt_h
if (!paths) return;
int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam]){
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
m_gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
printf("Writing CLT data to %s\n", paths[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
hfree(cpu_clt);
}
void TpHostGpu::testImclt (int num_runs){ // 682
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
/*
*/
}
void TpHostGpu::testImcltRbgAll (int num_runs){ // 701
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
/*
*/
}
void TpHostGpu::testCorrelate2DIntra(int num_runs){
......
......@@ -122,7 +122,7 @@ public:
void setTextures();
void setRGBA();
void testRotMatrices (int num_runs); // 424
trot_deriv testRotMatrices (int num_runs); // 424
void testReverseDistortions (int num_runs); // 468
void testGeomCorrect (int num_runs); // 534
void testConvertDirect (int num_runs); // 608
......@@ -133,6 +133,7 @@ public:
void testCorrelate2DIntra (int num_runs);
void testCorrelate2DInterSelf(int num_runs);
void saveClt(const char ** paths, const char * prompt, float ** gpu_clt_h);
// for both intra and inter!
void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
......
......@@ -24,6 +24,9 @@ public:
static constexpr int kernels_hor = KERNELS_HOR;
static constexpr int kernels_vert = KERNELS_VERT;
static constexpr int rbyrdist_len = RBYRDIST_LEN;
static constexpr int tiles_per_block_geom = TILES_PER_BLOCK_GEOM;
//
static constexpr int task_inter_en = TASK_INTER_EN; // 10 // Task bit to enable interscene correlation
static constexpr int task_corr_en = TASK_CORR_EN; // 9 // Task bit to enable intrascene correlation (pairs defined separately)
static constexpr int task_text_en = TASK_TEXT_EN; // 8 // task bit to enable texture generation
......@@ -50,8 +53,10 @@ public:
#ifdef DBG_TILE
static constexpr int debug_tile{1};
static constexpr int dbg_tile{DBG_TILE};
#else
static constexpr int debug_tile{0};
static constexpr int dbg_tile{-1};
#endif
private:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment