Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
14d7689b
Commit
14d7689b
authored
Apr 08, 2025
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
More methods replacing flat code
parent
dc090454
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
306 additions
and
4 deletions
+306
-4
TpHostGpu.cu
src/TpHostGpu.cu
+298
-2
TpHostGpu.h
src/TpHostGpu.h
+2
-1
TpParams.h
src/TpParams.h
+6
-1
No files found.
src/TpHostGpu.cu
View file @
14d7689b
...
...
@@ -280,22 +280,318 @@ void TpHostGpu::setRGBA(){
(m_tpParams.num_colors + 1) * sizeof(float)));
}
void TpHostGpu::testRotMatrices (int num_runs){ // 424
trot_deriv TpHostGpu::testRotMatrices (int num_runs){ // 424
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_correction_vector){
throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
}
if (!m_gpu_rot_deriv){
throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
}
dim3 threads_rot (3,3,3);
dim3 grid_rot (m_tpParams.num_cams, 1, 1);
printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
StopWatchInterface *timerROT_MATRICES = 0;
sdkCreateTimer(&timerROT_MATRICES);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerROT_MATRICES);
sdkStartTimer(&timerROT_MATRICES);
}
calc_rot_deriv<<<grid_rot,threads_rot>>> (
m_tpParams.num_cams, // int num_cams,
m_gpu_correction_vector , // struct corr_vector * gpu_correction_vector,
m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("testRotMatrices pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerROT_MATRICES);
float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
sdkDeleteTimer(&timerROT_MATRICES);
printf("Average calc_rot_matrices run time =%f ms\n", avgTimeROT_MATRICES);
trot_deriv rot_deriv{};
checkCudaErrors(cudaMemcpy(
&rot_deriv,
m_gpu_rot_deriv,
sizeof(trot_deriv),
cudaMemcpyDeviceToHost));
return rot_deriv;
}
void TpHostGpu::testReverseDistortions (int num_runs){ // 468
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_rByRDist){
throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
}
if (!m_rByRDist){
throw std::runtime_error("Error: m_rByRDist is not initialized ");
}
dim3 threads_rd(3,3,3);
dim3 grid_rd (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
sdkCreateTimer(&timerREVERSE_DISTORTIONS);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerREVERSE_DISTORTIONS);
sdkStartTimer(&timerREVERSE_DISTORTIONS);
}
calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_gpu_rByRDist);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("testReverseDistortions pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerREVERSE_DISTORTIONS);
float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
printf("Average calcReverseDistortionTable run time =%f ms\n", avgTimeREVERSE_DISTORTIONS);
float * rByRDist_gen = (float *) malloc(m_tpParams.rbyrdist_len * sizeof(float));
checkCudaErrors(cudaMemcpy(
rByRDist_gen,
m_gpu_rByRDist,
m_tpParams.rbyrdist_len * sizeof(float),
cudaMemcpyDeviceToHost));
float max_err = 0;
for (int i = 0; i < m_tpParams.rbyrdist_len; i++){
float err = abs(rByRDist_gen[i] - m_rByRDist[i]);
if (err > max_err){
max_err = err;
}
}
printf("Maximal rByRDist error = %f\n",max_err);
free (rByRDist_gen);
// temporarily restore
if (0) {
checkCudaErrors(cudaMemcpy(
m_gpu_rByRDist,
m_rByRDist,
m_tpParams.rbyrdist_len * sizeof(float),
cudaMemcpyHostToDevice));
}
}
void TpHostGpu::testGeomCorrect (int num_runs){ // 534
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
if (!m_gpu_correction_vector){
throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
}
if (!m_gpu_rByRDist){
throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
}
if (!m_gpu_rot_deriv){
throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
}
if (!m_ftask_data){
throw std::runtime_error("Error: m_ftask_data is not initialized ");
}
if (!m_ftask_data1){
throw std::runtime_error("Error: m_ftask_data1 is not initialized ");
}
dim3 threads_geom(m_tpParams.num_cams, m_tpParams.tiles_per_block_geom, 1);
dim3 grid_geom ((m_tpParams.tp_tasks_size + m_tpParams.tiles_per_block_geom-1)/m_tpParams.tiles_per_block_geom, 1, 1);
printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
StopWatchInterface *timerGEOM = 0;
sdkCreateTimer(&timerGEOM);
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
m_tpParams.num_cams, // int num_cams,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
// gpu_tasks, // struct tp_task * gpu_tasks,
m_tpParams.tp_tasks_size, // int num_tiles, // number of tiles in task list
m_gpu_geometry_correction, // struct gc * gpu_geometry_correction,
m_gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
m_gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
m_gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
/// cudaProfilerStop();
sdkStopTimer(&timerGEOM);
float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
sdkDeleteTimer(&timerGEOM);
printf("Average TextureList run time =%f ms\n", avgTimeGEOM);
checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
m_ftask_data1,
m_gpu_ftasks,
m_tpParams.tp_tasks_size * m_tpParams.task_size *sizeof(float),
cudaMemcpyDeviceToHost));
//task_size
#if 0 // for manual browsing
struct tp_task * old_task = &task_data [DBG_TILE];
struct tp_task * new_task = &task_data1[DBG_TILE];
#endif
if( m_tpParams.debug_tile) {
printf("old_task txy = 0x%x\n", *(int *) (m_ftask_data + m_tpParams.tp_tasks_size * m_tpParams.dbg_tile + 1)) ; // task_data [DBG_TILE].txy);
printf("new_task txy = 0x%x\n", *(int *) (m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + 1)) ; // task_data1[DBG_TILE].txy);
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++){
printf("camera %d pX old %f new %f diff = %f\n", ncam,
*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
(*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)) -
(*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)));
printf("camera %d pY old %f new %f diff = %f\n", ncam,
*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
(*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)) -
(*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)));
}
}
}
void TpHostGpu::testConvertDirect (int num_runs){ // 608
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_kernel_offsets){
throw std::runtime_error("Error: m_gpu_kernel_offsets is not initialized ");
}
if (!m_gpu_kernels){
throw std::runtime_error("Error: m_gpu_kernels is not initialized ");
}
if (!m_gpu_images){
throw std::runtime_error("Error: m_gpu_images is not initialized ");
}
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_active_tiles){
throw std::runtime_error("Error: m_gpu_active_tiles is not initialized ");
}
if (!m_gpu_num_active){
throw std::runtime_error("Error: m_gpu_num_active is not initialized ");
}
//create and start CUDA timer
StopWatchInterface *timerTP = 0;
sdkCreateTimer(&timerTP);
dim3 threads_tp(1, 1, 1);
dim3 grid_tp(1, 1, 1);
printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
printf("grid_tp= (%d, %d, %d)\n",grid_tp.x, grid_tp.y, grid_tp.z);
/// cudaProfilerStart();
float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerTP);
sdkStartTimer(&timerTP);
}
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
fgpu_kernel_offsets, // struct CltExtra ** gpu_kernel_offsets,
m_gpu_kernels, // float ** gpu_kernels,
m_gpu_images, // float ** gpu_images,
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_gpu_clt, // float ** gpu_clt, // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
dstride/sizeof(float), // size_t dstride, // for gpu_images
m_tpParams.tp_tasks_size,// int num_tiles) // number of tiles in task
0, // int lpf_mask) // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
m_tpParams.img_width, // IMG_WIDTH, // int woi_width,
m_tpParams.img_height, // IMG_HEIGHT, // int woi_height,
0, // m_tpParams.kernels_hor, // KERNELS_HOR, // int kernels_hor,
m_tpParams.kernels_hor, // KERNELS_VERT, // int kernels_vert);
m_gpu_active_tiles, // int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
m_gpu_num_active, //); // int * pnum_active_tiles); // indices to gpu_tasks
m_tpParams.tilesx); // TILESX); // int tilesx)
printf("HOST: convert_direct() done\n");
getLastCudaError("Kernel execution failed");
printf("HOST: convert_direct() done - 1\n");
checkCudaErrors(cudaDeviceSynchronize());
printf("HOST: convert_direct() done - 2\n");
// printf("%d\n",i);
}
sdkStopTimer(&timerTP);
float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
sdkDeleteTimer(&timerTP);
int num_active_tiles; // calculated by convert_direct
checkCudaErrors(cudaMemcpy(
&num_active_tiles,
m_gpu_num_active, // make it local?
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Run time =%f ms, num active tiles = %d\n", avgTime, num_active_tiles);
saveClt(
m_tpPaths.ports_clt_file, // const char ** paths, // tpPaths.ports_clt_file
"CLT data", // const char * prompt, // "CLT data"
m_gpu_clt_h); // float ** gpu_clt_h);
}
void TpHostGpu::saveClt(
const char ** paths, // tpPaths.ports_clt_file
const char * prompt, // "CLT data"
float ** gpu_clt_h){ // m_gpu_clt_h
if (!paths) return;
int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam]){
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
m_gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
printf("Writing CLT data to %s\n", paths[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
hfree(cpu_clt);
}
void TpHostGpu::testImclt (int num_runs){ // 682
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
/*
*/
}
void TpHostGpu::testImcltRbgAll (int num_runs){ // 701
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_geometry_correction){
throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
}
/*
*/
}
void TpHostGpu::testCorrelate2DIntra(int num_runs){
...
...
src/TpHostGpu.h
View file @
14d7689b
...
...
@@ -122,7 +122,7 @@ public:
void
setTextures
();
void
setRGBA
();
void
testRotMatrices
(
int
num_runs
);
// 424
trot_deriv
testRotMatrices
(
int
num_runs
);
// 424
void
testReverseDistortions
(
int
num_runs
);
// 468
void
testGeomCorrect
(
int
num_runs
);
// 534
void
testConvertDirect
(
int
num_runs
);
// 608
...
...
@@ -133,6 +133,7 @@ public:
void
testCorrelate2DIntra
(
int
num_runs
);
void
testCorrelate2DInterSelf
(
int
num_runs
);
void
saveClt
(
const
char
**
paths
,
const
char
*
prompt
,
float
**
gpu_clt_h
);
// for both intra and inter!
void
saveIntraCorrFile
(
const
char
*
path
,
const
char
*
prompt
,
int
num_corrs
,
int
num_corr_indices
,
float
*
gpu_corrs
,
int
*
gpu_corr_indices
,
int
num_sel_sensors
);
void
saveInterCorrFile
(
const
char
*
path
,
const
char
*
prompt
,
int
num_corrs
,
int
num_corr_indices
,
float
*
gpu_corrs_td
,
int
*
gpu_corr_indices
,
int
num_sel_sensors
);
...
...
src/TpParams.h
View file @
14d7689b
...
...
@@ -24,6 +24,9 @@ public:
static
constexpr
int
kernels_hor
=
KERNELS_HOR
;
static
constexpr
int
kernels_vert
=
KERNELS_VERT
;
static
constexpr
int
rbyrdist_len
=
RBYRDIST_LEN
;
static
constexpr
int
tiles_per_block_geom
=
TILES_PER_BLOCK_GEOM
;
//
static
constexpr
int
task_inter_en
=
TASK_INTER_EN
;
// 10 // Task bit to enable interscene correlation
static
constexpr
int
task_corr_en
=
TASK_CORR_EN
;
// 9 // Task bit to enable intrascene correlation (pairs defined separately)
static
constexpr
int
task_text_en
=
TASK_TEXT_EN
;
// 8 // task bit to enable texture generation
...
...
@@ -50,8 +53,10 @@ public:
#ifdef DBG_TILE
static
constexpr
int
debug_tile
{
1
};
static
constexpr
int
dbg_tile
{
DBG_TILE
};
#else
static
constexpr
int
debug_tile
{
0
};
static
constexpr
int
dbg_tile
{
-
1
};
#endif
private
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment