Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tile_processor_gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Elphel
tile_processor_gpu
Commits
3610b7a6
Commit
3610b7a6
authored
Apr 10, 2025
by
Andrey Filippov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
testCorrelate2DIntraTD
parent
6f9c7399
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
352 additions
and
142 deletions
+352
-142
TpHostGpu.cu
src/TpHostGpu.cu
+331
-117
TpHostGpu.h
src/TpHostGpu.h
+5
-2
test_tp.cu
src/test_tp.cu
+16
-23
No files found.
src/TpHostGpu.cu
View file @
3610b7a6
...
...
@@ -549,51 +549,6 @@ void TpHostGpu::testConvertDirect (int num_runs){ // 608
m_gpu_clt_h); // float ** gpu_clt_h);
}
void TpHostGpu::saveClt(
const char ** paths, // tpPaths.ports_clt_file
const char * prompt, // "CLT data"
float ** gpu_clt_h){ // m_gpu_clt_h
if (!paths) return;
int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && m_gpu_clt_h[ncam]){
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
m_gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
printf("Writing %s to %s\n", prompt, paths[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
hfree(cpu_clt);
}
void TpHostGpu::saveRgb(
const char ** paths, // m_tpPaths.result_rbg_file
const char * prompt, // "RBG data"
float ** gpu_corr_images_h){
if (!paths) return;
int rslt_img_size = m_tpParams.num_colors * (m_tpParams.img_height + m_tpParams.dtt_size) * (m_tpParams.img_width + m_tpParams.dtt_size);
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && gpu_corr_images_h[ncam]){
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
m_tpParams.num_colors* (m_tpParams.img_height + m_tpParams.dtt_size),
cudaMemcpyDeviceToHost));
printf("Writing %s to %s\n", prompt, paths[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
free(cpu_corr_image);
}
/*
// not implemented
...
...
@@ -716,10 +671,9 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
m_gpu_corrs, // float * gpu_corrs,
m_gpu_corr_indices, // int * gpu_corr_indices)
16); //int num_sel_sensors) { // only for interscene
}
void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
// 889
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
...
...
@@ -748,7 +702,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
dim3 grid_tp(1, 1, 1);
float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; // [tpParams.num_cams] [NUM_CAMS];
// use gpu_images and convert to gpu_clt_ref
// use gpu_images and convert to gpu_clt_ref
was 1152:
convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
m_tpParams.num_cams, // int num_cams, // actual number of cameras
m_tpParams.num_colors, // int num_colors, // actual number of colors: 3 for RGB, 1 for LWIR/mono
...
...
@@ -865,6 +819,267 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){
gfree(gpu_clt_ref);
}
void TpHostGpu::testCorrelate2DIntraTD (int num_runs, int quad_combine){ // 886 - 1123
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
// check/replace names
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_corrs_td){
throw std::runtime_error("Error: m_gpu_corrs_td is not initialized ");
}
if (!m_gpu_ftasks){
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_corr_indices){
throw std::runtime_error("Error: m_gpu_corr_indices is not initialized ");
}
if (!m_gpu_num_corr_tiles){
throw std::runtime_error("Error: m_gpu_num_corr_tiles is not initialized ");
}
if (!m_gpu_corrs_combo_td){
throw std::runtime_error("Error: m_gpu_corrs_combo_td is not initialized ");
}
if (!m_gpu_corrs_combo_indices){
throw std::runtime_error("Error: m_gpu_corrs_combo_indices is not initialized ");
}
if (!m_gpu_corrs_combo){
throw std::runtime_error("Error: m_gpu_corrs_combo is not initialized ");
}
//m_gpu_corrs_combo
// testing corr
StopWatchInterface *timerCORRTD = 0;
sdkCreateTimer(&timerCORRTD);
int num_corr_combo{};
int num_corrs{}; // will get data from the gpu memory
for (int i = i0; i < numIterations; i++) {
if (i == 0) {
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORRTD);
sdkStartTimer(&timerCORRTD);
}
// FIXME: provide sel_pairs
correlate2D<<<1,1>>>( // output TD tiles, no normalization
m_tpParams.num_cams, // int num_cams,
m_tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
m_tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
m_tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
m_tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
m_gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
m_tpParams.num_colors, // int colors, // number of colors (3/1)
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows
m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride, // in floats
0, // int corr_radius, // radius of the output correlation (7 for 15x15)
m_gpu_corrs_td); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure:correlate2D");
checkCudaErrors(cudaDeviceSynchronize());
printf("correlate2D-TD pass: %d\n",i);
checkCudaErrors(cudaMemcpy(
&num_corrs,
m_gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
if (quad_combine) {
num_corr_combo = num_corrs/m_tpParams.num_pairs;
corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
num_corr_combo, // tp_task_size, // int num_tiles, // number of tiles to process (each with num_pairs)
m_tpParams.num_pairs, // int num_pairs, // num pairs per tile (should be the same)
1, // int init_output, // !=0 - reset output tiles to zero before accumulating
0x0f, // int pairs_mask, // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
m_gpu_corrs_combo_indices, // int * gpu_combo_indices, // output if noty null: packed tile+pairs_mask (will point to the first used pair
dstride_corr_td/sizeof(float), // const size_t corr_stride, // (in floats) stride for the input TD correlations
m_gpu_corrs_td, // float * gpu_corrs, // input correlation tiles
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_combo, // (in floats) stride for the output TD correlations (same as input)
m_gpu_corrs_combo_td); // float * gpu_corrs_combo); // combined correlation output (one per tile)
getLastCudaError("Kernel failure:corr2D_combine");
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corr_combo, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_combo_td/sizeof(float), // const size_t corr_stride_td, // in floats
m_gpu_corrs_combo_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr_combo/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs_combo, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
printf("corr2D_combine pass: %d\n",i);
}else { // if (quad_combine) {
checkCudaErrors(cudaDeviceSynchronize());
corr2D_normalize<<<1,1>>>(
num_corrs, //tp_task_size, // int num_corr_tiles, // number of correlation tiles to process
dstride_corr_td/sizeof(float), // const size_t corr_stride_td, // in floats
m_gpu_corrs_td, // float * gpu_corrs_td, // correlation tiles in transform domain
(float *) 0, // corr_weights, // float * corr_weights, // null or per-tile weight (fat_zero2 will be divided by it)
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_gpu_corrs, // float * gpu_corrs, // correlation output data (pixel domain)
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_tpParams.corr_out_rad); // int corr_radius); // radius of the output correlation (7 for 15x15)
} // if (quad_combine) {
getLastCudaError("Kernel failure:corr2D_normalize");
checkCudaErrors(cudaDeviceSynchronize());
printf("corr2D_normalize pass: %d\n",i);
}
sdkStopTimer(&timerCORRTD);
float avgTimeCORRTD = (float)sdkGetTimerValue(&timerCORRTD) / (float)numIterations;
sdkDeleteTimer(&timerCORRTD);
printf("Average CORR-TD and companions run time =%f ms, num cor tiles (old) = %d\n", avgTimeCORRTD, num_corrs); // 981
if (quad_combine) {
int corr_size_combo = 2 * CORR_OUT_RAD + 1;
int rslt_corr_size_combo = num_corr_combo * corr_size_combo * corr_size_combo;
float * cpu_corr_combo = (float *)malloc(rslt_corr_size_combo * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
m_gpu_corrs_combo,
dstride_corr_combo,
(corr_size_combo * corr_size_combo) * sizeof(float),
num_corr_combo,
cudaMemcpyDeviceToHost));
printf("Writing phase correlation data to %s\n", m_tpPaths.result_corr_quad_file);
writeFloatsToFile(
cpu_corr_combo, // float * data, // allocated array
rslt_corr_size_combo, // int size, // length in elements
m_tpPaths.result_corr_quad_file); // const char * path) // file path
free(cpu_corr_combo);
} else { // if (quad_combine) { // 1006
// Reading / formatting / saving correlate2D(TD) + corr2D_normalize /1007
checkCudaErrors(cudaMemcpy(
&num_corrs,
m_gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
// printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
// int corr_size = 2 * CORR_OUT_RAD + 1;
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
// int num_corr_indices = num_pairs_inter * m_tpParams.num_tiles;
int num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
int rslt_corr_size = num_corrs * m_tpParams.corr_length; // corr_size * corr_size;
int corr_img_size = num_corr_indices * 16*16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
int * cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
m_tpParams.corr_length * sizeof(float),
m_gpu_corrs,
dstride_corr,
m_tpParams.corr_length * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
m_gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
// Reading / formatting / saving correlate2D(TD) + corr2D_normalize
float * corr_img = getCorrImg(
corr_img_size, // int corr_img_size,
num_corr_indices, //int num_corr_indices,
cpu_corr_indices, // int * cpu_corr_indices,
cpu_corr, // float * cpu_corr,
16); // num_sel_sensors); // int num_sel_sensors) // Will not be used
printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
m_tpPaths.result_corr_td_norm_file, (m_tpParams.tilesx * 16),(m_tpParams.tilesya*16),
m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
m_tpPaths.result_corr_td_norm_file); // const char * path) // file path
// export TD intra // 1076
int intra_corr_size_td = num_corrs * m_tpParams.dtt_size2 * m_tpParams.dtt_size2; // DTT_SIZE2*DTT_SIZE2;
float * cpu_corr_td = (float *) malloc(intra_corr_size_td * sizeof(float));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_td,
(m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float),
m_gpu_corrs_td,
dstride_corr_td,
(m_tpParams.dtt_size2 * m_tpParams.dtt_size2) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
float * corr_img_td = getCorrTdImg(
corr_img_size, // int corr_img_size,
num_corr_indices, //int num_corr_indices,
cpu_corr_indices, // int * cpu_corr_indices,
m_gpu_corrs_td, // float * cpu_corr,
16); // num_sel_sensors); // int num_sel_sensors) // // Will not be used
printf("Writing intrascene phase correlation TD data tp %s\n", m_tpPaths.result_intrascene_td);
writeFloatsToFile(
corr_img_td, // float * data, // allocated array
corr_img_size, // int size, // length in elements
m_tpPaths.result_intrascene_td); // "clt/aux_intrascene-TD.raw"); // const char * path) // file path
free (cpu_corr_td);
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
free (corr_img_td);
} // if (quad_combine) {
}
void TpHostGpu::saveClt(
const char ** paths, // tpPaths.ports_clt_file
const char * prompt, // "CLT data"
float ** gpu_clt_h){ // m_gpu_clt_h
if (!paths) return;
int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && m_gpu_clt_h[ncam]){
checkCudaErrors(cudaMemcpy( // segfault
cpu_clt,
m_gpu_clt_h[ncam],
rslt_size * sizeof(float),
cudaMemcpyDeviceToHost));
printf("Writing %s to %s\n", prompt, paths[ncam]);
writeFloatsToFile(cpu_clt, // float * data, // allocated array
rslt_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
hfree(cpu_clt);
}
void TpHostGpu::saveRgb(
const char ** paths, // m_tpPaths.result_rbg_file
const char * prompt, // "RBG data"
float ** gpu_corr_images_h){
if (!paths) return;
int rslt_img_size = m_tpParams.num_colors * (m_tpParams.img_height + m_tpParams.dtt_size) * (m_tpParams.img_width + m_tpParams.dtt_size);
float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam] && gpu_corr_images_h[ncam]){
checkCudaErrors(cudaMemcpy2D( // segfault
cpu_corr_image,
(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
gpu_corr_images_h[ncam],
dstride_rslt,
(m_tpParams.img_width + m_tpParams.dtt_size) * sizeof(float),
m_tpParams.num_colors* (m_tpParams.img_height + m_tpParams.dtt_size),
cudaMemcpyDeviceToHost));
printf("Writing %s to %s\n", prompt, paths[ncam]);
writeFloatsToFile( // will have margins
cpu_corr_image, // float * data, // allocated array
rslt_img_size, // int size, // length in elements
paths[ncam]); // const char * path) // file path
}
free(cpu_corr_image);
}
void TpHostGpu::saveIntraCorrFile(
const char * path,
const char * prompt,
...
...
@@ -919,72 +1134,6 @@ void TpHostGpu::saveIntraCorrFile(
tpPaths.result_inter_td_norm_file); // const char * path) // file path
*/
float * TpHostGpu::getCorrImg(
int corr_img_size,
int num_corr_indices,
int * cpu_corr_indices,
float * cpu_corr,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iy = 0; iy < m_tpParams.corr_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
for (int ix = 0; ix < m_tpParams.corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
return corr_img;
}
float * TpHostGpu::getCorrTdImg(
int corr_img_size,
int num_corr_indices,
int * cpu_corr_indices,
float * cpu_corr_td,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
return corr_img;
}
...
...
@@ -1016,7 +1165,7 @@ void TpHostGpu::saveInterCorrFile(
gpu_corr_indices,
num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
float * corr_img = getCorrTdImg(
float * corr_img
_td
= getCorrTdImg(
corr_img_size, // int corr_img_size,
num_corr_indices, //int num_corr_indices,
cpu_corr_indices, // int * cpu_corr_indices,
...
...
@@ -1024,11 +1173,11 @@ void TpHostGpu::saveInterCorrFile(
num_sel_sensors); // int num_sel_sensors)
printf("Writing %s TD data to %s\n", prompt, path);
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img
_td
, // float * data, // allocated array
corr_img_size, // int size, // length in elements
path); // const char * path) // file path
free (cpu_corr_indices);
free (corr_img);
free (corr_img
_td
);
free (cpu_corr_td);
}
...
...
@@ -1069,7 +1218,72 @@ void TpHostGpu::saveInterCorrIndicesFile(
free (cpu_corr_indices);
}
float * TpHostGpu::getCorrImg(
int corr_img_size,
int num_corr_indices,
int * cpu_corr_indices,
float * cpu_corr,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iy = 0; iy < m_tpParams.corr_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
for (int ix = 0; ix < m_tpParams.corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
return corr_img;
}
float * TpHostGpu::getCorrTdImg(
int corr_img_size,
int num_corr_indices,
int * cpu_corr_indices,
float * cpu_corr_td,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
return corr_img;
}
void TpHostGpu::hfree(float * p) {if (p) free(p); p = {};}
void TpHostGpu::hfree(struct CltExtra * p) {if (p) free(p); p = {};}
...
...
src/TpHostGpu.h
View file @
3610b7a6
...
...
@@ -129,8 +129,11 @@ public:
// void testImclt (int num_runs); // 682 // not implemented
void
testImcltRbgAll
(
int
num_runs
);
// 701
void
testCorrelate2DIntra
(
int
num_runs
);
void
testCorrelate2DInterSelf
(
int
num_runs
);
void
testCorrelate2DIntra
(
int
num_runs
);
// 762 - 885
void
testCorrelate2DInterSelf
(
int
num_runs
);
// 1136 - 1411
void
testCorrelate2DIntraTD
(
int
num_runs
,
int
quad_combine
);
// 886 - 1123
void
saveClt
(
const
char
**
paths
,
const
char
*
prompt
,
float
**
gpu_clt_h
);
void
saveRgb
(
const
char
**
paths
,
const
char
*
prompt
,
float
**
gpu_corr_images_h
);
...
...
src/test_tp.cu
View file @
3610b7a6
...
...
@@ -32,7 +32,7 @@
// all of the next 5 were disabled
//#define NOCORR
#define NOCORR_TD
//
#define NOCORR_TD
#define NOTEXTURES //
#define NOTEXTURE_RGBA //
//#define NOTEXTURE_RGBAXXX //
...
...
@@ -883,10 +883,8 @@ int main(int argc, char **argv)
free (corr_img);
#endif // ifndef NOCORR
#ifndef NOCORR_TD
//#define QUAD_COMBINE
// cudaProfilerStart();
// testing corr
StopWatchInterface *timerCORRTD = 0;
...
...
@@ -903,10 +901,10 @@ int main(int argc, char **argv)
// FIXME: provide sel_pairs
correlate2D<<<1,1>>>( // output TD tiles, no normalization
tpParams.num_cams, // int num_cams,
T
pParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
T
pParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
T
pParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
T
pParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
t
pParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
t
pParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
t
pParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
t
pParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
...
...
@@ -1019,20 +1017,18 @@ int main(int argc, char **argv)
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs *
corr_length * corr_length
;
rslt_corr_size = num_corrs *
tpParams.corr_length; // corr_size * corr_size
;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
cpu_corr_indices = (int *) malloc(num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_length * corr_length)
* sizeof(float),
tpParams.corr_length
* sizeof(float),
gpu_corrs,
dstride_corr,
(corr_length * corr_length)
* sizeof(float),
tpParams.corr_length
* sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
...
...
@@ -1056,13 +1052,13 @@ int main(int argc, char **argv)
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
int src_offs0 = ict *
corr_length *
corr_length;
int src_offs0 = ict *
tpParams.
corr_length;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy <
corr_length
; iy++){
int src_offs = src_offs0 + iy *
corr_length
; // ict * tpParams.num_pairs * corr_size * corr_size;
for (int iy = 0; iy <
tpParams.corr_size
; iy++){
int src_offs = src_offs0 + iy *
tpParams.corr_size
; // ict * tpParams.num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix <
corr_length
; ix++){
for (int ix = 0; ix <
tpParams.corr_size
; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
...
...
@@ -1116,15 +1112,9 @@ int main(int argc, char **argv)
"clt/aux_intrascene-TD.raw"); // const char * path) // file path
#endif
free (cpu_corr_td);
#endif // if 1
// reuse image, export TD data
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
...
...
@@ -1135,6 +1125,9 @@ int main(int argc, char **argv)
// Testing "interframe" correlation with itself, assuming direct convert already ran
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment