Commit 67816dbf authored by Andrey Filippov's avatar Andrey Filippov

More refactoring

parent 68491042
......@@ -4,9 +4,528 @@
* Created on: Apr 2, 2025
* Author: elphel
*/
#include "TpParams.h"
#include <stdexcept>
#include <helper_cuda.h> // for checkCudaErrors
#include <cuda_runtime.h> // cudaFree
#include <helper_functions.h> // timer functions
//#include "TpParams.h" // TpHostGpu.h has it
#include "tp_paths.h"
#include "tp_files.h"
#include "tp_utils.h" // for copyalloc_kernel_gpu
#include "TpHostGpu.h"
TpHostGpu::~TpHostGpu(){
hfree(m_host_kern_buf);
hfree(m_ftask_data);
hfree(m_ftask_data1);
hfree(m_correction_vector);
hfree(m_rByRDist);
gfree(m_gpu_geometry_correction);
gfree(m_gpu_correction_vector);
gfree(m_gpu_rByRDist);
gfree(m_gpu_rot_deriv);
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
// Free used host memory, initialized as arrays of pointers
hfree(m_gpu_kernels_h[ncam]);
hfree(m_gpu_kernel_offsets_h[ncam]);
hfree(m_gpu_images_h[ncam]);
hfree(m_gpu_clt_h[ncam]);
hfree(m_gpu_corr_images_h[ncam]);
// Free used GPU memory initialized as **
if (m_gpu_kernels) gfree(m_gpu_kernels[ncam]);
if (m_gpu_kernel_offsets) gfree(m_gpu_kernel_offsets[ncam]);
if (m_gpu_images) gfree(m_gpu_images[ncam]);
if (m_gpu_clt) gfree(m_gpu_clt[ncam]);
if (m_gpu_corr_images) gfree(m_gpu_corr_images[ncam]);
}
gfree(m_gpu_corrs);
gfree(m_gpu_corrs_td);
gfree(m_gpu_corrs_combo);
gfree(m_gpu_corrs_combo_td);
gfree(m_gpu_corr_indices);
gfree(m_gpu_corrs_combo_indices);
gfree(m_gpu_ftasks);
gfree(m_gpu_active_tiles);
gfree(m_gpu_num_active);
gfree(m_gpu_num_corr_tiles);
gfree(m_gpu_texture_indices);
gfree(m_gpu_woi);
gfree(m_gpu_twh);
gfree(m_gpu_num_texture_tiles);
gfree(m_gpu_port_offsets);
gfree(m_gpu_color_weights);
gfree(m_gpu_generate_RBGA_params);
gfree(m_gpu_textures);
gfree(m_gpu_diff_rgb_combo);
gfree(m_gpu_textures_rbga);
return;
};
void TpHostGpu::setImageKernels(){
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
readFloatsFromFile(
m_host_kern_buf, // float * data, // allocated array
m_tpPaths.kernel_file[ncam]); // char * path) // file path
m_gpu_kernels_h[ncam] = copyalloc_kernel_gpu(m_host_kern_buf, m_tpParams.kern_size);
readFloatsFromFile(
m_host_kern_buf, // float * data, // allocated array
m_tpPaths.kernel_offs_file[ncam]); // char * path) // file path
m_gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
m_host_kern_buf,
m_tpParams.kern_tiles * (sizeof( struct CltExtra)/sizeof(float)));
}
m_gpu_kernels = copyalloc_pointers_gpu (m_gpu_kernels_h, m_tpParams.num_cams); // NUM_CAMS);
m_gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) m_gpu_kernel_offsets_h, m_tpParams.num_cams); // NUM_CAMS);
}
void TpHostGpu::setCltBuffers(){
const int slice_size {m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size};
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
// will get results back
//gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * tpParams.num_colors * 4 * DTT_SIZE * DTT_SIZE);
m_gpu_clt_h[ncam] = alloc_kernel_gpu(slice_size);
printf("Allocating GPU memory, 0x%x floats\n", slice_size) ;
}
m_gpu_clt = copyalloc_pointers_gpu (m_gpu_clt_h, m_tpParams.num_cams); // NUM_CAMS);
}
void TpHostGpu::setCorrImgBuffers(){
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
// allocate result images (3x height to accommodate 3 colors
// Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
//host array of pointers to GPU arrays
m_gpu_corr_images_h[ncam] = alloc_image_gpu(
&dstride_rslt, // size_t* dstride, // in bytes!!
m_tpParams.img_width + m_tpParams.dtt_size, // int width,
m_tpParams.num_colors*(m_tpParams.img_height + m_tpParams.dtt_size)); // int height);
}
m_gpu_corr_images = copyalloc_pointers_gpu (m_gpu_corr_images_h, m_tpParams.num_cams); // NUM_CAMS);
}
void TpHostGpu::setImgBuffers(){
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
readFloatsFromFile(
m_host_kern_buf, // float * data, // allocated array
m_tpPaths.image_files[ncam]); // char * path) // file path
m_gpu_images_h[ncam] = copyalloc_image_gpu(
m_host_kern_buf, // float * image_host,
&dstride, // size_t* dstride,
m_tpParams.img_width, // IMG_WIDTH, // int width,
m_tpParams.img_height); // IMG_HEIGHT); // int height);
}
m_gpu_images = copyalloc_pointers_gpu (m_gpu_images_h, m_tpParams.num_cams); // NUM_CAMS);
}
void TpHostGpu::setGeometryCorrectionBuffers() {
readFloatsFromFile(
(float *) &m_fgeometry_correction, // float * data, // allocated array, no need to free
m_tpPaths.geometry_correction_file); // char * path) // file path
m_rByRDist = readAllFloatsFromFile(
m_tpPaths.rByRDist_file, // const char * path,
&m_rByRDist_length); // int * len_in_floats)
m_correction_vector = readAllFloatsFromFile(
m_tpPaths.correction_vector_file, // const char * path,
&m_correction_vector_length); // int * len_in_floats)
m_gpu_geometry_correction = (struct gc *) copyalloc_kernel_gpu(
(float *) &m_fgeometry_correction,
sizeof(m_fgeometry_correction)/sizeof(float));
m_gpu_correction_vector = (struct corr_vector * ) copyalloc_kernel_gpu(
m_correction_vector,
m_correction_vector_length);
m_gpu_rByRDist = copyalloc_kernel_gpu(
m_rByRDist,
m_rByRDist_length);
checkCudaErrors(cudaMalloc((void **)&m_gpu_rot_deriv, sizeof(trot_deriv)));
}
void TpHostGpu::setCorrelationBuffers(){
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
m_gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
m_tpParams.corr_length, // int width,
m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be
m_gpu_corrs_td = alloc_image_gpu(
&dstride_corr_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * m_tpParams.dtt_size *m_tpParams.dtt_size, // int width,
m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy); // int height);
m_gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
m_tpParams.corr_length, // int width,
m_tpParams.tilesx * m_tpParams.tilesy); // int height);
m_gpu_corrs_combo_td = alloc_image_gpu(
&dstride_corr_combo_td, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
4 * m_tpParams.dtt_size *m_tpParams.dtt_size, // int width,
m_tpParams.tilesx * m_tpParams.tilesy); // int height);
// just allocate
checkCudaErrors (cudaMalloc((void **)&m_gpu_corr_indices, m_tpParams.num_pairs * m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&m_gpu_corrs_combo_indices, m_tpParams.tilesx * m_tpParams.tilesy * sizeof(int)));
}
void TpHostGpu::setTasks(const float target_disparity, const float scale){
// allocate m_ftask_data
m_ftask_data = (float *) malloc( m_tpParams.tilesx * m_tpParams.tilesy * m_tpParams.task_size * sizeof(float));
// float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
// tasks for all tiles
for (int ty = 0; ty < m_tpParams.tilesy ; ty++){
for (int tx = 0; tx < m_tpParams.tilesx ; tx++){
int nt = ty * m_tpParams.tilesx + tx;
int task_task = (1 << m_tpParams.task_inter_en) | (1 << m_tpParams.task_corr_en) | (1 << m_tpParams.task_text_en); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16);
// float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
float * tp = m_ftask_data + m_tpParams.task_size * nt;
*(tp + m_tpParams.tp_task_task_offset) = *(float *) &task_task;
*(tp + m_tpParams.tp_task_txy_offset) = *(float *) &task_txy;
*(tp + m_tpParams.tp_task_disparity_offset) = target_disparity;
// tp += 2; // skip centerX, centerY
*(tp + m_tpParams.tp_task_scale_offset) = scale; // 0; // 0.5f; // ,0; // scale, 0 - old way, just set
tp+= m_tpParams.tp_task_xy_offset;
for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) {
*(tp++) = m_tile_coords_h[ncam][nt][0];
*(tp++) = m_tile_coords_h[ncam][nt][1];
}
}
}
m_gpu_ftasks = (float *) copyalloc_kernel_gpu(m_ftask_data, m_tpParams.tp_tasks_size * m_tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
}
void TpHostGpu::setTextures(){
if (!m_ftask_data) { // ftask_data is not initialized
throw std::runtime_error("Error: m_ftask_data is not initialized ");
}
m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0));
m_tile_texture_size = m_tile_texture_layers *256;
// task data should be initialized
m_num_textures=0;
for (int ty = 0; ty < m_tpParams.tilesy; ty++){
for (int tx = 0; tx < m_tpParams.tilesx; tx++){
int nt = ty * m_tpParams.tilesx + tx;
float *tp = m_ftask_data + m_tpParams.task_size * nt;
int cm = (*(int *) tp) & m_tpParams.task_texture_bits; // non-zero any of 4 lower task bits
if (cm){
m_texture_indices[m_num_textures++] = (nt << m_tpParams.text_ntile_shift) | (1 << m_tpParams.list_texture_bit); // setting 0x80 in texture indices
}
}
}
// num_textures now has the total number of textures
// copy corr_indices to gpu
m_gpu_texture_indices = (int *) copyalloc_kernel_gpu(
(float * ) m_texture_indices,
m_num_textures,
m_tpParams.tilesx * m_tpParams.tilesya); // number of rows - multiple of 4
m_gpu_textures = alloc_image_gpu(
&dstride_textures, // in bytes ! for one rgba/ya 16x16 tile
m_tile_texture_size, // int width (floats),
m_tpParams.tilesx * m_tpParams.tilesy); // int height);
}
void TpHostGpu::setRGBA(){
/*
// appears in setTextures
m_tile_texture_layers = (m_tpParams.texture_colors + 1 + (m_tpParams.keep_texture_weights? (m_tpParams.num_cams + m_tpParams.texture_colors + 1): 0));
m_tile_texture_size = m_tile_texture_layers *256;
*/
m_rgba_width = (m_tpParams.tilesx + 1) * m_tpParams.dtt_size;
m_rgba_height = (m_tpParams.tilesy + 1) * m_tpParams.dtt_size;;
m_rbga_slices = m_tpParams.texture_colors + 1; // 4/1
if (m_tpParams.keep_texture_weights & 2){
m_rbga_slices += m_tpParams.texture_colors * m_tpParams.num_cams;
}
checkCudaErrors(cudaMalloc((void **)&m_gpu_woi, 4 * sizeof(float))); // RGBA
checkCudaErrors(cudaMalloc((void **)&m_gpu_twh, 2 * sizeof(float))); // RGBA, dynamic
checkCudaErrors(cudaMalloc((void **)&m_gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border, RGBA all
m_gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.port_offsets, m_tpParams.num_cams * 2); // num_ports * 2);
m_gpu_color_weights = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.color_weights, sizeof(m_tpParams.color_weights));
m_gpu_generate_RBGA_params = (float *) copyalloc_kernel_gpu((float * ) m_tpParams.generate_RBGA_params, sizeof(m_tpParams.generate_RBGA_params));
m_gpu_textures_rbga = alloc_image_gpu(
&dstride_textures_rbga, // in bytes ! for one rgba/ya 16x16 tile
m_rgba_width, // int width (floats),
m_rgba_height * m_rbga_slices); // int height);
checkCudaErrors(cudaMalloc((void **)&m_gpu_diff_rgb_combo, m_tpParams.tilesx * m_tpParams.tilesy * m_tpParams.num_cams *
(m_tpParams.num_colors + 1) * sizeof(float)));
}
void TpHostGpu::testCorrelate2DIntra(int num_runs){
int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
int i0 = m_tpParams.debug_tile ? 0 : -1;
if (!m_gpu_clt){
throw std::runtime_error("Error: m_gpu_clt is not initialized ");
}
if (!m_gpu_ftasks) {
throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
}
if (!m_gpu_corrs) {
throw std::runtime_error("Error: m_gpu_corrs is not initialized ");
}
StopWatchInterface *timerCORR = 0;
sdkCreateTimer(&timerCORR);
for (int i = i0; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&timerCORR);
sdkStartTimer(&timerCORR);
}
correlate2D<<<1,1>>>(
m_tpParams.num_cams, // int num_cams,
m_tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
m_tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
m_tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
m_tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
m_gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
m_tpParams.num_colors, // int colors, // number of colors (3/1)
m_tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
m_tpParams.color_weights[1], // 0.25, // float scale1, // scale for B
m_tpParams.color_weights[2], // 0.5, // float scale2, // scale for G
m_tpParams.fat_zero * m_tpParams.fat_zero, // float fat_zero2, // here - absolute
m_gpu_ftasks, // float * gpu_ftasks, // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
m_tpParams.tp_tasks_size, // int num_tiles) // number of tiles in task
m_tpParams.tilesx, // int tilesx, // number of tile rows
m_gpu_corr_indices, // int * gpu_corr_indices, // packed tile+pair
m_gpu_num_corr_tiles, // int * pnum_corr_tiles, // pointer to a number of correlation tiles to process
dstride_corr/sizeof(float), // const size_t corr_stride, // in floats
m_tpParams.corr_out_rad, // int corr_radius, // radius of the output correlation (7 for 15x15)
m_gpu_corrs); // float * gpu_corrs); // correlation output data
getLastCudaError("Kernel failure");
checkCudaErrors(cudaDeviceSynchronize());
printf("test pass: %d\n",i);
}
float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
sdkStopTimer(&timerCORR);
sdkDeleteTimer(&timerCORR);
int num_corrs{}; // will get data from the gpu memory
checkCudaErrors(cudaMemcpy(
&num_corrs,
m_gpu_num_corr_tiles,
sizeof(int),
cudaMemcpyDeviceToHost));
printf("Average CORR run time =%f ms, num cor tiles (new) = %d\n", avgTimeCORR, num_corrs);
saveIntraCorrFile(
m_tpPaths.result_corr_file, // const char * path,
"phase correlation data", // const char * prompt,
num_corrs, // int num_corrs,
m_gpu_corrs, // float * gpu_corrs,
m_gpu_corr_indices, // int * gpu_corr_indices)
16); // //int num_sel_sensors) { // only for interscene
}
void TpHostGpu::saveIntraCorrFile(
const char * path,
const char * prompt,
int num_corrs,
float * gpu_corrs,
int * gpu_corr_indices,
int num_sel_sensors) { // only for interscene
if (!path) return;
int rslt_corr_length = num_corrs * m_tpParams.corr_length;
int corr_img_size = m_tpParams.num_corr_indices * 16 * 16; // NAN
// float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr = (float *)malloc(rslt_corr_length * sizeof(float));
int * cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
m_tpParams.corr_length * sizeof(float),
gpu_corrs,
dstride_corr,
m_tpParams.corr_length * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
m_tpParams.num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
float * corr_img = getCorrImg(
corr_img_size, // int corr_img_size,
cpu_corr_indices, // int * cpu_corr_indices,
cpu_corr, // float * cpu_corr,
num_sel_sensors); // int num_sel_sensors)
printf("Writing %s to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
prompt, path, (m_tpParams.tilesx * 16),(m_tpParams.tilesya * 16), m_tpParams.num_pairs, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
path); // const char * path) // file path
free (cpu_corr);
free (cpu_corr_indices);
free (corr_img);
}
/*
printf("Writing interscene phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
tpPaths.result_inter_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs_inter, (corr_img_size * sizeof(float)) ) ;
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
tpPaths.result_inter_td_norm_file); // const char * path) // file path
*/
float * TpHostGpu::getCorrImg(
int corr_img_size,
int * cpu_corr_indices,
float * cpu_corr,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iy = 0; iy < m_tpParams.corr_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.corr_size; // ict * num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (m_tpParams.tilesx * 16);
for (int ix = 0; ix < m_tpParams.corr_size; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
}
return corr_img;
}
float * TpHostGpu::getCorrTdImg(
int corr_img_size,
int * cpu_corr_indices,
float * cpu_corr_td,
int num_sel_sensors){
float * corr_img = (float *)malloc(corr_img_size * sizeof(float));
for (int i = 0; i < corr_img_size; i++){
corr_img[i] = NAN;
}
for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){ // Was not here - only for interscene. Will it hurt?
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
int src_offs0 = ict * m_tpParams.corr_length;
int dst_offs0 = cpair * (m_tpParams.num_tiles * 16 * 16) + (ty * 16 * m_tpParams.tilesx * 16) + (tx * 16);
for (int iquad = 0; iquad < 4; iquad ++) {
int iqy = (iquad >> 1) & 1;
int iqx = (iquad >> 0) & 1;
for (int iy = 0; iy < m_tpParams.dtt_size; iy++){
int src_offs = src_offs0 + iy * m_tpParams.dtt_size + iquad * m_tpParams.dtt_size * m_tpParams.dtt_size;
int dst_offs = dst_offs0 + (iy + m_tpParams.dtt_size * iqy)* (m_tpParams.tilesx * 16) + iqx * m_tpParams.dtt_size;
for (int ix = 0; ix < m_tpParams.dtt_size; ix++){
corr_img[dst_offs++] = cpu_corr_td[src_offs++];
}
}
}
}
return corr_img;
}
void TpHostGpu::saveInterCorrFile(
const char * path, // "clt/aux_interscene-TD.raw"
const char * prompt, // "interscene phase correlation"
int num_corrs,
float * gpu_corrs_td,
int * gpu_corr_indices,
int num_sel_sensors){
if (!path) return;
int corr_img_size = m_tpParams.num_corr_indices * 16 * 16; // NAN
int rslt_corr_size_td = num_corrs * m_tpParams.dtt_size2 * m_tpParams.dtt_size2;
float * cpu_corr_td = (float *)malloc(rslt_corr_size_td * sizeof(float));
int dtile_bytes = (m_tpParams.dtt_size2*m_tpParams.dtt_size2) * sizeof(float);
int * cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy2D(
cpu_corr_td,
dtile_bytes,
gpu_corrs_td,
dstride_corr_td,
dtile_bytes,
num_corrs,
cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
m_tpParams.num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
float * corr_img = getCorrTdImg(
corr_img_size, // int corr_img_size,
cpu_corr_indices, // int * cpu_corr_indices,
gpu_corrs_td, // float * cpu_corr,
num_sel_sensors); // int num_sel_sensors)
printf("Writing %s TD data to %s\n", prompt, path);
writeFloatsToFile(
corr_img, // float * data, // allocated array
corr_img_size, // int size, // length in elements
path); // const char * path) // file path
free(cpu_corr_indices);
free (corr_img);
free (cpu_corr_td);
}
void TpHostGpu::saveInterCorrIndicesFile(
const char * path, // "clt/aux_inter-indices.raw"
const char * prompt, // "interscene indices"
int * gpu_corr_indices,
int num_sel_sensors){
if (!path) return;
int * cpu_corr_indices = (int *) malloc(m_tpParams.num_corr_indices * sizeof(int));
checkCudaErrors(cudaMemcpy(
cpu_corr_indices,
gpu_corr_indices,
m_tpParams.num_corr_indices * sizeof(int),
cudaMemcpyDeviceToHost));
int corr_index_img_length = m_tpParams.tilesx * m_tpParams.tilesy * (num_sel_sensors+1) ;
float *corr_index_img = (float *)malloc(corr_index_img_length * sizeof(float));
for (int i = 0; i < corr_index_img_length; i++){
corr_index_img[i] = NAN;
}
for (int ict = 0; ict < m_tpParams.num_corr_indices; ict++){
int ctt = ( cpu_corr_indices[ict] >> m_tpParams.corr_ntile_shift); // CORR_NTILE_SHIFT);
int cpair = cpu_corr_indices[ict] & ((1 << m_tpParams.corr_ntile_shift) - 1);
if (cpair == 0xff){
cpair = num_sel_sensors;
}
int ty = ctt / m_tpParams.tilesx;
int tx = ctt % m_tpParams.tilesx;
corr_index_img[cpair * m_tpParams.tilesx * m_tpParams.tilesy + m_tpParams.tilesx * ty + tx] = ict; // cpu_corr_indices[ict];
}
printf("Writing %s to %s\n",prompt,path);
writeFloatsToFile(
corr_index_img, // float * data, // allocated array
corr_index_img_length, // int size, // length in elements
"clt/aux_inter-indices.raw"); // const char * path) // file path
free (corr_index_img);
free(cpu_corr_indices);
}
void TpHostGpu::hfree(float * p) {if (p) free(p); p = {};}
void TpHostGpu::hfree(struct CltExtra * p) {if (p) free(p); p = {};}
void TpHostGpu::gfree(float * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
void TpHostGpu::gfree(int * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
void TpHostGpu::gfree(struct CltExtra * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
void TpHostGpu::gfree(struct gc * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
void TpHostGpu::gfree(struct corr_vector * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
void TpHostGpu::gfree(struct trot_deriv * p) {if (p) checkCudaErrors(cudaFree(p));p = {};}
......@@ -7,17 +7,139 @@
#ifndef SRC_TPHOSTGPU_H_
#define SRC_TPHOSTGPU_H_
#include "geometry_correction.h"
#include "TpParams.h"
#include "TileProcessor.h"
class TpHostGpu{
static constexpr int m_num_cams_lwir = 16;
static constexpr int m_num_cams_rgb = 4;
static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
/*
*
*/
public:
TpParams& m_tpParams;
TpPaths& m_tpPaths;
private:
// no need to free
float m_tile_coords_h [max_num_cams][TpParams::tilesx*TpParams::tilesy][2]; // [TILESX * TILESY][2];
struct gc m_fgeometry_correction;
int m_correction_vector_length{};
int m_rByRDist_length{};
int m_texture_indices [TpParams::tilesx * TpParams::tilesya]; // [TILESX*TILESYA];
int m_num_textures{};
int m_tile_texture_layers{};
int m_tile_texture_size{};
int m_rgba_width{}; // = (TILESX+1) * DTT_SIZE;
int m_rgba_height{}; // = (TILESY+1) * DTT_SIZE;
int m_rbga_slices{}; // = tpParams.texture_colors + 1; // 4/1
// need to free
float * m_host_kern_buf{};
float * m_ftask_data{};
float * m_ftask_data1{};
float * m_gpu_ftasks{};
int * m_gpu_active_tiles{}; // tasks
int * m_gpu_num_active{}; // tasks
int * m_gpu_num_corr_tiles{}; // correlations
// host memory
// dstride* - size in byte to be passed to the GPU kernels
size_t dstride{};
size_t dstride_rslt{}; // in bytes !
size_t dstride_corr{}; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
// in the future, dstride_corr can reuse that of dstride_corr_td?
size_t dstride_corr_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_corr_combo{}; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_corr_combo_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_textures{}; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga{}; // in bytes ! for one rgba/ya 16x16 tile
float * m_gpu_kernels_h[max_num_cams]{};
struct CltExtra * m_gpu_kernel_offsets_h[max_num_cams]{};
float * m_gpu_images_h[max_num_cams]{};
float * m_gpu_clt_h[max_num_cams]{};
float * m_gpu_corr_images_h[max_num_cams]{};
float * m_correction_vector{};
float * m_rByRDist{};
int * m_gpu_texture_indices{};
// GPU memory
float ** m_gpu_kernels{};
struct CltExtra ** m_gpu_kernel_offsets{};
float ** m_gpu_images{};
float ** m_gpu_clt{};
float ** m_gpu_corr_images{};
struct gc * m_gpu_geometry_correction{};
struct corr_vector * m_gpu_correction_vector{};
float * m_gpu_rByRDist{};
struct trot_deriv * m_gpu_rot_deriv{};
// correlations device memory
float * m_gpu_corrs{}; // correlation tiles (per tile, per pair) in pixel domain
float * m_gpu_corrs_td{}; // correlation tiles (per tile, per pair) in transform domain
float * m_gpu_corrs_combo{}; // correlation tiles combined (1 per tile), pixel domain
float * m_gpu_corrs_combo_td{}; // correlation tiles combined (1 per tile), transform domain
int * m_gpu_corr_indices{}; // shared by gpu_corrs gpu_corrs_td
int * m_gpu_corrs_combo_indices{}; // shared by gpu_corrs_combo and gpu_corrs_combo_td
float * m_gpu_textures{};
float * m_gpu_diff_rgb_combo{};
float * m_gpu_textures_rbga{};
int * m_gpu_woi{};
int * m_gpu_twh{};
int * m_gpu_num_texture_tiles{};
float * m_gpu_port_offsets{};
float * m_gpu_color_weights{};
float * m_gpu_generate_RBGA_params{};
public:
TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
:m_tpParams{tpParams}
,m_tpPaths{tpPaths}
,m_host_kern_buf{(float *) malloc(tpParams.kern_size * sizeof(float))}
{};
};
~TpHostGpu();
void setImageKernels();
void setCltBuffers();
void setCorrImgBuffers();
void setImgBuffers();
void setGeometryCorrectionBuffers();
void setCorrelationBuffers();
void setTasks(const float target_disparity, const float scale);
void setTextures();
void setRGBA();
void testCorrelate2DIntra(int num_runs);
// for both intra and inter!
void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrIndicesFile(const char * path, const char * prompt, int * gpu_corr_indices, int num_sel_sensors);
private:
float * getCorrImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
float * getCorrTdImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
void hfree(float * p); // {if (p) free (p);}
void hfree(struct CltExtra * p);
void gfree(float * p);
void gfree(int * p);
void gfree(struct CltExtra * p);
void gfree(struct gc * p);
void gfree(struct corr_vector * p);
void gfree(struct trot_deriv * p);
};
......
......@@ -36,10 +36,10 @@ TpParams::TpParams(int lwir){
texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
kern_tiles = KERNELS_HOR * KERNELS_VERT * num_colors; // NUM_COLORS;
kern_size = kern_tiles * 4 * 64;
corr_size = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
corr_size = 2 * corr_out_rad + 1;
corr_length = corr_size * corr_size;
num_tiles = tp_tasks_size;
num_corr_indices = num_pairs * num_tiles;
}
......@@ -8,8 +8,54 @@
#ifndef SRC_TPPARAMS_H_
#define SRC_TPPARAMS_H_
#include <math.h>
#include "dtt8x8.h"
#include "tp_defines.h"
#include "geometry_correction.h" // TP_TASK_TASK_*
#include "TileProcessor.h"
class TpParams{
static constexpr int m_num_cams_lwir = 16;
public:
static constexpr int tilesx = TILESX;
static constexpr int tilesy = TILESY;
static constexpr int tilesya = TILESYA;
static constexpr int dtt_size = DTT_SIZE;
static constexpr int dtt_size2 = DTT_SIZE2;
static constexpr int img_width = IMG_WIDTH;
static constexpr int img_height = IMG_HEIGHT;
static constexpr int kernels_hor = KERNELS_HOR;
static constexpr int kernel_vert = KERNELS_VERT;
static constexpr int task_inter_en = TASK_INTER_EN; // 10 // Task bit to enable interscene correlation
static constexpr int task_corr_en = TASK_CORR_EN; // 9 // Task bit to enable intrascene correlation (pairs defined separately)
static constexpr int task_text_en = TASK_TEXT_EN; // 8 // task bit to enable texture generation
static constexpr int list_texture_bit = LIST_TEXTURE_BIT; // 8 // 7 // bit to request texture calculation
static constexpr int text_ntile_shift = TEXT_NTILE_SHIFT; // 9 // 8 // tile number shift for texture calculation (will be different from CORR_NTILE_SHIFT!)
static constexpr int task_texture_bits = TASK_TEXTURE_BITS; // TileProcessor.h
static constexpr int corr_ntile_shift = CORR_NTILE_SHIFT; // 8 // higher bits - number of a pair, other bits tile number
static constexpr int corr_out_rad = CORR_OUT_RAD; // 7
//???
// static constexpr int tp_task_size = TASK_TEXTURE_BITS; // TileProcessor.h
// int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
static constexpr int tp_tasks_size = tilesx * tilesy; //
static constexpr int tp_task_task_offset = TP_TASK_TASK_OFFSET;// 0
static constexpr int tp_task_txy_offset = TP_TASK_TXY_OFFSET;// 1
static constexpr int tp_task_disparity_offset = TP_TASK_DISPARITY_OFFSET;// 2
static constexpr int tp_task_centerxy_offset = TP_TASK_CENTERXY_OFFSET;// 3
static constexpr int tp_task_scale_offset = TP_TASK_SCALE_OFFSET;// 5
static constexpr int tp_task_xy_offset = TP_TASK_XY_OFFSET;// 6
static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
#ifdef DBG_TILE
static constexpr int debug_tile{1};
#else
static constexpr int debug_tile{0};
#endif
private:
static constexpr int m_num_cams_lwir = 16; // refactor to s_
static constexpr int m_num_colors_lwir = 1;
static constexpr int m_num_pairs_lwir = 120;
static constexpr int m_num_cams_rgb = 4;
......@@ -38,10 +84,13 @@ public:
float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
int keep_texture_weights {3}; // 0; // 1; // try with 0 also
int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
int kern_tiles;
int kern_size;
int corr_size;
int texture_colors{}; // 3; // result will be 3+1 RGBA (for mono - 2)
int kern_tiles{};
int kern_size{};
int num_tiles{};
int corr_size{};
int corr_length{};
int num_corr_indices{};
// std::vector<float[2]> m_port_offsets;
......
......@@ -31,7 +31,7 @@
*/
// all of the next 5 were disabled
#define NOCORR
//#define NOCORR
#define NOCORR_TD
#define NOTEXTURES //
#define NOTEXTURE_RGBA //
......@@ -129,6 +129,8 @@ int main(int argc, char **argv)
GenerateRgbaHost generateRgbaHost{}; // = new GenerateRgbaHost();
// return 0;
float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
float * ftask_data = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
......@@ -264,7 +266,7 @@ int main(int argc, char **argv)
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_size, // int width,
tpParams.corr_length, // int width,
tpParams.num_pairs * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be
......@@ -275,7 +277,7 @@ int main(int argc, char **argv)
gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_size, // int width,
tpParams.corr_length, // int width,
TILESX * TILESY); // int height);
gpu_corrs_combo_td = alloc_image_gpu(
......@@ -309,7 +311,7 @@ int main(int argc, char **argv)
int nt = ty * TILESX + tx;
int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16);
float task_target_disparity = DBG_DISPARITY;
float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
float * tp = ftask_data + tpParams.task_size * nt;
*(tp + TP_TASK_TASK_OFFSET) = *(float *) &task_task;
*(tp + TP_TASK_TXY_OFFSET) = *(float *) &task_txy;
......@@ -325,15 +327,18 @@ int main(int argc, char **argv)
}
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
int num_active_tiles; // will be calculated by convert_direct
int rslt_corr_size;
int corr_img_size;
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
// just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
......@@ -353,10 +358,11 @@ int main(int argc, char **argv)
(float * ) texture_indices,
num_textures,
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_twh, 2 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
......@@ -410,7 +416,7 @@ int main(int argc, char **argv)
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float));
float * cpu_corr_td;
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
......@@ -540,16 +546,6 @@ int main(int argc, char **argv)
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
/*
get_tiles_offsets<<<grid_geom,threads_geom>>> (
tpParams.num_cams, // int num_cams,
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
tpParams.num_cams, // int num_cams,
......@@ -778,10 +774,10 @@ int main(int argc, char **argv)
}
correlate2D<<<1,1>>>(
tpParams.num_cams, // int num_cams,
TpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
TpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
TpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
TpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
......@@ -886,6 +882,9 @@ int main(int argc, char **argv)
free (corr_img);
#endif // ifndef NOCORR
#ifndef NOCORR_TD
// cudaProfilerStart();
// testing corr
......@@ -1019,7 +1018,7 @@ int main(int argc, char **argv)
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * corr_size * corr_size;
rslt_corr_size = num_corrs * corr_length * corr_length;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
......@@ -1029,10 +1028,10 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
(corr_length * corr_length) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
(corr_length * corr_length) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
......@@ -1056,13 +1055,13 @@ int main(int argc, char **argv)
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size;
int src_offs0 = ict * corr_length * corr_length;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
for (int iy = 0; iy < corr_length; iy++){
int src_offs = src_offs0 + iy * corr_length; // ict * tpParams.num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
for (int ix = 0; ix < corr_length; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
......
......@@ -33,6 +33,9 @@ public:
const char* rByRDist_file;
const char* correction_vector_file;
const char* geometry_correction_file;
const char* result_interscene_td = "clt/aux_interscene-TD.raw";
const char* result_intrascene_td = "clt/aux_intrascene-TD.raw";
const char* result_interscene_indices = "clt/aux_inter-indices.raw";
private:
const char * m_kernel_file_lwir[16] ={
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment