Commit 67816dbf authored by Andrey Filippov's avatar Andrey Filippov

More refactoring

parent 68491042
This diff is collapsed.
......@@ -7,17 +7,139 @@
#ifndef SRC_TPHOSTGPU_H_
#define SRC_TPHOSTGPU_H_
#include "geometry_correction.h"
#include "TpParams.h"
#include "TileProcessor.h"
class TpHostGpu{
static constexpr int m_num_cams_lwir = 16;
static constexpr int m_num_cams_rgb = 4;
static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
/*
*
*/
public:
TpParams& m_tpParams;
TpPaths& m_tpPaths;
private:
// no need to free
float m_tile_coords_h [max_num_cams][TpParams::tilesx*TpParams::tilesy][2]; // [TILESX * TILESY][2];
struct gc m_fgeometry_correction;
int m_correction_vector_length{};
int m_rByRDist_length{};
int m_texture_indices [TpParams::tilesx * TpParams::tilesya]; // [TILESX*TILESYA];
int m_num_textures{};
int m_tile_texture_layers{};
int m_tile_texture_size{};
int m_rgba_width{}; // = (TILESX+1) * DTT_SIZE;
int m_rgba_height{}; // = (TILESY+1) * DTT_SIZE;
int m_rbga_slices{}; // = tpParams.texture_colors + 1; // 4/1
// need to free
float * m_host_kern_buf{};
float * m_ftask_data{};
float * m_ftask_data1{};
float * m_gpu_ftasks{};
int * m_gpu_active_tiles{}; // tasks
int * m_gpu_num_active{}; // tasks
int * m_gpu_num_corr_tiles{}; // correlations
// host memory
// dstride* - size in byte to be passed to the GPU kernels
size_t dstride{};
size_t dstride_rslt{}; // in bytes !
size_t dstride_corr{}; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
// in the future, dstride_corr can reuse that of dstride_corr_td?
size_t dstride_corr_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_corr_combo{}; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
size_t dstride_corr_combo_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
size_t dstride_textures{}; // in bytes ! for one rgba/ya 16x16 tile
size_t dstride_textures_rbga{}; // in bytes ! for one rgba/ya 16x16 tile
float * m_gpu_kernels_h[max_num_cams]{};
struct CltExtra * m_gpu_kernel_offsets_h[max_num_cams]{};
float * m_gpu_images_h[max_num_cams]{};
float * m_gpu_clt_h[max_num_cams]{};
float * m_gpu_corr_images_h[max_num_cams]{};
float * m_correction_vector{};
float * m_rByRDist{};
int * m_gpu_texture_indices{};
// GPU memory
float ** m_gpu_kernels{};
struct CltExtra ** m_gpu_kernel_offsets{};
float ** m_gpu_images{};
float ** m_gpu_clt{};
float ** m_gpu_corr_images{};
struct gc * m_gpu_geometry_correction{};
struct corr_vector * m_gpu_correction_vector{};
float * m_gpu_rByRDist{};
struct trot_deriv * m_gpu_rot_deriv{};
// correlations device memory
float * m_gpu_corrs{}; // correlation tiles (per tile, per pair) in pixel domain
float * m_gpu_corrs_td{}; // correlation tiles (per tile, per pair) in transform domain
float * m_gpu_corrs_combo{}; // correlation tiles combined (1 per tile), pixel domain
float * m_gpu_corrs_combo_td{}; // correlation tiles combined (1 per tile), transform domain
int * m_gpu_corr_indices{}; // shared by gpu_corrs gpu_corrs_td
int * m_gpu_corrs_combo_indices{}; // shared by gpu_corrs_combo and gpu_corrs_combo_td
float * m_gpu_textures{};
float * m_gpu_diff_rgb_combo{};
float * m_gpu_textures_rbga{};
int * m_gpu_woi{};
int * m_gpu_twh{};
int * m_gpu_num_texture_tiles{};
float * m_gpu_port_offsets{};
float * m_gpu_color_weights{};
float * m_gpu_generate_RBGA_params{};
public:
TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
:m_tpParams{tpParams}
,m_tpPaths{tpPaths}
,m_host_kern_buf{(float *) malloc(tpParams.kern_size * sizeof(float))}
{};
};
~TpHostGpu();
void setImageKernels();
void setCltBuffers();
void setCorrImgBuffers();
void setImgBuffers();
void setGeometryCorrectionBuffers();
void setCorrelationBuffers();
void setTasks(const float target_disparity, const float scale);
void setTextures();
void setRGBA();
void testCorrelate2DIntra(int num_runs);
// for both intra and inter!
void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs_td, int * gpu_corr_indices, int num_sel_sensors);
void saveInterCorrIndicesFile(const char * path, const char * prompt, int * gpu_corr_indices, int num_sel_sensors);
private:
float * getCorrImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
float * getCorrTdImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
void hfree(float * p); // {if (p) free (p);}
void hfree(struct CltExtra * p);
void gfree(float * p);
void gfree(int * p);
void gfree(struct CltExtra * p);
void gfree(struct gc * p);
void gfree(struct corr_vector * p);
void gfree(struct trot_deriv * p);
};
......
......@@ -36,10 +36,10 @@ TpParams::TpParams(int lwir){
texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
kern_tiles = KERNELS_HOR * KERNELS_VERT * num_colors; // NUM_COLORS;
kern_size = kern_tiles * 4 * 64;
corr_size = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
corr_size = 2 * corr_out_rad + 1;
corr_length = corr_size * corr_size;
num_tiles = tp_tasks_size;
num_corr_indices = num_pairs * num_tiles;
}
......@@ -8,8 +8,54 @@
#ifndef SRC_TPPARAMS_H_
#define SRC_TPPARAMS_H_
#include <math.h>
#include "dtt8x8.h"
#include "tp_defines.h"
#include "geometry_correction.h" // TP_TASK_TASK_*
#include "TileProcessor.h"
class TpParams{
static constexpr int m_num_cams_lwir = 16;
public:
static constexpr int tilesx = TILESX;
static constexpr int tilesy = TILESY;
static constexpr int tilesya = TILESYA;
static constexpr int dtt_size = DTT_SIZE;
static constexpr int dtt_size2 = DTT_SIZE2;
static constexpr int img_width = IMG_WIDTH;
static constexpr int img_height = IMG_HEIGHT;
static constexpr int kernels_hor = KERNELS_HOR;
static constexpr int kernel_vert = KERNELS_VERT;
static constexpr int task_inter_en = TASK_INTER_EN; // 10 // Task bit to enable interscene correlation
static constexpr int task_corr_en = TASK_CORR_EN; // 9 // Task bit to enable intrascene correlation (pairs defined separately)
static constexpr int task_text_en = TASK_TEXT_EN; // 8 // task bit to enable texture generation
static constexpr int list_texture_bit = LIST_TEXTURE_BIT; // 8 // 7 // bit to request texture calculation
static constexpr int text_ntile_shift = TEXT_NTILE_SHIFT; // 9 // 8 // tile number shift for texture calculation (will be different from CORR_NTILE_SHIFT!)
static constexpr int task_texture_bits = TASK_TEXTURE_BITS; // TileProcessor.h
static constexpr int corr_ntile_shift = CORR_NTILE_SHIFT; // 8 // higher bits - number of a pair, other bits tile number
static constexpr int corr_out_rad = CORR_OUT_RAD; // 7
//???
// static constexpr int tp_task_size = TASK_TEXTURE_BITS; // TileProcessor.h
// int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
static constexpr int tp_tasks_size = tilesx * tilesy; //
static constexpr int tp_task_task_offset = TP_TASK_TASK_OFFSET;// 0
static constexpr int tp_task_txy_offset = TP_TASK_TXY_OFFSET;// 1
static constexpr int tp_task_disparity_offset = TP_TASK_DISPARITY_OFFSET;// 2
static constexpr int tp_task_centerxy_offset = TP_TASK_CENTERXY_OFFSET;// 3
static constexpr int tp_task_scale_offset = TP_TASK_SCALE_OFFSET;// 5
static constexpr int tp_task_xy_offset = TP_TASK_XY_OFFSET;// 6
static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
#ifdef DBG_TILE
static constexpr int debug_tile{1};
#else
static constexpr int debug_tile{0};
#endif
private:
static constexpr int m_num_cams_lwir = 16; // refactor to s_
static constexpr int m_num_colors_lwir = 1;
static constexpr int m_num_pairs_lwir = 120;
static constexpr int m_num_cams_rgb = 4;
......@@ -38,10 +84,13 @@ public:
float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
int keep_texture_weights {3}; // 0; // 1; // try with 0 also
int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
int kern_tiles;
int kern_size;
int corr_size;
int texture_colors{}; // 3; // result will be 3+1 RGBA (for mono - 2)
int kern_tiles{};
int kern_size{};
int num_tiles{};
int corr_size{};
int corr_length{};
int num_corr_indices{};
// std::vector<float[2]> m_port_offsets;
......
......@@ -31,7 +31,7 @@
*/
// all of the next 5 were disabled
#define NOCORR
//#define NOCORR
#define NOCORR_TD
#define NOTEXTURES //
#define NOTEXTURE_RGBA //
......@@ -129,6 +129,8 @@ int main(int argc, char **argv)
GenerateRgbaHost generateRgbaHost{}; // = new GenerateRgbaHost();
// return 0;
float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
float * ftask_data = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
float * ftask_data1 = (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
......@@ -264,7 +266,7 @@ int main(int argc, char **argv)
// allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
gpu_corrs = alloc_image_gpu(
&dstride_corr, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_size, // int width,
tpParams.corr_length, // int width,
tpParams.num_pairs * TILESX * TILESY); // int height);
// read channel images (assuming host_kern_buf size > image size, reusing it)
// allocate all other correlation data, some may be
......@@ -275,7 +277,7 @@ int main(int argc, char **argv)
gpu_corrs_combo = alloc_image_gpu(
&dstride_corr_combo, // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
tpParams.corr_size, // int width,
tpParams.corr_length, // int width,
TILESX * TILESY); // int height);
gpu_corrs_combo_td = alloc_image_gpu(
......@@ -309,7 +311,7 @@ int main(int argc, char **argv)
int nt = ty * TILESX + tx;
int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
int task_txy = tx + (ty << 16);
float task_target_disparity = DBG_DISPARITY;
float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
float * tp = ftask_data + tpParams.task_size * nt;
*(tp + TP_TASK_TASK_OFFSET) = *(float *) &task_task;
*(tp + TP_TASK_TXY_OFFSET) = *(float *) &task_txy;
......@@ -325,15 +327,18 @@ int main(int argc, char **argv)
}
int tp_task_size = TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
int num_active_tiles; // will be calculated by convert_direct
int rslt_corr_size;
int corr_img_size;
gpu_ftasks = (float *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
// just allocate
checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices, TILESX * TILESY*sizeof(int)));
num_textures = 0;
for (int ty = 0; ty < TILESY; ty++){
for (int tx = 0; tx < TILESX; tx++){
......@@ -353,10 +358,11 @@ int main(int argc, char **argv)
(float * ) texture_indices,
num_textures,
TILESX * TILESYA); // number of rows - multiple of 4
// just allocate
checkCudaErrors(cudaMalloc((void **)&gpu_woi, 4 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_twh, 2 * sizeof(float)));
checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
// number of border tiles
// copy port indices to gpu
......@@ -410,7 +416,7 @@ int main(int argc, char **argv)
float * corr_img; // = (float *)malloc(corr_img_size * sizeof(float));
float * cpu_corr; // = (float *)malloc(rslt_corr_size * sizeof(float));
float * cpu_corr_td;
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
int * cpu_corr_indices; // = (int *) malloc(num_corr_indices * sizeof(int));
......@@ -540,16 +546,6 @@ int main(int argc, char **argv)
sdkResetTimer(&timerGEOM);
sdkStartTimer(&timerGEOM);
}
/*
get_tiles_offsets<<<grid_geom,threads_geom>>> (
tpParams.num_cams, // int num_cams,
gpu_tasks, // struct tp_task * gpu_tasks,
tp_task_size, // int num_tiles, // number of tiles in task list
gpu_geometry_correction, // struct gc * gpu_geometry_correction,
gpu_correction_vector, // struct corr_vector * gpu_correction_vector,
gpu_rByRDist, // float * gpu_rByRDist) // length should match RBYRDIST_LEN
gpu_rot_deriv); // union trot_deriv * gpu_rot_deriv);
*/
calculate_tiles_offsets<<<1,1>>> (
1, // int uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
tpParams.num_cams, // int num_cams,
......@@ -778,10 +774,10 @@ int main(int argc, char **argv)
}
correlate2D<<<1,1>>>(
tpParams.num_cams, // int num_cams,
TpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
TpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
TpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
TpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
tpParams.sel_pairs[0], // int sel_pairs0 // unused bits should be 0
tpParams.sel_pairs[1], // int sel_pairs1, // unused bits should be 0
tpParams.sel_pairs[2], // int sel_pairs2, // unused bits should be 0
tpParams.sel_pairs[3], // int sel_pairs3, // unused bits should be 0
gpu_clt, // float ** gpu_clt, // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
tpParams.num_colors, // int colors, // number of colors (3/1)
tpParams.color_weights[0], // 0.25, // float scale0, // scale for R
......@@ -886,6 +882,9 @@ int main(int argc, char **argv)
free (corr_img);
#endif // ifndef NOCORR
#ifndef NOCORR_TD
// cudaProfilerStart();
// testing corr
......@@ -1019,7 +1018,7 @@ int main(int argc, char **argv)
// int rslt_corr_size = num_corrs * corr_size * corr_size;
// float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
rslt_corr_size = num_corrs * corr_size * corr_size;
rslt_corr_size = num_corrs * corr_length * corr_length;
corr_img_size = num_corr_indices * 16*16; // NAN
corr_img = (float *)malloc(corr_img_size * sizeof(float));
cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
......@@ -1029,10 +1028,10 @@ int main(int argc, char **argv)
checkCudaErrors(cudaMemcpy2D(
cpu_corr,
(corr_size * corr_size) * sizeof(float),
(corr_length * corr_length) * sizeof(float),
gpu_corrs,
dstride_corr,
(corr_size * corr_size) * sizeof(float),
(corr_length * corr_length) * sizeof(float),
num_corrs,
cudaMemcpyDeviceToHost));
// checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices, num_pairs * TILESX * TILESY*sizeof(int)));
......@@ -1056,13 +1055,13 @@ int main(int argc, char **argv)
int ty = ctt / TILESX;
int tx = ctt % TILESX;
// int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
int src_offs0 = ict * corr_size * corr_size;
int src_offs0 = ict * corr_length * corr_length;
int dst_offs0 = cpair * (num_tiles * 16 * 16) + (ty * 16 * TILESX * 16) + (tx * 16);
for (int iy = 0; iy < corr_size; iy++){
int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
for (int iy = 0; iy < corr_length; iy++){
int src_offs = src_offs0 + iy * corr_length; // ict * tpParams.num_pairs * corr_size * corr_size;
int dst_offs = dst_offs0 + iy * (TILESX * 16);
for (int ix = 0; ix < corr_size; ix++){
for (int ix = 0; ix < corr_length; ix++){
corr_img[dst_offs++] = cpu_corr[src_offs++];
}
}
......
......@@ -33,6 +33,9 @@ public:
const char* rByRDist_file;
const char* correction_vector_file;
const char* geometry_correction_file;
const char* result_interscene_td = "clt/aux_interscene-TD.raw";
const char* result_intrascene_td = "clt/aux_intrascene-TD.raw";
const char* result_interscene_indices = "clt/aux_inter-indices.raw";
private:
const char * m_kernel_file_lwir[16] ={
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment