More refactoring

67816dbf · Andrey Filippov · 68491042 · 67816dbf · 67816dbf · 67816dbf
Commit 67816dbf authored Apr 06, 2025 by Andrey Filippov
6 changed files
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -7,17 +7,139 @@

 #ifndef SRC_TPHOSTGPU_H_
 #define SRC_TPHOSTGPU_H_
+
+#include "geometry_correction.h"
 #include "TpParams.h"
+#include "TileProcessor.h"
+
 class TpHostGpu{
+	static constexpr int m_num_cams_lwir =   16;
+	static constexpr int m_num_cams_rgb =     4;
+	static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
+	/*
+	 *
+	 */
+
 public:
 	TpParams& m_tpParams;
 	TpPaths& m_tpPaths;
+private:
+	// no need to free
+    float               m_tile_coords_h        [max_num_cams][TpParams::tilesx*TpParams::tilesy][2]; // [TILESX * TILESY][2];
+    struct gc           m_fgeometry_correction;
+    int                 m_correction_vector_length{};
+    int                 m_rByRDist_length{};
+    int                 m_texture_indices [TpParams::tilesx * TpParams::tilesya]; //     [TILESX*TILESYA];
+    int                 m_num_textures{};
+    int                 m_tile_texture_layers{};
+    int                 m_tile_texture_size{};
+
+    int                 m_rgba_width{};  //  =   (TILESX+1) * DTT_SIZE;
+    int                 m_rgba_height{}; //  =  (TILESY+1) * DTT_SIZE;
+    int                 m_rbga_slices{}; //  =  tpParams.texture_colors + 1; // 4/1
+
+
+	// need to free
+	float *             m_host_kern_buf{};
+	float *             m_ftask_data{};
+	float *             m_ftask_data1{};
+    float *             m_gpu_ftasks{};
+    int *               m_gpu_active_tiles{};   // tasks
+    int *               m_gpu_num_active{};     // tasks
+    int *               m_gpu_num_corr_tiles{}; // correlations
+
+
+	// host memory
+	// dstride* - size in byte to be passed to the GPU kernels
+	size_t              dstride{};
+    size_t              dstride_rslt{};          // in bytes !
+    size_t              dstride_corr{};          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    // in the future, dstride_corr can reuse that of dstride_corr_td?
+    size_t              dstride_corr_td{};       // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
+    size_t              dstride_corr_combo{};    // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    size_t              dstride_corr_combo_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
+    size_t              dstride_textures{}; // in bytes ! for one rgba/ya 16x16 tile
+    size_t              dstride_textures_rbga{}; // in bytes ! for one rgba/ya 16x16 tile
+
+	float *             m_gpu_kernels_h[max_num_cams]{};
+	struct CltExtra *   m_gpu_kernel_offsets_h[max_num_cams]{};
+	float *             m_gpu_images_h[max_num_cams]{};
+	float *             m_gpu_clt_h[max_num_cams]{};
+	float *             m_gpu_corr_images_h[max_num_cams]{};
+    float *             m_correction_vector{};
+    float *             m_rByRDist{};
+    int   *             m_gpu_texture_indices{};
+
+
+
+	// GPU memory
+	float **            m_gpu_kernels{};
+	struct CltExtra  ** m_gpu_kernel_offsets{};
+	float **            m_gpu_images{};
+	float **            m_gpu_clt{};
+	float **            m_gpu_corr_images{};
+
+    struct gc          * m_gpu_geometry_correction{};
+    struct corr_vector * m_gpu_correction_vector{};
+    float              * m_gpu_rByRDist{};
+    struct trot_deriv  * m_gpu_rot_deriv{};
+
+    // correlations device memory
+    float              * m_gpu_corrs{};               // correlation tiles (per tile, per pair) in pixel domain
+    float              * m_gpu_corrs_td{};            // correlation tiles (per tile, per pair) in transform domain
+    float              * m_gpu_corrs_combo{};         // correlation tiles combined (1 per tile), pixel domain
+    float              * m_gpu_corrs_combo_td{};      // correlation tiles combined (1 per tile), transform domain
+    int                * m_gpu_corr_indices{};        // shared by gpu_corrs gpu_corrs_td
+    int                * m_gpu_corrs_combo_indices{}; // shared by gpu_corrs_combo and gpu_corrs_combo_td
+
+    float *             m_gpu_textures{};
+    float *             m_gpu_diff_rgb_combo{};
+    float *             m_gpu_textures_rbga{};
+
+    int   *             m_gpu_woi{};
+    int   *             m_gpu_twh{};
+    int   *             m_gpu_num_texture_tiles{};
+
+    float *             m_gpu_port_offsets{};
+    float *             m_gpu_color_weights{};
+    float *             m_gpu_generate_RBGA_params{};
+
+public:
 	TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
 	:m_tpParams{tpParams}
 	,m_tpPaths{tpPaths}
+	,m_host_kern_buf{(float *) malloc(tpParams.kern_size * sizeof(float))}
 	{};
-};
+	~TpHostGpu();
+	void setImageKernels();
+	void setCltBuffers();
+	void setCorrImgBuffers();
+	void setImgBuffers();
+	void setGeometryCorrectionBuffers();
+	void setCorrelationBuffers();
+	void setTasks(const float target_disparity, const float scale);
+	void setTextures();
+	void setRGBA();
+	void testCorrelate2DIntra(int num_runs);
+	// for both intra and inter!
+	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrIndicesFile(const char * path, const char * prompt, int * gpu_corr_indices, int num_sel_sensors);
+
+private:
+	float * getCorrImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr, int num_sel_sensors);
+	float * getCorrTdImg(int corr_img_size, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
+	void hfree(float * p); // {if (p) free (p);}
+	void hfree(struct CltExtra * p);
+	void gfree(float * p);
+	void gfree(int * p);
+	void gfree(struct CltExtra * p);

+	void gfree(struct gc * p);
+	void gfree(struct corr_vector * p);
+	void gfree(struct trot_deriv * p);
+
+};




--- a/src/TpParams.cu
+++ b/src/TpParams.cu
@@ -36,10 +36,10 @@ TpParams::TpParams(int lwir){
    texture_colors = num_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
    kern_tiles = KERNELS_HOR *  KERNELS_VERT * num_colors; // NUM_COLORS;
    kern_size =  kern_tiles * 4 * 64;
-    corr_size =  (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1); // CORR_SIZE;
-
-
-
+	corr_size =  2 * corr_out_rad + 1;
+	corr_length =  corr_size * corr_size;
+    num_tiles =  tp_tasks_size;
+    num_corr_indices = num_pairs * num_tiles;
 }


--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -8,8 +8,54 @@
 #ifndef SRC_TPPARAMS_H_
 #define SRC_TPPARAMS_H_
 #include <math.h>
+#include "dtt8x8.h"
+#include "tp_defines.h"
+#include "geometry_correction.h" // TP_TASK_TASK_*
+#include "TileProcessor.h"
 class TpParams{
-	static constexpr int m_num_cams_lwir =   16;
+public:
+	static constexpr int tilesx =                   TILESX;
+	static constexpr int tilesy =                   TILESY;
+	static constexpr int tilesya =                  TILESYA;
+	static constexpr int dtt_size =                 DTT_SIZE;
+	static constexpr int dtt_size2 =                DTT_SIZE2;
+	static constexpr int img_width =                IMG_WIDTH;
+	static constexpr int img_height =               IMG_HEIGHT;
+	static constexpr int kernels_hor =              KERNELS_HOR;
+	static constexpr int kernel_vert =              KERNELS_VERT;
+
+	static constexpr int task_inter_en =            TASK_INTER_EN; //          10 // Task bit to enable interscene correlation
+	static constexpr int task_corr_en =             TASK_CORR_EN;  //           9 // Task bit to enable intrascene correlation (pairs defined separately)
+	static constexpr int task_text_en =             TASK_TEXT_EN;  //           8 // task bit to enable texture generation
+
+	static constexpr int list_texture_bit =         LIST_TEXTURE_BIT; //        8 // 7 // bit to request texture calculation
+    static constexpr int text_ntile_shift =         TEXT_NTILE_SHIFT; //        9 // 8 // tile number shift for texture calculation (will be different from CORR_NTILE_SHIFT!)
+    static constexpr int task_texture_bits =        TASK_TEXTURE_BITS; //       TileProcessor.h
+
+    static constexpr int corr_ntile_shift =         CORR_NTILE_SHIFT; // 8 // higher bits - number of a pair, other bits tile number
+
+    static constexpr int corr_out_rad =             CORR_OUT_RAD; // 7
+    //???
+//    static constexpr int tp_task_size =        TASK_TEXTURE_BITS; //       TileProcessor.h
+//    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
+    static constexpr int tp_tasks_size =       tilesx * tilesy; //
+
+	static constexpr int tp_task_task_offset =      TP_TASK_TASK_OFFSET;//      0
+	static constexpr int tp_task_txy_offset =       TP_TASK_TXY_OFFSET;//       1
+	static constexpr int tp_task_disparity_offset = TP_TASK_DISPARITY_OFFSET;// 2
+	static constexpr int tp_task_centerxy_offset =  TP_TASK_CENTERXY_OFFSET;//  3
+	static constexpr int tp_task_scale_offset =     TP_TASK_SCALE_OFFSET;//     5
+	static constexpr int tp_task_xy_offset =        TP_TASK_XY_OFFSET;//        6
+	static constexpr float fat_zero = 1000.0f; // 300.0f; // 30.0;
+
+#ifdef DBG_TILE
+	static constexpr int debug_tile{1};
+#else
+	static constexpr int debug_tile{0};
+#endif
+
+private:
+	static constexpr int m_num_cams_lwir =   16; // refactor to s_
 	static constexpr int m_num_colors_lwir =  1;
 	static constexpr int m_num_pairs_lwir = 120;
 	static constexpr int m_num_cams_rgb =     4;
@@ -38,10 +84,13 @@ public:

    float port_offsets[max_num_cams][2]; // [NUM_CAMS][2];
    int keep_texture_weights {3}; // 0; // 1; // try with 0 also
-    int texture_colors; // 3; // result will be 3+1 RGBA (for mono - 2)
-    int kern_tiles;
-    int kern_size;
-    int corr_size;
+    int texture_colors{}; // 3; // result will be 3+1 RGBA (for mono - 2)
+    int kern_tiles{};
+    int kern_size{};
+    int num_tiles{};
+	int corr_size{};
+    int corr_length{};
+    int num_corr_indices{};
 //    std::vector<float[2]> m_port_offsets;



--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -31,7 +31,7 @@
 */

 // all of the next 5 were disabled
-#define NOCORR
+//#define NOCORR
 #define NOCORR_TD
 #define NOTEXTURES //
 #define NOTEXTURE_RGBA //
@@ -129,6 +129,8 @@ int main(int argc, char **argv)

    GenerateRgbaHost generateRgbaHost{}; //  = new GenerateRgbaHost();

+//    return 0;
+
    float * host_kern_buf = (float *) malloc(tpParams.kern_size * sizeof(float));
    float * ftask_data  =   (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
    float * ftask_data1  =  (float *) malloc(TILESX * TILESY * tpParams.task_size * sizeof(float));
@@ -264,7 +266,7 @@ int main(int argc, char **argv)
    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
    gpu_corrs = alloc_image_gpu(
    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_size,                      // int width,
+			tpParams.corr_length,                      // int width,
 			tpParams.num_pairs * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
 // allocate all other correlation data, some may be
@@ -275,7 +277,7 @@ int main(int argc, char **argv)

    gpu_corrs_combo = alloc_image_gpu(
    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			tpParams.corr_size,                       // int width,
+			tpParams.corr_length,                       // int width,
 			TILESX * TILESY);                // int height);

    gpu_corrs_combo_td = alloc_image_gpu(
@@ -309,7 +311,7 @@ int main(int argc, char **argv)
            int nt = ty * TILESX + tx;
            int task_task = (1 << TASK_INTER_EN) | (1 << TASK_CORR_EN) | (1 << TASK_TEXT_EN); // just 1 bit, correlation selection is defined by common corr_sel bits
            int task_txy = tx + (ty << 16);
-            float task_target_disparity = DBG_DISPARITY;
+            float task_target_disparity = DBG_DISPARITY; // disparity for which to calculate offsets (not needed in Java)
            float * tp = ftask_data + tpParams.task_size * nt;
            *(tp + TP_TASK_TASK_OFFSET) =    *(float *) &task_task;
            *(tp + TP_TASK_TXY_OFFSET) =      *(float *) &task_txy;
@@ -325,15 +327,18 @@ int main(int argc, char **argv)
    }

    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/tpParams.task_size; // number of task tiles
+    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));
+
    int num_active_tiles; // will be calculated by convert_direct
 	int rslt_corr_size;
 	int corr_img_size;

-    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * tpParams.task_size); // (sizeof(struct tp_task)/sizeof(float)));

    // just allocate
    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        tpParams.num_pairs * TILESX * TILESY*sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
+
+
    num_textures = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
@@ -353,10 +358,11 @@ int main(int argc, char **argv)
    		(float * ) texture_indices,
 			num_textures,
 			TILESX * TILESYA); // number of rows - multiple of 4
+
+
    // just allocate
    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&gpu_twh,               2 * sizeof(float)));
-
    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
    // number of border tiles
    // copy port indices to gpu
@@ -410,7 +416,7 @@ int main(int argc, char **argv)
 	float * corr_img; //  = (float *)malloc(corr_img_size * sizeof(float));
 	float * cpu_corr; //  = (float *)malloc(rslt_corr_size * sizeof(float));
 	float * cpu_corr_td;
-	int * cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));
+	int *   cpu_corr_indices; //  = (int *) malloc(num_corr_indices * sizeof(int));



@@ -540,16 +546,6 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerGEOM);
    		sdkStartTimer(&timerGEOM);
    	}
-/*
-    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			tpParams.num_cams,                // int                  num_cams,
-    			gpu_tasks,               // struct tp_task     * gpu_tasks,
-				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
-				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
-				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
-				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
-				*/
    	calculate_tiles_offsets<<<1,1>>> (
    			1,                       // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
 				tpParams.num_cams,                // int                  num_cams,
@@ -778,10 +774,10 @@ int main(int argc, char **argv)
    	}
    	correlate2D<<<1,1>>>(
    			tpParams.num_cams,                      // int               num_cams,
-				TpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
-				TpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
-				TpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
-				TpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
+				tpParams.sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				tpParams.sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				tpParams.sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				tpParams.sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
 				gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				tpParams.num_colors,                 // int               colors,             // number of colors (3/1)
 				tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
@@ -886,6 +882,9 @@ int main(int argc, char **argv)
    free (corr_img);
 #endif // ifndef NOCORR

+
+
+
 #ifndef NOCORR_TD
 //    cudaProfilerStart();
 // testing corr
@@ -1019,7 +1018,7 @@ int main(int argc, char **argv)
 //    int rslt_corr_size =   num_corrs * corr_size * corr_size;
 //    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));

-	rslt_corr_size =   num_corrs * corr_size * corr_size;
+	rslt_corr_size =   num_corrs * corr_length * corr_length;
 	corr_img_size = num_corr_indices * 16*16; // NAN
 	corr_img = (float *)malloc(corr_img_size * sizeof(float));
 	cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
@@ -1029,10 +1028,10 @@ int main(int argc, char **argv)

    checkCudaErrors(cudaMemcpy2D(
    		cpu_corr,
-			(corr_size * corr_size) * sizeof(float),
+			(corr_length * corr_length) * sizeof(float),
 			gpu_corrs,
 			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
+			(corr_length * corr_length) * sizeof(float),
 			num_corrs,
 			cudaMemcpyDeviceToHost));
    //    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
@@ -1056,13 +1055,13 @@ int main(int argc, char **argv)
    	int ty = ctt / TILESX;
    	int tx = ctt % TILESX;
    	//		int src_offs0 = ict * tpParams.num_pairs * corr_size * corr_size;
-    	int src_offs0 = ict * corr_size * corr_size;
+    	int src_offs0 = ict * corr_length * corr_length;
    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);

-    	for (int iy = 0; iy < corr_size; iy++){
-    		int src_offs = src_offs0 + iy * corr_size; // ict * tpParams.num_pairs * corr_size * corr_size;
+    	for (int iy = 0; iy < corr_length; iy++){
+    		int src_offs = src_offs0 + iy * corr_length; // ict * tpParams.num_pairs * corr_size * corr_size;
    		int dst_offs = dst_offs0 + iy * (TILESX * 16);
-    		for (int ix = 0; ix < corr_size; ix++){
+    		for (int ix = 0; ix < corr_length; ix++){
    			corr_img[dst_offs++] = cpu_corr[src_offs++];
    		}
    	}

--- a/src/tp_paths.h
+++ b/src/tp_paths.h
@@ -33,6 +33,9 @@ public:
 		const char* rByRDist_file;
 		const char* correction_vector_file;
 		const char* geometry_correction_file;
+		const char* result_interscene_td =      "clt/aux_interscene-TD.raw";
+		const char* result_intrascene_td =      "clt/aux_intrascene-TD.raw";
+		const char* result_interscene_indices = "clt/aux_inter-indices.raw";

 private:
 	    const char * m_kernel_file_lwir[16] ={