/*
 * TpHostGpu.h
 *
 *  Created on: Apr 2, 2025
 *      Author: elphel
 */

#ifndef SRC_TPHOSTGPU_H_
#define SRC_TPHOSTGPU_H_

#include "geometry_correction.h"
#include "TpParams.h"
#include "TileProcessor.h"

class TpHostGpu{
	static constexpr int m_num_cams_lwir =   16;
	static constexpr int m_num_cams_rgb =     4;
	static constexpr int max_num_cams {std::max(m_num_cams_rgb,m_num_cams_lwir)}; // it is always 16 element, RGB uses only first 4
	/*
	 *
	 */

public:
	TpParams& m_tpParams;
	TpPaths& m_tpPaths;
private:
	// no need to free
    float               m_tile_coords_h        [max_num_cams][TpParams::tilesx*TpParams::tilesy][2]; // [TILESX * TILESY][2];
    struct gc           m_fgeometry_correction;
    int                 m_correction_vector_length{};
    int                 m_rByRDist_length{};
    int                 m_texture_indices [TpParams::tilesx * TpParams::tilesya]; //     [TILESX*TILESYA];
    int                 m_num_textures{};
    int                 m_tile_texture_layers{};
    int                 m_tile_texture_size{};

    int                 m_rgba_width{};  //  =   (TILESX+1) * DTT_SIZE;
    int                 m_rgba_height{}; //  =  (TILESY+1) * DTT_SIZE;
    int                 m_rbga_slices{}; //  =  tpParams.texture_colors + 1; // 4/1


	// need to free
	float *             m_host_kern_buf{};
	float *             m_ftask_data{};
	float *             m_ftask_data1{};
    float *             m_gpu_ftasks{};
    int *               m_gpu_active_tiles{};   // tasks
    int *               m_gpu_num_active{};     // tasks
    int *               m_gpu_num_corr_tiles{}; // correlations


	// host memory
	// dstride* - size in byte to be passed to the GPU kernels
	size_t              dstride{};
    size_t              dstride_rslt{};          // in bytes !
    size_t              dstride_corr{};          // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
    // in the future, dstride_corr can reuse that of dstride_corr_td?
    size_t              dstride_corr_td{};       // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
    size_t              dstride_corr_combo{};    // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
    size_t              dstride_corr_combo_td{}; // in bytes ! for one 2d phase correlation (padded 4x8x8x4 bytes)
    size_t              dstride_textures{}; // in bytes ! for one rgba/ya 16x16 tile
    size_t              dstride_textures_rbga{}; // in bytes ! for one rgba/ya 16x16 tile

	float *             m_gpu_kernels_h[max_num_cams]{};
	struct CltExtra *   m_gpu_kernel_offsets_h[max_num_cams]{};
	float *             m_gpu_images_h[max_num_cams]{};
	float *             m_gpu_clt_h[max_num_cams]{};
	float *             m_gpu_corr_images_h[max_num_cams]{};
    float *             m_correction_vector{};
    float *             m_rByRDist{};
    int   *             m_gpu_texture_indices{};



	// GPU memory
	float **            m_gpu_kernels{};
	struct CltExtra  ** m_gpu_kernel_offsets{};
	float **            m_gpu_images{};
	float **            m_gpu_clt{};
	float **            m_gpu_corr_images{};

    struct gc          * m_gpu_geometry_correction{};
    struct corr_vector * m_gpu_correction_vector{};
    float              * m_gpu_rByRDist{};
    struct trot_deriv  * m_gpu_rot_deriv{};

    // correlations device memory
    float              * m_gpu_corrs{};               // correlation tiles (per tile, per pair) in pixel domain
    float              * m_gpu_corrs_td{};            // correlation tiles (per tile, per pair) in transform domain
    float              * m_gpu_corrs_combo{};         // correlation tiles combined (1 per tile), pixel domain
    float              * m_gpu_corrs_combo_td{};      // correlation tiles combined (1 per tile), transform domain
    int                * m_gpu_corr_indices{};        // shared by gpu_corrs gpu_corrs_td
    int                * m_gpu_corrs_combo_indices{}; // shared by gpu_corrs_combo and gpu_corrs_combo_td

    float *             m_gpu_textures{};
    float *             m_gpu_diff_rgb_combo{};
    float *             m_gpu_textures_rbga{};

    int   *             m_gpu_woi{};
    int   *             m_gpu_twh{};
    int   *             m_gpu_num_texture_tiles{};

    float *             m_gpu_port_offsets{};
    float *             m_gpu_color_weights{};
    float *             m_gpu_generate_RBGA_params{};

public:
	TpHostGpu(TpParams& tpParams, TpPaths& tpPaths)
	:m_tpParams{tpParams}
	,m_tpPaths{tpPaths}
	,m_host_kern_buf{(float *) malloc(tpParams.kern_size * sizeof(float))}
	{};
	~TpHostGpu();
	void allTests(
			int num_runs,
			int image_dx,                 // 2
			int image_dy,                 // 0
			const float target_disparity, // DBG_DISPARITY == 0.0
			const float scale,            // 0.0
			int quad_combine,
			int use_dp,
			int debug);
	void setImageKernels();                                                 // 233-258 (overlap)
	void setCltBuffers();                                                   // 246
	void setCorrImgBuffers();                                               // 252
	void setImgBuffers();                                                   // 283-292
	void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy);    // 1171-1188 SHOULD be called before testCorrelate2DInterSelf
	void setGeometryCorrectionBuffers();                                    // 207-231
	void setCorrelationBuffers();                                           // 260-281 , 332-333
	void setTasks(const float target_disparity, const float scale);         // 129, 302-325
	void setTextures();                                                     // 337-348, ??
	void setRGBA();                                                         // 377 - 390

	trot_deriv testRotMatrices   (int num_runs);                            // 420
	void testReverseDistortions  (int num_runs);                            // 468
	void testGeomCorrect         (int num_runs);                            // 534
	void testConvertDirect       (int num_runs);                            // 608
//	void testImclt               (int num_runs);                            // 682 // not implemented
	void testImcltRbgAll         (int num_runs);                            // 701

	void testCorrelate2DIntra    (int num_runs);                            // 762 - 885
	void testCorrelate2DIntraTD  (int num_runs, int quad_combine);          // 886 - 1123
	//void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy);  // 1171-1188
	void testCorrelate2DInterSelf(int num_runs);                            // 1136 - 1411
	void testTextures            (int num_runs, int use_dp, int debug);     // 1422-1664
	void testTexturesRGBA        (int num_runs, int use_dp, int debug);      // 1669-1810


private:
	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);


	// for both intra and inter!
	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);

	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
	void generate_RBGA_host( // not a member
			int                num_cams,           // number of cameras used
			// Parameters to generate texture tasks
			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p//		struct tp_task   * gpu_tasks,
			int                num_tiles,          // number of tiles in task list
			// declare arrays in device code?
			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
			int              * gpu_num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
			int              * gpu_woi,                // x,y,width,height of the woi
			int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
			int                height, // <= TILES-Y, use for faster processing of LWIR images
			// Parameters for the texture generation
			float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
			// TODO: use geometry_correction rXY !
			struct gc       * gpu_geometry_correction,
			int               colors,             // number of colors (3/1)
			int               is_lwir,            // do not perform shot correction
			const float       cpu_params[5],      // mitigating CUDA_ERROR_INVALID_PTX
			const float       weights[3],         // scale for R,B,G should be host_array, not gpu
			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
			int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
			const int         texture_rbga_stride,     // in floats
			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles

	void hfree(float *& p); // {if (p) free (p);}
	void hfree(struct CltExtra *& p);
	void gfree(float *& p);
	void gfree(int *& p);
	void gfree(struct CltExtra *& p);

	void gfree(struct gc *& p);
	void gfree(struct corr_vector *& p);
	void gfree(struct trot_deriv *& p);
	void gfree(float **& p);
	void gfree(struct CltExtra **& p);

};



#endif /* SRC_TPHOSTGPU_H_ */
