/*
 * utils.cu
 *
 *  Created on: Mar 25, 2025
 *      Author: elphel
 */
#include <cuda_runtime.h>
#include <helper_cuda.h>
#include "tp_utils.h"
#include "dtt8x8.h" // for DTT_SIZE * DTT_SIZE21
#include "tp_defines.h" // for TEXTURE_THREADS_PER_TILE

float * copyalloc_kernel_gpu(float * kernel_host,
		                int size, // size in floats
						int full_size)
{
	float *kernel_gpu;
    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
    checkCudaErrors(cudaMemcpy( // segfault
    		kernel_gpu,
    		kernel_host,
			size * sizeof(float),
            cudaMemcpyHostToDevice));
    return kernel_gpu;
}

float * copyalloc_kernel_gpu(float * kernel_host,
		                int size)
{
	return copyalloc_kernel_gpu(kernel_host,
            size, // size in floats
			size);
}



float * alloccopy_from_gpu(
		float * gpu_data,
		float * cpu_data, // if null, will allocate
		int size)
{
	if (!cpu_data) {
		cpu_data = (float *)malloc(size*sizeof(float));
	}
	checkCudaErrors(cudaMemcpy( // segfault
			cpu_data,
			gpu_data,
			size * sizeof(float),
			cudaMemcpyDeviceToHost));

	return cpu_data;
}


float * alloc_kernel_gpu(int size) // size in floats
{
	float *kernel_gpu;
    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
    return kernel_gpu;
}


float ** copyalloc_pointers_gpu(float ** gpu_pointer,
		                int size) // number of entries (cameras)
{
	float ** gpu_pointer_to_gpu_pointers;
    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
    checkCudaErrors(cudaMemcpy(
    		gpu_pointer_to_gpu_pointers,
			gpu_pointer,
			size * sizeof(float*),
            cudaMemcpyHostToDevice));
    return gpu_pointer_to_gpu_pointers;
}

// image-related
// shift image in-place, repeat lines/columns
void shift_image (
		float * image,
        int width,
		int height,
		int bayer,
		int dx,
		int dy)
{
	int step = 1;
	if (bayer){
		step = 2;
		dx &= -2;
		dy &= -2;
	}
	// vertical shift dy>0 - down, dy < 0 - up
	for (int m = 0; m < dy; m+= step) { // only if dy > 0 (down)
		for (int y = height - 1; y >= step; y++){
			float * dp = image + (y * width);
			float * sp = dp - step * width;
			for (int x = 0; x < width; x++){
				(*dp++) = (*sp++);
			}
		}
	}

	// vertical shift dy < 0 - up
	for (int m = 0; m > dy; m-= step) { // only if dy < 0 (up)
		for (int y = 0; y < height - step; y++){
			float * dp = image + (y * width);
			float * sp = dp + step * width;
			for (int x = 0; x < width; x++){
				(*dp++) = (*sp++);
			}
		}
	}
	// horizontal shift dx > 0 - right, dx < 0 - left
	for (int m = 0; m < dx; m+= step) { // only if dx > 0 (right)
		for (int y = 0; y < height; y++){
			float * dp = image + (y * width) + width - 1;
			float * sp = dp - step;
			for (int x = 0; x < (width - step); x++){
				(*dp--) = (*sp--);
			}
		}
	}

	// horizontal shift dx < 0 - left
	for (int m = 0; m > dx; m-= step) { // only if dx < 0 (left)
		for (int y = 0; y < height; y++){
			float * dp = image + (y * width);
			float * sp = dp + step;
			for (int x = 0; x < (width - step); x++){
				(*dp++) = (*sp++);
			}
		}
	}
}

void update_image_gpu(
		float * image_host,
		float * image_gpu,
		size_t  dstride, // in floats !
		int     width,
		int     height){
    checkCudaErrors(cudaMemcpy2D(
    		image_gpu,
            dstride, //  * sizeof(float),
			image_host,
			width * sizeof(float), // make in 16*n?
            width * sizeof(float),
			height,
			cudaMemcpyHostToDevice));
}

float * copyalloc_image_gpu(
		float * image_host,
		size_t* dstride, // in floats !
		int width,
		int height)
{
	float *image_gpu;
    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
    update_image_gpu(
    		image_host,
    		image_gpu,
    		*dstride, // in floats !
    		width,
    		height);
    return image_gpu;
}

float * alloc_image_gpu(size_t* dstride, // in bytes!!
		                int width,
						int height)
{
	float *image_gpu;
    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
    return image_gpu;
}

// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
void set_clt_lpf(
		float * lpf,    // size*size array to be filled out
		float   sigma,
		const int     dct_size)
{
	int dct_len = dct_size * dct_size;
	if (sigma == 0.0f) {
		lpf[0] = 1.0f;
		for (int i = 1; i < dct_len; i++){
			lpf[i] = 0.0;
		}
	} else {
		for (int i = 0; i < dct_size; i++){
			for (int j = 0; j < dct_size; j++){
				lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
			}
		}
		// normalize
		double sum = 0;
		for (int i = 0; i < dct_size; i++){
			for (int j = 0; j < dct_size; j++){
				double d = 	lpf[i*dct_size+j];
				d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
				if (i > 0) d*= 2.0;
				if (j > 0) d*= 2.0;
				sum +=d;
			}
		}
		for (int i = 0; i< dct_len; i++){
			lpf[i] /= sum;
		}
	}
}

int host_get_textures_shared_size( // in bytes
//__device__ int get_textures_shared_size( // in bytes
	    int                num_cams,     // actual number of cameras
	    int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
		int *              offsets){     // in floats
//	int shared_floats = 0;
	int offs = 0;
//	int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
	if (offsets) offsets[0] = offs;
	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles         [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
	if (offsets) offsets[1] = offs;
	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles         [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
	if (offsets) offsets[2] = offs;
//	offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;    //float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
	int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;                // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
	int rgbaw_size =    (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21;     // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
	offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
	if (offsets) offsets[3] = offs;
	offs += num_cams * 2;                                      // float port_offsets      [NUM_CAMS][2];
	if (offsets) offsets[4] = offs;
	offs += num_colors * num_cams;                             // float ports_rgb_shared  [NUM_COLORS][NUM_CAMS];
	if (offsets) offsets[5] = offs;
	offs += num_cams;                                          // float max_diff_shared   [NUM_CAMS];
	if (offsets) offsets[6] = offs;
	offs += num_cams * TEXTURE_THREADS_PER_TILE;               // float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
	if (offsets) offsets[7] = offs;
	offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
	if (offsets) offsets[8] = offs;
	return sizeof(float) * offs; // shared_floats;
}




