Refactoring, moving functions to tp_utils, tp_files<.cu,.h>

821b753a · Andrey Filippov · de3c497a · 821b753a · 821b753a · 821b753a
Commit 821b753a authored Mar 26, 2025 by Andrey Filippov
7 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1146,7 +1146,7 @@ extern "C" __global__ void combine_inter(     // combine per-senor interscene co
 	int corr_in_block = threadIdx.y;
 	int itile = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // correlation tile index
 	int corr_offset = itile * (num_sel_sensors + 1); // index of the first correlation for this task;
-	if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
+	if (corr_offset >= (*num_corr_tiles - num_sel_sensors)) { // was 	if (corr_offset >= (num_corr_tiles - num_sel_sensors)) {
 		return;
 	}
 //    __syncthreads();// __syncwarp();
@@ -2058,7 +2058,8 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
-		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh)
 {
 	float             min_shot = params[0];           // 10.0
 	float             scale_shot = params[1];         // 3.0
@@ -2166,12 +2167,11 @@ extern "C" __global__ void generate_RBGA(
 	 if (threadIdx.x == 0) {
        */
-		__device__ int twh[2];
+///		__device__ int twh[2];
 		update_woi<<<1,1, 0, cudaStreamTailLaunch>>>(
 				texture_slices, // int                texture_slices,
 				woi,            // int              *                  // min_x, min_y, max_x, max_y input, not modified, max_x - not used
 				twh);           // int              * twh)  // 2-element in device global memory
-		);
 		// next kernels will see woi as {x,y,width,height}
 		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -216,5 +216,7 @@ extern "C" __global__ void generate_RBGA(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int             * twh);
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -40,7 +40,7 @@
 #define SAVE_CLT
 #define NO_DP
-/// #define CORR_INTER_SELF 1
+#define CORR_INTER_SELF 1
 #include <stdio.h>
@@ -58,6 +58,8 @@
 #include "dtt8x8.h"
 #include "geometry_correction.h"
 #include "TileProcessor.cuh"
+#include "tp_utils.h"
+#include "tp_files.h"
 #if TEST_LWIR
 	#define IMG_WIDTH                   640
@@ -78,235 +80,6 @@
 #define TILESYA       ((TILESY +3) & (~3))
-float * copyalloc_kernel_gpu(float * kernel_host,
-		                int size, // size in floats
-						int full_size)
-{
-	float *kernel_gpu;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
-    checkCudaErrors(cudaMemcpy( // segfault
-    		kernel_gpu,
-    		kernel_host,
-			size * sizeof(float),
-            cudaMemcpyHostToDevice));
-    return kernel_gpu;
-}
-float * copyalloc_kernel_gpu(float * kernel_host,
-		                int size)
-{
-	return copyalloc_kernel_gpu(kernel_host,
-            size, // size in floats
-			size);
-}
-float * alloccopy_from_gpu(
-		float * gpu_data,
-		float * cpu_data, // if null, will allocate
-		int size)
-{
-	if (!cpu_data) {
-		cpu_data = (float *)malloc(size*sizeof(float));
-	}
-	checkCudaErrors(cudaMemcpy( // segfault
-			cpu_data,
-			gpu_data,
-			size * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	return cpu_data;
-}
-float * alloc_kernel_gpu(int size) // size in floats
-{
-	float *kernel_gpu;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
-    return kernel_gpu;
-}
-float ** copyalloc_pointers_gpu(float ** gpu_pointer,
-		                int size) // number of entries (cameras)
-{
-	float ** gpu_pointer_to_gpu_pointers;
-    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
-    checkCudaErrors(cudaMemcpy(
-    		gpu_pointer_to_gpu_pointers,
-			gpu_pointer,
-			size * sizeof(float*),
-            cudaMemcpyHostToDevice));
-    return gpu_pointer_to_gpu_pointers;
-}
-// shift image in-place, repeat lines/columns
-void shift_image (
-		float * image,
-        int width,
-		int height,
-		int bayer,
-		int dx,
-		int dy)
-{
-	int step = 1;
-	if (bayer){
-		step = 2;
-		dx &= -2;
-		dy &= -2;
-	}
-	// vertical shift dy>0 - down, dy < 0 - up
-	for (int m = 0; m < dy; m+= step) { // only if dy > 0 (down)
-		for (int y = height - 1; y >= step; y++){
-			float * dp = image + (y * width);
-			float * sp = dp - step * width;
-			for (int x = 0; x < width; x++){
-				(*dp++) = (*sp++);
-			}
-		}
-	}
-	// vertical shift dy < 0 - up
-	for (int m = 0; m > dy; m-= step) { // only if dy < 0 (up)
-		for (int y = 0; y < height - step; y++){
-			float * dp = image + (y * width);
-			float * sp = dp + step * width;
-			for (int x = 0; x < width; x++){
-				(*dp++) = (*sp++);
-			}
-		}
-	}
-	// horizontal shift dx > 0 - right, dx < 0 - left
-	for (int m = 0; m < dx; m+= step) { // only if dx > 0 (right)
-		for (int y = 0; y < height; y++){
-			float * dp = image + (y * width) + width - 1;
-			float * sp = dp - step;
-			for (int x = 0; x < (width - step); x++){
-				(*dp--) = (*sp--);
-			}
-		}
-	}
-	// horizontal shift dx < 0 - left
-	for (int m = 0; m > dx; m-= step) { // only if dx < 0 (left)
-		for (int y = 0; y < height; y++){
-			float * dp = image + (y * width);
-			float * sp = dp + step;
-			for (int x = 0; x < (width - step); x++){
-				(*dp++) = (*sp++);
-			}
-		}
-	}
-}
-void update_image_gpu(
-		float * image_host,
-		float * image_gpu,
-		size_t  dstride, // in floats !
-		int     width,
-		int     height){
-    checkCudaErrors(cudaMemcpy2D(
-    		image_gpu,
-            dstride, //  * sizeof(float),
-			image_host,
-			width * sizeof(float), // make in 16*n?
-            width * sizeof(float),
-			height,
-			cudaMemcpyHostToDevice));
-}
-float * copyalloc_image_gpu(
-		float * image_host,
-		size_t* dstride, // in floats !
-		int width,
-		int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    update_image_gpu(
-    		image_host,
-    		image_gpu,
-    		*dstride, // in floats !
-    		width,
-    		height);
-    /*
-    checkCudaErrors(cudaMemcpy2D(
-    		image_gpu,
-            *dstride, //  * sizeof(float),
-			image_host,
-			width * sizeof(float), // make in 16*n?
-            width * sizeof(float),
-			height,
-			cudaMemcpyHostToDevice));
-	*/
-    return image_gpu;
-}
-float * alloc_image_gpu(size_t* dstride, // in bytes!!
-		                int width,
-						int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    return image_gpu;
-}
-int get_file_size(std::string filename) // path to file
-{
-    FILE *p_file = NULL;
-    p_file = fopen(filename.c_str(),"rb");
-    fseek(p_file,0,SEEK_END);
-    int size = ftell(p_file);
-    fclose(p_file);
-    return size;
-}
-int readFloatsFromFile(float *       data, // allocated array
-					   const char *  path) // file path
-{
-    printf("readFloatsFromFile(%s)\n", path);
-    int fsize = get_file_size(path);
-    std::ifstream input(path, std::ios::binary );
-    // copies all data into buffer
-    std::vector<char> buffer((
-            std::istreambuf_iterator<char>(input)),
-            (std::istreambuf_iterator<char>()));
-    std::copy( buffer.begin(), buffer.end(), (char *) data);
-    printf("---- Bytes read: %d from %s\n", fsize, path);
-	return 0;
-}
-float * readAllFloatsFromFile(const char *  path,
-		int * len_in_floats) //
-{
-    int fsize = get_file_size(path);
-    float * data = (float *) malloc(fsize);
-    std::ifstream input(path, std::ios::binary );
-    std::vector<char> buffer((
-            std::istreambuf_iterator<char>(input)),
-            (std::istreambuf_iterator<char>()));
-    std::copy( buffer.begin(), buffer.end(), (char *) data);
-    printf("---- Bytes read: %d from %s\n", fsize, path);
-    * len_in_floats = fsize/sizeof(float);
-    return data;
-}
-int writeFloatsToFile(float *       data, // allocated array
-		               int           size, // length in elements
-					   const char *  path) // file path
-{
-	std::ofstream ofile(path, std::ios::binary);
-	ofile.write((char *) data, size * sizeof(float));
-	return 0;
-}
 // Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
 void set_clt_lpf(
 		float * lpf,    // size*size array to be filled out
@@ -978,6 +751,7 @@ int main(int argc, char **argv)
    float            * gpu_textures_rbga;
    int              * gpu_texture_indices;
    int              * gpu_woi;
+    int              * gpu_twh;
    int              * gpu_num_texture_tiles;
    float            * gpu_port_offsets;
    float            * gpu_color_weights;
@@ -1173,6 +947,8 @@ int main(int argc, char **argv)
 			TILESX * TILESYA); // number of rows - multiple of 4
    // just allocate
    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&gpu_twh,               2 * sizeof(float)));
    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
    // number of border tiles
@@ -2312,7 +2088,7 @@ int main(int argc, char **argv)
    						num_cams,                        // 	int               num_cams,           // number of cameras used
 							(int *) 0,                       // int             * woi,                // x, y, width,height
 							gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-							cpu_pnum_texture_tiles,          // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+							gpu_pnum_texture_tiles, /// cpu_pnum_texture_tiles,          // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
 							0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
 							gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 							gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
@@ -2561,7 +2337,8 @@ int main(int argc, char **argv)
 	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
 				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
-				gpu_textures_rbga);     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				gpu_textures_rbga,     // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+				gpu_twh);              // int             * twh)
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -2673,6 +2450,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaFree(gpu_textures_rbga));
 	checkCudaErrors(cudaFree(gpu_diff_rgb_combo));
 	checkCudaErrors(cudaFree(gpu_woi));
+	checkCudaErrors(cudaFree(gpu_twh));
 	checkCudaErrors(cudaFree(gpu_num_texture_tiles));
 	checkCudaErrors(cudaFree(gpu_geometry_correction));
    checkCudaErrors(cudaFree(gpu_correction_vector));

--- a/src/tp_files.cu
+++ b/src/tp_files.cu
+/*
+ * tp_files.cu
+ *
+ *  Created on: Mar 25, 2025
+ *      Author: elphel
+ */
+//#include <cuda_runtime.h>
+//#include <helper_cuda.h>
+//#include <stdio.h>
+//#include <stdlib.h>
+//#include <fstream>
+//#include <iterator>
+//#include <vector>
+//#include "tp_utils.h"
+#include "tp_files.h"
+int get_file_size(std::string filename) // path to file
+{
+    FILE *p_file = NULL;
+    p_file = fopen(filename.c_str(),"rb");
+    fseek(p_file,0,SEEK_END);
+    int size = ftell(p_file);
+    fclose(p_file);
+    return size;
+}
+int readFloatsFromFile(float *       data, // allocated array
+					   const char *  path) // file path
+{
+    printf("readFloatsFromFile(%s)\n", path);
+    int fsize = get_file_size(path);
+    std::ifstream input(path, std::ios::binary );
+    // copies all data into buffer
+    std::vector<char> buffer((
+            std::istreambuf_iterator<char>(input)),
+            (std::istreambuf_iterator<char>()));
+    std::copy( buffer.begin(), buffer.end(), (char *) data);
+    printf("---- Bytes read: %d from %s\n", fsize, path);
+	return 0;
+}
+float * readAllFloatsFromFile(const char *  path,
+		int * len_in_floats) //
+{
+    int fsize = get_file_size(path);
+    float * data = (float *) malloc(fsize);
+    std::ifstream input(path, std::ios::binary );
+    std::vector<char> buffer((
+            std::istreambuf_iterator<char>(input)),
+            (std::istreambuf_iterator<char>()));
+    std::copy( buffer.begin(), buffer.end(), (char *) data);
+    printf("---- Bytes read: %d from %s\n", fsize, path);
+    * len_in_floats = fsize/sizeof(float);
+    return data;
+}
+int writeFloatsToFile(float *       data, // allocated array
+		               int           size, // length in elements
+					   const char *  path) // file path
+{
+	std::ofstream ofile(path, std::ios::binary);
+	ofile.write((char *) data, size * sizeof(float));
+	return 0;
+}
--- a/src/tp_files.h
+++ b/src/tp_files.h
+/*
+ * tp_files.h
+ *
+ *  Created on: Mar 25, 2025
+ *      Author: elphel
+ */
+#ifndef SRC_TP_FILES_H_
+#define SRC_TP_FILES_H_
+#include <fstream>
+#include <iterator>
+#include <vector>
+int get_file_size(std::string filename); // path to file
+int readFloatsFromFile(float *       data, // allocated array
+					   const char *  path); // file path
+float * readAllFloatsFromFile(const char *  path,
+		int * len_in_floats);
+int writeFloatsToFile(float *       data, // allocated array
+		               int           size, // length in elements
+					   const char *  path); //  file path
+#endif /* SRC_TP_FILES_H_ */
--- a/src/tp_utils.cu
+++ b/src/tp_utils.cu
+/*
+ * utils.cu
+ *
+ *  Created on: Mar 25, 2025
+ *      Author: elphel
+ */
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include "tp_utils.h"
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size, // size in floats
+						int full_size)
+{
+	float *kernel_gpu;
+    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
+    checkCudaErrors(cudaMemcpy( // segfault
+    		kernel_gpu,
+    		kernel_host,
+			size * sizeof(float),
+            cudaMemcpyHostToDevice));
+    return kernel_gpu;
+}
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size)
+{
+	return copyalloc_kernel_gpu(kernel_host,
+            size, // size in floats
+			size);
+}
+float * alloccopy_from_gpu(
+		float * gpu_data,
+		float * cpu_data, // if null, will allocate
+		int size)
+{
+	if (!cpu_data) {
+		cpu_data = (float *)malloc(size*sizeof(float));
+	}
+	checkCudaErrors(cudaMemcpy( // segfault
+			cpu_data,
+			gpu_data,
+			size * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	return cpu_data;
+}
+float * alloc_kernel_gpu(int size) // size in floats
+{
+	float *kernel_gpu;
+    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
+    return kernel_gpu;
+}
+float ** copyalloc_pointers_gpu(float ** gpu_pointer,
+		                int size) // number of entries (cameras)
+{
+	float ** gpu_pointer_to_gpu_pointers;
+    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
+    checkCudaErrors(cudaMemcpy(
+    		gpu_pointer_to_gpu_pointers,
+			gpu_pointer,
+			size * sizeof(float*),
+            cudaMemcpyHostToDevice));
+    return gpu_pointer_to_gpu_pointers;
+}
+// image-related
+// shift image in-place, repeat lines/columns
+void shift_image (
+		float * image,
+        int width,
+		int height,
+		int bayer,
+		int dx,
+		int dy)
+{
+	int step = 1;
+	if (bayer){
+		step = 2;
+		dx &= -2;
+		dy &= -2;
+	}
+	// vertical shift dy>0 - down, dy < 0 - up
+	for (int m = 0; m < dy; m+= step) { // only if dy > 0 (down)
+		for (int y = height - 1; y >= step; y++){
+			float * dp = image + (y * width);
+			float * sp = dp - step * width;
+			for (int x = 0; x < width; x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+	// vertical shift dy < 0 - up
+	for (int m = 0; m > dy; m-= step) { // only if dy < 0 (up)
+		for (int y = 0; y < height - step; y++){
+			float * dp = image + (y * width);
+			float * sp = dp + step * width;
+			for (int x = 0; x < width; x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+	// horizontal shift dx > 0 - right, dx < 0 - left
+	for (int m = 0; m < dx; m+= step) { // only if dx > 0 (right)
+		for (int y = 0; y < height; y++){
+			float * dp = image + (y * width) + width - 1;
+			float * sp = dp - step;
+			for (int x = 0; x < (width - step); x++){
+				(*dp--) = (*sp--);
+			}
+		}
+	}
+	// horizontal shift dx < 0 - left
+	for (int m = 0; m > dx; m-= step) { // only if dx < 0 (left)
+		for (int y = 0; y < height; y++){
+			float * dp = image + (y * width);
+			float * sp = dp + step;
+			for (int x = 0; x < (width - step); x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+}
+void update_image_gpu(
+		float * image_host,
+		float * image_gpu,
+		size_t  dstride, // in floats !
+		int     width,
+		int     height){
+    checkCudaErrors(cudaMemcpy2D(
+    		image_gpu,
+            dstride, //  * sizeof(float),
+			image_host,
+			width * sizeof(float), // make in 16*n?
+            width * sizeof(float),
+			height,
+			cudaMemcpyHostToDevice));
+}
+float * copyalloc_image_gpu(
+		float * image_host,
+		size_t* dstride, // in floats !
+		int width,
+		int height)
+{
+	float *image_gpu;
+    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
+    update_image_gpu(
+    		image_host,
+    		image_gpu,
+    		*dstride, // in floats !
+    		width,
+    		height);
+    return image_gpu;
+}
+float * alloc_image_gpu(size_t* dstride, // in bytes!!
+		                int width,
+						int height)
+{
+	float *image_gpu;
+    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
+    return image_gpu;
+}
--- a/src/tp_utils.h
+++ b/src/tp_utils.h
+/*
+ * tp_utils.h
+ *
+ *  Created on: Mar 25, 2025
+ *      Author: elphel
+ */
+#ifndef SRC_TP_UTILS_H_
+#define SRC_TP_UTILS_H_
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size, // size in floats
+						int full_size);
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size);
+float * alloccopy_from_gpu(
+		float * gpu_data,
+		float * cpu_data, // if null, will allocate
+		int size);
+float * alloc_kernel_gpu(int size); // size in floats
+float ** copyalloc_pointers_gpu(float ** gpu_pointer,
+		                int size); // number of entries (cameras)
+// image-related
+// shift image in-place, repeat lines/columns
+void shift_image (
+		float * image,
+        int width,
+		int height,
+		int bayer,
+		int dx,
+		int dy);
+void update_image_gpu(
+		float * image_host,
+		float * image_gpu,
+		size_t  dstride, // in floats !
+		int     width,
+		int     height);
+float * copyalloc_image_gpu(
+		float * image_host,
+		size_t* dstride, // in floats !
+		int width,
+		int height);
+float * alloc_image_gpu(size_t* dstride, // in bytes!!
+		                int width,
+						int height);
+#endif /* SRC_TP_UTILS_H_ */