debugged initially with jcuda

d8e9a454 · Andrey Filippov · 414f6351 · d8e9a454 · d8e9a454 · d8e9a454
Commit d8e9a454 authored Apr 15, 2025 by Andrey Filippov
6 changed files
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -5,15 +5,21 @@
 *      Author: elphel
 */
 #include <stdexcept>
-#include <helper_cuda.h>  // for checkCudaErrors
+//#include <driver_types.h> // was not needed before, only for indexes - needs __DRIVER_TYPES_H__
-#include <cuda_runtime.h> // cudaFree
+#include <cstdlib>
+#include <cstdio>
+#include <cuda_runtime.h>     // cudaFree
+#include <helper_cuda.h>      // for checkCudaErrors
 #include <helper_functions.h> // timer functions
 //#include "TpParams.h"   // TpHostGpu.h has it
 #include "tp_paths.h"
 #include "tp_files.h"
 #include "tp_utils.h"       // for copyalloc_kernel_gpu
-#include "GenerateRgbaHost.h"
+//#include "GenerateRgbaHost.h"
 #include "TpHostGpu.h"
 #define MY_EXCEPTION(aMessage) \
@@ -79,6 +85,7 @@ void TpHostGpu::allTests(
 		int image_dy,
 		const float target_disparity,
 		const float scale,
+		const float fat_zero,         // 1000.0
 		int quad_combine,
 		int use_dp,
 		int debug){
@@ -100,10 +107,10 @@ void TpHostGpu::allTests(
 	testConvertDirect       (num_runs);                                // 608
 	testImcltRbgAll         (num_runs);                                // 701
-	testCorrelate2DIntra    (num_runs);                                // 762 - 885
+	testCorrelate2DIntra    (num_runs, fat_zero);                                // 762 - 885
-	testCorrelate2DIntraTD  (num_runs, quad_combine);                  // 886 - 1123
+	testCorrelate2DIntraTD  (num_runs, fat_zero, quad_combine);                  // 886 - 1123
 	setImgBuffersShifted(is_bayer, image_dx, image_dy);                // 1171-1188
-	testCorrelate2DInterSelf(num_runs);                                // 1136 - 1411
+	testCorrelate2DInterSelf(num_runs, fat_zero);                                // 1136 - 1411
 	testTextures            (num_runs, use_dp, debug);                 // 1422-1664
 	testTexturesRGBA        (num_runs, use_dp, debug);                 // 1669-1810
 	return;
@@ -696,7 +703,7 @@ void TpHostGpu::testImcltRbgAll         (int num_runs){ // 701
    		m_gpu_corr_images_h); // float **     gpu_corr_images_h){
 }
-void TpHostGpu::testCorrelate2DIntra(int num_runs){
+void TpHostGpu::testCorrelate2DIntra(int num_runs, float fat_zero){
 	int  num_corr_indices = m_tpParams.num_pairs * m_tpParams.num_tiles;
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
@@ -730,7 +737,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 				m_tpParams.color_weights[0], // 0.25,  // float             scale0,             // scale for R
 				m_tpParams.color_weights[1], // 0.25,  // float             scale1,             // scale for B
 				m_tpParams.color_weights[2], // 0.5,   // float             scale2,             // scale for G
-				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,         // float             fat_zero2,           // here - absolute
 				m_gpu_ftasks,                // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				m_tpParams.tp_tasks_size,    // int               num_tiles) // number of tiles in task
 				m_tpParams.tilesx,           // int               tilesx,             // number of tile rows
@@ -764,7 +771,7 @@ void TpHostGpu::testCorrelate2DIntra(int num_runs){
 			16);                        //int          num_sel_sensors) { // only for interscene
 }
-void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886 - 1123
+void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, float fat_zero, int quad_combine){ // 886 - 1123
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	// check/replace names
@@ -817,7 +824,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 				m_tpParams.color_weights[0],   // 0.25,  // float             scale0,             // scale for R
 				m_tpParams.color_weights[1],   // 0.25,  // float             scale1,             // scale for B
 				m_tpParams.color_weights[2],   // 0.5,   // float             scale2,             // scale for G
-				m_tpParams.fat_zero * m_tpParams.fat_zero,                // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,           // float             fat_zero2,           // here - absolute
 				m_gpu_ftasks,                  // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				m_tpParams.tp_tasks_size,      // int               num_tiles) // number of tiles in task
 				m_tpParams.tilesx,             // int               tilesx,             // number of tile rows
@@ -856,7 +863,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 					(float *) 0,                               // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 					dstride_corr_combo/sizeof(float),          // const size_t      corr_stride,        // in floats
 					m_gpu_corrs_combo,                         // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 			printf("corr2D_combine pass: %d\n",i);
 		}else { // if (quad_combine) {
@@ -868,7 +875,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 					(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 					dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
 					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 		} // if (quad_combine) {
@@ -971,7 +978,7 @@ void TpHostGpu::testCorrelate2DIntraTD  (int num_runs, int quad_combine){ // 886
 	} // if (quad_combine) {
 }
-void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
+void TpHostGpu::testCorrelate2DInterSelf(int num_runs, float fat_zero){ // 889
 	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
 	int i0 =            m_tpParams.debug_tile ? 0 : -1;
 	// check/replace names
@@ -1087,7 +1094,7 @@ void TpHostGpu::testCorrelate2DInterSelf(int num_runs){ // 889
    				(float *) 0, // corr_weights,              // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
    				dstride_corr/sizeof(float),                // const size_t      corr_stride,        // in floats
 					m_gpu_corrs,                               // float           * gpu_corrs,          // correlation output data (pixel domain)
-					m_tpParams.fat_zero * m_tpParams.fat_zero, // float             fat_zero2,           // here - absolute
+					fat_zero * fat_zero,                       // float             fat_zero2,           // here - absolute
 					m_tpParams.corr_out_rad);                  // int               corr_radius);        // radius of the output correlation (7 for 15x15)
     	getLastCudaError("Kernel failure:corr2D_normalize");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -1440,6 +1447,8 @@ void TpHostGpu::testTexturesRGBA        (
 			printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size, m_tpParams.num_cams, m_tpParams.texture_colors);
 			cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_size); // 60000); // 5536); // for CC 7.5
+			// was not here - next line
+			cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
 			generate_RBGA<<<1,1>>> (
 					m_tpParams.num_cams,              // int                num_cams,           // number of cameras used
 					// Parameters to generate texture tasks
@@ -1788,6 +1797,8 @@ float * TpHostGpu::getCorrTdImg(
 	return corr_img;
 }
+//void TpHostGpu::generate_RBGA_host(
+// static // https://stackoverflow.com/questions/15725922/static-function-a-storage-class-may-not-be-specified-here
 void TpHostGpu::generate_RBGA_host(
 		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
@@ -2031,7 +2042,6 @@ void TpHostGpu::generate_RBGA_host(
 				pntt, // ntt,                    // int *             num_texture_tiles,  // number of texture tiles to process
 				ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
 				gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				//				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 				colors,                          // int               colors,             // number of colors (3/1)
 				is_lwir,                         // int               is_lwir,            // do not perform shot correction

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -117,6 +117,7 @@ public:
 			int image_dy,                 // 0
 			const float target_disparity, // DBG_DISPARITY == 0.0
 			const float scale,            // 0.0
+			const float fat_zero,         // 1000.0
 			int quad_combine,
 			int use_dp,
 			int debug);
@@ -138,27 +139,13 @@ public:
 //	void testImclt               (int num_runs);                            // 682 // not implemented
 	void testImcltRbgAll         (int num_runs);                            // 701
-	void testCorrelate2DIntra    (int num_runs);                            // 762 - 885
+	void testCorrelate2DIntra    (int num_runs, float fat_zero);                            // 762 - 885
-	void testCorrelate2DIntraTD  (int num_runs, int quad_combine);          // 886 - 1123
+	void testCorrelate2DIntraTD  (int num_runs, float fat_zero, int quad_combine);          // 886 - 1123
 	//void setImgBuffersShifted(int is_bayer, int image_dx, int image_dy);  // 1171-1188
-	void testCorrelate2DInterSelf(int num_runs);                            // 1136 - 1411
+	void testCorrelate2DInterSelf(int num_runs, float fat_zero);                            // 1136 - 1411
 	void testTextures            (int num_runs, int use_dp, int debug);     // 1422-1664
 	void testTexturesRGBA        (int num_runs, int use_dp, int debug);      // 1669-1810
+	static void generate_RBGA_host( // not a member
-private:
-	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
-	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);
-	// for both intra and inter!
-	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
-	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
-	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
-	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
-	void generate_RBGA_host( // not a member
 			int                num_cams,           // number of cameras used
 			// Parameters to generate texture tasks
 			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16p//		struct tp_task   * gpu_tasks,
@@ -182,6 +169,19 @@ private:
 			const int         texture_rbga_stride,     // in floats
 			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+private:
+	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
+	void saveRgb(const char ** paths,  const char * prompt, float **     gpu_corr_images_h);
+	// for both intra and inter!
+	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);
+	void saveInterCorrIndicesFile(const char * path, const char * prompt, int num_corr_indices, int * gpu_corr_indices, int num_sel_sensors);
+	float * getCorrImg  (int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr,   int num_sel_sensors);
+	float * getCorrTdImg(int corr_img_size, int num_corr_indices, int * cpu_corr_indices, float * cpu_corr_td, int num_sel_sensors);
 	void hfree(float *& p); // {if (p) free (p);}
 	void hfree(struct CltExtra *& p);
 	void gfree(float *& p);
@@ -193,9 +193,9 @@ private:
 	void gfree(struct trot_deriv *& p);
 	void gfree(float **& p);
 	void gfree(struct CltExtra **& p);
 };
 #endif /* SRC_TPHOSTGPU_H_ */
--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -49,7 +49,7 @@ public:
 	static constexpr int tp_task_centerxy_offset =  TP_TASK_CENTERXY_OFFSET;//  3
 	static constexpr int tp_task_scale_offset =     TP_TASK_SCALE_OFFSET;//     5
 	static constexpr int tp_task_xy_offset =        TP_TASK_XY_OFFSET;//        6
-	static constexpr float fat_zero =               1000.0f; // 300.0f; // 30.0;
+//	static constexpr float fat_zero =               1000.0f; // 300.0f; // 30.0;
    static constexpr int convert_direct_indexing_threads =      CONVERT_DIRECT_INDEXING_THREADS; //
    static constexpr int convert_direct_indexing_threads_log2 = CONVERT_DIRECT_INDEXING_THREADS_LOG2; //

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -112,12 +112,23 @@ __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
 		}
 };
+// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"
+__constant__ int angles_offsets [4] {15,0,30,30};
+/*
+__constant__ int angles_offsets [4]  {
+		(int) (offsetof4(corr_vector, azimuth)),
+		(int) (offsetof4(corr_vector, tilt)),
+		(int) (offsetof4(corr_vector, roll)),
+		(int) (offsetof4(corr_vector, roll))};
+*/
+/*
+ __constant__ int angles_offsets [4] = {
+		(int) (offsetof(corr_vector, azimuth)/sizeof(float)),
+		(int) (offsetof(corr_vector, tilt)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float)),
+		(int) (offsetof(corr_vector, roll)   /sizeof(float))};
-__constant__ int angles_offsets [4] = {
+ */
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
 __constant__ int mm_seq [3][3][3]={
 		{
 				{6,5,12}, // a_t * a_z -> tmp0

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -51,6 +51,11 @@
    ((size_t)&(((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
+#ifndef offsetof4
+#define offsetof4(st, m) \
+    (((size_t)&(((st *)0)->m))>>2)
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm

--- a/src/test_tp.cu
+++ b/src/test_tp.cu