More methods replacing flat code

14d7689b · Andrey Filippov · dc090454 · 14d7689b · 14d7689b · 14d7689b
Commit 14d7689b authored Apr 08, 2025 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 306 additions and 4 deletions

TpHostGpu.cu src/TpHostGpu.cu +298 -2

TpHostGpu.h src/TpHostGpu.h +2 -1

TpParams.h src/TpParams.h +6 -1

No files found.
--- a/src/TpHostGpu.cu
+++ b/src/TpHostGpu.cu
@@ -280,22 +280,318 @@ void TpHostGpu::setRGBA(){
    		(m_tpParams.num_colors + 1) * sizeof(float)));
 }

-void TpHostGpu::testRotMatrices         (int num_runs){ // 424
+trot_deriv TpHostGpu::testRotMatrices (int num_runs){ // 424
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_correction_vector){
+		throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
+	}
+	if (!m_gpu_rot_deriv){
+		throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
+	}
+    dim3 threads_rot (3,3,3);
+    dim3 grid_rot    (m_tpParams.num_cams, 1, 1);
+    printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
+    printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
+    StopWatchInterface *timerROT_MATRICES = 0;
+    sdkCreateTimer(&timerROT_MATRICES);
+    for (int i = i0; i < numIterations; i++) {
+    	if (i == 0)	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerROT_MATRICES);
+    		sdkStartTimer(&timerROT_MATRICES);
+    	}
+    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
+    			m_tpParams.num_cams,                // int                  num_cams,
+				m_gpu_correction_vector ,  // struct corr_vector * gpu_correction_vector,
+				m_gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("testRotMatrices pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerROT_MATRICES);
+    float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
+    sdkDeleteTimer(&timerROT_MATRICES);
+    printf("Average calc_rot_matrices run time =%f ms\n",  avgTimeROT_MATRICES);
+    trot_deriv rot_deriv{};
+	checkCudaErrors(cudaMemcpy(
+			&rot_deriv,
+			m_gpu_rot_deriv,
+			sizeof(trot_deriv),
+			cudaMemcpyDeviceToHost));
+	return rot_deriv;
 }

 void TpHostGpu::testReverseDistortions (int num_runs){ // 468
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+	if (!m_gpu_rByRDist){
+		throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
+	}
+	if (!m_rByRDist){
+		throw std::runtime_error("Error: m_rByRDist is not initialized ");
+	}
+    dim3 threads_rd(3,3,3);
+    dim3 grid_rd   (NUM_CAMS, 1, 1); // can get rid of NUM_CAMS
+    printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
+    printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
+    StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
+    sdkCreateTimer(&timerREVERSE_DISTORTIONS);
+    for (int i = i0; i < numIterations; i++) {
+    	if (i == 0) {
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerREVERSE_DISTORTIONS);
+    		sdkStartTimer(&timerREVERSE_DISTORTIONS);
+    	}
+    	calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
+    			m_gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
+				m_gpu_rByRDist);
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("testReverseDistortions pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerREVERSE_DISTORTIONS);
+    float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
+    sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
+    printf("Average calcReverseDistortionTable  run time =%f ms\n",  avgTimeREVERSE_DISTORTIONS);
+    float * rByRDist_gen = (float *) malloc(m_tpParams.rbyrdist_len * sizeof(float));
+	checkCudaErrors(cudaMemcpy(
+			rByRDist_gen,
+			m_gpu_rByRDist,
+			m_tpParams.rbyrdist_len * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	float max_err = 0;
+	for (int i = 0; i < m_tpParams.rbyrdist_len; i++){
+		float err = abs(rByRDist_gen[i] - m_rByRDist[i]);
+		if (err > max_err){
+			max_err = err;
+		}
+	}
+	printf("Maximal rByRDist error = %f\n",max_err);
+	free (rByRDist_gen);
+    // temporarily restore
+	if (0) {
+		checkCudaErrors(cudaMemcpy(
+				m_gpu_rByRDist,
+				m_rByRDist,
+				m_tpParams.rbyrdist_len * sizeof(float),
+				cudaMemcpyHostToDevice));
+	}
 }

 void TpHostGpu::testGeomCorrect         (int num_runs){ // 534
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_ftasks){
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+	if (!m_gpu_correction_vector){
+		throw std::runtime_error("Error: m_gpu_correction_vector is not initialized ");
+	}
+	if (!m_gpu_rByRDist){
+		throw std::runtime_error("Error: m_gpu_rByRDist is not initialized ");
+	}
+	if (!m_gpu_rot_deriv){
+		throw std::runtime_error("Error: m_gpu_rot_deriv is not initialized ");
+	}
+	if (!m_ftask_data){
+		throw std::runtime_error("Error: m_ftask_data is not initialized ");
+	}
+	if (!m_ftask_data1){
+		throw std::runtime_error("Error: m_ftask_data1 is not initialized ");
+	}
+    dim3 threads_geom(m_tpParams.num_cams, m_tpParams.tiles_per_block_geom, 1);
+    dim3 grid_geom   ((m_tpParams.tp_tasks_size + m_tpParams.tiles_per_block_geom-1)/m_tpParams.tiles_per_block_geom, 1, 1);
+    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
+    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
+    StopWatchInterface *timerGEOM = 0;
+    sdkCreateTimer(&timerGEOM);
+    for (int i = i0; i < numIterations; i++) {
+    	if (i == 0) {
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerGEOM);
+    		sdkStartTimer(&timerGEOM);
+    	}
+    	calculate_tiles_offsets<<<1,1>>> (
+    			1,                         // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+				m_tpParams.num_cams,                // int                  num_cams,
+				m_gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+//    			gpu_tasks,                 // struct tp_task     * gpu_tasks,
+				m_tpParams.tp_tasks_size,  // int                  num_tiles,          // number of tiles in task list
+				m_gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
+				m_gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
+				m_gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				m_gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerGEOM);
+    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
+    sdkDeleteTimer(&timerGEOM);
+    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+
+    checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
+    		m_ftask_data1,
+			m_gpu_ftasks,
+			m_tpParams.tp_tasks_size * m_tpParams.task_size *sizeof(float),
+			cudaMemcpyDeviceToHost));
+    //task_size
+#if 0 // for manual browsing
+    struct tp_task * old_task = &task_data [DBG_TILE];
+    struct tp_task * new_task = &task_data1[DBG_TILE];
+#endif
+	if( m_tpParams.debug_tile) {
+	    printf("old_task txy = 0x%x\n", *(int *) (m_ftask_data + m_tpParams.tp_tasks_size * m_tpParams.dbg_tile + 1)) ; // task_data [DBG_TILE].txy);
+	    printf("new_task txy = 0x%x\n", *(int *) (m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + 1)) ; // task_data1[DBG_TILE].txy);
+	        for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++){
+	            printf("camera %d pX old %f new %f diff = %f\n", ncam,
+	            		 *(m_ftask_data  + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
+	            		 *(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0),
+	            		 (*(m_ftask_data + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)) -
+	            		 (*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 0)));
+	            printf("camera %d pY old %f new %f diff = %f\n", ncam,
+	           		 *(m_ftask_data  +  m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
+	           		 *(m_ftask_data1 +  m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1),
+	           		 (*(m_ftask_data +  m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)) -
+	           		 (*(m_ftask_data1 + m_tpParams.task_size * m_tpParams.dbg_tile + m_tpParams.tp_task_xy_offset + 2*ncam + 1)));
+	        }
+	}
 }

 void TpHostGpu::testConvertDirect       (int num_runs){ // 608
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_kernel_offsets){
+		throw std::runtime_error("Error: m_gpu_kernel_offsets is not initialized ");
+	}
+	if (!m_gpu_kernels){
+		throw std::runtime_error("Error: m_gpu_kernels is not initialized ");
+	}
+	if (!m_gpu_images){
+		throw std::runtime_error("Error: m_gpu_images is not initialized ");
+	}
+	if (!m_gpu_ftasks){
+		throw std::runtime_error("Error: m_gpu_ftasks is not initialized ");
+	}
+	if (!m_gpu_clt){
+		throw std::runtime_error("Error: m_gpu_clt is not initialized ");
+	}
+	if (!m_gpu_active_tiles){
+		throw std::runtime_error("Error: m_gpu_active_tiles is not initialized ");
+	}
+	if (!m_gpu_num_active){
+		throw std::runtime_error("Error: m_gpu_num_active is not initialized ");
+	}
+    //create and start CUDA timer
+    StopWatchInterface *timerTP = 0;
+    sdkCreateTimer(&timerTP);
+    dim3 threads_tp(1, 1, 1);
+    dim3 grid_tp(1, 1, 1);
+    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
+    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
+    ///    cudaProfilerStart();
+    float ** fgpu_kernel_offsets = (float **) m_gpu_kernel_offsets; //  [tpParams.num_cams]  [NUM_CAMS];
+    for (int i = i0; i < numIterations; i++) {
+    	if (i == 0) {
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerTP);
+    		sdkStartTimer(&timerTP);
+    	}
+    	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+    			m_tpParams.num_cams,              // int                num_cams,           // actual number of cameras
+				m_tpParams.num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    			fgpu_kernel_offsets,     // struct CltExtra ** gpu_kernel_offsets,
+				m_gpu_kernels,           // float           ** gpu_kernels,
+				m_gpu_images,            // float           ** gpu_images,
+				m_gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				m_gpu_clt,               // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+				dstride/sizeof(float),   // size_t             dstride, // for gpu_images
+				m_tpParams.tp_tasks_size,// int                num_tiles) // number of tiles in task
+				0,                       // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+				m_tpParams.img_width,    // IMG_WIDTH,               // int                woi_width,
+				m_tpParams.img_height,   // IMG_HEIGHT,              // int                woi_height,
+				0, // m_tpParams.kernels_hor, // KERNELS_HOR,           // int                kernels_hor,
+				m_tpParams.kernels_hor,  // KERNELS_VERT,          // int                kernels_vert);
+				m_gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+    			m_gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+				m_tpParams.tilesx); // TILESX); // int                tilesx)
+        printf("HOST: convert_direct() done\n");
+    	getLastCudaError("Kernel execution failed");
+        printf("HOST: convert_direct() done - 1\n");
+    	checkCudaErrors(cudaDeviceSynchronize());
+        printf("HOST: convert_direct() done - 2\n");
+//    	printf("%d\n",i);
+    }
+    sdkStopTimer(&timerTP);
+    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
+    sdkDeleteTimer(&timerTP);
+    int num_active_tiles; // calculated by convert_direct
+	checkCudaErrors(cudaMemcpy(
+			&num_active_tiles,
+			m_gpu_num_active, // make it local?
+			sizeof(int),
+			cudaMemcpyDeviceToHost));
+    printf("Run time =%f ms, num active tiles = %d\n",  avgTime, num_active_tiles);
+    saveClt(
+    		m_tpPaths.ports_clt_file, // const char ** paths,    // tpPaths.ports_clt_file
+			"CLT data",               // const char * prompt,    // "CLT data"
+			m_gpu_clt_h);             // float **     gpu_clt_h);
 }

+void TpHostGpu::saveClt(
+		const char ** paths,    // tpPaths.ports_clt_file
+		const char * prompt,    // "CLT data"
+		float **     gpu_clt_h){ // m_gpu_clt_h
+	if (!paths)	return;
+    int rslt_size = (m_tpParams.tilesy * m_tpParams.tilesx * m_tpParams.num_colors * 4 * m_tpParams.dtt_size * m_tpParams.dtt_size);
+    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
+    for (int ncam = 0; ncam < m_tpParams.num_cams; ncam++) if (paths[ncam]){
+    	checkCudaErrors(cudaMemcpy( // segfault
+    			cpu_clt,
+				m_gpu_clt_h[ncam],
+				rslt_size * sizeof(float),
+    			cudaMemcpyDeviceToHost));
+        printf("Writing CLT data to %s\n",  paths[ncam]);
+    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
+    			rslt_size, // int           size, // length in elements
+				paths[ncam]); // 			   const char *  path) // file path
+    }
+    hfree(cpu_clt);
+}
+
+
+
 void TpHostGpu::testImclt               (int num_runs){ // 682
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+	/*
+
+	 */
 }

 void TpHostGpu::testImcltRbgAll         (int num_runs){ // 701
+	int numIterations = m_tpParams.debug_tile ? 1 : num_runs;
+	int i0 =            m_tpParams.debug_tile ? 0 : -1;
+	if (!m_gpu_geometry_correction){
+		throw std::runtime_error("Error: m_gpu_geometry_correction is not initialized ");
+	}
+	/*
+
+	 */
 }

 void TpHostGpu::testCorrelate2DIntra(int num_runs){

--- a/src/TpHostGpu.h
+++ b/src/TpHostGpu.h
@@ -122,7 +122,7 @@ public:
 	void setTextures();
 	void setRGBA();

-	void testRotMatrices         (int num_runs); // 424
+	trot_deriv testRotMatrices         (int num_runs); // 424
 	void testReverseDistortions  (int num_runs); // 468
 	void testGeomCorrect         (int num_runs); // 534
 	void testConvertDirect       (int num_runs); // 608
@@ -133,6 +133,7 @@ public:
 	void testCorrelate2DIntra    (int num_runs);
 	void testCorrelate2DInterSelf(int num_runs);

+	void saveClt(const char ** paths,  const char * prompt, float **     gpu_clt_h);
 	// for both intra and inter!
 	void saveIntraCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float * gpu_corrs, int * gpu_corr_indices, int num_sel_sensors);
 	void saveInterCorrFile(const char * path, const char * prompt, int num_corrs, int num_corr_indices, float  * gpu_corrs_td,  int * gpu_corr_indices, int num_sel_sensors);

--- a/src/TpParams.h
+++ b/src/TpParams.h
@@ -24,6 +24,9 @@ public:
 	static constexpr int kernels_hor =              KERNELS_HOR;
 	static constexpr int kernels_vert =             KERNELS_VERT;

+	static constexpr int rbyrdist_len =             RBYRDIST_LEN;
+	static constexpr int tiles_per_block_geom =     TILES_PER_BLOCK_GEOM;
+	//
 	static constexpr int task_inter_en =            TASK_INTER_EN; //          10 // Task bit to enable interscene correlation
 	static constexpr int task_corr_en =             TASK_CORR_EN;  //           9 // Task bit to enable intrascene correlation (pairs defined separately)
 	static constexpr int task_text_en =             TASK_TEXT_EN;  //           8 // task bit to enable texture generation
@@ -50,8 +53,10 @@ public:

 #ifdef DBG_TILE
 	static constexpr int debug_tile{1};
+	static constexpr int dbg_tile{DBG_TILE};
 #else
 	static constexpr int debug_tile{0};
+	static constexpr int dbg_tile{-1};
 #endif

 private: