refactoring for CDP2

de3c497a · Andrey Filippov · f8839287 · de3c497a · de3c497a · de3c497a
Commit de3c497a authored Feb 18, 2025 by Andrey Filippov
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 304 additions and 151 deletions

TileProcessor.cuh src/TileProcessor.cuh +282 -142

geometry_correction.cu src/geometry_correction.cu +3 -3

test_tp.cu src/test_tp.cu +19 -6

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -40,6 +40,9 @@
 	#include "tp_defines.h"
 	#include "dtt8x8.h"
 	#include "geometry_correction.h"
+
+//	#include "TileProcessor.h"
+
 #endif // #ifndef JCUDA

 #ifndef get_task_size
@@ -337,9 +340,6 @@ extern "C" __global__ void calculate_tiles_offsets(
 				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);

 	}
-//	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
 }



--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -33,9 +33,9 @@
 // all of the next 5 were disabled
 #define NOCORR
 #define NOCORR_TD
-#define NOTEXTURES
-#define NOTEXTURE_RGBA
-#define NOTEXTURE_RGBAXXX
+//#define NOTEXTURES
+//#define NOTEXTURE_RGBA
+//#define NOTEXTURE_RGBAXXX


 #define SAVE_CLT
@@ -574,11 +574,23 @@ void generate_RBGA_host(

 		 int border_tile =  pass >> 2;
 		 int ntt = *(cpu_num_texture_tiles + ((pass & 3) << 1) + border_tile);
+		 int *pntt = gpu_num_texture_tiles + ((pass & 3) << 1) + border_tile;
 		 dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // TEXTURE_TILES_PER_BLOCK = 1
+
+		 /* before CDP2
 		 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
 		 if (border_tile){
 			 ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
 		 }
+		 */
+		 // for CDP2
+		 int ti_offset = (pass & 3) * (width * (tilesya >> 2)); //  (TILES-X * (TILES-YA >> 2));  // 1/4
+		 if (border_tile){
+//	    	ti_offset += width * (tilesya >> 2) - ntt; // TILES-X * (TILES-YA >> 2) - ntt;
+			 ti_offset += width * (tilesya >> 2); // TILES-X * (TILES-YA >> 2) - ntt;
+			 ti_offset = - ti_offset; // does not depend on results of the previous kernel, but is negative
+		 }
+
 #ifdef DEBUG8A
 		 printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
 				 pass, border_tile,ti_offset, ntt);
@@ -604,7 +616,7 @@ void generate_RBGA_host(
 				 num_cams,                        // int               num_cams,           // number of cameras used
 				 gpu_woi,                             // int             * woi,                // x, y, width,height
 				 gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				 ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
+				 pntt, // ntt,                    // int *             num_texture_tiles,  // number of texture tiles to process
 				 ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
 				 gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 //				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
@@ -1451,10 +1463,11 @@ int main(int argc, char **argv)
 				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
    			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
 				TILESX); // int                tilesx)
-
-
+        printf("HOST: convert_direct() done\n");
    	getLastCudaError("Kernel execution failed");
+        printf("HOST: convert_direct() done - 1\n");
    	checkCudaErrors(cudaDeviceSynchronize());
+        printf("HOST: convert_direct() done - 2\n");
 //    	printf("%d\n",i);
    }
    sdkStopTimer(&timerTP);