debugging

cde525c8 · Andrey Filippov · 40239aff · cde525c8 · cde525c8 · cde525c8
Commit cde525c8 authored Nov 26, 2021 by Andrey Filippov
5 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1031,7 +1031,11 @@ __global__ void index_direct(
 __global__ void index_correlate(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,           // unused bits should be 0
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int                num_tiles,         // number of tiles in task
@@ -1161,7 +1165,11 @@ __device__ int get_textures_shared_size( // in bytes
 */
 extern "C" __global__ void correlate2D(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
@@ -1184,29 +1192,34 @@ extern "C" __global__ void correlate2D(
 		 *pnum_corr_tiles = 0;
 		 index_correlate<<<blocks0,threads0>>>(
 				 num_cams,            // int               num_cams,
-				 sel_pairs,           // int *             sel_pairs,
+//				 sel_pairs,           // int *             sel_pairs,
+				 sel_pairs0,          // int               sel_pairs0,
+				 sel_pairs1,          // int               sel_pairs1,
+				 sel_pairs2,          // int               sel_pairs2,
+				 sel_pairs3,          // int               sel_pairs3,
 				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 //				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           // int                num_tiles,          // number of tiles in task
 				 tilesx,              // int                width,                // number of tiles in a row
 				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 				 pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
 		 cudaDeviceSynchronize();
-		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		 dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-		    dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+		 dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-		        correlate2D_inner<<<grid_corr,threads_corr>>>(
+		 correlate2D_inner<<<grid_corr,threads_corr>>>(
-						num_cams,           // int               num_cams,
+				 num_cams,           // int               num_cams,
-		        		gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				 gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-						colors,             // int               colors,             // number of colors (3/1)
+				 colors,             // int               colors,             // number of colors (3/1)
-						scale0,             // float             scale0,             // scale for R
+				 scale0,             // float             scale0,             // scale for R
-						scale1,             // float             scale1,             // scale for B
+				 scale1,             // float             scale1,             // scale for B
-						scale2,             // float             scale2,             // scale for G
+				 scale2,             // float             scale2,             // scale for G
-						fat_zero,           // float             fat_zero,           // here - absolute
+				 fat_zero,           // float             fat_zero,           // here - absolute
-		        		*pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+				 *pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
-		        		gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
+				 gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
-						corr_stride,        // const size_t      corr_stride,        // in floats
+				 corr_stride,        // const size_t      corr_stride,        // in floats
-						corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+				 corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		        		gpu_corrs);         // float           * gpu_corrs);         // correlation output data
+				 gpu_corrs);         // float           * gpu_corrs);         // correlation output data
 	 }
 }
@@ -2493,7 +2506,11 @@ __global__ void create_nonoverlap_list(
 */
 __global__ void index_correlate(
 		int               num_cams,
-		int *             sel_pairs,           // unused bits should be 0
+//		int *             sel_pairs,           // unused bits should be 0
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int                num_tiles,         // number of tiles in task
@@ -2505,6 +2522,7 @@ __global__ void index_correlate(
 	if (num_tile >= num_tiles){
 		return;
 	}
+    int sel_pairs[] = {sel_pairs0, sel_pairs1, sel_pairs2, sel_pairs3};
 	//	int task_size = get_task_size(num_cams);
 	int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
 	if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -65,7 +65,11 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 extern "C" __global__ void correlate2D(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -37,11 +37,16 @@
 */
 #ifndef JCUDA
-#include "tp_defines.h"
+	#include "tp_defines.h"
-#include "dtt8x8.h"
+	#include "dtt8x8.h"
-#include "geometry_correction.h"
+	#include "geometry_correction.h"
 #endif // #ifndef JCUDA
+#ifndef get_task_size
+	#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#endif
 // Using NUM_CAMS threads per tile
 #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
 ///#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -41,14 +41,22 @@
 #ifndef JCUDA
 #include <stdio.h>
 #define THREADSX              (DTT_SIZE)
-#define TEST_LWIR                      1
+#define TEST_LWIR                      0
 #define NUM_CAMS                      16 // now maximal number of cameras
-#define NUM_PAIRS                      6
+//#define NUM_PAIRS                      6
 #define NUM_COLORS                     1 //3
-#define IMG_WIDTH                   2592
+// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
-#define IMG_HEIGHT                  1936
+#if TEST_LWIR
-#define KERNELS_HOR                  164
+	#define IMG_WIDTH                   640
-#define KERNELS_VERT                 123
+	#define IMG_HEIGHT                  512
+	#define KERNELS_HOR                 82 // 80+2
+	#define KERNELS_VERT                66 // 64+2
+#else
+	#define IMG_WIDTH                  2592
+	#define IMG_HEIGHT                 1936
+	#define KERNELS_HOR                 164  // 2592 / 16 + 2
+	#define KERNELS_VERT                123  // 1936 / 16 + 2
+#endif
 #define KERNELS_LSTEP                  4
 #define THREADS_PER_TILE               8
 #define TILES_PER_BLOCK                4
@@ -73,7 +81,7 @@
 #define TASK_TEXTURE_W_BIT             3 // Texture with West  neighbor
 #define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
 #define LIST_TEXTURE_BIT               7 // bit to request texture calculation
-#define CORR_OUT_RAD                   4
+#define CORR_OUT_RAD                   7 // full tile (15x15), was 4 (9x9)
 #define FAT_ZERO_WEIGHT                0.0001 // add to port weights to avoid nan
 #define THREADS_DYNAMIC_BITS           5 // treads in block for CDP creation of the texture list