debugging

cde525c8 · Andrey Filippov · 40239aff · cde525c8 · cde525c8 · cde525c8
Commit cde525c8 authored Nov 26, 2021 by Andrey Filippov
5 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1031,7 +1031,11 @@ __global__ void index_direct(
 __global__ void index_correlate(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,           // unused bits should be 0
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int                num_tiles,         // number of tiles in task
@@ -1161,7 +1165,11 @@ __device__ int get_textures_shared_size( // in bytes
 */
 extern "C" __global__ void correlate2D(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
@@ -1184,29 +1192,34 @@ extern "C" __global__ void correlate2D(
 		 *pnum_corr_tiles = 0;
 		 index_correlate<<<blocks0,threads0>>>(
 				 num_cams,            // int               num_cams,
-				 sel_pairs,           // int *             sel_pairs,
+//				 sel_pairs,           // int *             sel_pairs,
+				 sel_pairs0,          // int               sel_pairs0,
+				 sel_pairs1,          // int               sel_pairs1,
+				 sel_pairs2,          // int               sel_pairs2,
+				 sel_pairs3,          // int               sel_pairs3,
 				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 //				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           // int                num_tiles,          // number of tiles in task
 				 tilesx,              // int                width,                // number of tiles in a row
 				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
 				 pnum_corr_tiles);    // int *              pnum_corr_tiles);   // pointer to the length of correlation tasks array
 		 cudaDeviceSynchronize();
-		    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+		 dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-		    dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+		 dim3 grid_corr((*pnum_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-		        correlate2D_inner<<<grid_corr,threads_corr>>>(
+		 correlate2D_inner<<<grid_corr,threads_corr>>>(
-						num_cams,           // int               num_cams,
+				 num_cams,           // int               num_cams,
-		        		gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				 gpu_clt,            // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-						colors,             // int               colors,             // number of colors (3/1)
+				 colors,             // int               colors,             // number of colors (3/1)
-						scale0,             // float             scale0,             // scale for R
+				 scale0,             // float             scale0,             // scale for R
-						scale1,             // float             scale1,             // scale for B
+				 scale1,             // float             scale1,             // scale for B
-						scale2,             // float             scale2,             // scale for G
+				 scale2,             // float             scale2,             // scale for G
-						fat_zero,           // float             fat_zero,           // here - absolute
+				 fat_zero,           // float             fat_zero,           // here - absolute
-		        		*pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
+				 *pnum_corr_tiles,   // size_t            num_corr_tiles,     // number of correlation tiles to process
-		        		gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
+				 gpu_corr_indices,   //  int             * gpu_corr_indices,  // packed tile+pair
-						corr_stride,        // const size_t      corr_stride,        // in floats
+				 corr_stride,        // const size_t      corr_stride,        // in floats
-						corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+				 corr_radius,        // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		        		gpu_corrs);         // float           * gpu_corrs);         // correlation output data
+				 gpu_corrs);         // float           * gpu_corrs);         // correlation output data
 	 }
 }
@@ -2493,7 +2506,11 @@ __global__ void create_nonoverlap_list(
 */
 __global__ void index_correlate(
 		int               num_cams,
-		int *             sel_pairs,           // unused bits should be 0
+//		int *             sel_pairs,           // unused bits should be 0
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int                num_tiles,         // number of tiles in task
@@ -2505,6 +2522,7 @@ __global__ void index_correlate(
 	if (num_tile >= num_tiles){
 		return;
 	}
+    int sel_pairs[] = {sel_pairs0, sel_pairs1, sel_pairs2, sel_pairs3};
 	//	int task_size = get_task_size(num_cams);
 	int task_task =get_task_task(num_tile, gpu_ftasks, num_cams);
 	if (((task_task >> TASK_CORR_BITS) & 1) == 0){ // needs correlation. Maybe just check task_task != 0?

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -65,7 +65,11 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 extern "C" __global__ void correlate2D(
 		int               num_cams,
-		int *             sel_pairs,
+//		int *             sel_pairs,
+		int               sel_pairs0,
+		int               sel_pairs1,
+		int               sel_pairs2,
+		int               sel_pairs3,
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -37,11 +37,16 @@
 */
 #ifndef JCUDA
-#include "tp_defines.h"
+	#include "tp_defines.h"
-#include "dtt8x8.h"
+	#include "dtt8x8.h"
-#include "geometry_correction.h"
+	#include "geometry_correction.h"
 #endif // #ifndef JCUDA
+#ifndef get_task_size
+	#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#endif
 // Using NUM_CAMS threads per tile
 #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
 ///#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -30,6 +30,12 @@
 ** -----------------------------------------------------------------------------**
 */
+#define NOCORR
+#define NOCORR_TD
+#define NOTEXTURES
+#define NOTEXTURE_RGBA
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
@@ -297,11 +303,29 @@ int main(int argc, char **argv)
    const char* rByRDist_file =            "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist";
    const char* correction_vector_file =   "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector";
    const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction";
+	int sel_pairs[4];
+#if TEST_LWIR
+    // testing with 16 LWIR
+    int num_cams =   16;
+    int num_colors = 1;
+    sel_pairs[0] =   0xffffffff;
+    sel_pairs[1] =   0xffffffff;
+    sel_pairs[2] =   0xffffffff;
+    sel_pairs[3] =   0x00ffffff;
+    int num_pairs = 120;
+#else
    // testing with quad RGB
    int num_cams =   4;
    int num_colors = 3;
-	int task_size = sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
+    sel_pairs[0] = 0x3f;
+    sel_pairs[1] =   0;
+    sel_pairs[2] =   0;
+    sel_pairs[3] =   0;
+    int num_pairs =  6;
+#endif
+	int task_size = get_task_size(num_cams); // sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
 	// FIXME: update to use new correlations and num_cams
    float port_offsets4[4][2] =  {// used only in textures to scale differences
@@ -318,7 +342,6 @@ int main(int argc, char **argv)
    	}
    } else {
 		 for (int ncam = 0; ncam < num_cams; ncam++) {
-//			 double alpha = 2 * Math.PI * (i + (topis0 ? 0 : 0.5))/num_sensors;
 			 double alpha = 2 * M_PI * (ncam) /num_cams; // math.h
 			 port_offsets[ncam][0] =  0.5 * sin((alpha));
 			 port_offsets[ncam][1] = -0.5 * cos((alpha));
@@ -338,8 +361,8 @@ int main(int argc, char **argv)
    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
 // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
-    static struct tp_task     task_data   [TILESX*TILESY]; // maximal length - each tile
+///    static struct tp_task     task_data   [TILESX*TILESY]; // maximal length - each tile
-    static struct tp_task     task_data1  [TILESX*TILESY]; // maximal length - each tile
+///    static struct tp_task     task_data1  [TILESX*TILESY]; // maximal length - each tile
    float * ftask_data  = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
    float * ftask_data1  = (float *) malloc(TILESX * TILESY * task_size * sizeof(float));
@@ -387,8 +410,8 @@ int main(int argc, char **argv)
    // GPU pointers to GPU memory
-    struct tp_task  * gpu_tasks;  // TODO: ***** remove ! ****
+///    struct tp_task  * gpu_tasks;  // TODO: ***** remove ! **** DONE
-    float *           gpu_ftasks; // TODO: ***** allocate ! ****
+    float *           gpu_ftasks; // TODO: ***** allocate ! **** DONE
    int *             gpu_active_tiles;
    int *             gpu_num_active;
    int *             gpu_num_corr_tiles;
@@ -473,13 +496,13 @@ int main(int argc, char **argv)
    gpu_corrs = alloc_image_gpu(
    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
 			CORR_SIZE,                      // int width,
-			NUM_PAIRS * TILESX * TILESY);   // int height);
+			num_pairs * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
 // allocate all other correlation data, some may be
    gpu_corrs_td = alloc_image_gpu(
    		&dstride_corr_td,               // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
 			4 * DTT_SIZE * DTT_SIZE,         // int width,
-			NUM_PAIRS * TILESX * TILESY);    // int height);
+			num_pairs * TILESX * TILESY);    // int height);
    gpu_corrs_combo = alloc_image_gpu(
    		&dstride_corr_combo,             // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
@@ -534,7 +557,8 @@ int main(int argc, char **argv)
    for (int ty = 0; ty < TILESY; ty++){
        for (int tx = 0; tx < TILESX; tx++){
            int nt = ty * TILESX + tx;
-            int task_task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
+//            int task_task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
+            int task_task = 0xf | (1 << TASK_CORR_BITS); // just 1 bit, correlation selection is defined by common corr_sel bits
            int task_txy = tx + (ty << 16);
            float task_target_disparity = DBG_DISPARITY;
            float * tp = ftask_data + task_size * nt;
@@ -548,7 +572,7 @@ int main(int argc, char **argv)
        }
    }
-    int tp_task_size =  sizeof(ftask_data)/sizeof(float)/task_size;
+    int tp_task_size =  TILESX * TILESY; // sizeof(ftask_data)/sizeof(float)/task_size; // number of task tiles
    int num_active_tiles; // will be calculated by convert_direct
@@ -574,8 +598,9 @@ int main(int argc, char **argv)
 #endif
    // segfault in the next
-    gpu_tasks =  (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+///    gpu_tasks =  (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
-    gpu_ftasks = (float  *) copyalloc_kernel_gpu((float * ) &ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
+//    gpu_ftasks = (float  *) copyalloc_kernel_gpu((float * ) &ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
+    gpu_ftasks = (float  *) copyalloc_kernel_gpu(ftask_data, tp_task_size * task_size); // (sizeof(struct tp_task)/sizeof(float)));
    // build corr_indices - not needed anymore?
    /*
@@ -599,7 +624,7 @@ int main(int argc, char **argv)
 			NUM_PAIRS * TILESX * TILESY);
    */
    // just allocate
-    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        NUM_PAIRS * TILESX * TILESY*sizeof(int)));
+    checkCudaErrors (cudaMalloc((void **)&gpu_corr_indices,        num_pairs * TILESX * TILESY*sizeof(int)));
    checkCudaErrors (cudaMalloc((void **)&gpu_corrs_combo_indices,             TILESX * TILESY*sizeof(int)));
 //
@@ -849,12 +874,20 @@ int main(int argc, char **argv)
    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
    sdkDeleteTimer(&timerGEOM);
    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+/*
 	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
 			&task_data1,
 			gpu_tasks,
 			tp_task_size * sizeof(struct tp_task),
 			cudaMemcpyDeviceToHost));
+*/
+	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
+			ftask_data1,
+			gpu_ftasks,
+			tp_task_size * task_size *sizeof(float),
+			cudaMemcpyDeviceToHost));
+//task_size
 #if 0 // for manual browsing
 	struct tp_task * old_task = &task_data [DBG_TILE];
 	struct tp_task * new_task = &task_data1[DBG_TILE];
@@ -1079,10 +1112,13 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerCORR);
    		sdkStartTimer(&timerCORR);
    	}
-    	// FIXME: update to provide sel_pairs
    	correlate2D<<<1,1>>>(
    			num_cams,                      // int               num_cams,
-				0,                             // int *             sel_pairs,           // unused bits should be 0
+//				0,                             // int *             sel_pairs,           // unused bits should be 0
+				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
 				gpu_clt,                    // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				3,                          // int               colors,             // number of colors (3/1)
 				color_weights[0], // 0.25,  // float             scale0,             // scale for R
@@ -1156,8 +1192,11 @@ int main(int argc, char **argv)
    	// FIXME: provide sel_pairs
        correlate2D<<<1,1>>>( // output TD tiles, no normalization
        		num_cams,                      // int               num_cams,
-        		0,                             // int *             sel_pairs,           // unused bits should be 0
+				//				0,                             // int *             sel_pairs,           // unused bits should be 0
+				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
+				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
+				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
+				sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 				3,                             // int               colors,             // number of colors (3/1)
 				color_weights[0], // 0.25,     // float             scale0,             // scale for R
@@ -1182,11 +1221,11 @@ int main(int argc, char **argv)
    			gpu_num_corr_tiles,
    			sizeof(int),
    			cudaMemcpyDeviceToHost));
-    	num_corr_combo = num_corrs/NUM_PAIRS;
+    	num_corr_combo = num_corrs/num_pairs;
    	corr2D_combine<<<1,1>>>( // Combine quad (2 hor, 2 vert) pairs
    			num_corr_combo, // tp_task_size,     // int               num_tiles,          // number of tiles to process (each with num_pairs)
-				NUM_PAIRS,                           // int               num_pairs,          // num pairs per tile (should be the same)
+				num_pairs,                           // int               num_pairs,          // num pairs per tile (should be the same)
    			1,                                   // int               init_output,        // !=0 - reset output tiles to zero before accumulating
    			0x0f,                                // int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
 				gpu_corr_indices,                    // int             * gpu_corr_indices,   // packed tile+pair
@@ -1464,7 +1503,8 @@ int main(int argc, char **argv)
    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
    }
-	checkCudaErrors(cudaFree(gpu_tasks));
+//	checkCudaErrors(cudaFree(gpu_tasks));
+	checkCudaErrors(cudaFree(gpu_ftasks));
 	checkCudaErrors(cudaFree(gpu_active_tiles));
 	checkCudaErrors(cudaFree(gpu_num_active));
 	checkCudaErrors(cudaFree(gpu_kernels));

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -41,14 +41,22 @@
 #ifndef JCUDA
 #include <stdio.h>
 #define THREADSX              (DTT_SIZE)
-#define TEST_LWIR                      1
+#define TEST_LWIR                      0
 #define NUM_CAMS                      16 // now maximal number of cameras
-#define NUM_PAIRS                      6
+//#define NUM_PAIRS                      6
 #define NUM_COLORS                     1 //3
-#define IMG_WIDTH                   2592
+// kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
-#define IMG_HEIGHT                  1936
+#if TEST_LWIR
-#define KERNELS_HOR                  164
+	#define IMG_WIDTH                   640
-#define KERNELS_VERT                 123
+	#define IMG_HEIGHT                  512
+	#define KERNELS_HOR                 82 // 80+2
+	#define KERNELS_VERT                66 // 64+2
+#else
+	#define IMG_WIDTH                  2592
+	#define IMG_HEIGHT                 1936
+	#define KERNELS_HOR                 164  // 2592 / 16 + 2
+	#define KERNELS_VERT                123  // 1936 / 16 + 2
+#endif
 #define KERNELS_LSTEP                  4
 #define THREADS_PER_TILE               8
 #define TILES_PER_BLOCK                4
@@ -73,7 +81,7 @@
 #define TASK_TEXTURE_W_BIT             3 // Texture with West  neighbor
 #define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
 #define LIST_TEXTURE_BIT               7 // bit to request texture calculation
-#define CORR_OUT_RAD                   4
+#define CORR_OUT_RAD                   7 // full tile (15x15), was 4 (9x9)
 #define FAT_ZERO_WEIGHT                0.0001 // add to port weights to avoid nan
 #define THREADS_DYNAMIC_BITS           5 // treads in block for CDP creation of the texture list