Mitigating bug, could not resolve

35b71554 · Andrey Filippov · 4398007c · 35b71554 · 35b71554 · 35b71554
Commit 35b71554 authored Dec 19, 2021 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 72 additions and 91 deletions

TileProcessor.cuh src/TileProcessor.cuh +27 -22

TileProcessor.h src/TileProcessor.h +2 -1

test_tp.cu src/test_tp.cu +31 -42

tp_defines.h src/tp_defines.h +12 -26

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1068,7 +1068,6 @@ __global__ void convert_correct_tiles(
 		int                kernels_vert, //);
 		int                tilesx);

-
 extern "C" __global__ void correlate2D_inner(
 		int               num_cams,
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -1077,11 +1076,11 @@ extern "C" __global__ void correlate2D_inner(
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero2,          // here - absolute
-		size_t            num_corr_tiles,     // number of correlation tiles to process
+		int            num_corr_tiles,     // number of correlation tiles to process
 		int             * gpu_corr_indices,   // packed tile+pair
-		const size_t      corr_stride,        // in floats
-		int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		float           * gpu_corrs);          // correlation output data
+		size_t      corr_stride,        // in floats
+		int               corr_radius0,        // radius of the output correlation (7 for 15x15)
+		float           * gpu_corrs);          // correlation output data (either pixel domain or transform domain

 extern "C" __global__ void corr2D_normalize_inner(
 		int               num_corr_tiles,     // number of correlation tiles to process
@@ -1177,12 +1176,12 @@ extern "C" __global__ void correlate2D(
 		float             scale2,            // scale for G
 		float             fat_zero2,           // here - absolute
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
 		int               num_tiles,          // number of tiles in task
 		int               tilesx,             // number of tile rows
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-		const size_t      corr_stride,        // in floats
+//		const size_t      corr_stride,        // in floats
+		size_t      corr_stride,        // in floats
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs)          // correlation output data
 {
@@ -1247,12 +1246,16 @@ extern "C" __global__ void correlate2D_inner(
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero2,          // here - absolute
-		size_t            num_corr_tiles,     // number of correlation tiles to process
+		int            num_corr_tiles,     // number of correlation tiles to process
 		int             * gpu_corr_indices,   // packed tile+pair
-		const size_t      corr_stride,        // in floats
-		int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		size_t         corr_stride,        // in floats
+		int               corr_radius0,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs)          // correlation output data (either pixel domain or transform domain
 {
+//	int corr_radius = corr_radius0 & 0x1f;// minimal "bad"
+//	int corr_radius = corr_radius0 & 0xf; // maximal "good"
+	int corr_radius = corr_radius0 & 0x7; // actual never >7. Still did not understand where is the problem,
+	// providing literal "7" in the call does not fix the problem
 	float scales[3] = {scale0, scale1, scale2};
 	int corr_in_block = threadIdx.y;
 	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block; // 4
@@ -1268,8 +1271,6 @@ extern "C" __global__ void correlate2D_inner(
 	if (corr_pair > pair_list_len){
 		return; // BUG - should not happen
 	}
-//	int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
-//	int cam2 = pairs[corr_pair][1]; // number of the second camera in a pair
 	int cam1 = all_pairs[pair_list_start + corr_pair][0]; // number of the first camera in a pair
 	int cam2 = all_pairs[pair_list_start + corr_pair][1]; // number of the second camera in a pair
    __syncthreads();// __syncwarp();
@@ -1281,6 +1282,7 @@ extern "C" __global__ void correlate2D_inner(
    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
    float * mclt_corr = ((float *) mlt_corrs) +  corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
    resetCorrelation(clt_corr);
+    __syncthreads(); /// ***** Was not here: probably not needed
    for (int color = 0; color < colors; color++){
        // copy clt (frequency domain data)
        float * clt_tile1 = ((float *) clt_tiles1) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
@@ -1380,13 +1382,14 @@ extern "C" __global__ void correlate2D_inner(
 #endif
        } // if (color == 1){ // LPF only after B (nothing in mono)
    } // for (int color = 0; color < colors; color++){
-
+//	corr_radius = 7;
    // Skip normalization, lpf, inverse correction and unfolding if Transform Domain output is required
    if (corr_radius > 0) {
    	normalizeTileAmplitude(
    			clt_corr,   // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 				fat_zero2); // float fat_zero2 ) // fat zero is absolute, scale it outside
    	// Low Pass Filter from constant area (is it possible to replace?)
+        __syncthreads(); /// ***** Was not here: probably not needed

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -1408,8 +1411,6 @@ extern "C" __global__ void correlate2D_inner(
 #endif
 #endif

-
-
    	float *clt = clt_corr + threadIdx.x;
 #pragma unroll
    	for (int q = 0; q < 4; q++){
@@ -1422,7 +1423,7 @@ extern "C" __global__ void correlate2D_inner(
    		}
    	}
    	__syncthreads();// __syncwarp();
-
+//    	corr_radius = 7;

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -1434,6 +1435,7 @@ extern "C" __global__ void correlate2D_inner(
 #endif
 #endif
    	dttii_2d(clt_corr);
+//    has __syncthreads() inside

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -1445,12 +1447,12 @@ extern "C" __global__ void correlate2D_inner(

 #endif
 #endif
-
+    	__syncthreads();
+//    	corr_radius = 7;
    	corrUnfoldTile(
    			corr_radius, // int corr_radius,
 				(float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 				(float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
-
    	__syncthreads();

 #ifdef DBG_TILE
@@ -1467,14 +1469,14 @@ extern "C" __global__ void correlate2D_inner(
 #endif

    	// copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
-    	int size2r1 = 2 * corr_radius + 1;
-    	int len2r1x2r1 = size2r1 * size2r1;
+    	int size2r1 = 2 * corr_radius + 1;  //  15 for full corr tile
+    	int len2r1x2r1 = size2r1 * size2r1; // 225 for full corr tile
    	int corr_tile_offset =  + corr_stride * corr_num;
    	float *mem_corr = gpu_corrs + corr_tile_offset;
 #pragma unroll
    	//    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
    	for (int offs = threadIdx.x; offs < len2r1x2r1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
-    		mem_corr[offs] = mclt_corr[offs];
+    		mem_corr[offs] = mclt_corr[offs]; // copy OK
    	}

    	__syncthreads();
@@ -1751,6 +1753,9 @@ extern "C" __global__ void corr2D_normalize_inner(
 		float             fat_zero2,          // here - absolute, squared
 		int               corr_radius)        // radius of the output correlation (7 for 15x15)
 {
+	corr_radius &= 0x7; // actual never >7. Still did not understand where is the problem,
+	// providing literal "7" in the call does not fix the problem
+
 	int corr_in_block = threadIdx.y;
 	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK_NORMALIZE + corr_in_block; // 4
 	if (corr_num >= num_corr_tiles){

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -82,7 +82,8 @@ extern "C" __global__ void correlate2D(
 		int               tilesx,             // number of tile rows
 		int             * gpu_corr_indices,   // packed tile+pair
 		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-		const size_t      corr_stride,        // in floats
+		size_t      corr_stride,        // in floats
+//		int               corr_stride,        // in floats
 		int               corr_radius,        // radius of the output correlation (7 for 15x15)
 		float           * gpu_corrs);          // correlation output data


--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -30,7 +30,7 @@
 ** -----------------------------------------------------------------------------**
 */

-#define NOCORR
+// #define NOCORR
 //#define NOCORR_TD
 //#define NOTEXTURES_HOST
 #define NOTEXTURES
@@ -53,6 +53,25 @@
 #include "geometry_correction.h"
 #include "TileProcessor.cuh"

+#if TEST_LWIR
+	#define IMG_WIDTH                   640
+	#define IMG_HEIGHT                  512
+	#define KERNELS_HOR                 82 // 80+2
+	#define KERNELS_VERT                66 // 64+2
+#else
+	#define IMG_WIDTH                  2592
+	#define IMG_HEIGHT                 1936
+	#define KERNELS_HOR                 164  // 2592 / 16 + 2
+	#define KERNELS_VERT                123  // 1936 / 16 + 2
+#endif
+#define CORR_OUT_RAD                   7 // full tile (15x15), was 4 (9x9)
+#define DBG_DISPARITY                  0.0 // 56.0//   0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
+// only used in C++ test
+#define TILESX        (IMG_WIDTH / DTT_SIZE)
+#define TILESY        (IMG_HEIGHT / DTT_SIZE)
+#define TILESYA       ((TILESY +3) & (~3))
+
+

 float * copyalloc_kernel_gpu(float * kernel_host,
 		                int size, // size in floats
@@ -1259,23 +1278,9 @@ int main(int argc, char **argv)
 	struct tp_task * old_task = &task_data [DBG_TILE];
 	struct tp_task * new_task = &task_data1[DBG_TILE];
 #endif
-//    printf("old_task txy = 0x%x\n",  task_data [DBG_TILE].txy);
-//    printf("new_task txy = 0x%x\n",  task_data1[DBG_TILE].txy);
+#ifdef 	DBG_TILE
    printf("old_task txy = 0x%x\n", *(int *) (ftask_data + task_size * DBG_TILE + 1)) ; // task_data [DBG_TILE].txy);
    printf("new_task txy = 0x%x\n", *(int *) (ftask_data1 + task_size * DBG_TILE + 1)) ; // task_data1[DBG_TILE].txy);
-
-    /*
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++){
-        printf("camera %d pX old %f new %f diff = %f\n", ncam,
-        		task_data [DBG_TILE].xy[ncam][0],  task_data1[DBG_TILE].xy[ncam][0],
-				task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
-        printf("camera %d pY old %f new %f diff = %f\n", ncam,
-        		task_data [DBG_TILE].xy[ncam][1],  task_data1[DBG_TILE].xy[ncam][1],
-				task_data [DBG_TILE].xy[ncam][1]-  task_data1[DBG_TILE].xy[ncam][1]);
-    }
-    */
-
-    //  for (int ncam = 0; ncam < NUM_CAMS; ncam++){
        for (int ncam = 0; ncam < num_cams; ncam++){
            printf("camera %d pX old %f new %f diff = %f\n", ncam,
            		 *(ftask_data  + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 0),
@@ -1288,16 +1293,7 @@ int main(int argc, char **argv)
           		 (*(ftask_data + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)) -
           		 (*(ftask_data1 + task_size * DBG_TILE + tp_task_xy_offset + 2*ncam + 1)));
        }
-
-
-#if 0
-    // temporarily restore tasks
-    checkCudaErrors(cudaMemcpy(
-    		gpu_tasks,
-			&task_data,
-			tp_task_size * sizeof(struct tp_task),
-            cudaMemcpyHostToDevice));
-#endif
+#endif //#ifdef 	DBG_TILE
 #endif // TEST_GEOM_CORR


@@ -1305,13 +1301,8 @@ int main(int argc, char **argv)
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);

-#if 0
-    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
-    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
-#else
    dim3 threads_tp(1, 1, 1);
    dim3 grid_tp(1, 1, 1);
-#endif
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);

@@ -1333,7 +1324,6 @@ int main(int argc, char **argv)
 				gpu_kernels,           // float           ** gpu_kernels,
 				gpu_images,            // float           ** gpu_images,
 				gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				gpu_tasks,             // struct tp_task   * gpu_tasks,
 				gpu_clt,               // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
 				dstride/sizeof(float), // size_t             dstride, // for gpu_images
 				tp_task_size,          // int                num_tiles) // number of tiles in task
@@ -1373,12 +1363,10 @@ int main(int argc, char **argv)
 				gpu_clt_h[ncam],
 				rslt_size * sizeof(float),
    			cudaMemcpyDeviceToHost));
-//#ifndef DBG_TILE
        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
    			rslt_size, // int           size, // length in elements
 				ports_clt_file[ncam]); // 			   const char *  path) // file path
-//#endif
    }
 #endif

@@ -1453,14 +1441,11 @@ int main(int argc, char **argv)
 //				3* (IMG_HEIGHT + DTT_SIZE),
 				num_colors* (IMG_HEIGHT + DTT_SIZE),
    			cudaMemcpyDeviceToHost));
-
-///#ifndef DBG_TILE
        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
    	writeFloatsToFile( // will have margins
    			cpu_corr_image, // float *       data, // allocated array
 				rslt_img_size, // int           size, // length in elements
 				result_rbg_file[ncam]); // 			   const char *  path) // file path
-///#endif
    }

    free(cpu_corr_image);
@@ -1482,7 +1467,6 @@ int main(int argc, char **argv)
    	}
    	correlate2D<<<1,1>>>(
    			num_cams,                      // int               num_cams,
-//				0,                             // int *             sel_pairs,           // unused bits should be 0
 				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
 				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
 				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
@@ -1778,7 +1762,7 @@ int main(int argc, char **argv)

 #ifndef NSAVE_CORR
    printf("Writing phase correlation data to %s, width = %d, height=%d, slices=%d, length=%ld bytes\n",
-    		result_corr_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
+    		result_corr_td_norm_file, (TILESX*16),(TILESYA*16), num_pairs, (corr_img_size * sizeof(float)) ) ;
    writeFloatsToFile(
    		corr_img,                  // float *       data, // allocated array
 			corr_img_size,             // int           size, // length in elements
@@ -1985,8 +1969,7 @@ int main(int argc, char **argv)
    		    				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
    							diff_rgb_combo_size,    // int           size, // length in elements
    							result_diff_rgb_combo_file); // 			   const char *  path) // file path
-
-    		//DBG_TILE
+#ifdef 	DBG_TILE
    		#ifdef DEBUG10
    		    		int texture_offset = DBG_TILE * tile_texture_size;
    		    		int chn = 0;
@@ -2002,6 +1985,7 @@ int main(int argc, char **argv)
    		    			}
    		    		}
    		#endif // DEBUG9
+#endif //#ifdef 	DBG_TILE
    		#endif
    		    		free(cpu_textures);
    		    		free (cpu_diff_rgb_combo);
@@ -2108,7 +2092,7 @@ int main(int argc, char **argv)
 					diff_rgb_combo_size,    // int           size, // length in elements
 					result_diff_rgb_combo_file); // 			   const char *  path) // file path

-//DBG_TILE
+#ifdef 	DBG_TILE
 #ifdef DEBUG10
    		int texture_offset = DBG_TILE * tile_texture_size;
    		int chn = 0;
@@ -2124,6 +2108,8 @@ int main(int argc, char **argv)
    			}
    		}
 #endif // DEBUG9
+#endif //#ifdef 	DBG_TILE
+
 #endif
    		free(cpu_textures);
    		free (cpu_diff_rgb_combo);
@@ -2243,6 +2229,8 @@ int main(int argc, char **argv)
 			rslt_rgba_size,    // int           size, // length in elements
 			result_textures_rgba_file); // 			   const char *  path) // file path
 #endif
+
+#ifdef 	DBG_TILE
 #ifdef DEBUG11
    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
    for (int chn = 0; chn < rbga_slices; chn++){
@@ -2257,6 +2245,7 @@ int main(int argc, char **argv)
    	}
    }
 #endif // DEBUG11
+#endif //#ifdef 	DBG_TILE
    free(cpu_textures_rgba);
 #endif // ifndef NOTEXTURES


--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -41,22 +41,11 @@
 #ifndef JCUDA
 #include <stdio.h>
 #define THREADSX              (DTT_SIZE)
-#define TEST_LWIR                      1
 #define NUM_CAMS                      16 // now maximal number of cameras
 //#define NUM_PAIRS                      6
 //#define NUM_COLORS                     1 //3
 // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
-#if TEST_LWIR
-	#define IMG_WIDTH                   640
-	#define IMG_HEIGHT                  512
-	#define KERNELS_HOR                 82 // 80+2
-	#define KERNELS_VERT                66 // 64+2
-#else
-	#define IMG_WIDTH                  2592
-	#define IMG_HEIGHT                 1936
-	#define KERNELS_HOR                 164  // 2592 / 16 + 2
-	#define KERNELS_VERT                123  // 1936 / 16 + 2
-#endif
+#define TEST_LWIR                      1
 #define KERNELS_LSTEP                  4
 #define THREADS_PER_TILE               8
 #define TILES_PER_BLOCK                4
@@ -79,30 +68,22 @@
 #define TASK_TEXTURE_E_BIT             1 // Texture with East  neighbor
 #define TASK_TEXTURE_S_BIT             2 // Texture with South neighbor
 #define TASK_TEXTURE_W_BIT             3 // Texture with West  neighbor
-#define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
+//#define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
 #define LIST_TEXTURE_BIT               7 // bit to request texture calculation
-#define CORR_OUT_RAD                   7 // full tile (15x15), was 4 (9x9)
+//#define CORR_OUT_RAD                 7 // full tile (15x15), was 4 (9x9)
 #define FAT_ZERO_WEIGHT                0.0001 // add to port weights to avoid nan

 #define THREADS_DYNAMIC_BITS           5 // treads in block for CDP creation of the texture list

-#define DBG_DISPARITY                  0.0 // 56.0//   0.0 // 56.0 // disparity for which to calculate offsets (not needed in Java)
 #define RBYRDIST_LEN                5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
 #define RBYRDIST_STEP                  0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
 #define TILES_PER_BLOCK_GEOM          (32/NUM_CAMS)   // each tile has NUM_CAMS threads

-// only used in C++ test
-#define TILESX        (IMG_WIDTH / DTT_SIZE)
-#define TILESY        (IMG_HEIGHT / DTT_SIZE)
-#define TILESYA       ((TILESY +3) & (~3))
-
-


-#define DEBUG_OOB1 1
-
+#ifdef 	DEBUG_ANY
+//#define DEBUG_OOB1 1
 // Use CORR_OUT_RAD for the correlation output
-
 //#define DBG_TILE_X     40
 //#define DBG_TILE_Y     80
 #if TEST_LWIR
@@ -119,6 +100,8 @@

 //#undef HAS_PRINTF
 #define HAS_PRINTF
+
+
 //7
 //#define DEBUG1 1
 //#define DEBUG2 1
@@ -146,8 +129,8 @@


 #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
-#define DEBUG20 1 // Geometry Correction
-#define DEBUG21 1 // Geometry Correction
+//#define DEBUG20 1 // Geometry Correction
+//#define DEBUG21 1 // Geometry Correction
 //#define DEBUG210 1
 ////#define DEBUG30 1
 //#define DEBUG22 1
@@ -155,5 +138,8 @@

 #endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)

+#endif //#ifdef 	DEBUG_ANY
+
+
 #endif //#ifndef JCUDA