More debugging

c1955bd8 · Andrey Filippov · 7d01b009 · c1955bd8 · c1955bd8 · c1955bd8
Commit c1955bd8 authored Jun 19, 2022 by Andrey Filippov
5 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -65,8 +65,11 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_kernels;
 	private CUdeviceptr gpu_kernel_offsets;
 	private CUdeviceptr gpu_bayer;
-//	private CUdeviceptr gpu_tasks;
-	private CUdeviceptr gpu_ftasks;
+	private CUdeviceptr gpu_ftasks=null;
+	private int gpu_ftasks_len = 0;
+	private CUdeviceptr gpu_active_tiles=null; // will be re-allocated with gpu_ftasks
 	private CUdeviceptr gpu_corrs;
 	private CUdeviceptr gpu_corr_weights;
 	private CUdeviceptr gpu_corrs_td;
@@ -98,7 +101,7 @@ public class GpuQuad{ // quad camera description
 	private CUdeviceptr gpu_rot_deriv;
 	private CUdeviceptr gpu_geometry_correction;
 	private CUdeviceptr gpu_rByRDist;
-	private CUdeviceptr gpu_active_tiles;
+//	private CUdeviceptr gpu_active_tiles;
 	private CUdeviceptr gpu_num_active_tiles;
 	private int mclt_stride;
 	private int corr_stride;
@@ -274,8 +277,7 @@ public class GpuQuad{ // quad camera description
 		gpu_kernels =             new CUdeviceptr();
 		gpu_kernel_offsets =      new CUdeviceptr();
 		gpu_bayer =               new CUdeviceptr();
-//		gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT
+///		gpu_ftasks =              new CUdeviceptr(); //  allocate tilesX * tilesY * getTaskSize() * Sizeof.FLOAT
-		gpu_ftasks =              new CUdeviceptr(); //  allocate tilesX * tilesY * getTaskSize() * Sizeof.FLOAT
 		gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.FLOAT
 		gpu_corr_weights =        new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT
@@ -284,8 +286,6 @@ public class GpuQuad{ // quad camera description
 		gpu_corrs_combo_td =      new CUdeviceptr(); //  allocate tilesX * tilesY *             4 * DTT_SIZE * DTT_SIZE * Sizeof.FLOAT
 		gpu_textures =            new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
-///		gpu_clt =                 new CUdeviceptr();
-///		gpu_4_images =            new CUdeviceptr();
 		gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
 		// May add separate gpu_corr_indices_td here
 		gpu_corr_combo_indices =  new CUdeviceptr(); //  allocate tilesX * tilesY * 1 * Sizeof.FLOAT            
@@ -309,7 +309,7 @@ public class GpuQuad{ // quad camera description
 		gpu_geometry_correction=  new CUdeviceptr();
 		gpu_rByRDist=             new CUdeviceptr(); //  calculated once for the camera distortion model in CPU (move to GPU?)
-		gpu_active_tiles =        new CUdeviceptr(); //  TILESX*TILESY*sizeof(int)
+///		gpu_active_tiles =        new CUdeviceptr(); //  TILESX*TILESY*sizeof(int)
 		gpu_num_active_tiles =    new CUdeviceptr(); //  1 int
 		// Init data arrays for all kernels
@@ -330,19 +330,6 @@ public class GpuQuad{ // quad camera description
 					Sizeof.FLOAT);            // int ElementSizeBytes)
 			mclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
-// Maybe move _bayer to use variable width/height as gpu_clt, gpu_corr_images_h	
-			/*
-			gpu_corr_images_h[ncam] =  new CUdeviceptr();
-			cuMemAllocPitch (
-					gpu_corr_images_h[ncam],               // CUdeviceptr dptr,
-					device_stride,                         // long[] pPitch,
-					(img_width + GPUTileProcessor.DTT_SIZE) * Sizeof.FLOAT, // long WidthInBytes,
-					3*(img_height + GPUTileProcessor.DTT_SIZE),// long Height,
-					Sizeof.FLOAT);            // int ElementSizeBytes)
-			imclt_stride = (int)(device_stride[0] / Sizeof.FLOAT);
-			*/
-///			gpu_clt_h[ncam] = new CUdeviceptr();
-///			cuMemAlloc(gpu_clt_h[ncam],tilesY * tilesX * num_colors * 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE * Sizeof.FLOAT ); //     public static int cuMemAlloc(CUdeviceptr dptr, long bytesize)
 		}
 		// now create device arrays pointers
 		if (Sizeof.POINTER != Sizeof.LONG) {
@@ -353,14 +340,10 @@ public class GpuQuad{ // quad camera description
 		cuMemAlloc(gpu_kernels,        num_cams * Sizeof.POINTER);
 		cuMemAlloc(gpu_kernel_offsets, num_cams * Sizeof.POINTER);
 		cuMemAlloc(gpu_bayer,          num_cams * Sizeof.POINTER);
-///		cuMemAlloc(gpu_clt,            num_cams * Sizeof.POINTER);
-///		cuMemAlloc(gpu_4_images,       num_cams * Sizeof.POINTER);
 		long [] gpu_kernels_l =        new long [num_cams];
 		long [] gpu_kernel_offsets_l = new long [num_cams];
 		long [] gpu_bayer_l =          new long [num_cams];
-///		long [] gpu_clt_l =            new long [num_cams];
-///		long [] gpu_4_images_l =       new long [num_cams];
 		for (int ncam = 0; ncam < num_cams; ncam++) gpu_kernels_l[ncam] =        GPUTileProcessor.getPointerAddress(gpu_kernels_h[ncam]);
 		cuMemcpyHtoD(gpu_kernels, Pointer.to(gpu_kernels_l),                     num_cams * Sizeof.POINTER);
@@ -371,23 +354,16 @@ public class GpuQuad{ // quad camera description
 		for (int ncam = 0; ncam < num_cams; ncam++) gpu_bayer_l[ncam] =          GPUTileProcessor.getPointerAddress(gpu_bayer_h[ncam]);
 		cuMemcpyHtoD(gpu_bayer, Pointer.to(gpu_bayer_l),                         num_cams * Sizeof.POINTER);
-///		for (int ncam = 0; ncam < num_cams; ncam++) gpu_clt_l[ncam] =            GPUTileProcessor.getPointerAddress(gpu_clt_h[ncam]);
-///		cuMemcpyHtoD(gpu_clt, Pointer.to(gpu_clt_l),                             num_cams * Sizeof.POINTER);
-///		for (int ncam = 0; ncam < num_cams; ncam++) gpu_4_images_l[ncam] =       GPUTileProcessor.getPointerAddress(gpu_corr_images_h[ncam]);
-///		cuMemcpyHtoD(gpu_4_images, Pointer.to(gpu_4_images_l),                   num_cams * Sizeof.POINTER);
 		// Set GeometryCorrection data
 		cuMemAlloc(gpu_geometry_correction,      GeometryCorrection.arrayLength(GPUTileProcessor.MAX_NUM_CAMS) * Sizeof.FLOAT); // always maximal number of cameras (sparse)
 		cuMemAlloc(gpu_rByRDist,                 GPUTileProcessor.RBYRDIST_LEN *  Sizeof.FLOAT);
 		cuMemAlloc(gpu_rot_deriv,                5*GPUTileProcessor.MAX_NUM_CAMS*3*3 * Sizeof.FLOAT); // always maximal number of cameras (sparse)
-		//        	cuMemAlloc(gpu_correction_vector,        CorrVector.LENGTH * Sizeof.FLOAT);
 		cuMemAlloc(gpu_correction_vector,        GPUTileProcessor.CORR_VECTOR_MAX_LENGTH * Sizeof.FLOAT); // update CORR_VECTOR_LENGTH to fit 
 		// Set task array
-//		cuMemAlloc(gpu_tasks,      tilesX * tilesY * GPUTileProcessor.TPTASK_SIZE * Sizeof.FLOAT);
+//		cuMemAlloc(gpu_ftasks,      tilesX * tilesY * getTaskSize() * Sizeof.FLOAT);
-		cuMemAlloc(gpu_ftasks,      tilesX * tilesY * getTaskSize() * Sizeof.FLOAT);
+//		checkAllocateGpuFtasks((tilesX+4) * (tilesY+4)); // +4 - "a little more" (will reallocate if needed)
+		checkAllocateGpuFtasks(tilesX * tilesY); // +4 - "a little more" (will reallocate if needed)
 		//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
 		// Set corrs array
 		int num_pairs = Correlation2d.getNumPairs(quadCLT.getNumSensors());
@@ -412,8 +388,8 @@ public class GpuQuad{ // quad camera description
 		cuMemAlloc(gpu_num_texture_ovlp,                  8 * Sizeof.FLOAT);
 		cuMemAlloc(gpu_texture_indices_len,               1 * Sizeof.FLOAT);
+// will be dynamically allocated with gpu_ftasks
-		cuMemAlloc(gpu_active_tiles,        tilesX * tilesY * Sizeof.FLOAT);
+//		cuMemAlloc(gpu_active_tiles,        tilesX * tilesY * Sizeof.FLOAT);
 		cuMemAlloc(gpu_num_active_tiles,                  1 * Sizeof.FLOAT);
 		cuMemAlloc(gpu_corr_weights, num_pairs* tilesX * tilesY * Sizeof.FLOAT);
@@ -473,6 +449,33 @@ public class GpuQuad{ // quad camera description
 				Sizeof.FLOAT);                         // int ElementSizeBytes)
 		texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
 	}
+	private void checkAllocateGpuFtasks(int max_tasks) {
+/*
+ * Got jcuda.CudaException: CUDA_ERROR_MISALIGNED_ADDRESS
+	at jcuda.driver.JCudaDriver.checkResult(JCudaDriver.java:396)
+	at jcuda.driver.JCudaDriver.cuCtxSynchronize(JCudaDriver.java:2426)
+	at com.elphel.imagej.gpu.GpuQuad.execConvertDirect(GpuQuad.java:1596)
+	Will try rounding to multiple of smth (was 0x80)
+ */
+		max_tasks = (max_tasks + 0x7f) & ~0x7f; 
+		if (max_tasks > gpu_ftasks_len) {
+			if (gpu_ftasks != null) {
+				cuMemFree (gpu_ftasks);
+			}
+			if (gpu_active_tiles != null) {
+				cuMemFree (gpu_active_tiles);
+			}
+			gpu_ftasks =       new CUdeviceptr();
+			cuMemAlloc(gpu_ftasks, max_tasks * getTaskSize() * Sizeof.FLOAT);
+			gpu_active_tiles = new CUdeviceptr(); 
+			cuMemAlloc(gpu_active_tiles, max_tasks * Sizeof.FLOAT);
+			gpu_ftasks_len = max_tasks;
+		}
+	}
 	public int getTilesX() {
 		return getImageWidth() / GPUTileProcessor.DTT_SIZE;
 	}
@@ -619,7 +622,8 @@ public class GpuQuad{ // quad camera description
 			)
 	{
 		if (verify) checkTasks(tile_tasks);
-		num_task_tiles = tile_tasks.length;
+		num_task_tiles = tile_tasks.length; // only place that modifies num_task_tiles and may require allocation
+		checkAllocateGpuFtasks(num_task_tiles); 
 		int task_size = getTaskSize();
 		float [] ftasks = new float [task_size * num_task_tiles];
 		for (int i = 0; i < num_task_tiles; i++) {
@@ -645,7 +649,7 @@ public class GpuQuad{ // quad camera description
 			boolean use_aux    // while is it in class member? - just to be able to free
 			)
 	{
-		num_task_tiles = tile_tasks.length;
+		num_task_tiles = tile_tasks.length; // does not require re-allocation, as tile_tasks should be already set to GPU
 		int task_size = getTaskSize();
 		float [] ftasks = new float [task_size * num_task_tiles];
 		cuMemcpyDtoH(Pointer.to(ftasks), gpu_ftasks, task_size * num_task_tiles * Sizeof.FLOAT);

--- a/src/main/java/com/elphel/imagej/tileprocessor/IntersceneMatchParameters.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/IntersceneMatchParameters.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/OpticalFlow.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/QuadCLT.java
@@ -221,9 +221,10 @@ public class QuadCLT extends QuadCLTCPU {
 	 */
 	public static double [] removeDisparityOutliersByLMA(
 			final double [][] dls,
-			final double      max_strength,  // do not touch stronger
+			final double      max_strength,        // do not touch stronger
 			final double      diff_from_lma_pos,   // Difference from farthest FG objects (OK to have large, e.g. 100)
 			final double      diff_from_lma_neg,   // Difference from nearest BG objects (small, as FG are usually more visible)
+			final int         search_radius,       // Search farther if no LMA neighbor is found closer. Original value - 1 (8 neighbors)
 			final boolean     remove_no_lma_neib,  // remove without LMA neighbors
 			final int         width,               //tilesX
 			final int         threadsMax,
@@ -238,7 +239,7 @@ public class QuadCLT extends QuadCLTCPU {
 		final Thread[] threads = ImageDtt.newThreadArray(threadsMax);
 		final AtomicInteger ai = new AtomicInteger(0);
 		final AtomicInteger anum_updated = new AtomicInteger(0);
-		final int dbg_tile = 1235;
+		final int dbg_tile = 1944;
 		anum_updated.set(0);
 		for (int ithread = 0; ithread < threads.length; ithread++) {
 			threads[ithread] = new Thread() {
@@ -250,22 +251,28 @@ public class QuadCLT extends QuadCLTCPU {
 						if (Double.isNaN(disparity_lma[nTile]) && !Double.isNaN(disparity[nTile]) && (strength[nTile] < max_strength)) {
 							double best_fit_pos = Double.NaN; // Closest higher disparity than this
 							double best_fit_neg = Double.NaN; // Closest lower disparity than this
-							for (int dir = 0; dir < 8; dir++) {
+							for (int rad = 1; rad <= search_radius; rad++) {
-								int ineib = tn.getNeibIndex(nTile, dir);
+								int numdir = TileNeibs.getNumDirs(rad);
-								if (    (ineib >= 0) &&
+								for (int dir = 0; dir < numdir; dir++) {
-										!Double.isNaN(disparity_lma[ineib]) &&
+									int ineib = tn.getNeibIndexRadius(nTile, dir, rad);
-										!Double.isNaN(disparity[ineib])) {
+									if (    (ineib >= 0) &&
-									double d = disparity[nTile] - disparity_lma[ineib];
+											!Double.isNaN(disparity_lma[ineib]) &&
-									if (d > 0) {
+											!Double.isNaN(disparity[ineib])) {
-										if (!(d >= best_fit_neg)) {
+										double d = disparity[nTile] - disparity_lma[ineib];
-											best_fit_neg = d;
+										if (d > 0) {
-										}
+											if (!(d >= best_fit_neg)) {
-									} else {
+												best_fit_neg = d;
-										if (!(-d >= best_fit_pos)) {
+											}
-											best_fit_neg = -d;
+										} else {
+											if (!(-d >= best_fit_pos)) {
+												best_fit_neg = -d;
+											}
 										}
 									}
 								}
+								if (!(Double.isNaN(best_fit_pos) && Double.isNaN(best_fit_neg))) {
+									break;
+								}
 							}
 							if (    (best_fit_neg > diff_from_lma_neg) ||
 									(best_fit_pos > diff_from_lma_pos) ||
@@ -2064,7 +2071,7 @@ public class QuadCLT extends QuadCLTCPU {
 	            null,                         // final boolean []          selection, // may be null, if not null do not  process unselected tiles
 	            scene.getErsCorrection(),     // final GeometryCorrection  geometryCorrection,
 	            0.0,                          // final double              disparity_corr,
-	            0, // margin,                 // final int                 margin,      // do not use tiles if their centers are closer to the edges
+	            -1, // 0, // margin,                 // final int                 margin,      // do not use tiles if their centers are closer to the edges
 	            null,                         // final boolean []          valid_tiles,            
 	            threadsMax);                  // final int                 threadsMax)  // maximal number of threads to launch
 	    scene.saveQuadClt(); // to re-load new set of Bayer images to the GPU (do nothing for CPU) and Geometry

--- a/src/main/java/com/elphel/imagej/tileprocessor/TileNeibs.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TileNeibs.java
@@ -140,6 +140,70 @@ public class TileNeibs{
 		default: return indx;
 		}
 	}
+	public static int getNumDirs(int radius) {
+		if (radius < 0) {
+			return 0;
+		} else if (radius == 0) {
+			return 1;
+		} else {
+			return 8 * radius;
+		}
+	}
+	/**
+	 * Get 2d element index after step of variable radius:
+	 * radius==1 - same as getNeibIndex(int indx, int dir), 8 directions
+	 * radius==2 - 16 directions (5x5 square), 0 - still up, north
+	 * radius==3 - 24 directions (7x7 square)
+	 * ...
+	 * @param indx start index
+	 * @param dir step direction (CW from up)
+	 * @param radius - "distance" from the start point
+	 * @return new index or -1 if leaving array in any direction
+	 */
+	public int getNeibIndexRadius(int indx, int dir, int radius) {
+		if (radius < 2) {
+			return  getNeibIndex(indx, dir);
+		}
+		int y = indx / sizeX;
+		int x = indx % sizeX;
+		if (dir > (8 * radius)) {
+			System.out.println("getNeibIndex(): indx="+indx+", dir="+dir+", radius="+radius);
+		}
+		int dr = (dir + radius) % (8 * radius);
+		int quad = dr / (2 * radius);
+		int side = dr %  (2 * radius);
+		switch (quad) {
+		case 0:
+			x = x - radius + side;
+			y = y - radius;
+			break;
+		case 1:
+			x = x + radius; 
+			y = y - radius + side;
+			break;
+		case 2:
+			x = x + radius - side; 
+			y = y + radius;
+			break;
+		case 3:
+			x = x - radius; 
+			y = y + radius - side;
+			break;
+		}
+		if ((x >= 0) && (y >= 0) && (x < sizeX) && (y < sizeY)) {
+			return x + sizeX*y;
+		} else {
+			return -1;
+		}
+	}
 	/**
 	 * Get 2d element index after step N, NE, ... NW. Returns -1 if leaving array
 	 * And 2 steps for dir = 8(N), 9(NNE),..23(NNW)