Claude: refineMotionVectors() — multithreaded masking + pre-computed kernel

- Pre-compute integer-pixel raised-cosine mask kernel once before nseq loop - Allocate fpixels_masked[nframes][width*height] once; clear per nseq with Arrays.fill - Restructure masking: parallel outer loop over frames (nFr via AtomicInteger), inner loop over tiles — no write contention (non-overlapping tile guarantee) - Pre-extract tile center/velocity into arrays before thread launch - Replace new ImagePlus(FloatProcessor) debug calls with ShowDoubleFloatArrays.showArrays() - Fix "centre" -> "center" in comments; naming: nFr for ai.getAndIncrement(), ntile for plain for

Claude: refineMotionVectors() — multithreaded masking + pre-computed kernel
- Pre-compute integer-pixel raised-cosine mask kernel once before nseq loop - Allocate fpixels_masked[nframes][width*height] once; clear per nseq with Arrays.fill - Restructure masking: parallel outer loop over frames (nFr via AtomicInteger), inner loop over tiles — no write contention (non-overlapping tile guarantee) - Pre-extract tile center/velocity into arrays before thread launch - Replace new ImagePlus(FloatProcessor) debug calls with ShowDoubleFloatArrays.showArrays() - Fix "centre" -> "center" in comments; naming: nFr for ai.getAndIncrement(), ntile for plain for
872107bf · Andrey Filippov · c76920b7 · 872107bf
Commit 872107bf authored May 06, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 102 additions and 88 deletions

CuasMotion.java src/main/java/com/elphel/imagej/cuas/CuasMotion.java +102 -88

No files found.
--- a/src/main/java/com/elphel/imagej/cuas/CuasMotion.java
+++ b/src/main/java/com/elphel/imagej/cuas/CuasMotion.java
@@ -2309,34 +2309,36 @@ public class CuasMotion {
 		final int    n_recenter  = clt_parameters.imp.cuas_n_recenter;
 		final double rstr        = clt_parameters.imp.cuas_rstr;

-		// Boosted pair count, centred on frame_center
+		// Boosted pair count, centered on frame_center
 		final int corr_pairs_ref = (int) Math.round(2 * half_accum_range * recalc_mv_boost);

 		// Hard-coded debug selectors: set >= 0 to enable per-scan/per-tile visualisation
-		final int dbg_nseq = -1;
-		final int dbg_tile = -1;
+		final int dbg_nseq = 20; // -1;
+		final int dbg_tile = 51+38*80; // -1;

-		// Show the mask weight profile once
-		if ((dbg_nseq >= 0 || dbg_tile >= 0) && debugLevel >= 0) {
-			int r1i = (int) Math.ceil(recalc_mv_r1);
-			int msize = 2 * r1i + 1;
-			float[] mask_img = new float[msize * msize];
+		// Pre-compute integer-pixel raised-cosine mask kernel once (shared across all nseq)
+		final int r1i = (int) Math.ceil(recalc_mv_r1);
+		final int mside = 2 * r1i + 1;
+		final float[] mask_kernel = new float[mside * mside];
 		for (int dy = -r1i; dy <= r1i; dy++) {
 			for (int dx = -r1i; dx <= r1i; dx++) {
-					double r = Math.sqrt((double)(dx * dx + dy * dy));
-					double w;
-					if (r <= recalc_mv_r0)      w = 1.0;
-					else if (r >= recalc_mv_r1) w = 0.0;
-					else w = 0.5 * (Math.cos(Math.PI * (r - recalc_mv_r0) / (recalc_mv_r1 - recalc_mv_r0)) + 1.0);
-					mask_img[(dy + r1i) * msize + (dx + r1i)] = (float) w;
+				double r = Math.sqrt(dx * dx + dy * dy);
+				float w;
+				if      (r <= recalc_mv_r0) w = 1.0f;
+				else if (r >= recalc_mv_r1) w = 0.0f;
+				else w = (float)(0.5 * (Math.cos(Math.PI * (r - recalc_mv_r0) / (recalc_mv_r1 - recalc_mv_r0)) + 1.0));
+				mask_kernel[(dy + r1i) * mside + (dx + r1i)] = w;
 			}
 		}
-			new ImagePlus("refineMotionVectors-mask-r0_" + recalc_mv_r0 + "-r1_" + recalc_mv_r1,
-					new FloatProcessor(msize, msize, mask_img)).show();
+		if ((dbg_nseq >= 0 || dbg_tile >= 0) && debugLevel >= 0) {
+			ShowDoubleFloatArrays.showArrays(mask_kernel.clone(), mside, mside,
+					"refineMotionVectors-mask-r0_" + recalc_mv_r0 + "-r1_" + recalc_mv_r1);
 		}

-		final int r1i = (int) Math.ceil(recalc_mv_r1) + 1;   // pixel search radius
-		float[][] fpixels_masked = new float[nframes][];       // lazy-allocated per nseq
+		// Allocate staging array once; each nseq clears only the frames it needs
+		final float[][] fpixels_masked = new float[nframes][width * height];
+		final Thread[] threads = ImageDtt.newThreadArray();
+		final AtomicInteger ai = new AtomicInteger(0);

 		for (int nseq = 0; nseq < targets_nonoverlap.length; nseq++) {
 			// Skip scan positions where no target has a known centroid yet
@@ -2351,7 +2353,7 @@ public class CuasMotion {
 			if (!has_centered) continue;

 			int frame_center = frame0 + nseq * corr_inc;
-			// Centre the boosted correlation window on frame_center
+			// Center the boosted correlation window on frame_center
 			int frame0_ref  = frame_center - corr_pairs_ref / 2;
 			int frame1_ref  = frame0_ref + corr_offset;

@@ -2361,58 +2363,70 @@ public class CuasMotion {
 			int fmax_alloc = Math.min(nframes - 1, frame1_ref + corr_pairs_ref - 1);
 			if (fmin_alloc > fmax_alloc) continue;  // no valid pairs in range

-			// Reset masked-frame array for this scan, allocate zero arrays for the window
-			for (int f = 0; f < nframes; f++) fpixels_masked[f] = null;
-			for (int f = fmin_alloc; f <= fmax_alloc; f++) {
-				fpixels_masked[f] = new float[width * height];
-			}
-
-			// Apply the raised-cosine mask around each target, tracking its motion
-			for (int ntile = 0; ntile < targets_nonoverlap[nseq].length; ntile++) {
+			// Clear staging frames for this scan position
+			for (int f = fmin_alloc; f <= fmax_alloc; f++) Arrays.fill(fpixels_masked[f], 0.0f);
+
+			// Pre-extract tile center/velocity for thread-safe parallel access
+			final int ntiles = targets_nonoverlap[nseq].length;
+			final boolean[] tile_valid = new boolean[ntiles];
+			final double[]  tcx0 = new double[ntiles];
+			final double[]  tcy0 = new double[ntiles];
+			final double[]  tvx  = new double[ntiles];
+			final double[]  tvy  = new double[ntiles];
+			for (int ntile = 0; ntile < ntiles; ntile++) {
 				double[] target = targets_nonoverlap[nseq][ntile];
 				if (target == null || Double.isNaN(target[CuasMotionLMA.RSLT_X])) continue;
-				int tileX = ntile % tilesX;
-				int tileY = ntile / tilesX;
-				// Target centre in image pixels at frame_center
-				double cx0 = tileX * tileSize + tileSize / 2.0 + target[CuasMotionLMA.RSLT_X];
-				double cy0 = tileY * tileSize + tileSize / 2.0 + target[CuasMotionLMA.RSLT_Y];
-				double vx  = target[CuasMotionLMA.RSLT_VX];  // pixels per corr_offset frames
-				double vy  = target[CuasMotionLMA.RSLT_VY];
+				tile_valid[ntile] = true;
+				tcx0[ntile] = (ntile % tilesX) * tileSize + tileSize / 2.0 + target[CuasMotionLMA.RSLT_X];
+				tcy0[ntile] = (ntile / tilesX) * tileSize + tileSize / 2.0 + target[CuasMotionLMA.RSLT_Y];
+				tvx[ntile]  = target[CuasMotionLMA.RSLT_VX];
+				tvy[ntile]  = target[CuasMotionLMA.RSLT_VY];
+			}

-				for (int f = fmin_alloc; f <= fmax_alloc; f++) {
-					double offset_scale = (double)(f - frame_center) / corr_offset;
-					double cx = cx0 + vx * offset_scale;
-					double cy = cy0 + vy * offset_scale;
-					int icx = (int) Math.round(cx);
-					int icy = (int) Math.round(cy);
-					double dcx = cx - icx;
-					double dcy = cy - icy;
+			// Parallel over frames: each thread applies all tile masks to one frame
+			final int fmin_f  = fmin_alloc;
+			final int frange  = fmax_alloc - fmin_f + 1;
+			final int fcenter = frame_center;
+			ai.set(0);
+			for (int ithread = 0; ithread < threads.length; ithread++) {
+				threads[ithread] = new Thread() {
+					public void run() {
+						for (int nFr = ai.getAndIncrement(); nFr < frange; nFr = ai.getAndIncrement()) {
+							int f = fmin_f + nFr;
 							float[] src = fpixels[f];
 							float[] dst = fpixels_masked[f];
+							double offset_scale = (double)(f - fcenter) / corr_offset;
+							for (int ntile = 0; ntile < ntiles; ntile++) {
+								if (!tile_valid[ntile]) continue;
+								double cx = tcx0[ntile] + tvx[ntile] * offset_scale;
+								double cy = tcy0[ntile] + tvy[ntile] * offset_scale;
+								int icx = (int) Math.round(cx);
+								int icy = (int) Math.round(cy);
 								for (int dy = -r1i; dy <= r1i; dy++) {
 									int py = icy + dy;
 									if (py < 0 || py >= height) continue;
 									for (int dx = -r1i; dx <= r1i; dx++) {
 										int px = icx + dx;
 										if (px < 0 || px >= width) continue;
-							double r = Math.sqrt((dx - dcx) * (dx - dcx) + (dy - dcy) * (dy - dcy));
-							if (r >= recalc_mv_r1) continue;
-							double w = (r <= recalc_mv_r0) ? 1.0 :
-								0.5 * (Math.cos(Math.PI * (r - recalc_mv_r0) / (recalc_mv_r1 - recalc_mv_r0)) + 1.0);
-							dst[py * width + px] += (float)(w * src[py * width + px]);
+										float w = mask_kernel[(dy + r1i) * mside + (dx + r1i)];
+										if (w == 0.0f) continue;
+										dst[py * width + px] += w * src[py * width + px];
+									}
 								}
 							}
 						}
 					}
+				};
+			}
+			ImageDtt.startAndJoin(threads);

 			// Debug: show the masked source frame at frame_center for this scan
 			if (nseq == dbg_nseq && debugLevel >= 0) {
 				int f_show = Math.max(fmin_alloc, Math.min(fmax_alloc, frame_center));
-				new ImagePlus("refineMotionVectors-masked-nseq" + nseq + "-f" + f_show,
-						new FloatProcessor(width, height, fpixels_masked[f_show].clone())).show();
-				// Also show the same frame unmasked for comparison
-				new ImagePlus("refineMotionVectors-source-nseq" + nseq + "-f" + f_show,
-						new FloatProcessor(width, height, fpixels[f_show].clone())).show();
+				ShowDoubleFloatArrays.showArrays(fpixels_masked[f_show].clone(), width, height,
+						"refineMotionVectors-masked-nseq" + nseq + "-f" + f_show);
+				ShowDoubleFloatArrays.showArrays(fpixels[f_show], width, height,
+						"refineMotionVectors-source-nseq" + nseq + "-f" + f_show);
 			}

 			TDCorrTile[] tdCorrTiles = cuasMotion.correlatePairs(
@@ -2447,8 +2461,8 @@ public class CuasMotion {
 				int corr_side = 2 * GPUTileProcessor.DTT_SIZE - 1; // 15
 				float[] corr_img = new float[corr_side * corr_side];
 				for (int i = 0; i < corr_img.length; i++) corr_img[i] = (float) corr_tiles_pd[dbg_tile][i];
-				new ImagePlus("refineMotionVectors-corr2d-nseq" + nseq + "-tile" + dbg_tile,
-						new FloatProcessor(corr_side, corr_side, corr_img)).show();
+				ShowDoubleFloatArrays.showArrays(corr_img, corr_side, corr_side,
+						"refineMotionVectors-corr2d-nseq" + nseq + "-tile" + dbg_tile);
 			}

 			// Add differential MV to targets_nonoverlap in-place
@@ -8308,7 +8322,6 @@ public class CuasMotion {
 			
 			boolean save_filtered_low =  intermed_low &&  (niter < iter_show1);
 			boolean save_filtered_high = intermed_high && (niter < iter_show1);
-//			totals = getRemain(motion_sequence, target_sequence_multi, num_all, num_undef, num_good, num_bad);
 			totals = getRemain(target_sequence_multi, num_all, num_undef, num_good, num_bad);
 			if (totals[TOTALS_UNDEFINED] == 0) {
 				if (debugLevel > -4) System.out.println ("No undefined tiles left, breaking loop");
@@ -8400,6 +8413,26 @@ public class CuasMotion {
 				// show good and bad accumulated here too?
 			}

+			// Andrey 05/05/2026 moved here (earlier) so shiftAndRenderAccumulate() will use update motion vector
+			// By Claude on 05/05/2026 — re-correlate with spatial mask around known target
+			if (recalc_mv) {
+				refineMotionVectors(
+						clt_parameters,      // CLTParameters   clt_parameters,
+						batch_mode,          // boolean         batch_mode,
+						cuasMotion,          // CuasMotion      cuasMotion,
+						recalc_mv_boost,     // double          recalc_mv_boost,
+						recalc_mv_r0,        // double          recalc_mv_r0,
+						recalc_mv_r1,        // double          recalc_mv_r1,
+						fpixels_tum,         // float [][]      fpixels,
+						targets_nonoverlap,  // double [][][]   targets_nonoverlap,
+						frame0,              // int             frame0,
+						corr_inc,            // int             corr_inc,
+						half_accum_range,    // int             half_accum_range,
+						smooth,              // boolean         smooth,
+						corr_offset,         // int             corr_offset,
+						debugLevel);         // int             debugLevel)
+			}
+			
 			// perform new accumulations of shifted non-conflicting tiles
 			float [][] fpixels_accumulated = cuasMotion.shiftAndRenderAccumulate(
 					clt_parameters,   // CLTParameters       clt_parameters,
@@ -8443,25 +8476,6 @@ public class CuasMotion {
 			// targets_new will contain motion vectors, centroid, and LMA results combined
 			//save_filtered_high

-			// By Claude on 05/05/2026 — re-correlate with spatial mask around known target
-			if (recalc_mv) {
-				refineMotionVectors(
-						clt_parameters,      // CLTParameters   clt_parameters,
-						batch_mode,          // boolean         batch_mode,
-						cuasMotion,          // CuasMotion      cuasMotion,
-						recalc_mv_boost,     // double          recalc_mv_boost,
-						recalc_mv_r0,        // double          recalc_mv_r0,
-						recalc_mv_r1,        // double          recalc_mv_r1,
-						fpixels_tum,         // float [][]      fpixels,
-						targets_nonoverlap,  // double [][][]   targets_nonoverlap,
-						frame0,              // int             frame0,
-						corr_inc,            // int             corr_inc,
-						half_accum_range,    // int             half_accum_range,
-						smooth,              // boolean         smooth,
-						corr_offset,         // int             corr_offset,
-						debugLevel);         // int             debugLevel)
-			}
-
 			float [][] accum_debug = save_filtered_high? new float [num_corr_samples][]:null; //fpixels_accumulated.length]
 			boolean    keep_failed = false; // keep failed targets
 			double [][][][] targets_new_multi = 	getAccumulatedCoordinatesMulti(