CLAUDE: Add RT detector skeleton (com.elphel.imagej.cuas.rt)

Pure-Java prototype CuasRTDetector + RtParams parameters class + CuasRTLib JNA stub (activate when JNA added to pom.xml). No dependencies added; build is clean. See design discussion in attic/imagej-elphel-internal/handoffs/2026-05-21_RealTimeDesign.md. Co-authored-by: Claude <claude@elphel.com>

CLAUDE: Add RT detector skeleton (com.elphel.imagej.cuas.rt)
Pure-Java prototype CuasRTDetector + RtParams parameters class + CuasRTLib JNA stub (activate when JNA added to pom.xml). No dependencies added; build is clean. See design discussion in attic/imagej-elphel-internal/handoffs/2026-05-21_RealTimeDesign.md. Co-authored-by: Claude <claude@elphel.com>
1bf9e9e4 · Andrey Filippov · 9dacd956 · 1bf9e9e4 · 1bf9e9e4 · 1bf9e9e4
Commit 1bf9e9e4 authored May 21, 2026 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/cuas/rt/CuasRTDetector.java
+++ b/src/main/java/com/elphel/imagej/cuas/rt/CuasRTDetector.java
+/**
+ ** CuasRTDetector.java
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  CuasRTDetector.java is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ ** -----------------------------------------------------------------------------**
+ */
+package com.elphel.imagej.cuas.rt;
+
+import java.util.ArrayList;
+
+/**
+ * Real-time CUAS target detector — pure-Java prototype (single temporal pyramid level).
+ *
+ * This is the reference implementation used for algorithm tuning before the
+ * C++/CUDA port in cuas_rt_gpu/src/rt_detector.cu.  Keep all methods as
+ * static pure functions (flat float[] arrays, no streams, no ArrayList in hot
+ * paths) so each method maps 1:1 to a C++ function and eventually a CUDA kernel.
+ *
+ * Architecture overview (single pyramid level):
+ *
+ *   fpixels[nFrames][width*height]          ← already temporally averaged/decimated
+ *            │
+ *            ▼
+ *   layer1()[nChan][width*height]            ← Layer 1: spatio-temporal matched filter
+ *            │                                  one response map per velocity channel
+ *            ▼
+ *   updateAccumChannel() × nChan             ← Layer 2: per-channel recurrent buffer
+ *     shift → mem*accum+obs*layer1 → leakyReLU
+ *            │
+ *            ▼
+ *   findPeaks() → float[] (x,y) pairs        ← Stage 2 trigger candidates
+ *
+ * Velocity channels:
+ *   nChan = (2*maxVel+1)^2, ivx/ivy in [-maxVel..+maxVel] px/sample.
+ *   chanIdx(ivx,ivy) = (ivy+maxVel)*nVel + (ivx+maxVel)  [ivy major, ivx minor]
+ *
+ * 50% overlap mode (recommended):
+ *   Run at stride = nFrames/2; consecutive windows share half their frames.
+ *   Reduces worst-case detection latency from nFrames to nFrames/2 samples.
+ *   The accumulation buffer's memCoeff absorbs the resulting double-counting smoothly.
+ *
+ * Validation against current software:
+ *   Feed the same fpixels[] used by refineMotionVectors() at a single target
+ *   location and compare findPeaks() output to the current detection result.
+ */
+public class CuasRTDetector {
+
+    // =========================================================================
+    // Layer 1: spatio-temporal matched filter
+    // =========================================================================
+
+    /**
+     * Layer 1 response for one velocity channel (ivx, ivy).
+     *
+     * For each output pixel (x,y), sums a 3x3 spatial neighborhood across
+     * all nFrames temporal layers.  Layer t is spatially offset by
+     * (ivx*dt, ivy*dt) where dt = t - tMid, so the filter "tracks" a
+     * hypothetical target moving at velocity (ivx,ivy) px/sample.
+     *
+     * Sub-pixel shift: currently nearest-integer.
+     * TODO: replace with bilinear interpolation for smoother velocity response.
+     *
+     * Result is ADDED into out[] so callers can accumulate across multiple calls.
+     *
+     * @param fpixels  [nFrames][width*height] input (temporally decimated)
+     * @param width    image width in pixels
+     * @param height   image height in pixels
+     * @param ivx      channel velocity X in [-maxVel..+maxVel]
+     * @param ivy      channel velocity Y in [-maxVel..+maxVel]
+     * @param out      [width*height] output array (accumulated into, not cleared here)
+     */
+    public static void layer1Channel(
+            float[][] fpixels,
+            int width, int height,
+            int ivx, int ivy,
+            float[] out)
+    {
+        int nFrames = fpixels.length;
+        int tMid    = nFrames / 2;
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                float sum = 0.0f;
+                for (int t = 0; t < nFrames; t++) {
+                    int dt = t - tMid;
+                    int sx = x - ivx * dt;
+                    int sy = y - ivy * dt;
+                    for (int dy = -1; dy <= 1; dy++) {
+                        for (int dx = -1; dx <= 1; dx++) {
+                            int px = sx + dx, py = sy + dy;
+                            if (px >= 0 && px < width && py >= 0 && py < height) {
+                                sum += fpixels[t][py * width + px];
+                            }
+                        }
+                    }
+                }
+                out[y * width + x] += sum;
+            }
+        }
+    }
+
+    /**
+     * Full Layer 1 output: compute response for all nChan velocity channels.
+     *
+     * @param fpixels  [nFrames][width*height]
+     * @param p        RT parameters
+     * @param width    image width
+     * @param height   image height
+     * @return         [nChan][width*height], indexed by p.chanIdx(ivx,ivy)
+     */
+    public static float[][] layer1(
+            float[][] fpixels,
+            RtParams p,
+            int width, int height)
+    {
+        int nChan = p.nChan();
+        float[][] out = new float[nChan][width * height];
+        for (int ivy = -p.maxVel; ivy <= p.maxVel; ivy++) {
+            for (int ivx = -p.maxVel; ivx <= p.maxVel; ivx++) {
+                layer1Channel(fpixels, width, height, ivx, ivy,
+                              out[p.chanIdx(ivx, ivy)]);
+            }
+        }
+        return out;
+    }
+
+    // =========================================================================
+    // Layer 2: recurrent accumulation buffer helpers
+    // =========================================================================
+
+    /**
+     * Integer spatial shift of buf[] by (vx, vy) pixels, in place.
+     * Pixels shifted outside the image boundary are zeroed.
+     *
+     * For non-integer velocities: replace with bilinear interpolation.
+     * The half-pixel grid case (|velocity| = 0.5) is a planned intermediate step.
+     *
+     * @param buf    [width*height] modified in place
+     * @param width  image width
+     * @param height image height
+     * @param vx     shift in X (positive = rightward)
+     * @param vy     shift in Y (positive = downward)
+     */
+    public static void shiftBuffer(float[] buf, int width, int height, int vx, int vy) {
+        float[] tmp = new float[buf.length];
+        for (int y = 0; y < height; y++) {
+            for (int x = 0; x < width; x++) {
+                int sx = x - vx, sy = y - vy;
+                if (sx >= 0 && sx < width && sy >= 0 && sy < height) {
+                    tmp[y * width + x] = buf[sy * width + sx];
+                }
+                // else tmp[y*width+x] stays 0
+            }
+        }
+        System.arraycopy(tmp, 0, buf, 0, buf.length);
+    }
+
+    /**
+     * Leaky ReLU activation applied in place.
+     * x >= 0 → x unchanged; x < 0 → x * slope.
+     * slope=1.0 gives linear (no competition); slope→0 approaches hard ReLU.
+     */
+    public static void leakyReLU(float[] buf, float slope) {
+        for (int i = 0; i < buf.length; i++) {
+            if (buf[i] < 0.0f) buf[i] *= slope;
+        }
+    }
+
+    /**
+     * Update one velocity channel's accumulation buffer with a new Layer 1 observation.
+     *
+     * Loop per frame:
+     *   1. Spatial shift accumBuf by (ivx, ivy)   — tracks the target
+     *   2. accumBuf = memCoeff*accumBuf + obsCoeff*layer1  — integrate
+     *   3. leakyReLU(accumBuf, leakySlope)         — suppress noise / control competition
+     *
+     * @param accumBuf  [width*height] persistent buffer, updated in place
+     * @param layer1    [width*height] new Layer 1 observation for this channel
+     * @param p         RT parameters
+     * @param width     image width
+     * @param height    image height
+     * @param ivx       channel velocity X
+     * @param ivy       channel velocity Y
+     */
+    public static void updateAccumChannel(
+            float[] accumBuf, float[] layer1,
+            RtParams p, int width, int height,
+            int ivx, int ivy)
+    {
+        shiftBuffer(accumBuf, width, height, ivx, ivy);
+        float mc = p.memCoeff, oc = p.obsCoeff;
+        for (int i = 0; i < accumBuf.length; i++) {
+            accumBuf[i] = mc * accumBuf[i] + oc * layer1[i];
+        }
+        leakyReLU(accumBuf, p.leakySlope);
+    }
+
+    /**
+     * Update all nChan accumulation buffers with new Layer 1 output.
+     *
+     * @param accumBufs [nChan][width*height] persistent state, updated in place
+     * @param layer1Out [nChan][width*height] from layer1()
+     * @param p         RT parameters
+     * @param width     image width
+     * @param height    image height
+     */
+    public static void updateAccum(
+            float[][] accumBufs, float[][] layer1Out,
+            RtParams p, int width, int height)
+    {
+        for (int ivy = -p.maxVel; ivy <= p.maxVel; ivy++) {
+            for (int ivx = -p.maxVel; ivx <= p.maxVel; ivx++) {
+                int ich = p.chanIdx(ivx, ivy);
+                updateAccumChannel(accumBufs[ich], layer1Out[ich],
+                                   p, width, height, ivx, ivy);
+            }
+        }
+    }
+
+    // =========================================================================
+    // Stage 2 trigger: peak detection
+    // =========================================================================
+
+    /**
+     * Cross-channel maximum projection: for each pixel, take the max value
+     * across all velocity channels in the accumulation buffer.
+     * This collapses [nChan][width*height] → [width*height].
+     *
+     * @param accumBufs [nChan][width*height]
+     * @param width     image width
+     * @param height    image height
+     * @return          [width*height] max projection
+     */
+    public static float[] maxProjection(float[][] accumBufs, int width, int height) {
+        float[] proj = new float[width * height];
+        for (float[] ch : accumBufs) {
+            for (int i = 0; i < proj.length; i++) {
+                if (ch[i] > proj[i]) proj[i] = ch[i];
+            }
+        }
+        return proj;
+    }
+
+    /**
+     * Detect peaks in a [width*height] map using 3x3 non-maximum suppression.
+     * A pixel is a peak if it is above thresh AND strictly greater than all 8 neighbors.
+     *
+     * Returns float[] of (x, y) pairs: [x0, y0, x1, y1, ...].
+     * Border pixels (1-pixel margin) are excluded.
+     *
+     * For Stage 2 trigger logic: before spawning a new Stage 2 worker, check
+     * the presence bitmask to avoid duplicating an already-tracked target.
+     *
+     * @param map    [width*height] projection (e.g. from maxProjection)
+     * @param p      RT parameters (uses p.peakThresh)
+     * @param width  image width
+     * @param height image height
+     * @return       float[] of (x,y) pairs, length = 2 * numPeaks
+     */
+    public static float[] findPeaks(float[] map, RtParams p, int width, int height) {
+        ArrayList<Float> peaks = new ArrayList<>();
+        for (int y = 1; y < height - 1; y++) {
+            for (int x = 1; x < width - 1; x++) {
+                float v = map[y * width + x];
+                if (v < p.peakThresh) continue;
+                boolean isMax = true;
+                outer:
+                for (int dy = -1; dy <= 1; dy++) {
+                    for (int dx = -1; dx <= 1; dx++) {
+                        if (dx == 0 && dy == 0) continue;
+                        if (map[(y + dy) * width + (x + dx)] >= v) {
+                            isMax = false;
+                            break outer;
+                        }
+                    }
+                }
+                if (isMax) {
+                    peaks.add((float) x);
+                    peaks.add((float) y);
+                }
+            }
+        }
+        float[] result = new float[peaks.size()];
+        for (int i = 0; i < result.length; i++) result[i] = peaks.get(i);
+        return result;
+    }
+
+    // =========================================================================
+    // Top-level single-level pipeline
+    // =========================================================================
+
+    /**
+     * Run the full single-pyramid-level detection pipeline on one input window.
+     *
+     * State (accumBufs) is kept between calls by the caller; it represents
+     * the recurrent memory across input windows.
+     *
+     * @param fpixels   [nFrames][width*height] input window (temporally decimated)
+     * @param accumBufs [nChan][width*height] persistent accumulation state (in/out)
+     * @param p         RT parameters
+     * @param width     image width
+     * @param height    image height
+     * @return          float[] of (x,y) peak pairs from this window
+     */
+    public static float[] processWindow(
+            float[][] fpixels,
+            float[][] accumBufs,
+            RtParams p,
+            int width, int height)
+    {
+        float[][] layer1Out = layer1(fpixels, p, width, height);
+        updateAccum(accumBufs, layer1Out, p, width, height);
+        float[] proj = maxProjection(accumBufs, width, height);
+        return findPeaks(proj, p, width, height);
+    }
+
+    /**
+     * Allocate fresh (zeroed) accumulation buffers for a new session.
+     *
+     * @param p      RT parameters
+     * @param width  image width
+     * @param height image height
+     * @return       [nChan][width*height] zero-initialized
+     */
+    public static float[][] allocAccum(RtParams p, int width, int height) {
+        return new float[p.nChan()][width * height];
+    }
+}
--- a/src/main/java/com/elphel/imagej/cuas/rt/CuasRTLib.java
+++ b/src/main/java/com/elphel/imagej/cuas/rt/CuasRTLib.java
+/**
+ ** CuasRTLib.java
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  CuasRTLib.java is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ ** -----------------------------------------------------------------------------**
+ */
+package com.elphel.imagej.cuas.rt;
+
+/**
+ * JNA binding to libcuasrt.so — the native CUDA real-time detector library.
+ *
+ * STATUS: stub — activate when JNA is added to pom.xml.
+ *
+ * -------------------------------------------------------------------------
+ * TO ACTIVATE: add to pom.xml dependencies:
+ *
+ *   <dependency>
+ *     <groupId>net.java.dev.jna</groupId>
+ *     <artifactId>jna</artifactId>
+ *     <version>5.14.0</version>
+ *   </dependency>
+ *
+ * Then replace this class with the active interface (sketch below).
+ * -------------------------------------------------------------------------
+ *
+ * ACTIVE INTERFACE SKETCH (not compiled — JNA not yet on classpath):
+ *
+ *   import com.sun.jna.Library;
+ *   import com.sun.jna.Native;
+ *   import com.sun.jna.Pointer;
+ *   import com.sun.jna.Structure;
+ *   import java.util.Arrays;
+ *   import java.util.List;
+ *
+ *   // RtParamsJna: JNA Structure mapping to C RtParams in rt_lib.h
+ *   public static class RtParamsJna extends Structure {
+ *       public int   max_vel;
+ *       public int   width;
+ *       public int   height;
+ *       public float mem_coeff;
+ *       public float obs_coeff;
+ *       public float leaky_slope;
+ *       public float peak_thresh;
+ *
+ *       @Override protected List<String> getFieldOrder() {
+ *           return Arrays.asList("max_vel","width","height",
+ *                                "mem_coeff","obs_coeff","leaky_slope","peak_thresh");
+ *       }
+ *   }
+ *
+ *   public interface CuasRTLib extends Library {
+ *       CuasRTLib INSTANCE = Native.load("cuasrt", CuasRTLib.class);
+ *
+ *       // Create GPU context; returns opaque handle.
+ *       Pointer cuasrt_create(int width, int height, RtParamsJna p);
+ *
+ *       // Destroy GPU context and free all GPU memory.
+ *       void    cuasrt_destroy(Pointer ctx);
+ *
+ *       // Layer 1 matched filter.
+ *       // fpixels: [nframes * width * height], row-major, host memory.
+ *       // layer1Out: [n_chan * width * height] output, n_chan = (2*max_vel+1)^2.
+ *       void    cuasrt_layer1(Pointer ctx, float[] fpixels, int nframes, float[] layer1Out);
+ *
+ *       // Update accumulation buffer; returns number of peaks found.
+ *       // layer1Out:  [n_chan * width * height] from cuasrt_layer1.
+ *       // accumBuf:   [n_chan * width * height] persistent state (in/out).
+ *       // peaksXy:    output [(x,y) pairs], length >= maxPeaks*2.
+ *       int     cuasrt_update_accum(Pointer ctx,
+ *                                   float[] layer1Out, float[] accumBuf,
+ *                                   RtParamsJna p, float[] peaksXy, int maxPeaks);
+ *   }
+ *
+ * -------------------------------------------------------------------------
+ * LIBRARY PATH:
+ *   Set -Djna.library.path=/path/to/cuas_rt_gpu/build at JVM startup,
+ *   or place libcuasrt.so on LD_LIBRARY_PATH.
+ *   libcudart.so must also be accessible (ships with CUDA toolkit).
+ *
+ * CUDA CONTEXT:
+ *   libcuasrt.so creates its own CUDA context (device 0).
+ *   It runs independently of the JCuda tile-processor context.
+ *   When both pipelines run together, context sharing via cudaSetDevice
+ *   (CUDA 12 primary context) can be added without API changes.
+ * -------------------------------------------------------------------------
+ */
+public final class CuasRTLib {
+    // Placeholder — replace with active JNA interface once JNA is in pom.xml.
+    private CuasRTLib() {}
+}
--- a/src/main/java/com/elphel/imagej/cuas/rt/RtParams.java
+++ b/src/main/java/com/elphel/imagej/cuas/rt/RtParams.java
+/**
+ ** RtParams.java
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  RtParams.java is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ ** -----------------------------------------------------------------------------**
+ */
+package com.elphel.imagej.cuas.rt;
+
+/**
+ * Runtime parameters for the real-time CUAS target detector.
+ *
+ * Mirrors the RtParams C struct in cuas_rt_gpu/src/rt_lib.h.
+ * When JNA is added to pom.xml this class will gain a JNA Structure counterpart
+ * (RtParamsJna extends com.sun.jna.Structure) for direct memory mapping.
+ *
+ * Velocity channels:
+ *   X and Y each span [-maxVel .. +maxVel] pixels/sample.
+ *   Total channels: nChan() = (2*maxVel+1)^2 = 25 for maxVel=2.
+ *   Channel index:  chanIdx(ivx,ivy) = (ivy+maxVel)*nVel() + (ivx+maxVel)
+ *   Units are pixels per input sample; input is already temporally
+ *   averaged/decimated for the specific pyramid level, so the same
+ *   maxVel applies at every pyramid level.
+ */
+public class RtParams {
+
+    /** Velocity range per axis [-maxVel .. +maxVel] px/sample. Start with 2; try 1 and 3. */
+    public int   maxVel      = 2;
+
+    /** Accumulation buffer memory coefficient (high persistence). */
+    public float memCoeff    = 0.9f;
+
+    /** New observation weight (typically 1 - memCoeff). */
+    public float obsCoeff    = 0.1f;
+
+    /**
+     * Leaky ReLU negative slope inside accumulation loop.
+     * Controls competition vs. pure linear accumulation:
+     *   slope=1.0 → linear (maximum SNR improvement, no competition)
+     *   slope→0   → hard ReLU (strong competition, earlier saturation)
+     * Tune experimentally; start near 1.0 and reduce toward 0.01.
+     */
+    public float leakySlope  = 0.1f;
+
+    /** Accumulation buffer peak detection threshold for Stage 2 trigger. */
+    public float peakThresh  = 0.5f;
+
+    public RtParams() {}
+
+    public RtParams(int maxVel, float memCoeff, float obsCoeff,
+                    float leakySlope, float peakThresh) {
+        this.maxVel     = maxVel;
+        this.memCoeff   = memCoeff;
+        this.obsCoeff   = obsCoeff;
+        this.leakySlope = leakySlope;
+        this.peakThresh = peakThresh;
+    }
+
+    /** Number of velocity channels per axis: 2*maxVel+1. */
+    public int nVel()  { return 2 * maxVel + 1; }
+
+    /** Total 2D velocity channels: nVel()^2. */
+    public int nChan() { int n = nVel(); return n * n; }
+
+    /**
+     * Flat channel index for velocity (ivx, ivy), both in [-maxVel..+maxVel].
+     * ivy is the major index (row), ivx is the minor index (column).
+     */
+    public int chanIdx(int ivx, int ivy) {
+        return (ivy + maxVel) * nVel() + (ivx + maxVel);
+    }
+
+    /** Recover ivx from channel index. */
+    public int ivxFromChan(int ich) { return (ich % nVel()) - maxVel; }
+
+    /** Recover ivy from channel index. */
+    public int ivyFromChan(int ich) { return (ich / nVel()) - maxVel; }
+
+    @Override
+    public String toString() {
+        return String.format(
+            "RtParams{maxVel=%d, nChan=%d, mem=%.3f, obs=%.3f, leaky=%.3f, thresh=%.3f}",
+            maxVel, nChan(), memCoeff, obsCoeff, leakySlope, peakThresh);
+    }
+}
--- a/src/main/java/com/elphel/imagej/cuas/rt/package-info.java
+++ b/src/main/java/com/elphel/imagej/cuas/rt/package-info.java
+/**
+ * Real-time CUAS target detector — skeleton package.
+ *
+ * Design discussion: attic/imagej-elphel-internal/handoffs/2026-05-21_RealTimeDesign.md
+ *
+ * Contents:
+ *   RtParams         — runtime parameters (mirrors RtParams C struct in libcuasrt.so)
+ *   CuasRTDetector   — pure-Java prototype; reference implementation for tuning
+ *   CuasRTLib        — JNA binding stub (activate when JNA added to pom.xml)
+ *
+ * Implementation order (next week):
+ *   1. Tune CuasRTDetector against real data at a single pyramid level
+ *   2. Port to C++/CUDA in cuas_rt_gpu, validate numerically against Java
+ *   3. Wire CuasRTLib JNA binding, run Java → libcuasrt.so end-to-end
+ *   4. Extend to full temporal pyramid
+ */
+package com.elphel.imagej.cuas.rt;