CLAUDE: Initial cuas_rt_gpu skeleton (libcuasrt.so)

b70571d6 · Andrey Filippov · b70571d6 · b70571d6 · b70571d6 · b70571d6
Commit b70571d6 authored May 21, 2026 by Andrey Filippov
Showing with 304 additions and 0 deletions

rt_defines.h src/rt_defines.h +61 -0

rt_detector.cu src/rt_detector.cu +0 -0

rt_detector.h src/rt_detector.h +76 -0

rt_lib.cu src/rt_lib.cu +50 -0

rt_lib.h src/rt_lib.h +117 -0

No files found.
--- a/src/rt_defines.h
+++ b/src/rt_defines.h
+/**
+ ** rt_defines.h
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** This file is part of cuas_rt_gpu (libcuasrt.so).
+ ** Analogous to tp_defines.h in tile_processor_gpu.
+ **
+ ** Unlike tp_defines.h, there is no JCUDA guard here: libcuasrt.so is
+ ** always pre-compiled with nvcc (not NVRTC).  Parameters that vary at
+ ** runtime are passed via RtParams (rt_lib.h), not via #define.
+ **
+ ** Only structural constants that affect kernel shared-memory layout
+ ** or loop bounds that MUST be compile-time constants live here.
+ **
+ ** -----------------------------------------------------------------------------
+ **
+ **  rt_defines.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ ** -----------------------------------------------------------------------------
+ */
+#pragma once
+#ifndef RT_DEFINES_H
+#define RT_DEFINES_H
+// Spatial kernel half-size for Layer 1 matched filter (3x3 neighborhood → half=1)
+#define RT_SPATIAL_HALF         1
+// Maximum runtime max_vel supported by pre-compiled kernels.
+// Must be >= any value passed in RtParams.max_vel at runtime.
+// Determines shared-memory allocation upper bound.
+// Rebuild the library if you need a larger value.
+#define RT_MAX_VEL_COMPILE      3
+// Maximum number of input frames per Layer 1 call (NFRAMES_MAX in RtDetector).
+// Increase if pyramid levels need longer windows.
+#define RT_NFRAMES_MAX         16
+// Maximum peaks returned per updateAccum call (d_peaks_xy size).
+#define RT_MAX_PEAKS         1024
+// CUDA thread-block tile size for 2D kernels (Layer 1, accumulation update).
+#define RT_BLOCK_X             16
+#define RT_BLOCK_Y             16
+#ifdef RT_HAS_PRINTF
+#include <stdio.h>
+#endif
+#endif // RT_DEFINES_H
--- a/src/rt_detector.cu
+++ b/src/rt_detector.cu
--- a/src/rt_detector.h
+++ b/src/rt_detector.h
+/**
+ ** rt_detector.h — internal C++ class managing GPU state for one detector instance.
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** Not part of the extern "C" API; used only within libcuasrt.so.
+ ** Analogous to TpHostGpu in tile_processor_gpu.
+ **
+ ** Each method corresponds to a static method in CuasRTDetector.java.
+ ** Validate numerical agreement between Java prototype and C++ implementation
+ ** before tuning parameters.
+ **
+ ** -----------------------------------------------------------------------------
+ **
+ **  rt_detector.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ ** -----------------------------------------------------------------------------
+ */
+#pragma once
+#ifndef RT_DETECTOR_H
+#define RT_DETECTOR_H
+#include "rt_defines.h"
+#include "rt_lib.h"
+#include <cuda_runtime.h>
+class RtDetector {
+public:
+    explicit RtDetector(const RtParams& p);
+    ~RtDetector();
+    /**
+     * Layer 1 matched filter.
+     * h_fpixels:    host [nframes * width * height]
+     * nframes:      number of input frames (<= RT_NFRAMES_MAX)
+     * h_layer1_out: host [n_chan * width * height] output
+     */
+    void runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out);
+    /**
+     * Update accumulation buffers; detect and return peaks.
+     * h_layer1_out: host [n_chan * width * height] from runLayer1
+     * h_accum_buf:  host [n_chan * width * height] persistent state (in/out, caller owns)
+     * p:            current parameters (may differ from construction-time params)
+     * h_peaks_xy:   output [max_peaks * 2]
+     * max_peaks:    output capacity
+     * Returns:      number of peaks found
+     */
+    int  updateAccum(const float* h_layer1_out,
+                     float*       h_accum_buf,
+                     const RtParams& p,
+                     float*       h_peaks_xy,
+                     int          max_peaks);
+private:
+    RtParams m_p;
+    int      m_n_chan;    // (2*max_vel+1)^2
+    int      m_n_pixels;  // width * height
+    // GPU memory (owned by this object)
+    float* d_fpixels    = nullptr;  // [RT_NFRAMES_MAX * n_pixels]
+    float* d_layer1_out = nullptr;  // [n_chan * n_pixels]
+    float* d_accum_chan = nullptr;  // [n_pixels] one channel at a time (upload/update/download)
+    float* d_proj       = nullptr;  // [n_pixels] cross-channel max projection
+    float* d_peaks_xy   = nullptr;  // [RT_MAX_PEAKS * 2]
+    int*   d_num_peaks  = nullptr;  // single int (atomic counter in k_find_peaks)
+    void allocGpu();
+    void freeGpu();
+};
+#endif // RT_DETECTOR_H
--- a/src/rt_lib.cu
+++ b/src/rt_lib.cu
+/**
+ ** rt_lib.cu — extern "C" wrappers around RtDetector.
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** This is the JNA / C++ API surface.  All CUDA work is in rt_detector.cu.
+ ** Keep this file thin: just parameter validation and delegation to RtDetector.
+ **
+ ** -----------------------------------------------------------------------------
+ **
+ **  rt_lib.cu is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ ** -----------------------------------------------------------------------------
+ */
+#include "rt_lib.h"
+#include "rt_detector.h"
+#include <cstdio>
+extern "C" {
+void* cuasrt_create(int width, int height, const RtParams* p) {
+    if (!p) { fprintf(stderr, "cuasrt_create: null params\n"); return nullptr; }
+    RtParams rp  = *p;
+    rp.width     = width;
+    rp.height    = height;
+    return new RtDetector(rp);
+}
+void cuasrt_destroy(void* ctx) {
+    delete static_cast<RtDetector*>(ctx);
+}
+void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out) {
+    if (!ctx || !fpixels || !layer1_out) return;
+    static_cast<RtDetector*>(ctx)->runLayer1(fpixels, nframes, layer1_out);
+}
+int cuasrt_update_accum(void* ctx,
+                         const float* layer1_out, float* accum_buf,
+                         const RtParams* p, float* peaks_xy, int max_peaks) {
+    if (!ctx || !layer1_out || !accum_buf || !p || !peaks_xy) return 0;
+    return static_cast<RtDetector*>(ctx)->updateAccum(layer1_out, accum_buf,
+                                                       *p, peaks_xy, max_peaks);
+}
+} // extern "C"
--- a/src/rt_lib.h
+++ b/src/rt_lib.h
+/**
+ ** rt_lib.h — extern "C" public API for libcuasrt.so
+ **
+ ** Copyright (C) 2026 Elphel, Inc.
+ **
+ ** Called from:
+ **   Java    — via JNA (com.elphel.imagej.cuas.rt.CuasRTLib)
+ **   C++     — directly from standalone test/debug programs
+ **   Jetson  — C++ production binary (same API, ARM build of the .so)
+ **
+ ** Mirrors Java class RtParams (com.elphel.imagej.cuas.rt.RtParams).
+ ** Keep field names and order in sync with the JNA Structure mapping in
+ ** CuasRTLib.java (RtParamsJna extends com.sun.jna.Structure).
+ **
+ ** -----------------------------------------------------------------------------
+ **
+ **  rt_lib.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ ** -----------------------------------------------------------------------------
+ */
+#pragma once
+#ifndef RT_LIB_H
+#define RT_LIB_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * Runtime parameters for the RT detector.
+ * Passed by pointer; the library copies the struct internally.
+ *
+ * IMPORTANT: field order must match RtParamsJna.getFieldOrder() in CuasRTLib.java.
+ */
+typedef struct {
+    int   max_vel;       // velocity range [-max_vel .. +max_vel] px/sample; try 1, 2, 3
+    int   width;         // image width in pixels
+    int   height;        // image height in pixels
+    float mem_coeff;     // accumulation memory coefficient (e.g. 0.9)
+    float obs_coeff;     // observation weight (e.g. 0.1 = 1 - mem_coeff)
+    float leaky_slope;   // leaky ReLU negative slope: 1.0=linear, 0.01=near-hard
+    float peak_thresh;   // accumulation buffer peak detection threshold
+} RtParams;
+/**
+ * Allocate GPU memory and initialize CUDA context for the RT detector.
+ *
+ * width, height: image dimensions (must match fpixels layout in cuasrt_layer1).
+ * p: initial parameters (copied; can be changed per-call in cuasrt_update_accum).
+ *
+ * Returns opaque context handle; pass to all subsequent calls.
+ * Returns NULL on CUDA initialization failure.
+ */
+void* cuasrt_create(int width, int height, const RtParams* p);
+/**
+ * Release all GPU memory and destroy the CUDA context.
+ * ctx becomes invalid after this call.
+ */
+void  cuasrt_destroy(void* ctx);
+/**
+ * Layer 1: spatio-temporal matched filter over one input window.
+ *
+ * fpixels:    host float array, layout [nframes][height][width], row-major.
+ *             Must be the already temporally averaged/decimated frames for
+ *             this pyramid level.  nframes <= RT_NFRAMES_MAX (rt_defines.h).
+ *
+ * nframes:    number of frames in this window.  Center frame is nframes/2.
+ *             For 50% overlap mode: stride = nframes/2 between consecutive calls.
+ *
+ * layer1_out: host float array, size n_chan * width * height.
+ *             n_chan = (2*max_vel+1)^2.
+ *             Channel order: ivy major, ivx minor, both offset by max_vel.
+ *             Same indexing as RtParams.chanIdx(ivx,ivy) in Java.
+ *
+ * Kernel: k_layer1_channel launched n_chan times (one per velocity channel).
+ * All launches are synchronous; function returns when GPU work is complete.
+ */
+void  cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out);
+/**
+ * Layer 2: update accumulation buffers and detect Stage 2 trigger candidates.
+ *
+ * layer1_out: [n_chan * width * height] from cuasrt_layer1 (host memory, read-only).
+ * accum_buf:  [n_chan * width * height] persistent accumulation state (host memory,
+ *             in/out — caller owns and preserves between calls).
+ *             Initialize to zeros before first call (e.g. calloc).
+ * p:          current RtParams (may differ from cuasrt_create params for tuning).
+ * peaks_xy:   output buffer for detected peak coordinates, host memory.
+ *             Layout: [x0, y0, x1, y1, ...].  Must be >= max_peaks * 2 floats.
+ * max_peaks:  capacity of peaks_xy / 2.
+ *
+ * Returns number of peaks found (0 .. max_peaks).
+ *
+ * Sequence per channel:
+ *   1. Upload accum_buf channel to GPU
+ *   2. k_update_accum_channel: shift + mem*accum + obs*layer1 + leakyReLU
+ *   3. Download updated channel back to accum_buf
+ * Then: max-project across channels, run k_find_peaks, download results.
+ */
+int   cuasrt_update_accum(void*          ctx,
+                           const float*   layer1_out,
+                           float*         accum_buf,
+                           const RtParams* p,
+                           float*         peaks_xy,
+                           int            max_peaks);
+#ifdef __cplusplus
+}
+#endif
+#endif // RT_LIB_H