Commit b70571d6 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Initial cuas_rt_gpu skeleton (libcuasrt.so)

parents
/**
** rt_defines.h
**
** Copyright (C) 2026 Elphel, Inc.
**
** This file is part of cuas_rt_gpu (libcuasrt.so).
** Analogous to tp_defines.h in tile_processor_gpu.
**
** Unlike tp_defines.h, there is no JCUDA guard here: libcuasrt.so is
** always pre-compiled with nvcc (not NVRTC). Parameters that vary at
** runtime are passed via RtParams (rt_lib.h), not via #define.
**
** Only structural constants that affect kernel shared-memory layout
** or loop bounds that MUST be compile-time constants live here.
**
** -----------------------------------------------------------------------------
**
** rt_defines.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_DEFINES_H
#define RT_DEFINES_H
// Spatial kernel half-size for Layer 1 matched filter (3x3 neighborhood → half=1)
#define RT_SPATIAL_HALF 1
// Maximum runtime max_vel supported by pre-compiled kernels.
// Must be >= any value passed in RtParams.max_vel at runtime.
// Determines shared-memory allocation upper bound.
// Rebuild the library if you need a larger value.
#define RT_MAX_VEL_COMPILE 3
// Maximum number of input frames per Layer 1 call (NFRAMES_MAX in RtDetector).
// Increase if pyramid levels need longer windows.
#define RT_NFRAMES_MAX 16
// Maximum peaks returned per updateAccum call (d_peaks_xy size).
#define RT_MAX_PEAKS 1024
// CUDA thread-block tile size for 2D kernels (Layer 1, accumulation update).
#define RT_BLOCK_X 16
#define RT_BLOCK_Y 16
#ifdef RT_HAS_PRINTF
#include <stdio.h>
#endif
#endif // RT_DEFINES_H
This diff is collapsed.
/**
** rt_detector.h — internal C++ class managing GPU state for one detector instance.
**
** Copyright (C) 2026 Elphel, Inc.
**
** Not part of the extern "C" API; used only within libcuasrt.so.
** Analogous to TpHostGpu in tile_processor_gpu.
**
** Each method corresponds to a static method in CuasRTDetector.java.
** Validate numerical agreement between Java prototype and C++ implementation
** before tuning parameters.
**
** -----------------------------------------------------------------------------
**
** rt_detector.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_DETECTOR_H
#define RT_DETECTOR_H
#include "rt_defines.h"
#include "rt_lib.h"
#include <cuda_runtime.h>
class RtDetector {
public:
explicit RtDetector(const RtParams& p);
~RtDetector();
/**
* Layer 1 matched filter.
* h_fpixels: host [nframes * width * height]
* nframes: number of input frames (<= RT_NFRAMES_MAX)
* h_layer1_out: host [n_chan * width * height] output
*/
void runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out);
/**
* Update accumulation buffers; detect and return peaks.
* h_layer1_out: host [n_chan * width * height] from runLayer1
* h_accum_buf: host [n_chan * width * height] persistent state (in/out, caller owns)
* p: current parameters (may differ from construction-time params)
* h_peaks_xy: output [max_peaks * 2]
* max_peaks: output capacity
* Returns: number of peaks found
*/
int updateAccum(const float* h_layer1_out,
float* h_accum_buf,
const RtParams& p,
float* h_peaks_xy,
int max_peaks);
private:
RtParams m_p;
int m_n_chan; // (2*max_vel+1)^2
int m_n_pixels; // width * height
// GPU memory (owned by this object)
float* d_fpixels = nullptr; // [RT_NFRAMES_MAX * n_pixels]
float* d_layer1_out = nullptr; // [n_chan * n_pixels]
float* d_accum_chan = nullptr; // [n_pixels] one channel at a time (upload/update/download)
float* d_proj = nullptr; // [n_pixels] cross-channel max projection
float* d_peaks_xy = nullptr; // [RT_MAX_PEAKS * 2]
int* d_num_peaks = nullptr; // single int (atomic counter in k_find_peaks)
void allocGpu();
void freeGpu();
};
#endif // RT_DETECTOR_H
/**
** rt_lib.cu — extern "C" wrappers around RtDetector.
**
** Copyright (C) 2026 Elphel, Inc.
**
** This is the JNA / C++ API surface. All CUDA work is in rt_detector.cu.
** Keep this file thin: just parameter validation and delegation to RtDetector.
**
** -----------------------------------------------------------------------------
**
** rt_lib.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#include "rt_lib.h"
#include "rt_detector.h"
#include <cstdio>
extern "C" {
void* cuasrt_create(int width, int height, const RtParams* p) {
if (!p) { fprintf(stderr, "cuasrt_create: null params\n"); return nullptr; }
RtParams rp = *p;
rp.width = width;
rp.height = height;
return new RtDetector(rp);
}
void cuasrt_destroy(void* ctx) {
delete static_cast<RtDetector*>(ctx);
}
void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out) {
if (!ctx || !fpixels || !layer1_out) return;
static_cast<RtDetector*>(ctx)->runLayer1(fpixels, nframes, layer1_out);
}
int cuasrt_update_accum(void* ctx,
const float* layer1_out, float* accum_buf,
const RtParams* p, float* peaks_xy, int max_peaks) {
if (!ctx || !layer1_out || !accum_buf || !p || !peaks_xy) return 0;
return static_cast<RtDetector*>(ctx)->updateAccum(layer1_out, accum_buf,
*p, peaks_xy, max_peaks);
}
} // extern "C"
/**
** rt_lib.h — extern "C" public API for libcuasrt.so
**
** Copyright (C) 2026 Elphel, Inc.
**
** Called from:
** Java — via JNA (com.elphel.imagej.cuas.rt.CuasRTLib)
** C++ — directly from standalone test/debug programs
** Jetson — C++ production binary (same API, ARM build of the .so)
**
** Mirrors Java class RtParams (com.elphel.imagej.cuas.rt.RtParams).
** Keep field names and order in sync with the JNA Structure mapping in
** CuasRTLib.java (RtParamsJna extends com.sun.jna.Structure).
**
** -----------------------------------------------------------------------------
**
** rt_lib.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_LIB_H
#define RT_LIB_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* Runtime parameters for the RT detector.
* Passed by pointer; the library copies the struct internally.
*
* IMPORTANT: field order must match RtParamsJna.getFieldOrder() in CuasRTLib.java.
*/
typedef struct {
int max_vel; // velocity range [-max_vel .. +max_vel] px/sample; try 1, 2, 3
int width; // image width in pixels
int height; // image height in pixels
float mem_coeff; // accumulation memory coefficient (e.g. 0.9)
float obs_coeff; // observation weight (e.g. 0.1 = 1 - mem_coeff)
float leaky_slope; // leaky ReLU negative slope: 1.0=linear, 0.01=near-hard
float peak_thresh; // accumulation buffer peak detection threshold
} RtParams;
/**
* Allocate GPU memory and initialize CUDA context for the RT detector.
*
* width, height: image dimensions (must match fpixels layout in cuasrt_layer1).
* p: initial parameters (copied; can be changed per-call in cuasrt_update_accum).
*
* Returns opaque context handle; pass to all subsequent calls.
* Returns NULL on CUDA initialization failure.
*/
void* cuasrt_create(int width, int height, const RtParams* p);
/**
* Release all GPU memory and destroy the CUDA context.
* ctx becomes invalid after this call.
*/
void cuasrt_destroy(void* ctx);
/**
* Layer 1: spatio-temporal matched filter over one input window.
*
* fpixels: host float array, layout [nframes][height][width], row-major.
* Must be the already temporally averaged/decimated frames for
* this pyramid level. nframes <= RT_NFRAMES_MAX (rt_defines.h).
*
* nframes: number of frames in this window. Center frame is nframes/2.
* For 50% overlap mode: stride = nframes/2 between consecutive calls.
*
* layer1_out: host float array, size n_chan * width * height.
* n_chan = (2*max_vel+1)^2.
* Channel order: ivy major, ivx minor, both offset by max_vel.
* Same indexing as RtParams.chanIdx(ivx,ivy) in Java.
*
* Kernel: k_layer1_channel launched n_chan times (one per velocity channel).
* All launches are synchronous; function returns when GPU work is complete.
*/
void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out);
/**
* Layer 2: update accumulation buffers and detect Stage 2 trigger candidates.
*
* layer1_out: [n_chan * width * height] from cuasrt_layer1 (host memory, read-only).
* accum_buf: [n_chan * width * height] persistent accumulation state (host memory,
* in/out — caller owns and preserves between calls).
* Initialize to zeros before first call (e.g. calloc).
* p: current RtParams (may differ from cuasrt_create params for tuning).
* peaks_xy: output buffer for detected peak coordinates, host memory.
* Layout: [x0, y0, x1, y1, ...]. Must be >= max_peaks * 2 floats.
* max_peaks: capacity of peaks_xy / 2.
*
* Returns number of peaks found (0 .. max_peaks).
*
* Sequence per channel:
* 1. Upload accum_buf channel to GPU
* 2. k_update_accum_channel: shift + mem*accum + obs*layer1 + leakyReLU
* 3. Download updated channel back to accum_buf
* Then: max-project across channels, run k_find_peaks, download results.
*/
int cuasrt_update_accum(void* ctx,
const float* layer1_out,
float* accum_buf,
const RtParams* p,
float* peaks_xy,
int max_peaks);
#ifdef __cplusplus
}
#endif
#endif // RT_LIB_H
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment