Commit b70571d6 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Initial cuas_rt_gpu skeleton (libcuasrt.so)

parents
/**
** rt_defines.h
**
** Copyright (C) 2026 Elphel, Inc.
**
** This file is part of cuas_rt_gpu (libcuasrt.so).
** Analogous to tp_defines.h in tile_processor_gpu.
**
** Unlike tp_defines.h, there is no JCUDA guard here: libcuasrt.so is
** always pre-compiled with nvcc (not NVRTC). Parameters that vary at
** runtime are passed via RtParams (rt_lib.h), not via #define.
**
** Only structural constants that affect kernel shared-memory layout
** or loop bounds that MUST be compile-time constants live here.
**
** -----------------------------------------------------------------------------
**
** rt_defines.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
** but WITHOUT ANY WARRANTY; without even the implied warranty of
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
** along with this program. If not, see <http://www.gnu.org/licenses/>.
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_DEFINES_H
#define RT_DEFINES_H
// Spatial kernel half-size for Layer 1 matched filter (3x3 neighborhood → half=1)
#define RT_SPATIAL_HALF 1
// Maximum runtime max_vel supported by pre-compiled kernels.
// Must be >= any value passed in RtParams.max_vel at runtime.
// Determines shared-memory allocation upper bound.
// Rebuild the library if you need a larger value.
#define RT_MAX_VEL_COMPILE 3
// Maximum number of input frames per Layer 1 call (NFRAMES_MAX in RtDetector).
// Increase if pyramid levels need longer windows.
#define RT_NFRAMES_MAX 16
// Maximum peaks returned per updateAccum call (d_peaks_xy size).
#define RT_MAX_PEAKS 1024
// CUDA thread-block tile size for 2D kernels (Layer 1, accumulation update).
#define RT_BLOCK_X 16
#define RT_BLOCK_Y 16
#ifdef RT_HAS_PRINTF
#include <stdio.h>
#endif
#endif // RT_DEFINES_H
/**
** rt_detector.cu — CUDA kernels and RtDetector class implementation.
**
** Copyright (C) 2026 Elphel, Inc.
**
** Kernel ↔ Java method correspondence (validate numerical agreement):
** k_layer1_channel ↔ CuasRTDetector.layer1Channel()
** k_update_accum_channel ↔ CuasRTDetector.updateAccumChannel()
** k_max_projection ↔ CuasRTDetector.maxProjection()
** k_find_peaks ↔ CuasRTDetector.findPeaks()
**
** Build: see Debug/subdir.mk (Eclipse CDT), target libcuasrt.so.
** nvcc -arch=sm_75 -fPIC --cudart=shared ...
**
** -----------------------------------------------------------------------------
**
** rt_detector.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#include "rt_detector.h"
#include <helper_cuda.h> // checkCudaErrors — from CUDA samples
#include <cstdio>
#include <cstring>
// ============================================================================
// CUDA kernels
// ============================================================================
/**
* Layer 1 matched filter for one velocity channel (ivx, ivy).
* Thread: one output pixel (x, y).
* Sums a (2*RT_SPATIAL_HALF+1)^2 spatial neighborhood across nframes,
* with each frame t spatially offset by (ivx*dt, ivy*dt), dt = t - nframes/2.
* Result is added into out[] (not cleared here; caller clears before looping channels).
*/
__global__ void k_layer1_channel(
const float* __restrict__ fpixels, // [nframes * width * height]
float* out, // [width * height]
int width, int height, int nframes,
int ivx, int ivy)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
int tMid = nframes / 2;
float sum = 0.0f;
for (int t = 0; t < nframes; t++) {
int dt = t - tMid;
int sx = x - ivx * dt;
int sy = y - ivy * dt;
for (int dy = -RT_SPATIAL_HALF; dy <= RT_SPATIAL_HALF; dy++) {
for (int dx = -RT_SPATIAL_HALF; dx <= RT_SPATIAL_HALF; dx++) {
int px = sx + dx, py = sy + dy;
// unsigned comparison handles both < 0 and >= dim in one branch
if ((unsigned)px < (unsigned)width && (unsigned)py < (unsigned)height)
sum += fpixels[t * width * height + py * width + px];
}
}
}
out[y * width + x] += sum;
}
/**
* Per-channel accumulation update.
* Combines shift + integrate + leakyReLU in one kernel to avoid extra passes.
* Thread: one output pixel (x, y).
*
* accum_in: shifted accumulation (shift is done by reading from (x-ivx, y-ivy))
* layer1: new observation for this channel
* accum_out: mem_coeff * accum_shifted + obs_coeff * layer1, then leakyReLU
*
* TODO: for sub-pixel (half-pixel) shift, replace integer offset with
* bilinear interpolation: accum_in read from 4 neighbors with lerp weights.
*/
__global__ void k_update_accum_channel(
const float* __restrict__ accum_in, // [width * height] previous state
const float* __restrict__ layer1, // [width * height] new observation
float* accum_out, // [width * height] updated state
int width, int height,
int ivx, int ivy,
float mem_coeff, float obs_coeff, float leaky_slope)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= width || y >= height) return;
// Shifted read: pixel that "was" at (x-ivx, y-ivy) now maps to (x,y)
int sx = x - ivx, sy = y - ivy;
float prev = 0.0f;
if ((unsigned)sx < (unsigned)width && (unsigned)sy < (unsigned)height)
prev = accum_in[sy * width + sx];
float v = mem_coeff * prev + obs_coeff * layer1[y * width + x];
if (v < 0.0f) v *= leaky_slope;
accum_out[y * width + x] = v;
}
/**
* Cross-channel max projection: for each pixel, take max across all n_chan channels.
* Thread: one pixel. Reads accum_flat[n_chan * n_pixels] laid out channel-major.
*/
__global__ void k_max_projection(
const float* __restrict__ accum_flat, // [n_chan * width * height]
float* proj, // [width * height]
int n_pixels, int n_chan)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n_pixels) return;
float mx = 0.0f;
for (int c = 0; c < n_chan; c++) {
float v = accum_flat[c * n_pixels + i];
if (v > mx) mx = v;
}
proj[i] = mx;
}
/**
* 3x3 non-maximum suppression peak detection.
* Thread: one pixel. Border pixels (1-pixel margin) are skipped.
* A pixel is a peak if it is >= thresh AND strictly greater than all 8 neighbors.
* Detected peaks are appended atomically to peaks_xy; num_peaks is an atomic counter.
*/
__global__ void k_find_peaks(
const float* __restrict__ proj, // [width * height]
float* peaks_xy, // [max_peaks * 2]
int* num_peaks, // atomic counter, init to 0
int width, int height,
float thresh, int max_peaks)
{
int x = blockIdx.x * blockDim.x + threadIdx.x + 1; // skip left border
int y = blockIdx.y * blockDim.y + threadIdx.y + 1; // skip top border
if (x >= width - 1 || y >= height - 1) return;
float v = proj[y * width + x];
if (v < thresh) return;
// 3x3 NMS
for (int dy = -1; dy <= 1; dy++) {
for (int dx = -1; dx <= 1; dx++) {
if (dx == 0 && dy == 0) continue;
if (proj[(y + dy) * width + (x + dx)] >= v) return;
}
}
// v is a local maximum: claim a slot atomically
int slot = atomicAdd(num_peaks, 1);
if (slot < max_peaks) {
peaks_xy[slot * 2 ] = (float)x;
peaks_xy[slot * 2 + 1] = (float)y;
}
}
// ============================================================================
// RtDetector class implementation
// ============================================================================
RtDetector::RtDetector(const RtParams& p) : m_p(p) {
int nv = 2 * p.max_vel + 1;
m_n_chan = nv * nv;
m_n_pixels = p.width * p.height;
allocGpu();
}
RtDetector::~RtDetector() {
freeGpu();
}
void RtDetector::allocGpu() {
checkCudaErrors(cudaMalloc(&d_fpixels, RT_NFRAMES_MAX * m_n_pixels * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_layer1_out, m_n_chan * m_n_pixels * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_accum_chan, m_n_pixels * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_proj, m_n_pixels * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_peaks_xy, RT_MAX_PEAKS * 2 * sizeof(float)));
checkCudaErrors(cudaMalloc(&d_num_peaks, sizeof(int)));
}
void RtDetector::freeGpu() {
cudaFree(d_fpixels); d_fpixels = nullptr;
cudaFree(d_layer1_out); d_layer1_out = nullptr;
cudaFree(d_accum_chan); d_accum_chan = nullptr;
cudaFree(d_proj); d_proj = nullptr;
cudaFree(d_peaks_xy); d_peaks_xy = nullptr;
cudaFree(d_num_peaks); d_num_peaks = nullptr;
}
void RtDetector::runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out) {
if (nframes > RT_NFRAMES_MAX) {
fprintf(stderr, "runLayer1: nframes %d > RT_NFRAMES_MAX %d\n",
nframes, RT_NFRAMES_MAX);
return;
}
checkCudaErrors(cudaMemcpy(d_fpixels, h_fpixels,
(size_t)nframes * m_n_pixels * sizeof(float),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemset(d_layer1_out, 0,
(size_t)m_n_chan * m_n_pixels * sizeof(float)));
dim3 block(RT_BLOCK_X, RT_BLOCK_Y);
dim3 grid((m_p.width + RT_BLOCK_X - 1) / RT_BLOCK_X,
(m_p.height + RT_BLOCK_Y - 1) / RT_BLOCK_Y);
for (int ivy = -m_p.max_vel; ivy <= m_p.max_vel; ivy++) {
for (int ivx = -m_p.max_vel; ivx <= m_p.max_vel; ivx++) {
int ich = (ivy + m_p.max_vel) * (2 * m_p.max_vel + 1) + (ivx + m_p.max_vel);
k_layer1_channel<<<grid, block>>>(
d_fpixels,
d_layer1_out + (size_t)ich * m_n_pixels,
m_p.width, m_p.height, nframes, ivx, ivy);
}
}
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(h_layer1_out, d_layer1_out,
(size_t)m_n_chan * m_n_pixels * sizeof(float),
cudaMemcpyDeviceToHost));
}
int RtDetector::updateAccum(const float* h_layer1_out, float* h_accum_buf,
const RtParams& p, float* h_peaks_xy, int max_peaks) {
dim3 block(RT_BLOCK_X, RT_BLOCK_Y);
dim3 grid((p.width + RT_BLOCK_X - 1) / RT_BLOCK_X,
(p.height + RT_BLOCK_Y - 1) / RT_BLOCK_Y);
// We need a temporary GPU buffer for the incoming accum channel and the updated one.
// d_accum_chan is used as in-place scratch: upload old, compute new into d_layer1_out
// (reusing its per-channel slice as temp storage), then download new back.
// TODO: refactor with a second scratch buffer for clarity.
float* d_tmp = d_layer1_out; // reuse layer1 buffer as temporary after layer1 is done
for (int ivy = -p.max_vel; ivy <= p.max_vel; ivy++) {
for (int ivx = -p.max_vel; ivx <= p.max_vel; ivx++) {
int ich = (ivy + p.max_vel) * (2 * p.max_vel + 1) + (ivx + p.max_vel);
size_t off = (size_t)ich * m_n_pixels;
// Upload old accum channel
checkCudaErrors(cudaMemcpy(d_accum_chan,
h_accum_buf + off,
m_n_pixels * sizeof(float),
cudaMemcpyHostToDevice));
// Upload layer1 observation for this channel
float* d_l1_chan = d_tmp + off;
checkCudaErrors(cudaMemcpy(d_l1_chan,
h_layer1_out + off,
m_n_pixels * sizeof(float),
cudaMemcpyHostToDevice));
// Update in-place (write result back to d_accum_chan via swap)
k_update_accum_channel<<<grid, block>>>(
d_accum_chan, d_l1_chan, d_accum_chan,
p.width, p.height, ivx, ivy,
p.mem_coeff, p.obs_coeff, p.leaky_slope);
checkCudaErrors(cudaDeviceSynchronize());
// Download updated channel
checkCudaErrors(cudaMemcpy(h_accum_buf + off,
d_accum_chan,
m_n_pixels * sizeof(float),
cudaMemcpyDeviceToHost));
}
}
// Upload full updated accum_buf for max projection
// (already updated in h_accum_buf; upload all channels at once)
checkCudaErrors(cudaMemcpy(d_tmp, h_accum_buf,
(size_t)m_n_chan * m_n_pixels * sizeof(float),
cudaMemcpyHostToDevice));
// Max projection across channels
int flat_block = 256;
int flat_grid = (m_n_pixels + flat_block - 1) / flat_block;
k_max_projection<<<flat_grid, flat_block>>>(d_tmp, d_proj, m_n_pixels, m_n_chan);
checkCudaErrors(cudaDeviceSynchronize());
// Peak detection
checkCudaErrors(cudaMemset(d_num_peaks, 0, sizeof(int)));
int mp = (max_peaks < RT_MAX_PEAKS) ? max_peaks : RT_MAX_PEAKS;
k_find_peaks<<<grid, block>>>(d_proj, d_peaks_xy, d_num_peaks,
p.width, p.height, p.peak_thresh, mp);
checkCudaErrors(cudaDeviceSynchronize());
int num_found = 0;
checkCudaErrors(cudaMemcpy(&num_found, d_num_peaks, sizeof(int), cudaMemcpyDeviceToHost));
if (num_found > mp) num_found = mp;
checkCudaErrors(cudaMemcpy(h_peaks_xy, d_peaks_xy,
(size_t)num_found * 2 * sizeof(float),
cudaMemcpyDeviceToHost));
return num_found;
}
/**
** rt_detector.h — internal C++ class managing GPU state for one detector instance.
**
** Copyright (C) 2026 Elphel, Inc.
**
** Not part of the extern "C" API; used only within libcuasrt.so.
** Analogous to TpHostGpu in tile_processor_gpu.
**
** Each method corresponds to a static method in CuasRTDetector.java.
** Validate numerical agreement between Java prototype and C++ implementation
** before tuning parameters.
**
** -----------------------------------------------------------------------------
**
** rt_detector.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_DETECTOR_H
#define RT_DETECTOR_H
#include "rt_defines.h"
#include "rt_lib.h"
#include <cuda_runtime.h>
class RtDetector {
public:
explicit RtDetector(const RtParams& p);
~RtDetector();
/**
* Layer 1 matched filter.
* h_fpixels: host [nframes * width * height]
* nframes: number of input frames (<= RT_NFRAMES_MAX)
* h_layer1_out: host [n_chan * width * height] output
*/
void runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out);
/**
* Update accumulation buffers; detect and return peaks.
* h_layer1_out: host [n_chan * width * height] from runLayer1
* h_accum_buf: host [n_chan * width * height] persistent state (in/out, caller owns)
* p: current parameters (may differ from construction-time params)
* h_peaks_xy: output [max_peaks * 2]
* max_peaks: output capacity
* Returns: number of peaks found
*/
int updateAccum(const float* h_layer1_out,
float* h_accum_buf,
const RtParams& p,
float* h_peaks_xy,
int max_peaks);
private:
RtParams m_p;
int m_n_chan; // (2*max_vel+1)^2
int m_n_pixels; // width * height
// GPU memory (owned by this object)
float* d_fpixels = nullptr; // [RT_NFRAMES_MAX * n_pixels]
float* d_layer1_out = nullptr; // [n_chan * n_pixels]
float* d_accum_chan = nullptr; // [n_pixels] one channel at a time (upload/update/download)
float* d_proj = nullptr; // [n_pixels] cross-channel max projection
float* d_peaks_xy = nullptr; // [RT_MAX_PEAKS * 2]
int* d_num_peaks = nullptr; // single int (atomic counter in k_find_peaks)
void allocGpu();
void freeGpu();
};
#endif // RT_DETECTOR_H
/**
** rt_lib.cu — extern "C" wrappers around RtDetector.
**
** Copyright (C) 2026 Elphel, Inc.
**
** This is the JNA / C++ API surface. All CUDA work is in rt_detector.cu.
** Keep this file thin: just parameter validation and delegation to RtDetector.
**
** -----------------------------------------------------------------------------
**
** rt_lib.cu is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#include "rt_lib.h"
#include "rt_detector.h"
#include <cstdio>
extern "C" {
void* cuasrt_create(int width, int height, const RtParams* p) {
if (!p) { fprintf(stderr, "cuasrt_create: null params\n"); return nullptr; }
RtParams rp = *p;
rp.width = width;
rp.height = height;
return new RtDetector(rp);
}
void cuasrt_destroy(void* ctx) {
delete static_cast<RtDetector*>(ctx);
}
void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out) {
if (!ctx || !fpixels || !layer1_out) return;
static_cast<RtDetector*>(ctx)->runLayer1(fpixels, nframes, layer1_out);
}
int cuasrt_update_accum(void* ctx,
const float* layer1_out, float* accum_buf,
const RtParams* p, float* peaks_xy, int max_peaks) {
if (!ctx || !layer1_out || !accum_buf || !p || !peaks_xy) return 0;
return static_cast<RtDetector*>(ctx)->updateAccum(layer1_out, accum_buf,
*p, peaks_xy, max_peaks);
}
} // extern "C"
/**
** rt_lib.h — extern "C" public API for libcuasrt.so
**
** Copyright (C) 2026 Elphel, Inc.
**
** Called from:
** Java — via JNA (com.elphel.imagej.cuas.rt.CuasRTLib)
** C++ — directly from standalone test/debug programs
** Jetson — C++ production binary (same API, ARM build of the .so)
**
** Mirrors Java class RtParams (com.elphel.imagej.cuas.rt.RtParams).
** Keep field names and order in sync with the JNA Structure mapping in
** CuasRTLib.java (RtParamsJna extends com.sun.jna.Structure).
**
** -----------------------------------------------------------------------------
**
** rt_lib.h is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** -----------------------------------------------------------------------------
*/
#pragma once
#ifndef RT_LIB_H
#define RT_LIB_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* Runtime parameters for the RT detector.
* Passed by pointer; the library copies the struct internally.
*
* IMPORTANT: field order must match RtParamsJna.getFieldOrder() in CuasRTLib.java.
*/
typedef struct {
int max_vel; // velocity range [-max_vel .. +max_vel] px/sample; try 1, 2, 3
int width; // image width in pixels
int height; // image height in pixels
float mem_coeff; // accumulation memory coefficient (e.g. 0.9)
float obs_coeff; // observation weight (e.g. 0.1 = 1 - mem_coeff)
float leaky_slope; // leaky ReLU negative slope: 1.0=linear, 0.01=near-hard
float peak_thresh; // accumulation buffer peak detection threshold
} RtParams;
/**
* Allocate GPU memory and initialize CUDA context for the RT detector.
*
* width, height: image dimensions (must match fpixels layout in cuasrt_layer1).
* p: initial parameters (copied; can be changed per-call in cuasrt_update_accum).
*
* Returns opaque context handle; pass to all subsequent calls.
* Returns NULL on CUDA initialization failure.
*/
void* cuasrt_create(int width, int height, const RtParams* p);
/**
* Release all GPU memory and destroy the CUDA context.
* ctx becomes invalid after this call.
*/
void cuasrt_destroy(void* ctx);
/**
* Layer 1: spatio-temporal matched filter over one input window.
*
* fpixels: host float array, layout [nframes][height][width], row-major.
* Must be the already temporally averaged/decimated frames for
* this pyramid level. nframes <= RT_NFRAMES_MAX (rt_defines.h).
*
* nframes: number of frames in this window. Center frame is nframes/2.
* For 50% overlap mode: stride = nframes/2 between consecutive calls.
*
* layer1_out: host float array, size n_chan * width * height.
* n_chan = (2*max_vel+1)^2.
* Channel order: ivy major, ivx minor, both offset by max_vel.
* Same indexing as RtParams.chanIdx(ivx,ivy) in Java.
*
* Kernel: k_layer1_channel launched n_chan times (one per velocity channel).
* All launches are synchronous; function returns when GPU work is complete.
*/
void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out);
/**
* Layer 2: update accumulation buffers and detect Stage 2 trigger candidates.
*
* layer1_out: [n_chan * width * height] from cuasrt_layer1 (host memory, read-only).
* accum_buf: [n_chan * width * height] persistent accumulation state (host memory,
* in/out — caller owns and preserves between calls).
* Initialize to zeros before first call (e.g. calloc).
* p: current RtParams (may differ from cuasrt_create params for tuning).
* peaks_xy: output buffer for detected peak coordinates, host memory.
* Layout: [x0, y0, x1, y1, ...]. Must be >= max_peaks * 2 floats.
* max_peaks: capacity of peaks_xy / 2.
*
* Returns number of peaks found (0 .. max_peaks).
*
* Sequence per channel:
* 1. Upload accum_buf channel to GPU
* 2. k_update_accum_channel: shift + mem*accum + obs*layer1 + leakyReLU
* 3. Download updated channel back to accum_buf
* Then: max-project across channels, run k_find_peaks, download results.
*/
int cuasrt_update_accum(void* ctx,
const float* layer1_out,
float* accum_buf,
const RtParams* p,
float* peaks_xy,
int max_peaks);
#ifdef __cplusplus
}
#endif
#endif // RT_LIB_H
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment