CLAUDE: Initial cuas_rt_gpu skeleton (libcuasrt.so) (b70571d6) · Commits · Elphel / cuas_rt_gpu

src/rt_defines.h

0 → 100644

+61 −0

Original line number	Diff line number	Diff line
		/**
		** rt_defines.h
		**
		** Copyright (C) 2026 Elphel, Inc.
		**
		** This file is part of cuas_rt_gpu (libcuasrt.so).
		** Analogous to tp_defines.h in tile_processor_gpu.
		**
		** Unlike tp_defines.h, there is no JCUDA guard here: libcuasrt.so is
		** always pre-compiled with nvcc (not NVRTC). Parameters that vary at
		** runtime are passed via RtParams (rt_lib.h), not via #define.
		**
		** Only structural constants that affect kernel shared-memory layout
		** or loop bounds that MUST be compile-time constants live here.
		**
		** -----------------------------------------------------------------------------
		**
		** rt_defines.h is free software: you can redistribute it and/or modify
		** it under the terms of the GNU General Public License as published by
		** the Free Software Foundation, either version 3 of the License, or
		** (at your option) any later version.
		**
		** This program is distributed in the hope that it will be useful,
		** but WITHOUT ANY WARRANTY; without even the implied warranty of
		** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
		** GNU General Public License for more details.
		**
		** You should have received a copy of the GNU General Public License
		** along with this program. If not, see <http://www.gnu.org/licenses/>.
		** -----------------------------------------------------------------------------
		*/

		#pragma once
		#ifndef RT_DEFINES_H
		#define RT_DEFINES_H

		// Spatial kernel half-size for Layer 1 matched filter (3x3 neighborhood → half=1)
		#define RT_SPATIAL_HALF 1

		// Maximum runtime max_vel supported by pre-compiled kernels.
		// Must be >= any value passed in RtParams.max_vel at runtime.
		// Determines shared-memory allocation upper bound.
		// Rebuild the library if you need a larger value.
		#define RT_MAX_VEL_COMPILE 3

		// Maximum number of input frames per Layer 1 call (NFRAMES_MAX in RtDetector).
		// Increase if pyramid levels need longer windows.
		#define RT_NFRAMES_MAX 16

		// Maximum peaks returned per updateAccum call (d_peaks_xy size).
		#define RT_MAX_PEAKS 1024

		// CUDA thread-block tile size for 2D kernels (Layer 1, accumulation update).
		#define RT_BLOCK_X 16
		#define RT_BLOCK_Y 16

		#ifdef RT_HAS_PRINTF
		#include <stdio.h>
		#endif

		#endif // RT_DEFINES_H

src/rt_detector.cu

0 → 100644

+293 −0

Original line number	Diff line number	Diff line
		/**
		** rt_detector.cu — CUDA kernels and RtDetector class implementation.
		**
		** Copyright (C) 2026 Elphel, Inc.
		**
		** Kernel ↔ Java method correspondence (validate numerical agreement):
		** k_layer1_channel ↔ CuasRTDetector.layer1Channel()
		** k_update_accum_channel ↔ CuasRTDetector.updateAccumChannel()
		** k_max_projection ↔ CuasRTDetector.maxProjection()
		** k_find_peaks ↔ CuasRTDetector.findPeaks()
		**
		** Build: see Debug/subdir.mk (Eclipse CDT), target libcuasrt.so.
		** nvcc -arch=sm_75 -fPIC --cudart=shared ...
		**
		** -----------------------------------------------------------------------------
		**
		** rt_detector.cu is free software: you can redistribute it and/or modify
		** it under the terms of the GNU General Public License as published by
		** the Free Software Foundation, either version 3 of the License, or
		** (at your option) any later version.
		**
		** -----------------------------------------------------------------------------
		*/

		#include "rt_detector.h"
		#include <helper_cuda.h> // checkCudaErrors — from CUDA samples
		#include <cstdio>
		#include <cstring>

		// ============================================================================
		// CUDA kernels
		// ============================================================================

		/**
		* Layer 1 matched filter for one velocity channel (ivx, ivy).
		* Thread: one output pixel (x, y).
		* Sums a (2*RT_SPATIAL_HALF+1)^2 spatial neighborhood across nframes,
		* with each frame t spatially offset by (ivxdt, ivydt), dt = t - nframes/2.
		* Result is added into out[] (not cleared here; caller clears before looping channels).
		*/
		__global__ void k_layer1_channel(
		const float* __restrict__ fpixels, // [nframes * width * height]
		float* out, // [width * height]
		int width, int height, int nframes,
		int ivx, int ivy)
		{
		int x = blockIdx.x * blockDim.x + threadIdx.x;
		int y = blockIdx.y * blockDim.y + threadIdx.y;
		if (x >= width \|\| y >= height) return;

		int tMid = nframes / 2;
		float sum = 0.0f;
		for (int t = 0; t < nframes; t++) {
		int dt = t - tMid;
		int sx = x - ivx * dt;
		int sy = y - ivy * dt;
		for (int dy = -RT_SPATIAL_HALF; dy <= RT_SPATIAL_HALF; dy++) {
		for (int dx = -RT_SPATIAL_HALF; dx <= RT_SPATIAL_HALF; dx++) {
		int px = sx + dx, py = sy + dy;
		// unsigned comparison handles both < 0 and >= dim in one branch
		if ((unsigned)px < (unsigned)width && (unsigned)py < (unsigned)height)
		sum += fpixels[t * width * height + py * width + px];
		}
		}
		}
		out[y * width + x] += sum;
		}

		/**
		* Per-channel accumulation update.
		* Combines shift + integrate + leakyReLU in one kernel to avoid extra passes.
		* Thread: one output pixel (x, y).
		*
		* accum_in: shifted accumulation (shift is done by reading from (x-ivx, y-ivy))
		* layer1: new observation for this channel
		* accum_out: mem_coeff * accum_shifted + obs_coeff * layer1, then leakyReLU
		*
		* TODO: for sub-pixel (half-pixel) shift, replace integer offset with
		* bilinear interpolation: accum_in read from 4 neighbors with lerp weights.
		*/
		__global__ void k_update_accum_channel(
		const float* __restrict__ accum_in, // [width * height] previous state
		const float* __restrict__ layer1, // [width * height] new observation
		float* accum_out, // [width * height] updated state
		int width, int height,
		int ivx, int ivy,
		float mem_coeff, float obs_coeff, float leaky_slope)
		{
		int x = blockIdx.x * blockDim.x + threadIdx.x;
		int y = blockIdx.y * blockDim.y + threadIdx.y;
		if (x >= width \|\| y >= height) return;

		// Shifted read: pixel that "was" at (x-ivx, y-ivy) now maps to (x,y)
		int sx = x - ivx, sy = y - ivy;
		float prev = 0.0f;
		if ((unsigned)sx < (unsigned)width && (unsigned)sy < (unsigned)height)
		prev = accum_in[sy * width + sx];

		float v = mem_coeff * prev + obs_coeff * layer1[y * width + x];
		if (v < 0.0f) v *= leaky_slope;
		accum_out[y * width + x] = v;
		}

		/**
		* Cross-channel max projection: for each pixel, take max across all n_chan channels.
		* Thread: one pixel. Reads accum_flat[n_chan * n_pixels] laid out channel-major.
		*/
		__global__ void k_max_projection(
		const float* __restrict__ accum_flat, // [n_chan * width * height]
		float* proj, // [width * height]
		int n_pixels, int n_chan)
		{
		int i = blockIdx.x * blockDim.x + threadIdx.x;
		if (i >= n_pixels) return;
		float mx = 0.0f;
		for (int c = 0; c < n_chan; c++) {
		float v = accum_flat[c * n_pixels + i];
		if (v > mx) mx = v;
		}
		proj[i] = mx;
		}

		/**
		* 3x3 non-maximum suppression peak detection.
		* Thread: one pixel. Border pixels (1-pixel margin) are skipped.
		* A pixel is a peak if it is >= thresh AND strictly greater than all 8 neighbors.
		* Detected peaks are appended atomically to peaks_xy; num_peaks is an atomic counter.
		*/
		__global__ void k_find_peaks(
		const float* __restrict__ proj, // [width * height]
		float* peaks_xy, // [max_peaks * 2]
		int* num_peaks, // atomic counter, init to 0
		int width, int height,
		float thresh, int max_peaks)
		{
		int x = blockIdx.x * blockDim.x + threadIdx.x + 1; // skip left border
		int y = blockIdx.y * blockDim.y + threadIdx.y + 1; // skip top border
		if (x >= width - 1 \|\| y >= height - 1) return;

		float v = proj[y * width + x];
		if (v < thresh) return;

		// 3x3 NMS
		for (int dy = -1; dy <= 1; dy++) {
		for (int dx = -1; dx <= 1; dx++) {
		if (dx == 0 && dy == 0) continue;
		if (proj[(y + dy) * width + (x + dx)] >= v) return;
		}
		}
		// v is a local maximum: claim a slot atomically
		int slot = atomicAdd(num_peaks, 1);
		if (slot < max_peaks) {
		peaks_xy[slot * 2 ] = (float)x;
		peaks_xy[slot * 2 + 1] = (float)y;
		}
		}

		// ============================================================================
		// RtDetector class implementation
		// ============================================================================

		RtDetector::RtDetector(const RtParams& p) : m_p(p) {
		int nv = 2 * p.max_vel + 1;
		m_n_chan = nv * nv;
		m_n_pixels = p.width * p.height;
		allocGpu();
		}

		RtDetector::~RtDetector() {
		freeGpu();
		}

		void RtDetector::allocGpu() {
		checkCudaErrors(cudaMalloc(&d_fpixels, RT_NFRAMES_MAX * m_n_pixels * sizeof(float)));
		checkCudaErrors(cudaMalloc(&d_layer1_out, m_n_chan * m_n_pixels * sizeof(float)));
		checkCudaErrors(cudaMalloc(&d_accum_chan, m_n_pixels * sizeof(float)));
		checkCudaErrors(cudaMalloc(&d_proj, m_n_pixels * sizeof(float)));
		checkCudaErrors(cudaMalloc(&d_peaks_xy, RT_MAX_PEAKS * 2 * sizeof(float)));
		checkCudaErrors(cudaMalloc(&d_num_peaks, sizeof(int)));
		}

		void RtDetector::freeGpu() {
		cudaFree(d_fpixels); d_fpixels = nullptr;
		cudaFree(d_layer1_out); d_layer1_out = nullptr;
		cudaFree(d_accum_chan); d_accum_chan = nullptr;
		cudaFree(d_proj); d_proj = nullptr;
		cudaFree(d_peaks_xy); d_peaks_xy = nullptr;
		cudaFree(d_num_peaks); d_num_peaks = nullptr;
		}

		void RtDetector::runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out) {
		if (nframes > RT_NFRAMES_MAX) {
		fprintf(stderr, "runLayer1: nframes %d > RT_NFRAMES_MAX %d\n",
		nframes, RT_NFRAMES_MAX);
		return;
		}
		checkCudaErrors(cudaMemcpy(d_fpixels, h_fpixels,
		(size_t)nframes * m_n_pixels * sizeof(float),
		cudaMemcpyHostToDevice));
		checkCudaErrors(cudaMemset(d_layer1_out, 0,
		(size_t)m_n_chan * m_n_pixels * sizeof(float)));

		dim3 block(RT_BLOCK_X, RT_BLOCK_Y);
		dim3 grid((m_p.width + RT_BLOCK_X - 1) / RT_BLOCK_X,
		(m_p.height + RT_BLOCK_Y - 1) / RT_BLOCK_Y);

		for (int ivy = -m_p.max_vel; ivy <= m_p.max_vel; ivy++) {
		for (int ivx = -m_p.max_vel; ivx <= m_p.max_vel; ivx++) {
		int ich = (ivy + m_p.max_vel) * (2 * m_p.max_vel + 1) + (ivx + m_p.max_vel);
		k_layer1_channel<<<grid, block>>>(
		d_fpixels,
		d_layer1_out + (size_t)ich * m_n_pixels,
		m_p.width, m_p.height, nframes, ivx, ivy);
		}
		}
		checkCudaErrors(cudaDeviceSynchronize());
		checkCudaErrors(cudaMemcpy(h_layer1_out, d_layer1_out,
		(size_t)m_n_chan * m_n_pixels * sizeof(float),
		cudaMemcpyDeviceToHost));
		}

		int RtDetector::updateAccum(const float* h_layer1_out, float* h_accum_buf,
		const RtParams& p, float* h_peaks_xy, int max_peaks) {
		dim3 block(RT_BLOCK_X, RT_BLOCK_Y);
		dim3 grid((p.width + RT_BLOCK_X - 1) / RT_BLOCK_X,
		(p.height + RT_BLOCK_Y - 1) / RT_BLOCK_Y);

		// We need a temporary GPU buffer for the incoming accum channel and the updated one.
		// d_accum_chan is used as in-place scratch: upload old, compute new into d_layer1_out
		// (reusing its per-channel slice as temp storage), then download new back.
		// TODO: refactor with a second scratch buffer for clarity.

		float* d_tmp = d_layer1_out; // reuse layer1 buffer as temporary after layer1 is done

		for (int ivy = -p.max_vel; ivy <= p.max_vel; ivy++) {
		for (int ivx = -p.max_vel; ivx <= p.max_vel; ivx++) {
		int ich = (ivy + p.max_vel) * (2 * p.max_vel + 1) + (ivx + p.max_vel);
		size_t off = (size_t)ich * m_n_pixels;

		// Upload old accum channel
		checkCudaErrors(cudaMemcpy(d_accum_chan,
		h_accum_buf + off,
		m_n_pixels * sizeof(float),
		cudaMemcpyHostToDevice));
		// Upload layer1 observation for this channel
		float* d_l1_chan = d_tmp + off;
		checkCudaErrors(cudaMemcpy(d_l1_chan,
		h_layer1_out + off,
		m_n_pixels * sizeof(float),
		cudaMemcpyHostToDevice));
		// Update in-place (write result back to d_accum_chan via swap)
		k_update_accum_channel<<<grid, block>>>(
		d_accum_chan, d_l1_chan, d_accum_chan,
		p.width, p.height, ivx, ivy,
		p.mem_coeff, p.obs_coeff, p.leaky_slope);
		checkCudaErrors(cudaDeviceSynchronize());

		// Download updated channel
		checkCudaErrors(cudaMemcpy(h_accum_buf + off,
		d_accum_chan,
		m_n_pixels * sizeof(float),
		cudaMemcpyDeviceToHost));
		}
		}

		// Upload full updated accum_buf for max projection
		// (already updated in h_accum_buf; upload all channels at once)
		checkCudaErrors(cudaMemcpy(d_tmp, h_accum_buf,
		(size_t)m_n_chan * m_n_pixels * sizeof(float),
		cudaMemcpyHostToDevice));

		// Max projection across channels
		int flat_block = 256;
		int flat_grid = (m_n_pixels + flat_block - 1) / flat_block;
		k_max_projection<<<flat_grid, flat_block>>>(d_tmp, d_proj, m_n_pixels, m_n_chan);
		checkCudaErrors(cudaDeviceSynchronize());

		// Peak detection
		checkCudaErrors(cudaMemset(d_num_peaks, 0, sizeof(int)));
		int mp = (max_peaks < RT_MAX_PEAKS) ? max_peaks : RT_MAX_PEAKS;
		k_find_peaks<<<grid, block>>>(d_proj, d_peaks_xy, d_num_peaks,
		p.width, p.height, p.peak_thresh, mp);
		checkCudaErrors(cudaDeviceSynchronize());

		int num_found = 0;
		checkCudaErrors(cudaMemcpy(&num_found, d_num_peaks, sizeof(int), cudaMemcpyDeviceToHost));
		if (num_found > mp) num_found = mp;

		checkCudaErrors(cudaMemcpy(h_peaks_xy, d_peaks_xy,
		(size_t)num_found * 2 * sizeof(float),
		cudaMemcpyDeviceToHost));
		return num_found;
		}

src/rt_detector.h

0 → 100644

+76 −0

Original line number	Diff line number	Diff line
		/**
		** rt_detector.h — internal C++ class managing GPU state for one detector instance.
		**
		** Copyright (C) 2026 Elphel, Inc.
		**
		** Not part of the extern "C" API; used only within libcuasrt.so.
		** Analogous to TpHostGpu in tile_processor_gpu.
		**
		** Each method corresponds to a static method in CuasRTDetector.java.
		** Validate numerical agreement between Java prototype and C++ implementation
		** before tuning parameters.
		**
		** -----------------------------------------------------------------------------
		**
		** rt_detector.h is free software: you can redistribute it and/or modify
		** it under the terms of the GNU General Public License as published by
		** the Free Software Foundation, either version 3 of the License, or
		** (at your option) any later version.
		**
		** -----------------------------------------------------------------------------
		*/

		#pragma once
		#ifndef RT_DETECTOR_H
		#define RT_DETECTOR_H

		#include "rt_defines.h"
		#include "rt_lib.h"
		#include <cuda_runtime.h>

		class RtDetector {
		public:
		explicit RtDetector(const RtParams& p);
		~RtDetector();

		/**
		* Layer 1 matched filter.
		* h_fpixels: host [nframes * width * height]
		* nframes: number of input frames (<= RT_NFRAMES_MAX)
		* h_layer1_out: host [n_chan * width * height] output
		*/
		void runLayer1(const float* h_fpixels, int nframes, float* h_layer1_out);

		/**
		* Update accumulation buffers; detect and return peaks.
		* h_layer1_out: host [n_chan * width * height] from runLayer1
		* h_accum_buf: host [n_chan * width * height] persistent state (in/out, caller owns)
		* p: current parameters (may differ from construction-time params)
		* h_peaks_xy: output [max_peaks * 2]
		* max_peaks: output capacity
		* Returns: number of peaks found
		*/
		int updateAccum(const float* h_layer1_out,
		float* h_accum_buf,
		const RtParams& p,
		float* h_peaks_xy,
		int max_peaks);

		private:
		RtParams m_p;
		int m_n_chan; // (2*max_vel+1)^2
		int m_n_pixels; // width * height

		// GPU memory (owned by this object)
		float* d_fpixels = nullptr; // [RT_NFRAMES_MAX * n_pixels]
		float* d_layer1_out = nullptr; // [n_chan * n_pixels]
		float* d_accum_chan = nullptr; // [n_pixels] one channel at a time (upload/update/download)
		float* d_proj = nullptr; // [n_pixels] cross-channel max projection
		float* d_peaks_xy = nullptr; // [RT_MAX_PEAKS * 2]
		int* d_num_peaks = nullptr; // single int (atomic counter in k_find_peaks)

		void allocGpu();
		void freeGpu();
		};

		#endif // RT_DETECTOR_H

src/rt_lib.cu

0 → 100644

+50 −0

Original line number	Diff line number	Diff line
		/**
		** rt_lib.cu — extern "C" wrappers around RtDetector.
		**
		** Copyright (C) 2026 Elphel, Inc.
		**
		** This is the JNA / C++ API surface. All CUDA work is in rt_detector.cu.
		** Keep this file thin: just parameter validation and delegation to RtDetector.
		**
		** -----------------------------------------------------------------------------
		**
		** rt_lib.cu is free software: you can redistribute it and/or modify
		** it under the terms of the GNU General Public License as published by
		** the Free Software Foundation, either version 3 of the License, or
		** (at your option) any later version.
		**
		** -----------------------------------------------------------------------------
		*/

		#include "rt_lib.h"
		#include "rt_detector.h"
		#include <cstdio>

		extern "C" {

		void* cuasrt_create(int width, int height, const RtParams* p) {
		if (!p) { fprintf(stderr, "cuasrt_create: null params\n"); return nullptr; }
		RtParams rp = *p;
		rp.width = width;
		rp.height = height;
		return new RtDetector(rp);
		}

		void cuasrt_destroy(void* ctx) {
		delete static_cast<RtDetector*>(ctx);
		}

		void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out) {
		if (!ctx \|\| !fpixels \|\| !layer1_out) return;
		static_cast<RtDetector*>(ctx)->runLayer1(fpixels, nframes, layer1_out);
		}

		int cuasrt_update_accum(void* ctx,
		const float* layer1_out, float* accum_buf,
		const RtParams* p, float* peaks_xy, int max_peaks) {
		if (!ctx \|\| !layer1_out \|\| !accum_buf \|\| !p \|\| !peaks_xy) return 0;
		return static_cast<RtDetector*>(ctx)->updateAccum(layer1_out, accum_buf,
		*p, peaks_xy, max_peaks);
		}

		} // extern "C"

src/rt_lib.h

0 → 100644

+117 −0

Original line number	Diff line number	Diff line
		/**
		** rt_lib.h — extern "C" public API for libcuasrt.so
		**
		** Copyright (C) 2026 Elphel, Inc.
		**
		** Called from:
		** Java — via JNA (com.elphel.imagej.cuas.rt.CuasRTLib)
		** C++ — directly from standalone test/debug programs
		** Jetson — C++ production binary (same API, ARM build of the .so)
		**
		** Mirrors Java class RtParams (com.elphel.imagej.cuas.rt.RtParams).
		** Keep field names and order in sync with the JNA Structure mapping in
		** CuasRTLib.java (RtParamsJna extends com.sun.jna.Structure).
		**
		** -----------------------------------------------------------------------------
		**
		** rt_lib.h is free software: you can redistribute it and/or modify
		** it under the terms of the GNU General Public License as published by
		** the Free Software Foundation, either version 3 of the License, or
		** (at your option) any later version.
		**
		** -----------------------------------------------------------------------------
		*/

		#pragma once
		#ifndef RT_LIB_H
		#define RT_LIB_H

		#ifdef __cplusplus
		extern "C" {
		#endif

		/**
		* Runtime parameters for the RT detector.
		* Passed by pointer; the library copies the struct internally.
		*
		* IMPORTANT: field order must match RtParamsJna.getFieldOrder() in CuasRTLib.java.
		*/
		typedef struct {
		int max_vel; // velocity range [-max_vel .. +max_vel] px/sample; try 1, 2, 3
		int width; // image width in pixels
		int height; // image height in pixels
		float mem_coeff; // accumulation memory coefficient (e.g. 0.9)
		float obs_coeff; // observation weight (e.g. 0.1 = 1 - mem_coeff)
		float leaky_slope; // leaky ReLU negative slope: 1.0=linear, 0.01=near-hard
		float peak_thresh; // accumulation buffer peak detection threshold
		} RtParams;

		/**
		* Allocate GPU memory and initialize CUDA context for the RT detector.
		*
		* width, height: image dimensions (must match fpixels layout in cuasrt_layer1).
		* p: initial parameters (copied; can be changed per-call in cuasrt_update_accum).
		*
		* Returns opaque context handle; pass to all subsequent calls.
		* Returns NULL on CUDA initialization failure.
		*/
		void* cuasrt_create(int width, int height, const RtParams* p);

		/**
		* Release all GPU memory and destroy the CUDA context.
		* ctx becomes invalid after this call.
		*/
		void cuasrt_destroy(void* ctx);

		/**
		* Layer 1: spatio-temporal matched filter over one input window.
		*
		* fpixels: host float array, layout [nframes][height][width], row-major.
		* Must be the already temporally averaged/decimated frames for
		* this pyramid level. nframes <= RT_NFRAMES_MAX (rt_defines.h).
		*
		* nframes: number of frames in this window. Center frame is nframes/2.
		* For 50% overlap mode: stride = nframes/2 between consecutive calls.
		*
		* layer1_out: host float array, size n_chan * width * height.
		* n_chan = (2*max_vel+1)^2.
		* Channel order: ivy major, ivx minor, both offset by max_vel.
		* Same indexing as RtParams.chanIdx(ivx,ivy) in Java.
		*
		* Kernel: k_layer1_channel launched n_chan times (one per velocity channel).
		* All launches are synchronous; function returns when GPU work is complete.
		*/
		void cuasrt_layer1(void* ctx, const float* fpixels, int nframes, float* layer1_out);

		/**
		* Layer 2: update accumulation buffers and detect Stage 2 trigger candidates.
		*
		* layer1_out: [n_chan * width * height] from cuasrt_layer1 (host memory, read-only).
		* accum_buf: [n_chan * width * height] persistent accumulation state (host memory,
		* in/out — caller owns and preserves between calls).
		* Initialize to zeros before first call (e.g. calloc).
		* p: current RtParams (may differ from cuasrt_create params for tuning).
		* peaks_xy: output buffer for detected peak coordinates, host memory.
		* Layout: [x0, y0, x1, y1, ...]. Must be >= max_peaks * 2 floats.
		* max_peaks: capacity of peaks_xy / 2.
		*
		* Returns number of peaks found (0 .. max_peaks).
		*
		* Sequence per channel:
		* 1. Upload accum_buf channel to GPU
		* 2. k_update_accum_channel: shift + memaccum + obslayer1 + leakyReLU
		* 3. Download updated channel back to accum_buf
		* Then: max-project across channels, run k_find_peaks, download results.
		*/
		int cuasrt_update_accum(void* ctx,
		const float* layer1_out,
		float* accum_buf,
		const RtParams* p,
		float* peaks_xy,
		int max_peaks);

		#ifdef __cplusplus
		}
		#endif

		#endif // RT_LIB_H

Admin message