CLAUDE: Step 2 (B) foundation — GpuQuad no-alloc ctor + GpuQuadJna subclass skeleton (compiles)

Architecture B (chosen after finding GpuQuad's surface is ~70 methods, too large for a clean interface): - GpuQuad: add a protected no-alloc constructor (QuadCLT, debug_level, native_backend marker) that sets only the final config fields (gpuTileProcessor=null) and allocates NO JCuda memory / context. The working JCuda constructors are untouched. - New GpuQuadJna extends GpuQuad: uses the no-alloc ctor, then stands up the native libtileproc.so via TpJna (tp_create_module + tp_proc_create + tp_proc_setup). Inherits all methods (so it compiles); GPU-touching methods will be overridden incrementally to delegate to TpProc, the rest throw to fail loudly off the validated path. close() frees native memory deterministically. mvn -DskipTests compile: clean. JCUDA remains the default/working path. Next: per-method override marshalling (kernels/bayer/geometry/tasks + convert/imclt/getRBG/corr), then the backend selector (QuadCLT ctor) and the live JCUDA-vs-JNA file comparison. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Step 2 (B) foundation — GpuQuad no-alloc ctor + GpuQuadJna subclass skeleton (compiles)
Architecture B (chosen after finding GpuQuad's surface is ~70 methods, too large for a clean interface): - GpuQuad: add a protected no-alloc constructor (QuadCLT, debug_level, native_backend marker) that sets only the final config fields (gpuTileProcessor=null) and allocates NO JCuda memory / context. The working JCuda constructors are untouched. - New GpuQuadJna extends GpuQuad: uses the no-alloc ctor, then stands up the native libtileproc.so via TpJna (tp_create_module + tp_proc_create + tp_proc_setup). Inherits all methods (so it compiles); GPU-touching methods will be overridden incrementally to delegate to TpProc, the rest throw to fail loudly off the validated path. close() frees native memory deterministically. mvn -DskipTests compile: clean. JCUDA remains the default/working path. Next: per-method override marshalling (kernels/bayer/geometry/tasks + convert/imclt/getRBG/corr), then the backend selector (QuadCLT ctor) and the live JCUDA-vs-JNA file comparison. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
a138f826 · Andrey Filippov · 9234a307 · a138f826 · a138f826
Commit a138f826 authored Jun 25, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 96 additions and 0 deletions

GpuQuad.java src/main/java/com/elphel/imagej/gpu/GpuQuad.java +25 -0

GpuQuadJna.java src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java +71 -0

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
+++ b/src/main/java/com/elphel/imagej/gpu/GpuQuad.java
@@ -490,6 +490,31 @@ public class GpuQuad{ // quad camera description
 		texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
 	}
 	
+	// No-allocation constructor for the native (JNA) backend subclass GpuQuadJna.
+	// Sets the final config fields from quadCLT but allocates NO JCuda GPU memory and creates NO
+	// JCuda context (gpuTileProcessor = null). The subclass owns its native (TpProc) GPU memory and
+	// overrides every GPU-touching method. The boolean marker disambiguates this constructor.
+	protected GpuQuad(
+			final QuadCLT quadCLT,
+			int           debug_level,
+			boolean       native_backend) {
+		this.rectilinear =   false;
+		setGpu_debug_level(debug_level);
+		this.gpuTileProcessor = null;
+		this.quadCLT =       quadCLT;
+		int [] wh =          quadCLT.getGeometryCorrection().getSensorWH();
+		this.img_width =     wh[0];
+		this.img_height =    wh[1];
+		this.num_cams =      quadCLT.getNumSensors();
+		this.num_all_pairs = Correlation2d.getNumPairs(num_cams);
+		this.num_colors =    quadCLT.isMonochrome() ? 1 : 3;
+		this.kernels_hor =   (quadCLT.getCLTKernels() == null) ? 0 : quadCLT.getCLTKernels()[0][0][0].length;
+		this.kernels_vert =  (quadCLT.getCLTKernels() == null) ? 0 : quadCLT.getCLTKernels()[0][0].length;
+		this.kern_tiles =    kernels_hor * kernels_vert * num_colors;
+		this.kern_size =     kern_tiles * 4 * 64;
+		// no cuMemAlloc — native backend allocates its own GPU memory
+	}
+
 	// Constructor for rectilinear synthetic images
 	public GpuQuad(
 			GPUTileProcessor gpuTileProcessor,

--- a/src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java
+++ b/src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java
+package com.elphel.imagej.gpu.jna;
+
+import com.elphel.imagej.gpu.GpuQuad;
+import com.elphel.imagej.tileprocessor.QuadCLT;
+import com.sun.jna.Native;
+import com.sun.jna.Pointer;
+
+/**
+ * Native (JNA) GPU backend — architecture B: subclass {@link GpuQuad}, but allocate NO JCuda memory and
+ * create NO JCuda context. The constructor uses GpuQuad's protected no-alloc constructor (config fields
+ * only) and stands up the native tile-processor library ({@code libtileproc.so}) via {@link TpJna}:
+ * a {@code TpModule} (NVRTC-compiled kernels) and a persistent {@code TpProc} instance that owns all GPU
+ * memory. Each GPU-touching method of GpuQuad is overridden to delegate to {@code TpProc}; methods not yet
+ * ported throw {@link UnsupportedOperationException} (so anything off the validated path fails loudly
+ * rather than silently using the null base JCuda buffers).
+ *
+ * Selected once at startup (default backend stays JCUDA); JNA mode never initializes JCuda.
+ * Validated kernel path (Stages 1–5 + StageProc): geometry, convert_direct, imclt, correlations.
+ * By Claude on 2026-06-25.
+ */
+public class GpuQuadJna extends GpuQuad {
+    private final TpJna   lib;
+    private final Pointer module;   // TpModule* (NVRTC kernels)
+    private final Pointer proc;      // TpProc*   (persistent device buffers)
+    private boolean closed = false;
+
+    /**
+     * @param quadCLT   the per-scene QuadCLT (geometry, kernels, sensor config)
+     * @param srcdir    directory with the kernel .cu/.h sources for NVRTC (e.g. tile_processor_gpu/src)
+     * @param devrt     path to libcudadevrt.a (for CDP linking)
+     * @param debugLevel gpu debug level
+     */
+    public GpuQuadJna(QuadCLT quadCLT, String srcdir, String devrt, int debugLevel) {
+        super(quadCLT, debugLevel, true); // no-alloc: config fields only, no JCuda context
+        this.lib = Native.load("tileproc", TpJna.class);
+        this.module = lib.tp_create_module(srcdir, devrt);
+        if (module == null) {
+            throw new IllegalStateException("GpuQuadJna: tp_create_module failed: " + lib.tp_last_error());
+        }
+        this.proc = lib.tp_proc_create(module);
+        if (proc == null) {
+            throw new IllegalStateException("GpuQuadJna: tp_proc_create failed: " + lib.tp_last_error());
+        }
+        int rc = lib.tp_proc_setup(proc, num_cams, num_colors, img_width, img_height,
+                                   kernels_hor, kern_tiles);
+        if (rc != 0) {
+            throw new IllegalStateException("GpuQuadJna: tp_proc_setup rc=" + rc + ": " + lib.tp_last_error());
+        }
+        // tp_proc_setup_rbg_corr (imclt + correlation buffers) is deferred until the correlation config
+        // (num_pairs / sel_pairs / color_weights / corr_out_rad) is known at first use.
+    }
+
+    /** Native handles for the override implementations (added incrementally). */
+    protected TpJna   lib()    { return lib; }
+    protected Pointer module() { return module; }
+    protected Pointer proc()   { return proc; }
+
+    /** Release native GPU memory + module (deterministic — does not rely on GC/finalize). */
+    public synchronized void close() {
+        if (closed) return;
+        closed = true;
+        if (proc != null)   lib.tp_proc_destroy(proc);
+        if (module != null) lib.tp_destroy_module(module);
+    }
+
+    // NOTE: GPU-touching methods (setGeometryCorrection / setConvolutionKernels / setBayerImages /
+    // setTasks / execSetTilesOffsets / execConvertDirect / execImcltRbgAll / getRBG / execCorr2D_* /
+    // getCorr2D / getCorr2DCombo / handleWH) are overridden incrementally to delegate to TpProc.
+    // Until overridden, the inherited base methods would touch null JCuda buffers — the selector must
+    // only route the validated CUAS path here.
+}