CLAUDE: Step 2 (B) — GpuQuadJna convert->imclt->getRBG overrides delegate to TpProc

Override GpuQuad's GPU-touching methods for the image path, delegating to the native TpProc (with own caching, since base uses the null gpuTileProcessor): - setGeometryCorrection / setExtrinsicsVector -> tp_proc_set_geometry / set_correction_vector (gc.expandSensors(16).toFloatArray, cv.toFullRollArray). - setConvolutionKernels -> per-cam transpose-flatten (i=((i0&7)<<3)+((i0>>3)&7), CltExtra offsets) -> tp_proc_set_kernels / set_kernel_offsets. - setBayerImages -> channel-combine -> tp_proc_set_image (center -> set_center_image broadcast). - setTasks -> TpTask.asFloatArray -> tp_proc_set_tasks. - execSetTilesOffsets -> set gc+cv -> tp_proc_exec_geometry. - execConvertDirect(ref_scene,wh,erase_clt,no_kernels,use_center_image) -> tp_proc_exec_convert_direct (honors no_kernels skip-deconvolution + use_center_image, the fragile paths). - execImcltRbgAll -> tp_proc_exec_imclt; getRBG -> tp_proc_get_rbg + same inner-region extraction. mvn -DskipTests compile clean; all @Override signatures match base. Correlations (execCorr2D_*) and the backend selector are next. JCUDA remains the untouched default. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Step 2 (B) — GpuQuadJna convert->imclt->getRBG overrides delegate to TpProc
Override GpuQuad's GPU-touching methods for the image path, delegating to the native TpProc (with own caching, since base uses the null gpuTileProcessor): - setGeometryCorrection / setExtrinsicsVector -> tp_proc_set_geometry / set_correction_vector (gc.expandSensors(16).toFloatArray, cv.toFullRollArray). - setConvolutionKernels -> per-cam transpose-flatten (i=((i0&7)<<3)+((i0>>3)&7), CltExtra offsets) -> tp_proc_set_kernels / set_kernel_offsets. - setBayerImages -> channel-combine -> tp_proc_set_image (center -> set_center_image broadcast). - setTasks -> TpTask.asFloatArray -> tp_proc_set_tasks. - execSetTilesOffsets -> set gc+cv -> tp_proc_exec_geometry. - execConvertDirect(ref_scene,wh,erase_clt,no_kernels,use_center_image) -> tp_proc_exec_convert_direct (honors no_kernels skip-deconvolution + use_center_image, the fragile paths). - execImcltRbgAll -> tp_proc_exec_imclt; getRBG -> tp_proc_get_rbg + same inner-region extraction. mvn -DskipTests compile clean; all @Override signatures match base. Correlations (execCorr2D_*) and the backend selector are next. JCUDA remains the untouched default. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
34acc8ba · Andrey Filippov · a138f826 · 34acc8ba
Commit 34acc8ba authored Jun 25, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 142 additions and 5 deletions

GpuQuadJna.java src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java +142 -5

No files found.
--- a/src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java
+++ b/src/main/java/com/elphel/imagej/gpu/jna/GpuQuadJna.java
 package com.elphel.imagej.gpu.jna;
 import com.elphel.imagej.gpu.GpuQuad;
+import com.elphel.imagej.gpu.GPUTileProcessor;
+import com.elphel.imagej.gpu.TpTask;
+import com.elphel.imagej.tileprocessor.Correlation2d;
+import com.elphel.imagej.tileprocessor.CorrVector;
+import com.elphel.imagej.tileprocessor.GeometryCorrection;
 import com.elphel.imagej.tileprocessor.QuadCLT;
 import com.sun.jna.Native;
 import com.sun.jna.Pointer;
@@ -63,9 +68,141 @@ public class GpuQuadJna extends GpuQuad {
        if (module != null) lib.tp_destroy_module(module);
    }
-    // NOTE: GPU-touching methods (setGeometryCorrection / setConvolutionKernels / setBayerImages /
+    // ---- own caching flags (base uses gpuTileProcessor.* which is null here) ----
-    // setTasks / execSetTilesOffsets / execConvertDirect / execImcltRbgAll / getRBG / execCorr2D_* /
+    private boolean jna_kernels_set = false;
-    // getCorr2D / getCorr2DCombo / handleWH) are overridden incrementally to delegate to TpProc.
+    private boolean jna_rbg_corr_ready = false;
-    // Until overridden, the inherited base methods would touch null JCuda buffers — the selector must
-    // only route the validated CUAS path here.
+    // Allocate imclt(RBG)+correlation buffers once. RBG sizing needs only img/num_colors; correlation
+    // buffers are sized by num_all_pairs + corr_out_rad. sel_pairs/color_weights here are placeholders
+    // (the correlation overrides — added next — will pass the real per-call values).
+    private void ensureRbgCorr() {
+        if (jna_rbg_corr_ready) return;
+        int corr_out_rad = 7; // CORR_OUT_RAD -> 15x15
+        float[] cw = (num_colors == 1) ? new float[]{1f,1f,1f} : new float[]{0.25f,0.25f,0.5f};
+        lib.tp_proc_setup_rbg_corr(proc, Correlation2d.getNumPairs(num_cams), 0,0,0,0, cw[0],cw[1],cw[2], corr_out_rad);
+        jna_rbg_corr_ready = true;
+    }
+    // ---- geometry ----
+    @Override public void setGeometryCorrection() { setGeometryCorrection(quadCLT.getGeometryCorrection(), false); }
+    @Override public void setGeometryCorrection(GeometryCorrection gc, boolean use_java_rByRDist) {
+        float[] fgc = gc.expandSensors(GPUTileProcessor.MAX_NUM_CAMS).toFloatArray();
+        lib.tp_proc_set_geometry(proc, fgc, fgc.length);
+    }
+    @Override public void setExtrinsicsVector(CorrVector cv) {
+        double[] dcv = cv.toFullRollArray();
+        float[] fcv = new float[dcv.length];
+        for (int i = 0; i < dcv.length; i++) fcv[i] = (float) dcv[i];
+        lib.tp_proc_set_correction_vector(proc, fcv, fcv.length);
+    }
+    // ---- aberration kernels (same transpose-flatten as GpuQuad, then native upload) ----
+    @Override public void setConvolutionKernels(boolean force) {
+        if (jna_kernels_set && !force) return;
+        setConvolutionKernels(quadCLT.getCLTKernels(), true);
+    }
+    @Override public void setConvolutionKernels(double[][][][][][] clt_kernels, boolean force) {
+        if (jna_kernels_set && !force) return;
+        final boolean transpose = true;
+        int num_kernels = clt_kernels[0][0].length * clt_kernels[0][0][0].length * clt_kernels[0].length;
+        int kernel_length = num_kernels * 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
+        float[] fkernel  = new float[kernel_length];
+        float[] foffsets = new float[num_kernels * 8];
+        for (int ncam = 0; ncam < clt_kernels.length; ncam++) {
+            int indx = 0;
+            for (int ty = 0; ty < clt_kernels[ncam][0].length; ty++)
+                for (int tx = 0; tx < clt_kernels[ncam][0][ty].length; tx++)
+                    for (int col = 0; col < clt_kernels[ncam].length; col++)
+                        for (int p = 0; p < 4; p++) {
+                            double[] pa = clt_kernels[ncam][col][ty][tx][p];
+                            for (int i0 = 0; i0 < 64; i0++) {
+                                int i = transpose ? (((i0 & 7) << 3) + ((i0 >> 3) & 7)) : i0;
+                                fkernel[indx++] = (float) pa[i];
+                            }
+                        }
+            indx = 0;
+            for (int ty = 0; ty < clt_kernels[ncam][0].length; ty++)
+                for (int tx = 0; tx < clt_kernels[ncam][0][ty].length; tx++)
+                    for (int col = 0; col < clt_kernels[ncam].length; col++) {
+                        double[] pa = clt_kernels[ncam][col][ty][tx][4];
+                        for (int i = 0; i < pa.length; i++) foffsets[indx++] = (float) pa[i];
+                    }
+            lib.tp_proc_set_kernels(proc, ncam, fkernel, fkernel.length);
+            lib.tp_proc_set_kernel_offsets(proc, ncam, foffsets, foffsets.length);
+        }
+        jna_kernels_set = true;
+    }
+    // ---- bayer / source images (combine channels -> one float[] per cam) ----
+    @Override public void setBayerImages(boolean force, boolean center) {
+        if (center) {
+            double[][] bc = quadCLT.getImageCenter();
+            quadCLT.getResetImageCenter();
+            float[] f = combineChannels(bc);
+            if (f != null) lib.tp_proc_set_center_image(proc, f); // broadcast to all sensors
+            return;
+        }
+        setBayerImages(quadCLT.getResetImageData(), true);
+    }
+    @Override public void setBayerImages(double[][][] bayer_data, boolean force) {
+        for (int ncam = 0; ncam < bayer_data.length; ncam++) {
+            float[] f = combineChannels(bayer_data[ncam]);
+            if (f != null) lib.tp_proc_set_image(proc, ncam, f);
+        }
+    }
+    // sum the (1 or 3) split-color channels into one image, as GpuQuad.setBayerImages does
+    private static float[] combineChannels(double[][] chans) {
+        if (chans == null || chans[0] == null) return null;
+        float[] f = new float[chans[0].length];
+        for (int i = 0; i < f.length; i++) { double s = chans[0][i]; for (int j = 1; j < chans.length; j++) s += chans[j][i]; f[i] = (float) s; }
+        return f;
+    }
+    // ---- tasks ----
+    @Override public void setTasks(TpTask[] tile_tasks, boolean use_aux, boolean verify) {
+        num_task_tiles = tile_tasks.length;
+        int task_size = getTaskSize();
+        float[] ftasks = new float[task_size * num_task_tiles];
+        for (int i = 0; i < num_task_tiles; i++) { tile_tasks[i].task |= 511; tile_tasks[i].asFloatArray(ftasks, i, use_aux); }
+        lib.tp_proc_set_tasks(proc, ftasks, num_task_tiles, task_size * num_task_tiles);
+    }
+    // ---- geometry execution (calc_reverse_distortions + rot_derivs + calculate_tiles_offsets) ----
+    @Override public void execSetTilesOffsets(boolean uniform_grid) {
+        setGeometryCorrection();                                   // gc
+        setExtrinsicsVector(quadCLT.getGeometryCorrection().expandSensors(GPUTileProcessor.MAX_NUM_CAMS).getCorrVector()); // cv
+        lib.tp_proc_exec_geometry(proc, uniform_grid ? 1 : 0);
+    }
+    // ---- direct CLT conversion (+ the fragile no_kernels / use_center_image / erase_clt / ref_scene paths) ----
+    @Override public void execConvertDirect(boolean ref_scene, int[] wh, int erase_clt, boolean no_kernels, boolean use_center_image) {
+        boolean skip_kernels = rectilinear || no_kernels;
+        if (!skip_kernels) setConvolutionKernels(false);
+        if (!rectilinear)  setBayerImages(false, use_center_image);
+        lib.tp_proc_exec_convert_direct(proc, ref_scene ? 1 : 0, erase_clt, no_kernels ? 1 : 0);
+    }
+    // ---- inverse CLT -> RBG ----
+    @Override public void execImcltRbgAll(boolean is_mono, boolean ref_scene, int[] wh) {
+        ensureRbgCorr();
+        lib.tp_proc_exec_imclt(proc, 1); // apply_lpf = 1
+    }
+    @Override public float[][] getRBG(int ncam) {
+        int out_width = getImageWidth(), out_height = getImageHeight();
+        int gpu_width = out_width + GPUTileProcessor.DTT_SIZE, gpu_height = out_height + GPUTileProcessor.DTT_SIZE;
+        int gpu_img_size = gpu_width * gpu_height, rslt_img_size = out_width * out_height;
+        float[] packed = new float[num_colors * gpu_img_size]; // == TpProc rbg_w x rbg_h (de-pitched)
+        lib.tp_proc_get_rbg(proc, ncam, packed);
+        float[][] fimg = new float[num_colors][rslt_img_size];
+        for (int ncol = 0; ncol < num_colors; ncol++) {
+            int tl = (GPUTileProcessor.DTT_SIZE / 2) * (gpu_width + 1) + ncol * gpu_img_size;
+            for (int nrow = 0; nrow < out_height; nrow++)
+                System.arraycopy(packed, tl + gpu_width * nrow, fimg[ncol], out_width * nrow, out_width);
+        }
+        return fimg;
+    }
+    // ---- correlations (execCorr2D_TD / _inter_TD / _combine / _normalize, getCorr2D[Combo]):
+    //      NOT yet overridden — added in the next increment. Inherited base versions touch null JCuda
+    //      buffers, so the selector/test must drive only the convert->imclt->getRBG path until then.
 }