CLAUDE: tp_dnn GPU_CHUNK sub-batching - fix full-res OOM

tpdnn_infer processed the whole `count`-scene request in one shot; at full res (512x640) a 64-scene field tensor is ~10GB (64x124x512x640) + decode p ~10GB -> CUDA OOM on a 16GB card. The 64x80 synthetic smoke didn't expose it. Now loops in GPU_CHUNK-sized sub-batches (== infer_server's GPU_CHUNK), writing each chunk straight to the host output buffers so GPU tensors stay bounded (~3-4GB/chunk at CHUNK=8). L2 hidden/age/sprev carry across chunks; reset only at the first chunk when l2_reset (matches the server). Env TPDNN_GPU_CHUNK (default 8). Validated: parity vs server oracle still EXACT (0.0) at CHUNK=8 (8+4) and CHUNK=4 (4+4+4) - L2 carry across chunk boundaries is correct. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: tp_dnn GPU_CHUNK sub-batching - fix full-res OOM
tpdnn_infer processed the whole `count`-scene request in one shot; at full res (512x640) a 64-scene field tensor is ~10GB (64x124x512x640) + decode p ~10GB -> CUDA OOM on a 16GB card. The 64x80 synthetic smoke didn't expose it. Now loops in GPU_CHUNK-sized sub-batches (== infer_server's GPU_CHUNK), writing each chunk straight to the host output buffers so GPU tensors stay bounded (~3-4GB/chunk at CHUNK=8). L2 hidden/age/sprev carry across chunks; reset only at the first chunk when l2_reset (matches the server). Env TPDNN_GPU_CHUNK (default 8). Validated: parity vs server oracle still EXACT (0.0) at CHUNK=8 (8+4) and CHUNK=4 (4+4+4) - L2 carry across chunk boundaries is correct. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
6c9c2486 · Andrey Filippov · ea9c117a · 6c9c2486
Commit 6c9c2486 authored Jun 27, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 67 additions and 56 deletions

tp_dnn.cpp jna/tp_dnn.cpp +67 -56

No files found.
--- a/jna/tp_dnn.cpp
+++ b/jna/tp_dnn.cpp
@@ -15,6 +15,8 @@
 #include <vector>
 #include <string>
 #include <cstring>
+#include <cstdlib>
+#include <algorithm>
 #include <cmath>
 using torch::Tensor;
@@ -145,27 +147,36 @@ int tpdnn_infer(void* ctx, int level, int start, int count, int stride,
    int N = c->N;
    int vdim = 2 * c->vr + 1, nvel = vdim * vdim;
-    // newest-first window stack [count,N,H,W] (channel 0 = newest), matching the Java order.
+    // GPU_CHUNK sub-batching (== server's GPU_CHUNK loop): process the `count` scenes in chunks so the
+    // big GPU tensors (field [b,C,H,W], decode p [b,nvel,H,W]) stay bounded -- at full res a 64-scene
+    // single shot is ~10GB and OOMs a 16GB card. L2 hidden/age/sprev carry across chunks; reset only at
+    // the FIRST chunk when l2_reset (matches infer_server). Env TPDNN_GPU_CHUNK (default 8). By Claude 06/27/2026.
+    static int CHUNK = -1;
+    if (CHUNK < 0) { const char* e = std::getenv("TPDNN_GPU_CHUNK"); CHUNK = e ? std::atoi(e) : 8; if (CHUNK < 1) CHUNK = 8; }
+    int nch = use_l2 ? 6 : 5;
+    for (int c0 = 0; c0 < count; c0 += CHUNK) {
+        int b = std::min(CHUNK, count - c0);
+        // newest-first window stack [b,N,H,W] (channel 0 = newest), matching the Java order.
        std::vector<Tensor> wl;
-    wl.reserve(count);
+        wl.reserve(b);
-    for (int j = 0; j < count; j++) {
+        for (int j = 0; j < b; j++) {
-        int newest = start + j * stride;
+            int newest = start + (c0 + j) * stride;
            wl.push_back(lev.slice(0, newest - N + 1, newest + 1).flip(0));  // [N,H,W]
        }
-    Tensor wins  = torch::stack(wl, 0);                                  // [count,N,H,W]
+        Tensor wins  = torch::stack(wl, 0);                                  // [b,N,H,W]
-    Tensor field = shift_stitch(c->l1, wins, c->P, c->out_ch);          // [count,C,H,W]
+        Tensor field = shift_stitch(c->l1, wins, c->P, c->out_ch);          // [b,C,H,W]
        auto dec = decode(field, c->vr, rx, ry, rw, rh, rmax);              // ghostbusted
-    Tensor o5 = dec.first, rf = dec.second;
+        Tensor o5 = dec.first, rf = dec.second;                             // [b,5,H,W], [b,rh,rw,nvel]
-    int nch = 5;
        if (use_l2) {
            Tensor ong = decode(field, c->vr, rx, ry, rw, rh, 0.0).first;   // no ghostbuster (L2 gets full field)
            Tensor l2in = torch::stack({ong.select(1, 2),
                                        ong.select(1, 3) / VEL_DECIMATE,
-                                    ong.select(1, 4) / VEL_DECIMATE}, 1); // [count,3,H,W]
+                                        ong.select(1, 4) / VEL_DECIMATE}, 1); // [b,3,H,W]
            l2in = torch::nan_to_num(l2in, 0.0, 0.0, 0.0);
            int Hf = l2in.size(2), Wf = l2in.size(3);
-        if (!c->l2_state_init || c->h_l2.size(2) != Hf || c->h_l2.size(3) != Wf || l2_reset) {
+            if (!c->l2_state_init || c->h_l2.size(2) != Hf || c->h_l2.size(3) != Wf || (l2_reset && c0 == 0)) {
                auto o = field.options();
                c->h_l2     = torch::zeros({1, c->l2_ch, Hf, Wf}, o);
                c->age_l2   = torch::zeros({1, 1, Hf, Wf}, o);
@@ -173,8 +184,8 @@ int tpdnn_infer(void* ctx, int level, int start, int count, int stride,
                c->l2_state_init = true;
            }
            std::vector<Tensor> dets, vxs, vys, ages;
-        dets.reserve(count); vxs.reserve(count); vys.reserve(count); ages.reserve(count);
+            dets.reserve(b); vxs.reserve(b); vys.reserve(b); ages.reserve(b);
-        for (int j = 0; j < count; j++) {
+            for (int j = 0; j < b; j++) {
                auto out = c->l2.forward({l2in.slice(0, j, j + 1), c->h_l2}).toTuple();
                c->h_l2     = out->elements()[0].toTensor();                 // h_new
                Tensor dlog = out->elements()[1].toTensor();                 // [1,1,H,W] raw det logit
@@ -193,17 +204,17 @@ int tpdnn_infer(void* ctx, int level, int start, int count, int stride,
                vxs.push_back(vel.select(1, 0));
                vys.push_back(vel.select(1, 1));
            }
-        Tensor l2vx = torch::cat(vxs, 0) * (float)VEL_DECIMATE;          // [count,H,W]
+            Tensor l2vx = torch::cat(vxs, 0) * (float)VEL_DECIMATE;          // [b,H,W]
            Tensor l2vy = torch::cat(vys, 0) * (float)VEL_DECIMATE;
            o5 = torch::stack({o5.select(1, 0), o5.select(1, 1),            // keep L1 dx,dy
-                           torch::cat(dets, 0), l2vx, l2vy, torch::cat(ages, 0)}, 1); // [count,6,H,W]
+                               torch::cat(dets, 0), l2vx, l2vy, torch::cat(ages, 0)}, 1); // [b,6,H,W]
-        nch = 6;
        }
-    Tensor o5c = o5.contiguous().to(torch::kCPU);                        // [count,nch,H,W]
+        Tensor o5c = o5.contiguous().to(torch::kCPU);                        // [b,nch,H,W]
-    Tensor rfc = rf.contiguous().to(torch::kCPU);                        // [count,rh,rw,nvel]
+        Tensor rfc = rf.contiguous().to(torch::kCPU);                        // [b,rh,rw,nvel]
-    std::memcpy(out_o5,  o5c.data_ptr<float>(), (size_t)count * nch * H * W * sizeof(float));
+        std::memcpy(out_o5  + (size_t)c0 * nch * H * W,    o5c.data_ptr<float>(), (size_t)b * nch * H * W * sizeof(float));
-    std::memcpy(out_roi, rfc.data_ptr<float>(), (size_t)count * rh * rw * nvel * sizeof(float));
+        std::memcpy(out_roi + (size_t)c0 * rh * rw * nvel, rfc.data_ptr<float>(), (size_t)b * rh * rw * nvel * sizeof(float));
+    }
    return nch;
 }