CLAUDE: tp_proc_get_corr_indices/combo_indices/corr_td + erase_corrs (oracle TD-corr readback)

Native readback of gpu_corr_indices / gpu_corrs_combo_indices / gpu_corrs_td (DtoH) and cudaMemset2D erase of gpu_corrs, for GpuQuadJna's getCorrIndices/getCorrComboIndices/getCorrTdData/eraseGpuCorrs (oracle TD-correlation path). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: tp_proc_get_corr_indices/combo_indices/corr_td + erase_corrs (oracle TD-corr readback)
Native readback of gpu_corr_indices / gpu_corrs_combo_indices / gpu_corrs_td (DtoH) and cudaMemset2D erase of gpu_corrs, for GpuQuadJna's getCorrIndices/getCorrComboIndices/getCorrTdData/eraseGpuCorrs (oracle TD-correlation path). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
33827862 · Andrey Filippov · 0399a26d · 33827862
Commit 33827862 authored Jun 26, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 15 additions and 0 deletions

tp_jna.cpp jna/tp_jna.cpp +15 -0

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -859,6 +859,21 @@ int tp_proc_get_corr2d(TpProc* p, float* out, int corr_rad){ if(!p||!p->have_cor
 int tp_proc_num_corr_tiles(TpProc* p){ return p?p->last_num_corr_tiles:-1; }
 int tp_proc_num_corr_combo(TpProc* p){ return p?p->last_num_corr_combo:-1; }
+// ---- oracle TD-correlation readback / erase (getCorrIndices/getCorrTdData/getCorrComboIndices/eraseGpuCorrs) ----
+int tp_proc_get_corr_indices(TpProc* p, int* out, int n){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(out, p->gpu_corr_indices, (size_t)n*sizeof(int), cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
+int tp_proc_get_corr_combo_indices(TpProc* p, int* out, int n){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(out, p->gpu_corrs_combo_indices, (size_t)n*sizeof(int), cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
+// de-pitch gpu_corrs_td (TD per-pair, 4*dtt^2=256 floats/tile, last_num_corr_tiles rows)
+int tp_proc_get_corr_td(TpProc* p, float* out){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
+  int w=4*8*8;
+  return cudaMemcpy2D(out,(size_t)w*sizeof(float), p->gpu_corrs_td, p->dstride_corr_td,
+                      (size_t)w*sizeof(float), p->last_num_corr_tiles, cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
+// zero the per-pair pixel-domain correlation buffer (before accumulation)
+int tp_proc_erase_corrs(TpProc* p){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemset2D(p->gpu_corrs, p->dstride_corr, 0, (size_t)p->corr_length*sizeof(float),
+                      (size_t)p->num_pairs*p->tilesx*p->tilesy)==cudaSuccess?0:-2; }
 void tp_proc_destroy(TpProc* p); // fwd
 // Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to