CLAUDE: Stage 1 — native TpInstance geometry path (calc_reverse_distortions + rot_derivs)

Add TpInstance to the JNA shim: device buffers (gpu_geometry_correction, gpu_rByRDist, gpu_rot_deriv, gpu_correction_vector) + setters (HtoD), the two pure-geometry launches (calcReverseDistortionTable {16,1,1}/{3,3,3}, calc_rot_deriv {num_cams,1,1}/{3,3,3}), and readback getters. Driver-API cuLaunchKernel against the NVRTC module (mirrors GpuQuad.execCalcReverseDistortions / execRotDerivs, no JCuda). build_lib.sh builds libtileproc.so. Validated via Java->JNA against tile_processor_gpu/clt reference data on the RTX 5060 Ti: rByRDist == clt/*.rbyrdist to ~1e-7 (aux 16-cam and main), rot_deriv rows orthogonal to ~1e-10 (scaled-rotation structure, det~zoom^3). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Stage 1 — native TpInstance geometry path (calc_reverse_distortions + rot_derivs)
Add TpInstance to the JNA shim: device buffers (gpu_geometry_correction, gpu_rByRDist, gpu_rot_deriv, gpu_correction_vector) + setters (HtoD), the two pure-geometry launches (calcReverseDistortionTable {16,1,1}/{3,3,3}, calc_rot_deriv {num_cams,1,1}/{3,3,3}), and readback getters. Driver-API cuLaunchKernel against the NVRTC module (mirrors GpuQuad.execCalcReverseDistortions / execRotDerivs, no JCuda). build_lib.sh builds libtileproc.so. Validated via Java->JNA against tile_processor_gpu/clt reference data on the RTX 5060 Ti: rByRDist == clt/*.rbyrdist to ~1e-7 (aux 16-cam and main), rot_deriv rows orthogonal to ~1e-10 (scaled-rotation structure, det~zoom^3). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
a5b7c269 · Andrey Filippov · eec885a0 · a5b7c269 · a5b7c269
Commit a5b7c269 authored Jun 25, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 134 additions and 0 deletions

build_lib.sh jna/build_lib.sh +9 -0

tp_jna.cpp jna/tp_jna.cpp +125 -0

No files found.
--- a/jna/build_lib.sh
+++ b/jna/build_lib.sh
+#!/usr/bin/env bash
+# Build libtileproc.so (JNA-callable native tile-processor shim). By Claude on 2026-06-25.
+set -e
+cd "$(dirname "$0")"
+CUDA="${CUDA_HOME:-/usr/local/cuda}"
+g++ -std=c++14 -shared -fPIC tp_jna.cpp -o libtileproc.so \
+    -I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
+    -lnvrtc -lcuda
+echo "built ./libtileproc.so"
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -36,8 +36,32 @@ static const int N_KERNELS = sizeof(KERNELS)/sizeof(KERNELS[0]);

 struct TpModule { CUcontext ctx; CUmodule mod; int nfun; };

+// Stage-1 geometry buffer sizes (floats). Must match GpuQuad.java alloc:
+//   gpu_geometry_correction = GeometryCorrection.arrayLength(16) (== sizeof(struct gc)/4 == 165),
+//   gpu_rByRDist            = RBYRDIST_LEN (5001),
+//   gpu_rot_deriv           = 5*MAX_NUM_CAMS*3*3 (720),
+//   gpu_correction_vector   = CORR_VECTOR_MAX_LENGTH (cv.toFullRollArray()).
+// gc/cv allocated generously (kernel reads only the struct-sized prefix); over-alloc is harmless.
+#define RBYRDIST_LEN_C   5001
+#define ROT_DERIV_FLOATS 720
+#define GC_FLOATS_MAX    512
+#define CV_FLOATS_MAX    512
+
+// One per-camera-config GPU instance: device buffers + the module it launches against.
+// Stage 1 covers the geometry buffers only; ftasks/clt/images follow in later stages.
+struct TpInstance {
+  TpModule*   mod;
+  CUdeviceptr gpu_geometry_correction;
+  CUdeviceptr gpu_rByRDist;
+  CUdeviceptr gpu_rot_deriv;
+  CUdeviceptr gpu_correction_vector;
+};
+
 static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f) return std::string(); std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }

+// Resolve a kernel by name from the module (cheap; not a hot path in Stage 1).
+static CUfunction getfun(TpModule* m, const char* name){ CUfunction f=nullptr; if(m && m->mod) cuModuleGetFunction(&f, m->mod, name); return f; }
+
 extern "C" {

 TpModule* tp_create_module(const char* srcdir, const char* devrt){
@@ -93,4 +117,105 @@ int  tp_module_num_functions(TpModule* m){ return m ? m->nfun : -1; }
 const char* tp_last_error(){ return g_err; }
 void tp_destroy_module(TpModule* m){ if(m){ if(m->mod) cuModuleUnload(m->mod); if(m->ctx) cuCtxDestroy(m->ctx); delete m; } }

+// ---- Stage 1: instance + geometry path (calcReverseDistortionTable, calc_rot_deriv) ----
+// Mirrors GpuQuad.execCalcReverseDistortions / execRotDerivs (driver-API cuLaunchKernel),
+// but native. Validated by reading rByRDist back and comparing to clt/*.rbyrdist reference.
+
+void tp_destroy_instance(TpInstance* inst); // fwd (cleanup on partial-alloc failure)
+
+TpInstance* tp_create_instance(TpModule* m){
+  g_err[0]=0;
+  if(!m){ seterr("tp_create_instance: null module"); return nullptr; }
+  CUresult cr; const char* es;
+  cuCtxSetCurrent(m->ctx);
+  TpInstance* inst = new TpInstance{m,0,0,0,0};
+  #define ALLOCF(p,nf) do{ cr=cuMemAlloc(&inst->p,(size_t)(nf)*sizeof(float)); \
+      if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuMemAlloc(%s,%d) -> %d (%s)",#p,(int)(nf),cr,es); \
+        tp_destroy_instance(inst); return nullptr; } }while(0)
+  ALLOCF(gpu_geometry_correction, GC_FLOATS_MAX);
+  ALLOCF(gpu_rByRDist,            RBYRDIST_LEN_C);
+  ALLOCF(gpu_rot_deriv,           ROT_DERIV_FLOATS);
+  ALLOCF(gpu_correction_vector,   CV_FLOATS_MAX);
+  #undef ALLOCF
+  return inst;
+}
+
+// Upload host gc floats (gc.expandSensors(16).toFloatArray()) to gpu_geometry_correction.
+int tp_set_geometry_correction(TpInstance* inst, const float* fgc, int n){
+  if(!inst){ seterr("tp_set_geometry_correction: null instance"); return -1; }
+  if(n<0 || n>GC_FLOATS_MAX){ seterr("tp_set_geometry_correction: n=%d out of [0,%d]",n,GC_FLOATS_MAX); return -2; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUresult cr=cuMemcpyHtoD(inst->gpu_geometry_correction, fgc, (size_t)n*sizeof(float));
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("HtoD gc -> %d (%s)",cr,es); return -3; }
+  return 0;
+}
+
+// Upload host correction-vector floats (cv.toFullRollArray()) to gpu_correction_vector.
+int tp_set_correction_vector(TpInstance* inst, const float* fcv, int n){
+  if(!inst){ seterr("tp_set_correction_vector: null instance"); return -1; }
+  if(n<0 || n>CV_FLOATS_MAX){ seterr("tp_set_correction_vector: n=%d out of [0,%d]",n,CV_FLOATS_MAX); return -2; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUresult cr=cuMemcpyHtoD(inst->gpu_correction_vector, fcv, (size_t)n*sizeof(float));
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("HtoD cv -> %d (%s)",cr,es); return -3; }
+  return 0;
+}
+
+// calcReverseDistortionTable<<<{16,1,1},{3,3,3}>>>(gpu_geometry_correction, gpu_rByRDist)
+int tp_exec_calc_reverse_distortions(TpInstance* inst){
+  if(!inst){ seterr("tp_exec_calc_reverse_distortions: null instance"); return -1; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUfunction f=getfun(inst->mod,"calcReverseDistortionTable");
+  if(!f){ seterr("no kernel calcReverseDistortionTable"); return -2; }
+  void* args[] = { &inst->gpu_geometry_correction, &inst->gpu_rByRDist };
+  cuCtxSynchronize();
+  CUresult cr=cuLaunchKernel(f, 16,1,1, 3,3,3, 0,nullptr, args,nullptr);
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("launch calcReverseDistortionTable -> %d (%s)",cr,es); return -3; }
+  cr=cuCtxSynchronize();
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("sync calcReverseDistortionTable -> %d (%s)",cr,es); return -4; }
+  return 0;
+}
+
+// calc_rot_deriv<<<{num_cams,1,1},{3,3,3}>>>(num_cams, gpu_correction_vector, gpu_rot_deriv)
+int tp_exec_rot_derivs(TpInstance* inst, int num_cams){
+  if(!inst){ seterr("tp_exec_rot_derivs: null instance"); return -1; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUfunction f=getfun(inst->mod,"calc_rot_deriv");
+  if(!f){ seterr("no kernel calc_rot_deriv"); return -2; }
+  void* args[] = { &num_cams, &inst->gpu_correction_vector, &inst->gpu_rot_deriv };
+  cuCtxSynchronize();
+  CUresult cr=cuLaunchKernel(f, num_cams,1,1, 3,3,3, 0,nullptr, args,nullptr);
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("launch calc_rot_deriv -> %d (%s)",cr,es); return -3; }
+  cr=cuCtxSynchronize();
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("sync calc_rot_deriv -> %d (%s)",cr,es); return -4; }
+  return 0;
+}
+
+// Read rByRDist back (RBYRDIST_LEN floats) for validation against clt/*.rbyrdist.
+int tp_get_rbyrdist(TpInstance* inst, float* out){
+  if(!inst){ seterr("tp_get_rbyrdist: null instance"); return -1; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUresult cr=cuMemcpyDtoH(out, inst->gpu_rByRDist, (size_t)RBYRDIST_LEN_C*sizeof(float));
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("DtoH rByRDist -> %d (%s)",cr,es); return -2; }
+  return 0;
+}
+
+// Read rot_deriv back (5*16*3*3 floats) for sanity/validation.
+int tp_get_rot_deriv(TpInstance* inst, float* out){
+  if(!inst){ seterr("tp_get_rot_deriv: null instance"); return -1; }
+  cuCtxSetCurrent(inst->mod->ctx);
+  CUresult cr=cuMemcpyDtoH(out, inst->gpu_rot_deriv, (size_t)ROT_DERIV_FLOATS*sizeof(float));
+  if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("DtoH rot_deriv -> %d (%s)",cr,es); return -2; }
+  return 0;
+}
+
+void tp_destroy_instance(TpInstance* inst){
+  if(!inst) return;
+  if(inst->mod) cuCtxSetCurrent(inst->mod->ctx);
+  if(inst->gpu_geometry_correction) cuMemFree(inst->gpu_geometry_correction);
+  if(inst->gpu_rByRDist)            cuMemFree(inst->gpu_rByRDist);
+  if(inst->gpu_rot_deriv)           cuMemFree(inst->gpu_rot_deriv);
+  if(inst->gpu_correction_vector)   cuMemFree(inst->gpu_correction_vector);
+  delete inst;
+}
+
 } // extern "C"