CLAUDE: Step 1 — persistent granular native API (TpProc) for the convert_direct core

Add TpProc: the production-facing persistent instance (buffers allocated once in tp_proc_setup, reused across set/exec/get, freed in tp_proc_destroy) — the surface GpuQuadJna will delegate to. API: tp_proc_create/setup, set_geometry/correction_vector/kernels/kernel_offsets/image/center_image/ tasks, exec_geometry (calc_reverse_distortions+rot_derivs+calculate_tiles_offsets), exec_convert_direct (ref_scene/erase_clt/no_kernels), get_clt, destroy. Includes the fragile convert_direct paths the migration must preserve: no_kernels (skip deconvolution -> kernels_hor/vert=0), use_center_image (broadcast one center image to all sensors), erase_clt (erase_clt_tiles), ref_scene (clt_ref buffer). tp_proc_convert_selftest validates end-to-end on RTX 5060 Ti: standard convert CLT == clt/aux_chnN.clt golden (max|CLT-golden|=0.1085, == Stage 2, num_active=5120); no_kernels path runs with finite output. update_image_gpu pitch is in BYTES (the "in floats" comment is misleading). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Step 1 — persistent granular native API (TpProc) for the convert_direct core
Add TpProc: the production-facing persistent instance (buffers allocated once in tp_proc_setup, reused across set/exec/get, freed in tp_proc_destroy) — the surface GpuQuadJna will delegate to. API: tp_proc_create/setup, set_geometry/correction_vector/kernels/kernel_offsets/image/center_image/ tasks, exec_geometry (calc_reverse_distortions+rot_derivs+calculate_tiles_offsets), exec_convert_direct (ref_scene/erase_clt/no_kernels), get_clt, destroy. Includes the fragile convert_direct paths the migration must preserve: no_kernels (skip deconvolution -> kernels_hor/vert=0), use_center_image (broadcast one center image to all sensors), erase_clt (erase_clt_tiles), ref_scene (clt_ref buffer). tp_proc_convert_selftest validates end-to-end on RTX 5060 Ti: standard convert CLT == clt/aux_chnN.clt golden (max|CLT-golden|=0.1085, == Stage 2, num_active=5120); no_kernels path runs with finite output. update_image_gpu pitch is in BYTES (the "in floats" comment is misleading). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
06c12c4a · Andrey Filippov · 341538c7 · 06c12c4a
Commit 06c12c4a authored Jun 25, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 201 additions and 1 deletion

tp_jna.cpp jna/tp_jna.cpp +201 -1

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -553,4 +553,204 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
  return run_convert_pipeline(m, lwir, data_root, 0,0,1, out_clt_err, nullptr, nullptr, out_tex_err, out_num_active);
 }
-} // extern "C"
+} // extern "C" (module + Stage 0-5 selftests)
+// ============================================================================
+// Step 1: PERSISTENT granular API (TpProc) — the production-facing surface.
+// Buffers allocated once in tp_proc_setup, reused across set/exec/get, freed in
+// tp_proc_destroy. Reuses the proven launch logic; adds the fragile paths the
+// integration must preserve: no_kernels (skip deconvolution, rectilinear/FPN),
+// use_center_image (broadcast 1 center image to all sensors, FPN back-prop),
+// erase_clt, ref_scene (separate CLT buffer for inter-scene corr).
+// See handoffs/2026-06-25_convert_direct-fragile-features.md.
+// ============================================================================
+struct TpProc {
+  TpModule* mod;
+  int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kern_tiles, kern_size, slice;
+  size_t dstride_img;        // image pitch (bytes)
+  float *gc, *cv, *rbr, *rot;
+  std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h;
+  float **gpu_kernels, **gpu_kernel_offsets, **gpu_images, **gpu_clt, **gpu_clt_ref;
+  float *ftasks; int ntiles; int *active, *num_active;
+};
+extern "C" {
+TpProc* tp_proc_create(TpModule* m){
+  g_err[0]=0;
+  if(!m){ seterr("tp_proc_create: null module"); return nullptr; }
+  TpProc* p = new TpProc();
+  p->mod=m; p->ftasks=nullptr; p->ntiles=0; p->active=nullptr; p->num_active=nullptr;
+  p->gpu_kernels=p->gpu_kernel_offsets=p->gpu_images=p->gpu_clt=p->gpu_clt_ref=nullptr;
+  p->gc=p->cv=p->rbr=p->rot=nullptr;
+  return p;
+}
+// Allocate all per-config buffers. kern_tiles = KERNELS_HOR*KERNELS_VERT*num_colors.
+int tp_proc_setup(TpProc* p, int num_cams, int num_colors, int img_w, int img_h, int kernels_hor, int kern_tiles){
+  if(!p){ seterr("tp_proc_setup: null"); return -1; }
+  cuCtxSetCurrent(p->mod->ctx);
+  p->num_cams=num_cams; p->num_colors=num_colors; p->img_w=img_w; p->img_h=img_h;
+  p->kernels_hor=kernels_hor; p->kern_tiles=kern_tiles; p->kern_size=kern_tiles*4*64;
+  p->tilesx=img_w/8; p->tilesy=img_h/8;
+  p->slice = p->tilesy*p->tilesx*num_colors*4*64;
+  const int CE = (int)(sizeof(CltExtra)/sizeof(float));
+  p->kernels_h.assign(num_cams,nullptr); p->offs_h.assign(num_cams,nullptr);
+  p->images_h.assign(num_cams,nullptr);  p->clt_h.assign(num_cams,nullptr); p->clt_ref_h.assign(num_cams,nullptr);
+  for(int c=0;c<num_cams;c++){
+    p->kernels_h[c]=alloc_kernel_gpu(p->kern_size);
+    p->offs_h[c]=alloc_kernel_gpu(kern_tiles*CE);
+    p->images_h[c]=alloc_image_gpu(&p->dstride_img, img_w, img_h);
+    p->clt_h[c]=alloc_kernel_gpu(p->slice);
+    p->clt_ref_h[c]=alloc_kernel_gpu(p->slice);
+  }
+  p->gpu_kernels=copyalloc_pointers_gpu(p->kernels_h.data(),num_cams);
+  p->gpu_kernel_offsets=copyalloc_pointers_gpu(p->offs_h.data(),num_cams);
+  p->gpu_images=copyalloc_pointers_gpu(p->images_h.data(),num_cams);
+  p->gpu_clt=copyalloc_pointers_gpu(p->clt_h.data(),num_cams);
+  p->gpu_clt_ref=copyalloc_pointers_gpu(p->clt_ref_h.data(),num_cams);
+  cudaMalloc((void**)&p->active,(size_t)p->tilesx*p->tilesy*sizeof(int));
+  cudaMalloc((void**)&p->num_active,sizeof(int));
+  p->gc=alloc_kernel_gpu(512); p->cv=alloc_kernel_gpu(512);
+  p->rbr=alloc_kernel_gpu(5001); p->rot=alloc_kernel_gpu(5*16*3*3);
+  return 0;
+}
+int tp_proc_set_geometry(TpProc* p, const float* fgc, int n){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(p->gc,fgc,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
+int tp_proc_set_correction_vector(TpProc* p, const float* fcv, int n){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(p->cv,fcv,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
+int tp_proc_set_kernels(TpProc* p, int cam, const float* d, int n){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(p->kernels_h[cam],d,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
+int tp_proc_set_kernel_offsets(TpProc* p, int cam, const float* d, int n){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(p->offs_h[cam],d,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
+// pitched image upload (reuses harness update_image_gpu: dstride in floats)
+int tp_proc_set_image(TpProc* p, int cam, const float* d){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
+  update_image_gpu((float*)d, p->images_h[cam], p->dstride_img, p->img_w, p->img_h); return 0; } // dstride in BYTES (cudaMemcpy2D dpitch)
+// use_center_image: broadcast one center image to all sensors (FPN back-prop mode)
+int tp_proc_set_center_image(TpProc* p, const float* d){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
+  for(int c=0;c<p->num_cams;c++) update_image_gpu((float*)d, p->images_h[c], p->dstride_img, p->img_w, p->img_h); return 0; }
+int tp_proc_set_tasks(TpProc* p, const float* ftasks, int ntiles, int total_floats){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
+  if(p->ftasks) cudaFree(p->ftasks); cudaMalloc((void**)&p->ftasks,(size_t)total_floats*sizeof(float));
+  p->ntiles=ntiles; return cudaMemcpy(p->ftasks,ftasks,(size_t)total_floats*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
+// geometry: calc_reverse_distortions + rot_derivs + (uniform_grid) calculate_tiles_offsets
+int tp_proc_exec_geometry(TpProc* p, int uniform_grid){
+  if(!p)return -1; cuCtxSetCurrent(p->mod->ctx); CUresult cr; const char* es;
+  CUfunction f_rbr=getfun(p->mod,"calcReverseDistortionTable"), f_rot=getfun(p->mod,"calc_rot_deriv"), f_off=getfun(p->mod,"calculate_tiles_offsets");
+  if(!f_rbr||!f_rot||!f_off){ seterr("geom kernels missing"); return -2; }
+  #define LA(fn,gx,gy,gz,bx,by,bz,...) do{ void* a[]={__VA_ARGS__}; cuCtxSynchronize(); \
+     cr=cuLaunchKernel(fn,gx,gy,gz,bx,by,bz,0,nullptr,a,nullptr); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch %s ->%d(%s)",#fn,cr,es);return -3;} \
+     cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("sync %s ->%d(%s)",#fn,cr,es);return -4;} }while(0)
+  int mc16=16, nc=p->num_cams, ug=uniform_grid, nt=p->ntiles;
+  LA(f_rbr, mc16,1,1, 3,3,3, &p->gc,&p->rbr);
+  LA(f_rot, nc,1,1, 3,3,3, &nc,&p->cv,&p->rot);
+  LA(f_off, 1,1,1, 1,1,1, &ug,&nc,&p->ftasks,&nt,&p->gc,&p->cv,&p->rbr,&p->rot);
+  #undef LA
+  return 0;
+}
+// convert_direct with the fragile paths. no_kernels: skip deconvolution; use_center_image handled at
+// set time; ref_scene: write to clt_ref; erase_clt: -1 none / 0 zero / 1 NaN (erase_clt_tiles first).
+int tp_proc_exec_convert_direct(TpProc* p, int ref_scene, int erase_clt, int no_kernels){
+  if(!p)return -1; cuCtxSetCurrent(p->mod->ctx); CUresult cr; const char* es;
+  CUfunction f_cd=getfun(p->mod,"convert_direct"), f_er=getfun(p->mod,"erase_clt_tiles");
+  if(!f_cd){ seterr("convert_direct missing"); return -2; }
+  float** clt_sel = ref_scene ? p->gpu_clt_ref : p->gpu_clt;
+  if(erase_clt>=0 && f_er){
+    float fill = (erase_clt>0)? NAN : 0.0f; int nc=p->num_cams, ncol=p->num_colors, tx=p->tilesx, ty=p->tilesy;
+    void* a[]={ &nc,&ncol,&tx,&ty,&clt_sel,&fill }; cuCtxSynchronize();
+    cr=cuLaunchKernel(f_er,1,1,1,1,1,1,0,nullptr,a,nullptr);
+    if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch erase_clt_tiles ->%d(%s)",cr,es);return -3;}
+    cuCtxSynchronize();
+  }
+  int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, lpf=0, ww=p->img_w, wh=p->img_h,
+      kh=0, kv=no_kernels?0:p->kernels_hor, tx=p->tilesx;     // standard: (0, kernels_hor); no_kernels: (0,0)
+  size_t dstride=p->dstride_img/sizeof(float);
+  void* a[]={ &nc,&ncol,&p->gpu_kernel_offsets,&p->gpu_kernels,&p->gpu_images,&p->ftasks,&clt_sel,
+              &dstride,&ntsk,&lpf,&ww,&wh,&kh,&kv,&p->active,&p->num_active,&tx };
+  cuCtxSynchronize();
+  cr=cuLaunchKernel(f_cd,1,1,1,1,1,1,0,nullptr,a,nullptr);
+  if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch convert_direct ->%d(%s)",cr,es);return -4;}
+  cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("sync convert_direct ->%d(%s)",cr,es);return -5;}
+  return 0;
+}
+int tp_proc_get_clt(TpProc* p, int cam, int ref_scene, float* out){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
+  float* src = (ref_scene? p->clt_ref_h:p->clt_h)[cam];
+  return cudaMemcpy(out,src,(size_t)p->slice*sizeof(float),cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
+void tp_proc_destroy(TpProc* p); // fwd
+// Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
+// clt/aux_chnN.clt golden (== Stage 2). Also smoke-tests no_kernels (runs, finite output).
+int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
+                             double* out_clt_err, double* out_nokern_max, int* out_num_active){
+  g_err[0]=0;
+  if(!m){ seterr("tp_proc_convert_selftest: null module"); return -1; }
+  if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed",data_root); return -2; }
+  TpParams P((int)lwir); TpPaths PA((int)lwir);
+  TpProc* p = tp_proc_create(m);
+  if(!p) return -3;
+  if(tp_proc_setup(p, P.num_cams, P.num_colors, P.img_width, P.img_height, P.kernels_hor, P.kern_tiles)) { tp_proc_destroy(p); return -4; }
+  const int NC=P.num_cams;
+  float* hbuf=(float*)malloc((size_t)P.kern_size*sizeof(float));
+  // geometry
+  int n_gc=0,n_cv=0; float* fgc=readAllFloatsFromFile(PA.geometry_correction_file,&n_gc);
+  float* fcv=readAllFloatsFromFile(PA.correction_vector_file,&n_cv);
+  tp_proc_set_geometry(p,fgc,n_gc); tp_proc_set_correction_vector(p,fcv,n_cv);
+  // kernels + offsets + images
+  const int CE=(int)(sizeof(CltExtra)/sizeof(float));
+  for(int c=0;c<NC;c++){
+    readFloatsFromFile(hbuf, PA.kernel_file[c]);      tp_proc_set_kernels(p,c,hbuf,P.kern_size);
+    readFloatsFromFile(hbuf, PA.kernel_offs_file[c]); tp_proc_set_kernel_offsets(p,c,hbuf,P.kern_tiles*CE);
+    readFloatsFromFile(hbuf, PA.image_files[c]);      tp_proc_set_image(p,c,hbuf);
+  }
+  // tasks (TpHostGpu::setTasks, target_disparity=0, scale=0)
+  std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
+  for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
+  std::vector<float> ftask((size_t)P.tp_tasks_size*P.task_size, 0.0f);
+  for(int ty=0;ty<P.tilesy;ty++) for(int tx=0;tx<P.tilesx;tx++){
+    int nt=ty*P.tilesx+tx, task=(1<<P.task_inter_en)|(1<<P.task_corr_en)|(1<<P.task_text_en), txy=tx+(ty<<16);
+    float* tp=ftask.data()+(size_t)P.task_size*nt;
+    tp[P.tp_task_task_offset]=*(float*)&task; tp[P.tp_task_txy_offset]=*(float*)&txy;
+    tp[P.tp_task_disparity_offset]=0.0f; tp[P.tp_task_scale_offset]=0.0f;
+    float* q=tp+P.tp_task_xy_offset;
+    for(int c=0;c<NC;c++){ *q++=coords[(size_t)c*P.tp_tasks_size*2+nt*2+0]; *q++=coords[(size_t)c*P.tp_tasks_size*2+nt*2+1]; }
+  }
+  tp_proc_set_tasks(p, ftask.data(), P.tp_tasks_size, P.tp_tasks_size*P.task_size);
+  // geometry + convert (standard, with kernels)
+  tp_proc_exec_geometry(p, 1);
+  if(tp_proc_exec_convert_direct(p, 0, -1, 0)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
+  if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost);
+  double clt_err=0; std::vector<float> got(p->slice), gold(p->slice);
+  for(int c=0;c<NC;c++){ readFloatsFromFile(gold.data(), PA.ports_clt_file[c]); tp_proc_get_clt(p,c,0,got.data());
+    for(int i=0;i<p->slice;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>clt_err)clt_err=e; } }
+  if(out_clt_err) *out_clt_err=clt_err;
+  // no_kernels smoke test: re-run convert into clt (no_kernels=1), check finite + how much it differs
+  double nokern_max=0;
+  if(tp_proc_exec_convert_direct(p, 0, 0, 1)==0){
+    bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data());
+      for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } }
+    if(!finite) nokern_max=-1; // signal non-finite
+  } else nokern_max=-2;
+  if(out_nokern_max) *out_nokern_max=nokern_max;
+  free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p);
+  return 0;
+}
+void tp_proc_destroy(TpProc* p){
+  if(!p)return; if(p->mod) cuCtxSetCurrent(p->mod->ctx);
+  for(int c=0;c<p->num_cams;c++){ if(c<(int)p->kernels_h.size())cudaFree(p->kernels_h[c]); if(c<(int)p->offs_h.size())cudaFree(p->offs_h[c]);
+    if(c<(int)p->images_h.size())cudaFree(p->images_h[c]); if(c<(int)p->clt_h.size())cudaFree(p->clt_h[c]); if(c<(int)p->clt_ref_h.size())cudaFree(p->clt_ref_h[c]); }
+  cudaFree(p->gpu_kernels); cudaFree(p->gpu_kernel_offsets); cudaFree(p->gpu_images); cudaFree(p->gpu_clt); cudaFree(p->gpu_clt_ref);
+  cudaFree(p->active); cudaFree(p->num_active); if(p->ftasks)cudaFree(p->ftasks);
+  cudaFree(p->gc); cudaFree(p->cv); cudaFree(p->rbr); cudaFree(p->rot);
+  delete p;
+}
+} // extern "C" (TpProc)