CLAUDE: FIX convert_direct deconvolution in JNA — pass (kernels_hor, kernels_vert), not (0, *)

THE production-mismatch bug (RMSE ~1.7 vs JCUDA, invariant to FPN/row-col/MB). convert_direct gates the deconvolution kernels on `kernels_hor>0` (TileProcessor.cu:2782-2783): with kernels_hor=0 it passes NULL kernels -> NO deconvolution. tp_proc_exec_convert_direct hardcoded kh=0 (copied from the harness, whose golden was itself made with no deconvolution), so JNA skipped aberration deconvolution while production GpuQuad passes (kernels_hor, kernels_vert)=(82,66) and applies it. Fix: add kernels_vert to TpProc (= kern_tiles/(kernels_hor*num_colors)); exec passes (no_kernels?0:kernels_hor, no_kernels?0:kernels_vert). tp_proc_convert_selftest now uses no_kernels=1 to keep matching the NO-deconv harness golden (StageProc still PASS: CLT 0.1085 / RBG 0.0201 / corr 2e-5). Production (GpuQuadJna no_kernels=false) now applies deconvolution = matches JCUDA. .so-only change. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: FIX convert_direct deconvolution in JNA — pass (kernels_hor, kernels_vert), not (0, *)
THE production-mismatch bug (RMSE ~1.7 vs JCUDA, invariant to FPN/row-col/MB). convert_direct gates the deconvolution kernels on `kernels_hor>0` (TileProcessor.cu:2782-2783): with kernels_hor=0 it passes NULL kernels -> NO deconvolution. tp_proc_exec_convert_direct hardcoded kh=0 (copied from the harness, whose golden was itself made with no deconvolution), so JNA skipped aberration deconvolution while production GpuQuad passes (kernels_hor, kernels_vert)=(82,66) and applies it. Fix: add kernels_vert to TpProc (= kern_tiles/(kernels_hor*num_colors)); exec passes (no_kernels?0:kernels_hor, no_kernels?0:kernels_vert). tp_proc_convert_selftest now uses no_kernels=1 to keep matching the NO-deconv harness golden (StageProc still PASS: CLT 0.1085 / RBG 0.0201 / corr 2e-5). Production (GpuQuadJna no_kernels=false) now applies deconvolution = matches JCUDA. .so-only change. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
0399a26d · Andrey Filippov · 53599fd3 · 0399a26d
Commit 0399a26d authored Jun 26, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 8 additions and 6 deletions

tp_jna.cpp jna/tp_jna.cpp +8 -6

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -576,7 +576,7 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
 // ============================================================================
 struct TpProc {
  TpModule* mod;
-  int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kern_tiles, kern_size, slice;
+  int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kernels_vert, kern_tiles, kern_size, slice;
  size_t dstride_img;        // image pitch (bytes)
  float *gc, *cv, *rbr, *rot;
  std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h;
@@ -615,6 +615,7 @@ int tp_proc_setup(TpProc* p, int num_cams, int num_colors, int img_w, int img_h,
  cuCtxSetCurrent(p->mod->ctx);
  p->num_cams=num_cams; p->num_colors=num_colors; p->img_w=img_w; p->img_h=img_h;
  p->kernels_hor=kernels_hor; p->kern_tiles=kern_tiles; p->kern_size=kern_tiles*4*64;
+  p->kernels_vert = (kernels_hor*num_colors>0) ? kern_tiles/(kernels_hor*num_colors) : 0; // = KERNELS_VERT
  p->tilesx=img_w/8; p->tilesy=img_h/8;
  p->slice = p->tilesy*p->tilesx*num_colors*4*64;
  const int CE = (int)(sizeof(CltExtra)/sizeof(float));
@@ -705,7 +706,7 @@ int tp_proc_exec_convert_direct(TpProc* p, int ref_scene, int erase_clt, int no_
    cuCtxSynchronize();
  }
  int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, lpf=0, ww=p->img_w, wh=p->img_h,
-      kh=0, kv=no_kernels?0:p->kernels_hor, tx=p->tilesx;     // standard: (0, kernels_hor); no_kernels: (0,0)
+      kh=no_kernels?0:p->kernels_hor, kv=no_kernels?0:p->kernels_vert, tx=p->tilesx; // (kernels_hor,vert) applies deconvolution; (0,0) skips. convert_direct gates kernels on kernels_hor>0.
  size_t dstride=p->dstride_img/sizeof(float);
  void* a[]={ &nc,&ncol,&p->gpu_kernel_offsets,&p->gpu_kernels,&p->gpu_images,&p->ftasks,&clt_sel,
              &dstride,&ntsk,&lpf,&ww,&wh,&kh,&kv,&p->active,&p->num_active,&tx };
@@ -901,9 +902,10 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
  }
  tp_proc_set_tasks(p, ftask.data(), P.tp_tasks_size, P.tp_tasks_size*P.task_size);
-  // geometry + convert (standard, with kernels)
+  // geometry + convert. NOTE: the harness golden (aux_chnN.clt) was generated with kernels_hor=0
+  // (NO deconvolution), so match it with no_kernels=1. Production (GpuQuadJna) uses no_kernels=0 -> deconv.
  tp_proc_exec_geometry(p, 1);
-  if(tp_proc_exec_convert_direct(p, 0, -1, 0)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
+  if(tp_proc_exec_convert_direct(p, 0, -1, 1)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
  if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost);
  double clt_err=0; { std::vector<float> got(p->slice), gold(p->slice);
@@ -931,9 +933,9 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
  }
  if(out_corr_err) *out_corr_err=corr_err;
-  // no_kernels smoke test: re-run convert into clt (no_kernels=1), check finite + how much it differs
+  // deconvolution (no_kernels=0) smoke test: applies kernels (production path), check finite: re-run convert into clt (no_kernels=1), check finite + how much it differs
  double nokern_max=0;
-  if(tp_proc_exec_convert_direct(p, 0, 0, 1)==0){
+  if(tp_proc_exec_convert_direct(p, 0, 0, 0)==0){ // no_kernels=0 -> deconvolution applied
    std::vector<float> got(p->slice);
    bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data());
      for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } }