CLAUDE: Stage 3 — native imclt_rbg_all via JNA + .rbg golden validation

Refactor the Stage-2 selftest into run_convert_pipeline(do_imclt) shared helper; tp_convert_direct_selftest is now a thin wrapper (do_imclt=0). Add tp_imclt_selftest (do_imclt=1): after convert_direct, allocate pitched RBG output buffers (alloc_image_gpu, 648x520/cam, mono), launch imclt_rbg_all <<<1,1>>> (gpu_clt -> gpu_corr_images), de-pitch via cudaMemcpy2D, compare to clt/aux_chnN.rbg golden. Validated on RTX 5060 Ti via Java->JNA: max|RBG-golden|=0.0201 over peaks 1535 -> relative ~1.31e-5. convert_direct CLT error unchanged (0.108505) => no regression. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Stage 3 — native imclt_rbg_all via JNA + .rbg golden validation
Refactor the Stage-2 selftest into run_convert_pipeline(do_imclt) shared helper; tp_convert_direct_selftest is now a thin wrapper (do_imclt=0). Add tp_imclt_selftest (do_imclt=1): after convert_direct, allocate pitched RBG output buffers (alloc_image_gpu, 648x520/cam, mono), launch imclt_rbg_all <<<1,1>>> (gpu_clt -> gpu_corr_images), de-pitch via cudaMemcpy2D, compare to clt/aux_chnN.rbg golden. Validated on RTX 5060 Ti via Java->JNA: max|RBG-golden|=0.0201 over peaks 1535 -> relative ~1.31e-5. convert_direct CLT error unchanged (0.108505) => no regression. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
edfc7bae · Andrey Filippov · 05ee47d0 · edfc7bae
Commit edfc7bae authored Jun 25, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 59 additions and 12 deletions

tp_jna.cpp jna/tp_jna.cpp +59 -12

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -234,8 +234,10 @@ void tp_destroy_instance(TpInstance* inst){
 // cuLaunchKernel against the NVRTC module. Reads CLT back and compares to clt/aux_chnN.clt golden.
 // data_root must contain clt/ (TpPaths uses relative "clt/..." names). lwir=1 -> 16-cam LWIR set.
 // Runtime (cudaMalloc) and driver (cuLaunchKernel) interoperate because the module's context is current.
-int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
-                               double* out_max_err, int* out_num_active){
+// Shared convert(+optional imclt) pipeline. do_imclt!=0 also runs imclt_rbg_all and
+// compares the de-pitched RBG to clt/aux_chnN.rbg, writing out_rbg_err.
+static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt,
+                                double* out_clt_err, double* out_rbg_err, int* out_num_active){
  g_err[0]=0;
  if(!m){ seterr("selftest: null module"); return -1; }
  if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
@@ -248,7 +250,8 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
  CUfunction f_rot = getfun(m,"calc_rot_deriv");
  CUfunction f_off = getfun(m,"calculate_tiles_offsets");
  CUfunction f_cd  = getfun(m,"convert_direct");
-  if(!f_rbr||!f_rot||!f_off||!f_cd){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd); return -3; }
+  CUfunction f_im  = getfun(m,"imclt_rbg_all");
+  if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im); return -3; }

  int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
  float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
@@ -276,6 +279,16 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
  cudaMalloc((void**)&gpu_active, (size_t)P.tilesx*P.tilesy*sizeof(int));
  cudaMalloc((void**)&gpu_num_active, sizeof(int));

+  // --- RBG output images (pitched, 3x-height-for-colors, +dtt_size margins) for imclt ---
+  const int rbg_w = P.img_width  + P.dtt_size;                         // 648
+  const int rbg_h = P.num_colors*(P.img_height + P.dtt_size);          // 520 (mono)
+  const int rbg_size = rbg_w * rbg_h;                                  // 336960 floats/cam (== aux_chnN.rbg)
+  std::vector<float*> corr_h(NC, nullptr); float** gpu_corr_images=nullptr; size_t dstride_rslt=0;
+  if(do_imclt){
+    for(int c=0;c<NC;c++) corr_h[c]=alloc_image_gpu(&dstride_rslt, rbg_w, rbg_h);
+    gpu_corr_images = copyalloc_pointers_gpu(corr_h.data(), NC);
+  }
+
  // --- tasks (TpHostGpu::setTasks, target_disparity=0, scale=0) ---
  std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
  for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
@@ -329,23 +342,57 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
  int num_active=0; cudaMemcpy(&num_active, gpu_num_active, sizeof(int), cudaMemcpyDeviceToHost);

  // --- compare CLT to golden clt/aux_chnN.clt ---
-  double maxerr=0;
-  std::vector<float> golden(slice), got(slice);
+  double clt_err=0;
+  { std::vector<float> golden(slice), got(slice);
    for(int c=0;c<NC;c++){
      readFloatsFromFile(golden.data(), PA.ports_clt_file[c]);
      cudaMemcpy(got.data(), clt_h[c], (size_t)slice*sizeof(float), cudaMemcpyDeviceToHost);
-    for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>maxerr) maxerr=e; }
-  }
-  if(out_max_err) *out_max_err=maxerr;
+      for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>clt_err) clt_err=e; }
+    } }
+  if(out_clt_err) *out_clt_err=clt_err;
  if(out_num_active) *out_num_active=num_active;

+  // --- imclt_rbg_all <<<1,1>>> (gpu_clt -> gpu_corr_images), compare de-pitched RBG to aux_chnN.rbg ---
+  double rbg_err=-1;
+  if(do_imclt){
+    int apply_lpf=1, num_colors2=P.num_colors, twidth=P.tilesx, theight=P.tilesy;
+    size_t rslt_floats = dstride_rslt/sizeof(float);
+    void* a[]={ &NC,&gpu_clt,&gpu_corr_images,&apply_lpf,&num_colors2,&twidth,&theight,&rslt_floats };
+    cuCtxSynchronize();
+    cr=cuLaunchKernel(f_im,1,1,1,1,1,1,0,nullptr,a,nullptr);
+    if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch imclt_rbg_all -> %d (%s)",cr,es); return -12; }
+    cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync imclt_rbg_all -> %d (%s)",cr,es); return -13; }
+    rbg_err=0;
+    std::vector<float> gold(rbg_size), got(rbg_size);
+    for(int c=0;c<NC;c++){
+      readFloatsFromFile(gold.data(), PA.result_rbg_file[c]);
+      cudaMemcpy2D(got.data(), (size_t)rbg_w*sizeof(float), corr_h[c], dstride_rslt,
+                   (size_t)rbg_w*sizeof(float), rbg_h, cudaMemcpyDeviceToHost); // de-pitch
+      for(int i=0;i<rbg_size;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>rbg_err) rbg_err=e; }
+    }
+  }
+  if(out_rbg_err) *out_rbg_err=rbg_err;
+
  // --- free ---
-  for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); }
+  for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); if(do_imclt)cudaFree(corr_h[c]); }
  cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
+  if(gpu_corr_images) cudaFree(gpu_corr_images);
  cudaFree(gpu_active); cudaFree(gpu_num_active); cudaFree(gpu_ftasks);
  cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
  free(fgc); free(fcv); free(hbuf);
  return 0;
 }

+// Stage-2 wrapper: convert_direct only.
+int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
+                               double* out_max_err, int* out_num_active){
+  return run_convert_pipeline(m, lwir, data_root, 0, out_max_err, nullptr, out_num_active);
+}
+
+// Stage-3 wrapper: convert_direct + imclt_rbg_all (reports both CLT and RBG max error).
+int tp_imclt_selftest(TpModule* m, int lwir, const char* data_root,
+                      double* out_clt_err, double* out_rbg_err, int* out_num_active){
+  return run_convert_pipeline(m, lwir, data_root, 1, out_clt_err, out_rbg_err, out_num_active);
+}
+
 } // extern "C"