CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell;...

CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell; golden mismatch documented) Extend run_convert_pipeline with do_tex: setTextures/setRGBA-equivalent buffers (texture_indices from tasks, gpu_textures, diff_rgb_combo, color_weights, generate_RBGA_params), cuFuncSetAttribute(textures_accumulate, MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size) for the CDP child, launch textures_nonoverlap <<<1,1>>> (18 args), de-interleave diff_rgb_combo. tp_tex_selftest. RESULT on RTX 5060 Ti: textures_nonoverlap + its CDP child textures_accumulate EXECUTE correctly (no errors, shared 58880, 5120 tiles, output rms within ~1% of golden) => Blackwell compatibility confirmed. BUT diff_rgb_combo does NOT match the Jul-2025 golden numerically (value layers off by constant ~268, diff layers diff_sigma-sensitive). Ruled out input-CLT sensitivity (same error with golden CLT), diff_sigma (10.0 closest), arg/param order. NOT used by the LWIR16 CUAS workflow (cuas/ uses only convert_direct/corr2D_normalize/imclt_rbg_all). Documented known issue (golden staleness / unverified RGB-path drift), to track later via git bisect + the 107 kernel branch switch. See imagej-elphel-internal handoffs/2026-06-25_texture-diff-rgb-combo-mismatch.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell;...
CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell; golden mismatch documented) Extend run_convert_pipeline with do_tex: setTextures/setRGBA-equivalent buffers (texture_indices from tasks, gpu_textures, diff_rgb_combo, color_weights, generate_RBGA_params), cuFuncSetAttribute(textures_accumulate, MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size) for the CDP child, launch textures_nonoverlap <<<1,1>>> (18 args), de-interleave diff_rgb_combo. tp_tex_selftest. RESULT on RTX 5060 Ti: textures_nonoverlap + its CDP child textures_accumulate EXECUTE correctly (no errors, shared 58880, 5120 tiles, output rms within ~1% of golden) => Blackwell compatibility confirmed. BUT diff_rgb_combo does NOT match the Jul-2025 golden numerically (value layers off by constant ~268, diff layers diff_sigma-sensitive). Ruled out input-CLT sensitivity (same error with golden CLT), diff_sigma (10.0 closest), arg/param order. NOT used by the LWIR16 CUAS workflow (cuas/ uses only convert_direct/corr2D_normalize/imclt_rbg_all). Documented known issue (golden staleness / unverified RGB-path drift), to track later via git bisect + the 107 kernel branch switch. See imagej-elphel-internal handoffs/2026-06-25_texture-diff-rgb-combo-mismatch.md. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
341538c7 · Andrey Filippov · a0984dca · 341538c7
Commit 341538c7 authored Jun 25, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 85 additions and 8 deletions

tp_jna.cpp jna/tp_jna.cpp +85 -8

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -238,8 +238,8 @@ void tp_destroy_instance(TpInstance* inst){
 //  do_imclt!=0 → imclt_rbg_all, compare de-pitched RBG to clt/aux_chnN.rbg (out_rbg_err).
 //  do_corr!=0  → correlate2D(TD) + corr2D_combine + corr2D_normalize, compare the quad-combined
 //                pixel-domain correlation to clt/aux_corr-quad.corr (out_corr_err).
-static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt, int do_corr,
-                                double* out_clt_err, double* out_rbg_err, double* out_corr_err, int* out_num_active){
+static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt, int do_corr, int do_tex,
+                                double* out_clt_err, double* out_rbg_err, double* out_corr_err, double* out_tex_err, int* out_num_active){
  g_err[0]=0;
  if(!m){ seterr("selftest: null module"); return -1; }
  if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
@@ -256,9 +256,11 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
  CUfunction f_cor = getfun(m,"correlate2D");
  CUfunction f_cmb = getfun(m,"corr2D_combine");
  CUfunction f_nrm = getfun(m,"corr2D_normalize");
-  if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)||(do_corr&&(!f_cor||!f_cmb||!f_nrm))){
-    seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p cor=%p cmb=%p nrm=%p",
-           (void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im,(void*)f_cor,(void*)f_cmb,(void*)f_nrm); return -3; }
+  CUfunction f_tex = getfun(m,"textures_nonoverlap");
+  CUfunction f_acc = getfun(m,"textures_accumulate"); // CDP child; needs max-dyn-shared attribute
+  if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)||(do_corr&&(!f_cor||!f_cmb||!f_nrm))||(do_tex&&(!f_tex||!f_acc))){
+    seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p cor=%p cmb=%p nrm=%p tex=%p acc=%p",
+           (void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im,(void*)f_cor,(void*)f_cmb,(void*)f_nrm,(void*)f_tex,(void*)f_acc); return -3; }

  int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
  float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
@@ -446,6 +448,73 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
  }
  if(out_corr_err) *out_corr_err=corr_err;

+  // --- textures: textures_nonoverlap (DP; CDP-launches textures_accumulate) vs aux_diff_rgb_combo_dp.drbg ---
+  double tex_err=-1;
+  float *gpu_textures=nullptr,*gpu_diff_rgb_combo=nullptr,*gpu_color_weights=nullptr,*gpu_rbga_params=nullptr;
+  int *gpu_texture_indices=nullptr,*gpu_pnum_texture_tiles=nullptr;
+  if(do_tex){
+    // NOTE: textures_nonoverlap EXECUTES correctly on Blackwell (CDP child textures_accumulate launches),
+    // output magnitude matches golden (rms within ~1%), BUT diff_rgb_combo does NOT match the Jul-2025
+    // golden numerically (value layers off by a constant ~268, diff layers diff_sigma-sensitive). Verified
+    // NOT input sensitivity (same error when gpu_clt is overwritten with the golden CLT), NOT diff_sigma
+    // (10.0 current is closest; 1.5/1.0 worse), NOT arg/param order (matches kernel signature). Texture
+    // path is NOT used by the LWIR16 CUAS workflow (cuas/ uses only convert_direct/corr2D_normalize/imclt).
+    // Treated as a documented known issue (likely golden staleness / unverified RGB-path drift); to be
+    // tracked later via git bisect of TileProcessor.cu textures + the 107 kernel branch switch.
+    // See handoffs/2026-06-25_texture-diff-rgb-combo-mismatch.md.
+    int tex_colors=P.texture_colors, keep_w=P.keep_texture_weights;
+    int tile_layers = tex_colors+1+(keep_w?(NC+tex_colors+1):0);
+    int tile_tex_size = tile_layers*256;
+    const int ntiles = P.tilesx*P.tilesy;
+    // texture index list from tasks (all tiles have task_text_en set in setTasks)
+    std::vector<int> tex_idx(ntiles); int num_tex=0;
+    for(int ty=0;ty<P.tilesy;ty++) for(int tx=0;tx<P.tilesx;tx++){
+      int nt=ty*P.tilesx+tx; int task=*(int*)&ftask[(size_t)P.task_size*nt];
+      if(task & (P.task_texture_bits | (1<<P.task_text_en))) tex_idx[num_tex++]=(nt<<P.text_ntile_shift)|(1<<P.list_texture_bit);
+    }
+    gpu_texture_indices = (int*)copyalloc_kernel_gpu((float*)tex_idx.data(), num_tex, P.tilesx*P.tilesya);
+    size_t dstride_textures=0;
+    gpu_textures = alloc_image_gpu(&dstride_textures, tile_tex_size, ntiles);
+    cudaMalloc((void**)&gpu_diff_rgb_combo, (size_t)ntiles*NC*(P.num_colors+1)*sizeof(float));
+    cudaMalloc((void**)&gpu_pnum_texture_tiles, sizeof(int));
+    int zero=0; cudaMemcpy(gpu_pnum_texture_tiles,&zero,sizeof(int),cudaMemcpyHostToDevice);
+    gpu_color_weights = copyalloc_kernel_gpu((float*)P.color_weights, (int)(sizeof(P.color_weights)/sizeof(float)));
+    gpu_rbga_params   = copyalloc_kernel_gpu((float*)P.generate_RBGA_params, 5);
+
+    int shared_size = host_get_textures_shared_size(NC, tex_colors, 0);
+    cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+    cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, 100);
+
+    int num_cams=NC, ntsk=P.tp_tasks_size, is_lwir=(tex_colors==1), dust=1, linescan=1, tilesx=P.tilesx;
+    size_t tex_stride=dstride_textures/sizeof(float);
+    void* a[]={ &num_cams,&gpu_ftasks,&ntsk,&gpu_texture_indices,&gpu_pnum_texture_tiles,&gpu_clt,&gpu_gc,
+                &tex_colors,&is_lwir,&gpu_rbga_params,&gpu_color_weights,&dust,&keep_w,
+                &tex_stride,&gpu_textures,&linescan,&gpu_diff_rgb_combo,&tilesx };
+    cuCtxSynchronize();
+    cr=cuLaunchKernel(f_tex,1,1,1,1,1,1,0,nullptr,a,nullptr);
+    if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch textures_nonoverlap -> %d (%s)",cr,es); return -30; }
+    cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync textures_nonoverlap -> %d (%s)",cr,es); return -31; }
+
+    // diff_rgb_combo: device order [ntile*nlayers+nl] -> golden order [nl*ntiles+ntile]; raster (order-stable)
+    int nlayers=NC*(P.num_colors+1), sz=ntiles*nlayers;
+    std::vector<float> dev(sz), got(sz), gold(sz);
+    cudaMemcpy(dev.data(), gpu_diff_rgb_combo, (size_t)sz*sizeof(float), cudaMemcpyDeviceToHost);
+    for(int nl=0;nl<nlayers;nl++) for(int nt=0;nt<ntiles;nt++) got[nl*ntiles+nt]=dev[nt*nlayers+nl];
+    readFloatsFromFile(gold.data(), PA.result_diff_rgb_combo_file_dp);
+    tex_err=0; double gotmax=0,gmax=0,sgot=0,sg=0; int nnan_got=0,nnan_gold=0;
+    for(int i=0;i<sz;i++){ if(std::isnan(got[i]))nnan_got++; if(std::isnan(gold[i]))nnan_gold++;
+      double e=std::fabs((double)got[i]-(double)gold[i]); if(e>tex_err) tex_err=e;
+      if(std::fabs(got[i])>gotmax)gotmax=std::fabs(got[i]); if(std::fabs(gold[i])>gmax)gmax=std::fabs(gold[i]);
+      sgot+=(double)got[i]*got[i]; sg+=(double)gold[i]*gold[i]; }
+    std::vector<float> sa(got),sb(gold); std::sort(sa.begin(),sa.end()); std::sort(sb.begin(),sb.end());
+    double serr=0; for(int i=0;i<sz;i++){ double e=std::fabs((double)sa[i]-(double)sb[i]); if(e>serr)serr=e; }
+    // per-layer-group: are the first ntiles (layer 0) matching? compare layer 0 only (diff[cam0])
+    double l0err=0; for(int nt=0;nt<ntiles;nt++){ double e=std::fabs((double)got[nt]-(double)gold[nt]); if(e>l0err)l0err=e; }
+    printf("TEX DIAG: num_tex=%d nlayers=%d shared=%d | pointwise=%.4g sorted=%.6g layer0=%.4g | got(max=%.2f rms=%.3f) gold(max=%.2f rms=%.3f)\n",
+           num_tex, nlayers, shared_size, tex_err, serr, l0err, gotmax, std::sqrt(sgot/sz), gmax, std::sqrt(sg/sz));
+  }
+  if(out_tex_err) *out_tex_err=tex_err;
+
  // --- free ---
  for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); if(do_imclt)cudaFree(corr_h[c]); }
  cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
@@ -454,6 +523,8 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
  cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
  if(do_corr){ cudaFree(gpu_corrs_td); cudaFree(gpu_corrs_combo_td); cudaFree(gpu_corrs_combo);
               cudaFree(gpu_corr_indices); cudaFree(gpu_corrs_combo_indices); cudaFree(gpu_num_corr_tiles); }
+  if(do_tex){ cudaFree(gpu_textures); cudaFree(gpu_diff_rgb_combo); cudaFree(gpu_texture_indices);
+              cudaFree(gpu_pnum_texture_tiles); cudaFree(gpu_color_weights); cudaFree(gpu_rbga_params); }
  free(fgc); free(fcv); free(hbuf);
  return 0;
 }
@@ -461,19 +532,25 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
 // Stage-2 wrapper: convert_direct only.
 int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
                               double* out_max_err, int* out_num_active){
-  return run_convert_pipeline(m, lwir, data_root, 0,0, out_max_err, nullptr, nullptr, out_num_active);
+  return run_convert_pipeline(m, lwir, data_root, 0,0,0, out_max_err, nullptr, nullptr, nullptr, out_num_active);
 }

 // Stage-3 wrapper: convert_direct + imclt_rbg_all (reports both CLT and RBG max error).
 int tp_imclt_selftest(TpModule* m, int lwir, const char* data_root,
                      double* out_clt_err, double* out_rbg_err, int* out_num_active){
-  return run_convert_pipeline(m, lwir, data_root, 1,0, out_clt_err, out_rbg_err, nullptr, out_num_active);
+  return run_convert_pipeline(m, lwir, data_root, 1,0,0, out_clt_err, out_rbg_err, nullptr, nullptr, out_num_active);
 }

 // Stage-4 wrapper: convert_direct + correlate2D/combine/normalize (reports CLT and quad-corr max error).
 int tp_corr_selftest(TpModule* m, int lwir, const char* data_root,
                     double* out_clt_err, double* out_corr_err, int* out_num_active){
-  return run_convert_pipeline(m, lwir, data_root, 0,1, out_clt_err, nullptr, out_corr_err, out_num_active);
+  return run_convert_pipeline(m, lwir, data_root, 0,1,0, out_clt_err, nullptr, out_corr_err, nullptr, out_num_active);
+}
+
+// Stage-5 wrapper: convert_direct + textures_nonoverlap (reports CLT and diff_rgb_combo max error).
+int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
+                    double* out_clt_err, double* out_tex_err, int* out_num_active){
+  return run_convert_pipeline(m, lwir, data_root, 0,0,1, out_clt_err, nullptr, nullptr, out_tex_err, out_num_active);
 }

 } // extern "C"