CLAUDE: Add TpProc texture path (oracle): textures_nonoverlap + readback

Oracle (clt_aberrations_quad_corr_GPU) needs the texture path that RT did not: execTextures (calc_extra -> diff_rgb_combo, calc_textures -> textures), getTextureIndices, getExtra, getFlatTextures. Adds to TpProc: - texture buffers (lazy-allocated in tp_proc_ensure_textures; sizes match GpuQuad: max_texture_size=(num_colors+1+(num_cams+num_colors+1))*256) - tp_proc_exec_textures: textures_nonoverlap<<<1,1>>> mirroring GpuQuad.execTextures_DP. The kernel CDP-builds the index list internally (create_nonoverlap_list) so we do NOT host pre-fill it, and CDP-launches textures_accumulate (dyn-shared attr set on the accumulate fn). linescan_order is taken from the caller (0 in production -> diff_rgb_combo in texture_indices order). This matches PRODUCTION, not the Stage-5 harness convention (linescan=1 + host-prefilled indices + stale golden) that produced the documented diff_rgb_combo mismatch. - tp_proc_get_texture_indices / get_diff_rgb_combo / get_textures readback. Native compiles clean (4 new symbols exported). Not yet deployed: live .so left untouched (run was active); rebuild via jna/build_lib.sh when free. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Add TpProc texture path (oracle): textures_nonoverlap + readback
Oracle (clt_aberrations_quad_corr_GPU) needs the texture path that RT did not: execTextures (calc_extra -> diff_rgb_combo, calc_textures -> textures), getTextureIndices, getExtra, getFlatTextures. Adds to TpProc: - texture buffers (lazy-allocated in tp_proc_ensure_textures; sizes match GpuQuad: max_texture_size=(num_colors+1+(num_cams+num_colors+1))*256) - tp_proc_exec_textures: textures_nonoverlap<<<1,1>>> mirroring GpuQuad.execTextures_DP. The kernel CDP-builds the index list internally (create_nonoverlap_list) so we do NOT host pre-fill it, and CDP-launches textures_accumulate (dyn-shared attr set on the accumulate fn). linescan_order is taken from the caller (0 in production -> diff_rgb_combo in texture_indices order). This matches PRODUCTION, not the Stage-5 harness convention (linescan=1 + host-prefilled indices + stale golden) that produced the documented diff_rgb_combo mismatch. - tp_proc_get_texture_indices / get_diff_rgb_combo / get_textures readback. Native compiles clean (4 new symbols exported). Not yet deployed: live .so left untouched (run was active); rebuild via jna/build_lib.sh when free. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2aa4e6e0 · Andrey Filippov · 33827862 · 2aa4e6e0
Commit 2aa4e6e0 authored Jun 26, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 78 additions and 0 deletions

tp_jna.cpp jna/tp_jna.cpp +78 -0

No files found.
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -591,6 +591,13 @@ struct TpProc {
  float *gpu_corrs_td, *gpu_corrs_combo_td, *gpu_corrs_combo, *gpu_corrs; // gpu_corrs = per-pair pixel-domain
  int *gpu_corr_indices, *gpu_corrs_combo_indices, *gpu_num_corr_tiles;
  int last_num_corr_combo, last_num_corr_tiles;
+  // textures (oracle path). textures_nonoverlap<<<1,1>>> CDP-builds the index list (create_nonoverlap_list)
+  // then CDP-launches textures_accumulate -> gpu_tex_textures + gpu_tex_diff_rgb_combo. Matches production
+  // GpuQuad.execTextures_DP (linescan_order=0, kernel-built list), NOT the Stage-5 harness convention.
+  bool have_tex; size_t dstride_tex; int tex_max_size;
+  float *gpu_tex_textures, *gpu_tex_diff_rgb_combo, *gpu_tex_color_weights, *gpu_tex_rbga_params;
+  int *gpu_tex_indices, *gpu_tex_pnum;
+  int last_num_tex_tiles;
 };

 extern "C" {
@@ -606,6 +613,8 @@ TpProc* tp_proc_create(TpModule* m){
  p->have_corr=false; p->gpu_corrs_td=p->gpu_corrs_combo_td=p->gpu_corrs_combo=p->gpu_corrs=nullptr;
  p->gpu_corr_indices=p->gpu_corrs_combo_indices=p->gpu_num_corr_tiles=nullptr;
  p->last_num_corr_combo=0; p->last_num_corr_tiles=0;
+  p->have_tex=false; p->gpu_tex_textures=p->gpu_tex_diff_rgb_combo=p->gpu_tex_color_weights=p->gpu_tex_rbga_params=nullptr;
+  p->gpu_tex_indices=p->gpu_tex_pnum=nullptr; p->last_num_tex_tiles=0;
  return p;
 }

@@ -874,6 +883,73 @@ int tp_proc_erase_corrs(TpProc* p){ if(!p||!p->have_corr)return -1; cuCtxSetCurr
  return cudaMemset2D(p->gpu_corrs, p->dstride_corr, 0, (size_t)p->corr_length*sizeof(float),
                      (size_t)p->num_pairs*p->tilesx*p->tilesy)==cudaSuccess?0:-2; }

+// ---- textures (oracle path): execTextures / getTextureIndices / getExtra / getFlatTextures ----
+// Lazy-allocate the texture buffers on first use (production GpuQuad allocates them in its ctor;
+// here we defer so a corr-only run never pays for them). Sizes match GpuQuad exactly.
+static int tp_proc_ensure_textures(TpProc* p){
+  if(p->have_tex) return 0;
+  cuCtxSetCurrent(p->mod->ctx);
+  int ntiles=p->tilesx*p->tilesy;
+  // GpuQuad: max_texture_size = (num_colors+1 + (num_cams+num_colors+1)) * (2*DTT_SIZE)^2; DTT_SIZE=8 -> 256
+  p->tex_max_size = (p->num_colors+1 + (p->num_cams+p->num_colors+1)) * 256;
+  p->gpu_tex_textures = alloc_image_gpu(&p->dstride_tex, p->tex_max_size, ntiles); // pitched -> texture_stride
+  cudaMalloc((void**)&p->gpu_tex_diff_rgb_combo, (size_t)ntiles*p->num_cams*(p->num_colors+1)*sizeof(float));
+  cudaMalloc((void**)&p->gpu_tex_indices, (size_t)ntiles*sizeof(int)); // kernel fills (create_nonoverlap_list)
+  cudaMalloc((void**)&p->gpu_tex_pnum,    sizeof(int));                // = gpu_texture_indices_len
+  cudaMalloc((void**)&p->gpu_tex_color_weights, 3*sizeof(float));
+  cudaMalloc((void**)&p->gpu_tex_rbga_params,   5*sizeof(float));
+  p->have_tex=true;
+  return 0;
+}
+// textures_nonoverlap<<<1,1>>> (mirrors GpuQuad.execTextures_DP). params5 = {min_shot,scale_shot,diff_sigma,
+// diff_threshold,min_agree}; weights3 = R,B,G. calc_textures -> gpu_tex_textures (else stride 0/skip),
+// calc_extra -> gpu_tex_diff_rgb_combo (else null/skip). Returns the kernel-built texture-tile count.
+int tp_proc_exec_textures(TpProc* p, float* params5, float* weights3, int is_lwir,
+                          int dust_remove, int keep_weights, int calc_textures, int calc_extra, int linescan_order){
+  if(!p){ seterr("exec_textures: null"); return -1; }
+  if(tp_proc_ensure_textures(p)) return -2;
+  cuCtxSetCurrent(p->mod->ctx);
+  CUfunction f_tex=getfun(p->mod,"textures_nonoverlap"), f_acc=getfun(p->mod,"textures_accumulate");
+  if(!f_tex||!f_acc){ seterr("texture kernels missing"); return -3; }
+  cudaMemcpy(p->gpu_tex_color_weights, weights3, 3*sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(p->gpu_tex_rbga_params,   params5,  5*sizeof(float), cudaMemcpyHostToDevice);
+  int shared_size = host_get_textures_shared_size(p->num_cams, p->num_colors, 0); // CDP child needs dyn shared
+  cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
+  cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, 100); // CU_SHAREDMEM_CARVEOUT_MAX_SHARED
+  int zero=0; cudaMemcpy(p->gpu_tex_pnum,&zero,sizeof(int),cudaMemcpyHostToDevice);
+  int nc=p->num_cams, ntsk=p->ntiles, ncol=p->num_colors, tilesx=p->tilesx;
+  size_t tex_stride = calc_textures ? (p->dstride_tex/sizeof(float)) : 0;
+  float* p_diff = calc_extra ? p->gpu_tex_diff_rgb_combo : nullptr;
+  void* a[]={ &nc, &p->ftasks, &ntsk, &p->gpu_tex_indices, &p->gpu_tex_pnum, &p->gpu_clt, &p->gc,
+              &ncol, &is_lwir, &p->gpu_tex_rbga_params, &p->gpu_tex_color_weights, &dust_remove, &keep_weights,
+              &tex_stride, &p->gpu_tex_textures, &linescan_order, &p_diff, &tilesx };
+  if(launch1(f_tex,1,1,1,1,1,1,a,"textures_nonoverlap")) return -4;
+  cudaMemcpy(&p->last_num_tex_tiles, p->gpu_tex_pnum, sizeof(int), cudaMemcpyDeviceToHost);
+  return p->last_num_tex_tiles;
+}
+// read pnum (count) then min(count,n) packed indices; returns the true count (= gpu_texture_indices_len).
+int tp_proc_get_texture_indices(TpProc* p, int* out, int n){
+  if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
+  int cnt=0; cudaMemcpy(&cnt,p->gpu_tex_pnum,sizeof(int),cudaMemcpyDeviceToHost);
+  int c=(cnt<n)?cnt:n;
+  if(c>0 && cudaMemcpy(out,p->gpu_tex_indices,(size_t)c*sizeof(int),cudaMemcpyDeviceToHost)!=cudaSuccess) return -2;
+  return cnt;
+}
+// diff_rgb_combo in gpu_texture_indices order (linescan_order=0): first n = num_tex*num_cams*(num_colors+1) floats.
+int tp_proc_get_diff_rgb_combo(TpProc* p, float* out, int n){
+  if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
+  return cudaMemcpy(out,p->gpu_tex_diff_rgb_combo,(size_t)n*sizeof(float),cudaMemcpyDeviceToHost)==cudaSuccess?0:-2;
+}
+// de-pitch gpu_tex_textures (mirrors GpuQuad.getFlatTextures). tile_size = slices*256,
+// slices = num_colors+1 + (keep_weights ? (num_cams+num_colors+1) : 0).
+int tp_proc_get_textures(TpProc* p, float* out, int num_tiles, int num_colors, int keep_weights){
+  if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
+  int slices = num_colors+1+(keep_weights?(p->num_cams+num_colors+1):0);
+  int tile_size = slices*256;
+  return cudaMemcpy2D(out,(size_t)tile_size*sizeof(float), p->gpu_tex_textures, p->dstride_tex,
+                      (size_t)tile_size*sizeof(float), num_tiles, cudaMemcpyDeviceToHost)==cudaSuccess?0:-2;
+}
+
 void tp_proc_destroy(TpProc* p); // fwd

 // Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
@@ -972,6 +1048,8 @@ void tp_proc_destroy(TpProc* p){
  if(p->have_rbg){ for(size_t c=0;c<p->corr_images_h.size();c++) cudaFree(p->corr_images_h[c]); cudaFree(p->gpu_corr_images); }
  if(p->have_corr){ cudaFree(p->gpu_corrs_td); cudaFree(p->gpu_corrs_combo_td); cudaFree(p->gpu_corrs_combo); cudaFree(p->gpu_corrs);
    cudaFree(p->gpu_corr_indices); cudaFree(p->gpu_corrs_combo_indices); cudaFree(p->gpu_num_corr_tiles); }
+  if(p->have_tex){ cudaFree(p->gpu_tex_textures); cudaFree(p->gpu_tex_diff_rgb_combo); cudaFree(p->gpu_tex_indices);
+    cudaFree(p->gpu_tex_pnum); cudaFree(p->gpu_tex_color_weights); cudaFree(p->gpu_tex_rbga_params); }
  delete p;
 }