Commit 2aa4e6e0 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Add TpProc texture path (oracle): textures_nonoverlap + readback

Oracle (clt_aberrations_quad_corr_GPU) needs the texture path that RT did not:
execTextures (calc_extra -> diff_rgb_combo, calc_textures -> textures),
getTextureIndices, getExtra, getFlatTextures.

Adds to TpProc:
- texture buffers (lazy-allocated in tp_proc_ensure_textures; sizes match
  GpuQuad: max_texture_size=(num_colors+1+(num_cams+num_colors+1))*256)
- tp_proc_exec_textures: textures_nonoverlap<<<1,1>>> mirroring
  GpuQuad.execTextures_DP. The kernel CDP-builds the index list internally
  (create_nonoverlap_list) so we do NOT host pre-fill it, and CDP-launches
  textures_accumulate (dyn-shared attr set on the accumulate fn). linescan_order
  is taken from the caller (0 in production -> diff_rgb_combo in texture_indices
  order). This matches PRODUCTION, not the Stage-5 harness convention (linescan=1
  + host-prefilled indices + stale golden) that produced the documented
  diff_rgb_combo mismatch.
- tp_proc_get_texture_indices / get_diff_rgb_combo / get_textures readback.

Native compiles clean (4 new symbols exported). Not yet deployed: live .so
left untouched (run was active); rebuild via jna/build_lib.sh when free.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 33827862
......@@ -591,6 +591,13 @@ struct TpProc {
float *gpu_corrs_td, *gpu_corrs_combo_td, *gpu_corrs_combo, *gpu_corrs; // gpu_corrs = per-pair pixel-domain
int *gpu_corr_indices, *gpu_corrs_combo_indices, *gpu_num_corr_tiles;
int last_num_corr_combo, last_num_corr_tiles;
// textures (oracle path). textures_nonoverlap<<<1,1>>> CDP-builds the index list (create_nonoverlap_list)
// then CDP-launches textures_accumulate -> gpu_tex_textures + gpu_tex_diff_rgb_combo. Matches production
// GpuQuad.execTextures_DP (linescan_order=0, kernel-built list), NOT the Stage-5 harness convention.
bool have_tex; size_t dstride_tex; int tex_max_size;
float *gpu_tex_textures, *gpu_tex_diff_rgb_combo, *gpu_tex_color_weights, *gpu_tex_rbga_params;
int *gpu_tex_indices, *gpu_tex_pnum;
int last_num_tex_tiles;
};
extern "C" {
......@@ -606,6 +613,8 @@ TpProc* tp_proc_create(TpModule* m){
p->have_corr=false; p->gpu_corrs_td=p->gpu_corrs_combo_td=p->gpu_corrs_combo=p->gpu_corrs=nullptr;
p->gpu_corr_indices=p->gpu_corrs_combo_indices=p->gpu_num_corr_tiles=nullptr;
p->last_num_corr_combo=0; p->last_num_corr_tiles=0;
p->have_tex=false; p->gpu_tex_textures=p->gpu_tex_diff_rgb_combo=p->gpu_tex_color_weights=p->gpu_tex_rbga_params=nullptr;
p->gpu_tex_indices=p->gpu_tex_pnum=nullptr; p->last_num_tex_tiles=0;
return p;
}
......@@ -874,6 +883,73 @@ int tp_proc_erase_corrs(TpProc* p){ if(!p||!p->have_corr)return -1; cuCtxSetCurr
return cudaMemset2D(p->gpu_corrs, p->dstride_corr, 0, (size_t)p->corr_length*sizeof(float),
(size_t)p->num_pairs*p->tilesx*p->tilesy)==cudaSuccess?0:-2; }
// ---- textures (oracle path): execTextures / getTextureIndices / getExtra / getFlatTextures ----
// Lazy-allocate the texture buffers on first use (production GpuQuad allocates them in its ctor;
// here we defer so a corr-only run never pays for them). Sizes match GpuQuad exactly.
static int tp_proc_ensure_textures(TpProc* p){
if(p->have_tex) return 0;
cuCtxSetCurrent(p->mod->ctx);
int ntiles=p->tilesx*p->tilesy;
// GpuQuad: max_texture_size = (num_colors+1 + (num_cams+num_colors+1)) * (2*DTT_SIZE)^2; DTT_SIZE=8 -> 256
p->tex_max_size = (p->num_colors+1 + (p->num_cams+p->num_colors+1)) * 256;
p->gpu_tex_textures = alloc_image_gpu(&p->dstride_tex, p->tex_max_size, ntiles); // pitched -> texture_stride
cudaMalloc((void**)&p->gpu_tex_diff_rgb_combo, (size_t)ntiles*p->num_cams*(p->num_colors+1)*sizeof(float));
cudaMalloc((void**)&p->gpu_tex_indices, (size_t)ntiles*sizeof(int)); // kernel fills (create_nonoverlap_list)
cudaMalloc((void**)&p->gpu_tex_pnum, sizeof(int)); // = gpu_texture_indices_len
cudaMalloc((void**)&p->gpu_tex_color_weights, 3*sizeof(float));
cudaMalloc((void**)&p->gpu_tex_rbga_params, 5*sizeof(float));
p->have_tex=true;
return 0;
}
// textures_nonoverlap<<<1,1>>> (mirrors GpuQuad.execTextures_DP). params5 = {min_shot,scale_shot,diff_sigma,
// diff_threshold,min_agree}; weights3 = R,B,G. calc_textures -> gpu_tex_textures (else stride 0/skip),
// calc_extra -> gpu_tex_diff_rgb_combo (else null/skip). Returns the kernel-built texture-tile count.
int tp_proc_exec_textures(TpProc* p, float* params5, float* weights3, int is_lwir,
int dust_remove, int keep_weights, int calc_textures, int calc_extra, int linescan_order){
if(!p){ seterr("exec_textures: null"); return -1; }
if(tp_proc_ensure_textures(p)) return -2;
cuCtxSetCurrent(p->mod->ctx);
CUfunction f_tex=getfun(p->mod,"textures_nonoverlap"), f_acc=getfun(p->mod,"textures_accumulate");
if(!f_tex||!f_acc){ seterr("texture kernels missing"); return -3; }
cudaMemcpy(p->gpu_tex_color_weights, weights3, 3*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(p->gpu_tex_rbga_params, params5, 5*sizeof(float), cudaMemcpyHostToDevice);
int shared_size = host_get_textures_shared_size(p->num_cams, p->num_colors, 0); // CDP child needs dyn shared
cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, 100); // CU_SHAREDMEM_CARVEOUT_MAX_SHARED
int zero=0; cudaMemcpy(p->gpu_tex_pnum,&zero,sizeof(int),cudaMemcpyHostToDevice);
int nc=p->num_cams, ntsk=p->ntiles, ncol=p->num_colors, tilesx=p->tilesx;
size_t tex_stride = calc_textures ? (p->dstride_tex/sizeof(float)) : 0;
float* p_diff = calc_extra ? p->gpu_tex_diff_rgb_combo : nullptr;
void* a[]={ &nc, &p->ftasks, &ntsk, &p->gpu_tex_indices, &p->gpu_tex_pnum, &p->gpu_clt, &p->gc,
&ncol, &is_lwir, &p->gpu_tex_rbga_params, &p->gpu_tex_color_weights, &dust_remove, &keep_weights,
&tex_stride, &p->gpu_tex_textures, &linescan_order, &p_diff, &tilesx };
if(launch1(f_tex,1,1,1,1,1,1,a,"textures_nonoverlap")) return -4;
cudaMemcpy(&p->last_num_tex_tiles, p->gpu_tex_pnum, sizeof(int), cudaMemcpyDeviceToHost);
return p->last_num_tex_tiles;
}
// read pnum (count) then min(count,n) packed indices; returns the true count (= gpu_texture_indices_len).
int tp_proc_get_texture_indices(TpProc* p, int* out, int n){
if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
int cnt=0; cudaMemcpy(&cnt,p->gpu_tex_pnum,sizeof(int),cudaMemcpyDeviceToHost);
int c=(cnt<n)?cnt:n;
if(c>0 && cudaMemcpy(out,p->gpu_tex_indices,(size_t)c*sizeof(int),cudaMemcpyDeviceToHost)!=cudaSuccess) return -2;
return cnt;
}
// diff_rgb_combo in gpu_texture_indices order (linescan_order=0): first n = num_tex*num_cams*(num_colors+1) floats.
int tp_proc_get_diff_rgb_combo(TpProc* p, float* out, int n){
if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy(out,p->gpu_tex_diff_rgb_combo,(size_t)n*sizeof(float),cudaMemcpyDeviceToHost)==cudaSuccess?0:-2;
}
// de-pitch gpu_tex_textures (mirrors GpuQuad.getFlatTextures). tile_size = slices*256,
// slices = num_colors+1 + (keep_weights ? (num_cams+num_colors+1) : 0).
int tp_proc_get_textures(TpProc* p, float* out, int num_tiles, int num_colors, int keep_weights){
if(!p||!p->have_tex)return -1; cuCtxSetCurrent(p->mod->ctx);
int slices = num_colors+1+(keep_weights?(p->num_cams+num_colors+1):0);
int tile_size = slices*256;
return cudaMemcpy2D(out,(size_t)tile_size*sizeof(float), p->gpu_tex_textures, p->dstride_tex,
(size_t)tile_size*sizeof(float), num_tiles, cudaMemcpyDeviceToHost)==cudaSuccess?0:-2;
}
void tp_proc_destroy(TpProc* p); // fwd
// Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
......@@ -972,6 +1048,8 @@ void tp_proc_destroy(TpProc* p){
if(p->have_rbg){ for(size_t c=0;c<p->corr_images_h.size();c++) cudaFree(p->corr_images_h[c]); cudaFree(p->gpu_corr_images); }
if(p->have_corr){ cudaFree(p->gpu_corrs_td); cudaFree(p->gpu_corrs_combo_td); cudaFree(p->gpu_corrs_combo); cudaFree(p->gpu_corrs);
cudaFree(p->gpu_corr_indices); cudaFree(p->gpu_corrs_combo_indices); cudaFree(p->gpu_num_corr_tiles); }
if(p->have_tex){ cudaFree(p->gpu_tex_textures); cudaFree(p->gpu_tex_diff_rgb_combo); cudaFree(p->gpu_tex_indices);
cudaFree(p->gpu_tex_pnum); cudaFree(p->gpu_tex_color_weights); cudaFree(p->gpu_tex_rbga_params); }
delete p;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment