Commit 06c12c4a authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Step 1 — persistent granular native API (TpProc) for the convert_direct core

Add TpProc: the production-facing persistent instance (buffers allocated once in tp_proc_setup,
reused across set/exec/get, freed in tp_proc_destroy) — the surface GpuQuadJna will delegate to.
API: tp_proc_create/setup, set_geometry/correction_vector/kernels/kernel_offsets/image/center_image/
tasks, exec_geometry (calc_reverse_distortions+rot_derivs+calculate_tiles_offsets), exec_convert_direct
(ref_scene/erase_clt/no_kernels), get_clt, destroy. Includes the fragile convert_direct paths the
migration must preserve: no_kernels (skip deconvolution -> kernels_hor/vert=0), use_center_image
(broadcast one center image to all sensors), erase_clt (erase_clt_tiles), ref_scene (clt_ref buffer).

tp_proc_convert_selftest validates end-to-end on RTX 5060 Ti: standard convert CLT == clt/aux_chnN.clt
golden (max|CLT-golden|=0.1085, == Stage 2, num_active=5120); no_kernels path runs with finite output.
update_image_gpu pitch is in BYTES (the "in floats" comment is misleading).
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 341538c7
...@@ -553,4 +553,204 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root, ...@@ -553,4 +553,204 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
return run_convert_pipeline(m, lwir, data_root, 0,0,1, out_clt_err, nullptr, nullptr, out_tex_err, out_num_active); return run_convert_pipeline(m, lwir, data_root, 0,0,1, out_clt_err, nullptr, nullptr, out_tex_err, out_num_active);
} }
} // extern "C" } // extern "C" (module + Stage 0-5 selftests)
// ============================================================================
// Step 1: PERSISTENT granular API (TpProc) — the production-facing surface.
// Buffers allocated once in tp_proc_setup, reused across set/exec/get, freed in
// tp_proc_destroy. Reuses the proven launch logic; adds the fragile paths the
// integration must preserve: no_kernels (skip deconvolution, rectilinear/FPN),
// use_center_image (broadcast 1 center image to all sensors, FPN back-prop),
// erase_clt, ref_scene (separate CLT buffer for inter-scene corr).
// See handoffs/2026-06-25_convert_direct-fragile-features.md.
// ============================================================================
struct TpProc {
TpModule* mod;
int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kern_tiles, kern_size, slice;
size_t dstride_img; // image pitch (bytes)
float *gc, *cv, *rbr, *rot;
std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h;
float **gpu_kernels, **gpu_kernel_offsets, **gpu_images, **gpu_clt, **gpu_clt_ref;
float *ftasks; int ntiles; int *active, *num_active;
};
extern "C" {
TpProc* tp_proc_create(TpModule* m){
g_err[0]=0;
if(!m){ seterr("tp_proc_create: null module"); return nullptr; }
TpProc* p = new TpProc();
p->mod=m; p->ftasks=nullptr; p->ntiles=0; p->active=nullptr; p->num_active=nullptr;
p->gpu_kernels=p->gpu_kernel_offsets=p->gpu_images=p->gpu_clt=p->gpu_clt_ref=nullptr;
p->gc=p->cv=p->rbr=p->rot=nullptr;
return p;
}
// Allocate all per-config buffers. kern_tiles = KERNELS_HOR*KERNELS_VERT*num_colors.
int tp_proc_setup(TpProc* p, int num_cams, int num_colors, int img_w, int img_h, int kernels_hor, int kern_tiles){
if(!p){ seterr("tp_proc_setup: null"); return -1; }
cuCtxSetCurrent(p->mod->ctx);
p->num_cams=num_cams; p->num_colors=num_colors; p->img_w=img_w; p->img_h=img_h;
p->kernels_hor=kernels_hor; p->kern_tiles=kern_tiles; p->kern_size=kern_tiles*4*64;
p->tilesx=img_w/8; p->tilesy=img_h/8;
p->slice = p->tilesy*p->tilesx*num_colors*4*64;
const int CE = (int)(sizeof(CltExtra)/sizeof(float));
p->kernels_h.assign(num_cams,nullptr); p->offs_h.assign(num_cams,nullptr);
p->images_h.assign(num_cams,nullptr); p->clt_h.assign(num_cams,nullptr); p->clt_ref_h.assign(num_cams,nullptr);
for(int c=0;c<num_cams;c++){
p->kernels_h[c]=alloc_kernel_gpu(p->kern_size);
p->offs_h[c]=alloc_kernel_gpu(kern_tiles*CE);
p->images_h[c]=alloc_image_gpu(&p->dstride_img, img_w, img_h);
p->clt_h[c]=alloc_kernel_gpu(p->slice);
p->clt_ref_h[c]=alloc_kernel_gpu(p->slice);
}
p->gpu_kernels=copyalloc_pointers_gpu(p->kernels_h.data(),num_cams);
p->gpu_kernel_offsets=copyalloc_pointers_gpu(p->offs_h.data(),num_cams);
p->gpu_images=copyalloc_pointers_gpu(p->images_h.data(),num_cams);
p->gpu_clt=copyalloc_pointers_gpu(p->clt_h.data(),num_cams);
p->gpu_clt_ref=copyalloc_pointers_gpu(p->clt_ref_h.data(),num_cams);
cudaMalloc((void**)&p->active,(size_t)p->tilesx*p->tilesy*sizeof(int));
cudaMalloc((void**)&p->num_active,sizeof(int));
p->gc=alloc_kernel_gpu(512); p->cv=alloc_kernel_gpu(512);
p->rbr=alloc_kernel_gpu(5001); p->rot=alloc_kernel_gpu(5*16*3*3);
return 0;
}
int tp_proc_set_geometry(TpProc* p, const float* fgc, int n){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy(p->gc,fgc,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
int tp_proc_set_correction_vector(TpProc* p, const float* fcv, int n){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy(p->cv,fcv,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
int tp_proc_set_kernels(TpProc* p, int cam, const float* d, int n){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy(p->kernels_h[cam],d,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
int tp_proc_set_kernel_offsets(TpProc* p, int cam, const float* d, int n){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy(p->offs_h[cam],d,(size_t)n*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
// pitched image upload (reuses harness update_image_gpu: dstride in floats)
int tp_proc_set_image(TpProc* p, int cam, const float* d){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
update_image_gpu((float*)d, p->images_h[cam], p->dstride_img, p->img_w, p->img_h); return 0; } // dstride in BYTES (cudaMemcpy2D dpitch)
// use_center_image: broadcast one center image to all sensors (FPN back-prop mode)
int tp_proc_set_center_image(TpProc* p, const float* d){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
for(int c=0;c<p->num_cams;c++) update_image_gpu((float*)d, p->images_h[c], p->dstride_img, p->img_w, p->img_h); return 0; }
int tp_proc_set_tasks(TpProc* p, const float* ftasks, int ntiles, int total_floats){ if(!p)return -1; cuCtxSetCurrent(p->mod->ctx);
if(p->ftasks) cudaFree(p->ftasks); cudaMalloc((void**)&p->ftasks,(size_t)total_floats*sizeof(float));
p->ntiles=ntiles; return cudaMemcpy(p->ftasks,ftasks,(size_t)total_floats*sizeof(float),cudaMemcpyHostToDevice)==cudaSuccess?0:-2; }
// geometry: calc_reverse_distortions + rot_derivs + (uniform_grid) calculate_tiles_offsets
int tp_proc_exec_geometry(TpProc* p, int uniform_grid){
if(!p)return -1; cuCtxSetCurrent(p->mod->ctx); CUresult cr; const char* es;
CUfunction f_rbr=getfun(p->mod,"calcReverseDistortionTable"), f_rot=getfun(p->mod,"calc_rot_deriv"), f_off=getfun(p->mod,"calculate_tiles_offsets");
if(!f_rbr||!f_rot||!f_off){ seterr("geom kernels missing"); return -2; }
#define LA(fn,gx,gy,gz,bx,by,bz,...) do{ void* a[]={__VA_ARGS__}; cuCtxSynchronize(); \
cr=cuLaunchKernel(fn,gx,gy,gz,bx,by,bz,0,nullptr,a,nullptr); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch %s ->%d(%s)",#fn,cr,es);return -3;} \
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("sync %s ->%d(%s)",#fn,cr,es);return -4;} }while(0)
int mc16=16, nc=p->num_cams, ug=uniform_grid, nt=p->ntiles;
LA(f_rbr, mc16,1,1, 3,3,3, &p->gc,&p->rbr);
LA(f_rot, nc,1,1, 3,3,3, &nc,&p->cv,&p->rot);
LA(f_off, 1,1,1, 1,1,1, &ug,&nc,&p->ftasks,&nt,&p->gc,&p->cv,&p->rbr,&p->rot);
#undef LA
return 0;
}
// convert_direct with the fragile paths. no_kernels: skip deconvolution; use_center_image handled at
// set time; ref_scene: write to clt_ref; erase_clt: -1 none / 0 zero / 1 NaN (erase_clt_tiles first).
int tp_proc_exec_convert_direct(TpProc* p, int ref_scene, int erase_clt, int no_kernels){
if(!p)return -1; cuCtxSetCurrent(p->mod->ctx); CUresult cr; const char* es;
CUfunction f_cd=getfun(p->mod,"convert_direct"), f_er=getfun(p->mod,"erase_clt_tiles");
if(!f_cd){ seterr("convert_direct missing"); return -2; }
float** clt_sel = ref_scene ? p->gpu_clt_ref : p->gpu_clt;
if(erase_clt>=0 && f_er){
float fill = (erase_clt>0)? NAN : 0.0f; int nc=p->num_cams, ncol=p->num_colors, tx=p->tilesx, ty=p->tilesy;
void* a[]={ &nc,&ncol,&tx,&ty,&clt_sel,&fill }; cuCtxSynchronize();
cr=cuLaunchKernel(f_er,1,1,1,1,1,1,0,nullptr,a,nullptr);
if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch erase_clt_tiles ->%d(%s)",cr,es);return -3;}
cuCtxSynchronize();
}
int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, lpf=0, ww=p->img_w, wh=p->img_h,
kh=0, kv=no_kernels?0:p->kernels_hor, tx=p->tilesx; // standard: (0, kernels_hor); no_kernels: (0,0)
size_t dstride=p->dstride_img/sizeof(float);
void* a[]={ &nc,&ncol,&p->gpu_kernel_offsets,&p->gpu_kernels,&p->gpu_images,&p->ftasks,&clt_sel,
&dstride,&ntsk,&lpf,&ww,&wh,&kh,&kv,&p->active,&p->num_active,&tx };
cuCtxSynchronize();
cr=cuLaunchKernel(f_cd,1,1,1,1,1,1,0,nullptr,a,nullptr);
if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("launch convert_direct ->%d(%s)",cr,es);return -4;}
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){cuGetErrorString(cr,&es);seterr("sync convert_direct ->%d(%s)",cr,es);return -5;}
return 0;
}
int tp_proc_get_clt(TpProc* p, int cam, int ref_scene, float* out){ if(!p||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
float* src = (ref_scene? p->clt_ref_h:p->clt_h)[cam];
return cudaMemcpy(out,src,(size_t)p->slice*sizeof(float),cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
void tp_proc_destroy(TpProc* p); // fwd
// Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
// clt/aux_chnN.clt golden (== Stage 2). Also smoke-tests no_kernels (runs, finite output).
int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_nokern_max, int* out_num_active){
g_err[0]=0;
if(!m){ seterr("tp_proc_convert_selftest: null module"); return -1; }
if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed",data_root); return -2; }
TpParams P((int)lwir); TpPaths PA((int)lwir);
TpProc* p = tp_proc_create(m);
if(!p) return -3;
if(tp_proc_setup(p, P.num_cams, P.num_colors, P.img_width, P.img_height, P.kernels_hor, P.kern_tiles)) { tp_proc_destroy(p); return -4; }
const int NC=P.num_cams;
float* hbuf=(float*)malloc((size_t)P.kern_size*sizeof(float));
// geometry
int n_gc=0,n_cv=0; float* fgc=readAllFloatsFromFile(PA.geometry_correction_file,&n_gc);
float* fcv=readAllFloatsFromFile(PA.correction_vector_file,&n_cv);
tp_proc_set_geometry(p,fgc,n_gc); tp_proc_set_correction_vector(p,fcv,n_cv);
// kernels + offsets + images
const int CE=(int)(sizeof(CltExtra)/sizeof(float));
for(int c=0;c<NC;c++){
readFloatsFromFile(hbuf, PA.kernel_file[c]); tp_proc_set_kernels(p,c,hbuf,P.kern_size);
readFloatsFromFile(hbuf, PA.kernel_offs_file[c]); tp_proc_set_kernel_offsets(p,c,hbuf,P.kern_tiles*CE);
readFloatsFromFile(hbuf, PA.image_files[c]); tp_proc_set_image(p,c,hbuf);
}
// tasks (TpHostGpu::setTasks, target_disparity=0, scale=0)
std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
std::vector<float> ftask((size_t)P.tp_tasks_size*P.task_size, 0.0f);
for(int ty=0;ty<P.tilesy;ty++) for(int tx=0;tx<P.tilesx;tx++){
int nt=ty*P.tilesx+tx, task=(1<<P.task_inter_en)|(1<<P.task_corr_en)|(1<<P.task_text_en), txy=tx+(ty<<16);
float* tp=ftask.data()+(size_t)P.task_size*nt;
tp[P.tp_task_task_offset]=*(float*)&task; tp[P.tp_task_txy_offset]=*(float*)&txy;
tp[P.tp_task_disparity_offset]=0.0f; tp[P.tp_task_scale_offset]=0.0f;
float* q=tp+P.tp_task_xy_offset;
for(int c=0;c<NC;c++){ *q++=coords[(size_t)c*P.tp_tasks_size*2+nt*2+0]; *q++=coords[(size_t)c*P.tp_tasks_size*2+nt*2+1]; }
}
tp_proc_set_tasks(p, ftask.data(), P.tp_tasks_size, P.tp_tasks_size*P.task_size);
// geometry + convert (standard, with kernels)
tp_proc_exec_geometry(p, 1);
if(tp_proc_exec_convert_direct(p, 0, -1, 0)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost);
double clt_err=0; std::vector<float> got(p->slice), gold(p->slice);
for(int c=0;c<NC;c++){ readFloatsFromFile(gold.data(), PA.ports_clt_file[c]); tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>clt_err)clt_err=e; } }
if(out_clt_err) *out_clt_err=clt_err;
// no_kernels smoke test: re-run convert into clt (no_kernels=1), check finite + how much it differs
double nokern_max=0;
if(tp_proc_exec_convert_direct(p, 0, 0, 1)==0){
bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } }
if(!finite) nokern_max=-1; // signal non-finite
} else nokern_max=-2;
if(out_nokern_max) *out_nokern_max=nokern_max;
free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p);
return 0;
}
void tp_proc_destroy(TpProc* p){
if(!p)return; if(p->mod) cuCtxSetCurrent(p->mod->ctx);
for(int c=0;c<p->num_cams;c++){ if(c<(int)p->kernels_h.size())cudaFree(p->kernels_h[c]); if(c<(int)p->offs_h.size())cudaFree(p->offs_h[c]);
if(c<(int)p->images_h.size())cudaFree(p->images_h[c]); if(c<(int)p->clt_h.size())cudaFree(p->clt_h[c]); if(c<(int)p->clt_ref_h.size())cudaFree(p->clt_ref_h[c]); }
cudaFree(p->gpu_kernels); cudaFree(p->gpu_kernel_offsets); cudaFree(p->gpu_images); cudaFree(p->gpu_clt); cudaFree(p->gpu_clt_ref);
cudaFree(p->active); cudaFree(p->num_active); if(p->ftasks)cudaFree(p->ftasks);
cudaFree(p->gc); cudaFree(p->cv); cudaFree(p->rbr); cudaFree(p->rot);
delete p;
}
} // extern "C" (TpProc)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment