Commit d1c277ca authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Step 1 complete — TpProc imclt + corr2d granular ops (full LWIR16-CUAS surface)

Add to the persistent TpProc API: tp_proc_setup_rbg_corr (imclt RBG + correlation buffers + corr
config), tp_proc_exec_imclt / tp_proc_get_rbg, tp_proc_exec_corr2d (correlate2D TD -> corr2D_combine
-> corr2D_normalize) / tp_proc_get_corr2d_combo. launch1() helper. tp_proc_convert_selftest extended
to validate imclt vs aux_chnN.rbg and quad corr vs aux_corr-quad.corr (order-independent, stale golden).

Validated on RTX 5060 Ti via the persistent API: CLT==golden (0.1085), RBG==golden (0.0201),
quad-corr value-err 2.06e-5, no_kernels runs finite. The persistent granular API now covers the
full set of GPU ops the LWIR16 CUAS workflow uses (geometry/convert_direct[+no_kernels/use_center_image/
erase_clt/ref_scene]/imclt/correlations) — the surface GpuQuadJna (integration step 2) delegates to.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 06c12c4a
......@@ -71,6 +71,16 @@ static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f) re
// Resolve a kernel by name from the module (cheap; not a hot path in Stage 1).
static CUfunction getfun(TpModule* m, const char* name){ CUfunction f=nullptr; if(m && m->mod) cuModuleGetFunction(&f, m->mod, name); return f; }
// Synchronous driver-API launch with error capture (0 ok, -1 fail -> g_err).
static int launch1(CUfunction f, unsigned gx,unsigned gy,unsigned gz, unsigned bx,unsigned by,unsigned bz,
void** args, const char* name){
CUresult cr; const char* es; cuCtxSynchronize();
cr=cuLaunchKernel(f,gx,gy,gz,bx,by,bz,0,nullptr,args,nullptr);
if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch %s -> %d (%s)",name,cr,es); return -1; }
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync %s -> %d (%s)",name,cr,es); return -1; }
return 0;
}
extern "C" {
TpModule* tp_create_module(const char* srcdir, const char* devrt){
......@@ -572,6 +582,15 @@ struct TpProc {
std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h;
float **gpu_kernels, **gpu_kernel_offsets, **gpu_images, **gpu_clt, **gpu_clt_ref;
float *ftasks; int ntiles; int *active, *num_active;
// imclt (RBG) — pitched per-cam corr_images
bool have_rbg; int rbg_w, rbg_h; size_t dstride_rslt;
std::vector<float*> corr_images_h; float **gpu_corr_images;
// correlations
bool have_corr; int num_pairs, corr_out_rad, corr_length; int sel_pairs[4]; float color_weights[3];
size_t dstride_corr_td, dstride_corr_combo_td, dstride_corr_combo;
float *gpu_corrs_td, *gpu_corrs_combo_td, *gpu_corrs_combo;
int *gpu_corr_indices, *gpu_corrs_combo_indices, *gpu_num_corr_tiles;
int last_num_corr_combo;
};
extern "C" {
......@@ -583,6 +602,9 @@ TpProc* tp_proc_create(TpModule* m){
p->mod=m; p->ftasks=nullptr; p->ntiles=0; p->active=nullptr; p->num_active=nullptr;
p->gpu_kernels=p->gpu_kernel_offsets=p->gpu_images=p->gpu_clt=p->gpu_clt_ref=nullptr;
p->gc=p->cv=p->rbr=p->rot=nullptr;
p->have_rbg=false; p->gpu_corr_images=nullptr;
p->have_corr=false; p->gpu_corrs_td=p->gpu_corrs_combo_td=p->gpu_corrs_combo=nullptr;
p->gpu_corr_indices=p->gpu_corrs_combo_indices=p->gpu_num_corr_tiles=nullptr; p->last_num_corr_combo=0;
return p;
}
......@@ -680,12 +702,85 @@ int tp_proc_get_clt(TpProc* p, int cam, int ref_scene, float* out){ if(!p||cam<0
float* src = (ref_scene? p->clt_ref_h:p->clt_h)[cam];
return cudaMemcpy(out,src,(size_t)p->slice*sizeof(float),cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
// Allocate the imclt (RBG) + correlation buffers and store the corr config params.
int tp_proc_setup_rbg_corr(TpProc* p, int num_pairs, int s0,int s1,int s2,int s3,
float cw0,float cw1,float cw2, int corr_out_rad){
if(!p){ seterr("tp_proc_setup_rbg_corr: null"); return -1; }
cuCtxSetCurrent(p->mod->ctx);
const int ntiles = p->tilesx*p->tilesy;
// RBG (imclt) pitched per-cam
p->rbg_w = p->img_w + 8; p->rbg_h = p->num_colors*(p->img_h + 8);
p->corr_images_h.assign(p->num_cams,nullptr);
for(int c=0;c<p->num_cams;c++) p->corr_images_h[c]=alloc_image_gpu(&p->dstride_rslt, p->rbg_w, p->rbg_h);
p->gpu_corr_images=copyalloc_pointers_gpu(p->corr_images_h.data(),p->num_cams);
p->have_rbg=true;
// correlations
p->num_pairs=num_pairs; p->sel_pairs[0]=s0; p->sel_pairs[1]=s1; p->sel_pairs[2]=s2; p->sel_pairs[3]=s3;
p->color_weights[0]=cw0; p->color_weights[1]=cw1; p->color_weights[2]=cw2;
p->corr_out_rad=corr_out_rad; p->corr_length=(2*corr_out_rad+1)*(2*corr_out_rad+1);
const int td_w=4*8*8;
p->gpu_corrs_td = alloc_image_gpu(&p->dstride_corr_td, td_w, num_pairs*ntiles);
p->gpu_corrs_combo_td = alloc_image_gpu(&p->dstride_corr_combo_td, td_w, ntiles);
p->gpu_corrs_combo = alloc_image_gpu(&p->dstride_corr_combo, p->corr_length, ntiles);
cudaMalloc((void**)&p->gpu_corr_indices, (size_t)num_pairs*ntiles*sizeof(int));
cudaMalloc((void**)&p->gpu_corrs_combo_indices, (size_t)ntiles*sizeof(int));
cudaMalloc((void**)&p->gpu_num_corr_tiles, sizeof(int));
p->have_corr=true;
return 0;
}
// imclt_rbg_all: gpu_clt -> gpu_corr_images (RBG reconstruction)
int tp_proc_exec_imclt(TpProc* p, int apply_lpf){
if(!p||!p->have_rbg){ seterr("exec_imclt: no rbg buffers (call setup_rbg_corr)"); return -1; }
cuCtxSetCurrent(p->mod->ctx);
CUfunction f=getfun(p->mod,"imclt_rbg_all"); if(!f){ seterr("imclt_rbg_all missing"); return -2; }
int nc=p->num_cams, lpf=apply_lpf, ncol=p->num_colors, tw=p->tilesx, th=p->tilesy;
size_t rslt=p->dstride_rslt/sizeof(float);
void* a[]={ &nc,&p->gpu_clt,&p->gpu_corr_images,&lpf,&ncol,&tw,&th,&rslt };
return launch1(f,1,1,1,1,1,1,a,"imclt_rbg_all")?-3:0;
}
int tp_proc_get_rbg(TpProc* p, int cam, float* out){ if(!p||!p->have_rbg||cam<0||cam>=p->num_cams)return -1; cuCtxSetCurrent(p->mod->ctx);
return cudaMemcpy2D(out,(size_t)p->rbg_w*sizeof(float), p->corr_images_h[cam], p->dstride_rslt,
(size_t)p->rbg_w*sizeof(float), p->rbg_h, cudaMemcpyDeviceToHost)==cudaSuccess?0:-2; }
// correlate2D(TD) -> corr2D_combine(quad) -> corr2D_normalize. Result in gpu_corrs_combo.
int tp_proc_exec_corr2d(TpProc* p, double fat_zero){
if(!p||!p->have_corr){ seterr("exec_corr2d: no corr buffers (call setup_rbg_corr)"); return -1; }
cuCtxSetCurrent(p->mod->ctx);
CUfunction f_cor=getfun(p->mod,"correlate2D"), f_cmb=getfun(p->mod,"corr2D_combine"), f_nrm=getfun(p->mod,"corr2D_normalize");
if(!f_cor||!f_cmb||!f_nrm){ seterr("corr kernels missing"); return -2; }
float fz2=(float)(fat_zero*fat_zero);
int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, tx=p->tilesx, crad0=0;
int s0=p->sel_pairs[0],s1=p->sel_pairs[1],s2=p->sel_pairs[2],s3=p->sel_pairs[3];
float c0=p->color_weights[0],c1=p->color_weights[1],c2=p->color_weights[2];
size_t ctd=p->dstride_corr_td/sizeof(float);
void* ac[]={ &nc,&s0,&s1,&s2,&s3,&p->gpu_clt,&ncol,&c0,&c1,&c2,&fz2,&p->ftasks,&ntsk,&tx,
&p->gpu_corr_indices,&p->gpu_num_corr_tiles,&ctd,&crad0,&p->gpu_corrs_td };
if(launch1(f_cor,1,1,1,1,1,1,ac,"correlate2D")) return -3;
int num_corrs=0; cudaMemcpy(&num_corrs,p->gpu_num_corr_tiles,sizeof(int),cudaMemcpyDeviceToHost);
int ncc=num_corrs/p->num_pairs; p->last_num_corr_combo=ncc;
int init=1, pm=0x0f, np=p->num_pairs; size_t cctd=p->dstride_corr_combo_td/sizeof(float);
void* ab[]={ &ncc,&np,&init,&pm,&p->gpu_corr_indices,&p->gpu_corrs_combo_indices,&ctd,&p->gpu_corrs_td,&cctd,&p->gpu_corrs_combo_td };
if(launch1(f_cmb,1,1,1,1,1,1,ab,"corr2D_combine")) return -4;
float* cwn=nullptr; int crad=p->corr_out_rad; size_t ccp=p->dstride_corr_combo/sizeof(float);
void* an[]={ &ncc,&cctd,&p->gpu_corrs_combo_td,&cwn,&ccp,&p->gpu_corrs_combo,&fz2,&crad };
if(launch1(f_nrm,1,1,1,1,1,1,an,"corr2D_normalize")) return -5;
return 0;
}
// de-pitch gpu_corrs_combo (last_num_corr_combo tiles x corr_length); returns num_corr_combo.
int tp_proc_get_corr2d_combo(TpProc* p, float* out){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
int per=p->corr_length;
if(cudaMemcpy2D(out,(size_t)per*sizeof(float), p->gpu_corrs_combo, p->dstride_corr_combo,
(size_t)per*sizeof(float), p->last_num_corr_combo, cudaMemcpyDeviceToHost)!=cudaSuccess) return -2;
return p->last_num_corr_combo; }
void tp_proc_destroy(TpProc* p); // fwd
// Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
// clt/aux_chnN.clt golden (== Stage 2). Also smoke-tests no_kernels (runs, finite output).
int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_nokern_max, int* out_num_active){
double* out_clt_err, double* out_rbg_err, double* out_corr_err,
double* out_nokern_max, int* out_num_active){
g_err[0]=0;
if(!m){ seterr("tp_proc_convert_selftest: null module"); return -1; }
if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed",data_root); return -2; }
......@@ -693,6 +788,8 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
TpProc* p = tp_proc_create(m);
if(!p) return -3;
if(tp_proc_setup(p, P.num_cams, P.num_colors, P.img_width, P.img_height, P.kernels_hor, P.kern_tiles)) { tp_proc_destroy(p); return -4; }
tp_proc_setup_rbg_corr(p, P.num_pairs, P.sel_pairs[0],P.sel_pairs[1],P.sel_pairs[2],P.sel_pairs[3],
P.color_weights[0],P.color_weights[1],P.color_weights[2], P.corr_out_rad);
const int NC=P.num_cams;
float* hbuf=(float*)malloc((size_t)P.kern_size*sizeof(float));
// geometry
......@@ -725,14 +822,35 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
if(tp_proc_exec_convert_direct(p, 0, -1, 0)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost);
double clt_err=0; std::vector<float> got(p->slice), gold(p->slice);
for(int c=0;c<NC;c++){ readFloatsFromFile(gold.data(), PA.ports_clt_file[c]); tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>clt_err)clt_err=e; } }
double clt_err=0; { std::vector<float> got(p->slice), gold(p->slice);
for(int c=0;c<NC;c++){ readFloatsFromFile(gold.data(), PA.ports_clt_file[c]); tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>clt_err)clt_err=e; } } }
if(out_clt_err) *out_clt_err=clt_err;
// imclt -> RBG, compare to aux_chnN.rbg golden
double rbg_err=-1;
if(tp_proc_exec_imclt(p, 1)==0){
rbg_err=0; int rbg_size=p->rbg_w*p->rbg_h; std::vector<float> got(rbg_size), gold(rbg_size);
for(int c=0;c<NC;c++){ readFloatsFromFile(gold.data(), PA.result_rbg_file[c]); tp_proc_get_rbg(p,c,got.data());
for(int i=0;i<rbg_size;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>rbg_err)rbg_err=e; } }
}
if(out_rbg_err) *out_rbg_err=rbg_err;
// corr2d (quad) -> compare to aux_corr-quad.corr golden ORDER-INDEPENDENTLY (stale golden tile order)
double corr_err=-1;
if(tp_proc_exec_corr2d(p, 1000.0)==0){
int per=p->corr_length, ncc=p->last_num_corr_combo, sz=ncc*per;
std::vector<float> got(sz), gold(sz); tp_proc_get_corr2d_combo(p, got.data());
readFloatsFromFile(gold.data(), PA.result_corr_quad_file);
std::sort(got.begin(),got.end()); std::sort(gold.begin(),gold.end());
corr_err=0; for(int i=0;i<sz;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>corr_err)corr_err=e; }
}
if(out_corr_err) *out_corr_err=corr_err;
// no_kernels smoke test: re-run convert into clt (no_kernels=1), check finite + how much it differs
double nokern_max=0;
if(tp_proc_exec_convert_direct(p, 0, 0, 1)==0){
std::vector<float> got(p->slice);
bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } }
if(!finite) nokern_max=-1; // signal non-finite
......@@ -750,6 +868,9 @@ void tp_proc_destroy(TpProc* p){
cudaFree(p->gpu_kernels); cudaFree(p->gpu_kernel_offsets); cudaFree(p->gpu_images); cudaFree(p->gpu_clt); cudaFree(p->gpu_clt_ref);
cudaFree(p->active); cudaFree(p->num_active); if(p->ftasks)cudaFree(p->ftasks);
cudaFree(p->gc); cudaFree(p->cv); cudaFree(p->rbr); cudaFree(p->rot);
if(p->have_rbg){ for(size_t c=0;c<p->corr_images_h.size();c++) cudaFree(p->corr_images_h[c]); cudaFree(p->gpu_corr_images); }
if(p->have_corr){ cudaFree(p->gpu_corrs_td); cudaFree(p->gpu_corrs_combo_td); cudaFree(p->gpu_corrs_combo);
cudaFree(p->gpu_corr_indices); cudaFree(p->gpu_corrs_combo_indices); cudaFree(p->gpu_num_corr_tiles); }
delete p;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment