Commit 2f926aee authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Step 2 corr — split TpProc into granular...

CLAUDE: Step 2 corr — split TpProc into granular correlate2D_td/_inter_td/combine/normalize + get_corr2d

Match GpuQuad's separate correlation calls: tp_proc_exec_corr2d_td (correlate2D TD, fat_zero=0,
corr_radius=0 -> gpu_corrs_td, returns num_corr_tiles), tp_proc_exec_corr2d_inter_td (correlate2D_inter
clt vs clt_ref, sel_sensors), tp_proc_exec_corr2d_combine (init|no_transpose<<1, num_pairs, pairs_mask
-> num_corr_combo), tp_proc_exec_corr2d_normalize (combo -> corrs_combo / per-pair -> gpu_corrs),
tp_proc_get_corr2d (per-pair de-pitch) + tp_proc_num_corr_tiles/combo. Add per-pair pixel buffer
gpu_corrs (+dstride_corr). The bundled tp_proc_exec_corr2d/StageProc path is retained.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent d1c277ca
...@@ -587,10 +587,10 @@ struct TpProc { ...@@ -587,10 +587,10 @@ struct TpProc {
std::vector<float*> corr_images_h; float **gpu_corr_images; std::vector<float*> corr_images_h; float **gpu_corr_images;
// correlations // correlations
bool have_corr; int num_pairs, corr_out_rad, corr_length; int sel_pairs[4]; float color_weights[3]; bool have_corr; int num_pairs, corr_out_rad, corr_length; int sel_pairs[4]; float color_weights[3];
size_t dstride_corr_td, dstride_corr_combo_td, dstride_corr_combo; size_t dstride_corr_td, dstride_corr_combo_td, dstride_corr_combo, dstride_corr;
float *gpu_corrs_td, *gpu_corrs_combo_td, *gpu_corrs_combo; float *gpu_corrs_td, *gpu_corrs_combo_td, *gpu_corrs_combo, *gpu_corrs; // gpu_corrs = per-pair pixel-domain
int *gpu_corr_indices, *gpu_corrs_combo_indices, *gpu_num_corr_tiles; int *gpu_corr_indices, *gpu_corrs_combo_indices, *gpu_num_corr_tiles;
int last_num_corr_combo; int last_num_corr_combo, last_num_corr_tiles;
}; };
extern "C" { extern "C" {
...@@ -603,8 +603,9 @@ TpProc* tp_proc_create(TpModule* m){ ...@@ -603,8 +603,9 @@ TpProc* tp_proc_create(TpModule* m){
p->gpu_kernels=p->gpu_kernel_offsets=p->gpu_images=p->gpu_clt=p->gpu_clt_ref=nullptr; p->gpu_kernels=p->gpu_kernel_offsets=p->gpu_images=p->gpu_clt=p->gpu_clt_ref=nullptr;
p->gc=p->cv=p->rbr=p->rot=nullptr; p->gc=p->cv=p->rbr=p->rot=nullptr;
p->have_rbg=false; p->gpu_corr_images=nullptr; p->have_rbg=false; p->gpu_corr_images=nullptr;
p->have_corr=false; p->gpu_corrs_td=p->gpu_corrs_combo_td=p->gpu_corrs_combo=nullptr; p->have_corr=false; p->gpu_corrs_td=p->gpu_corrs_combo_td=p->gpu_corrs_combo=p->gpu_corrs=nullptr;
p->gpu_corr_indices=p->gpu_corrs_combo_indices=p->gpu_num_corr_tiles=nullptr; p->last_num_corr_combo=0; p->gpu_corr_indices=p->gpu_corrs_combo_indices=p->gpu_num_corr_tiles=nullptr;
p->last_num_corr_combo=0; p->last_num_corr_tiles=0;
return p; return p;
} }
...@@ -722,6 +723,7 @@ int tp_proc_setup_rbg_corr(TpProc* p, int num_pairs, int s0,int s1,int s2,int s3 ...@@ -722,6 +723,7 @@ int tp_proc_setup_rbg_corr(TpProc* p, int num_pairs, int s0,int s1,int s2,int s3
p->gpu_corrs_td = alloc_image_gpu(&p->dstride_corr_td, td_w, num_pairs*ntiles); p->gpu_corrs_td = alloc_image_gpu(&p->dstride_corr_td, td_w, num_pairs*ntiles);
p->gpu_corrs_combo_td = alloc_image_gpu(&p->dstride_corr_combo_td, td_w, ntiles); p->gpu_corrs_combo_td = alloc_image_gpu(&p->dstride_corr_combo_td, td_w, ntiles);
p->gpu_corrs_combo = alloc_image_gpu(&p->dstride_corr_combo, p->corr_length, ntiles); p->gpu_corrs_combo = alloc_image_gpu(&p->dstride_corr_combo, p->corr_length, ntiles);
p->gpu_corrs = alloc_image_gpu(&p->dstride_corr, p->corr_length, num_pairs*ntiles); // per-pair pixel-domain
cudaMalloc((void**)&p->gpu_corr_indices, (size_t)num_pairs*ntiles*sizeof(int)); cudaMalloc((void**)&p->gpu_corr_indices, (size_t)num_pairs*ntiles*sizeof(int));
cudaMalloc((void**)&p->gpu_corrs_combo_indices, (size_t)ntiles*sizeof(int)); cudaMalloc((void**)&p->gpu_corrs_combo_indices, (size_t)ntiles*sizeof(int));
cudaMalloc((void**)&p->gpu_num_corr_tiles, sizeof(int)); cudaMalloc((void**)&p->gpu_num_corr_tiles, sizeof(int));
...@@ -774,6 +776,67 @@ int tp_proc_get_corr2d_combo(TpProc* p, float* out){ if(!p||!p->have_corr)return ...@@ -774,6 +776,67 @@ int tp_proc_get_corr2d_combo(TpProc* p, float* out){ if(!p||!p->have_corr)return
(size_t)per*sizeof(float), p->last_num_corr_combo, cudaMemcpyDeviceToHost)!=cudaSuccess) return -2; (size_t)per*sizeof(float), p->last_num_corr_combo, cudaMemcpyDeviceToHost)!=cudaSuccess) return -2;
return p->last_num_corr_combo; } return p->last_num_corr_combo; }
// ---- granular correlations (match GpuQuad's separate execCorr2D_TD/_inter_TD/_combine/_normalize) ----
// correlate2D (TD, fat_zero=0, corr_radius=0) -> gpu_corrs_td; sets corr_indices + num_corr_tiles. returns count.
int tp_proc_exec_corr2d_td(TpProc* p, int s0,int s1,int s2,int s3, float sc0,float sc1,float sc2){
if(!p||!p->have_corr){ seterr("exec_corr2d_td: no corr buffers"); return -1; } cuCtxSetCurrent(p->mod->ctx);
CUfunction f=getfun(p->mod,"correlate2D"); if(!f){ seterr("correlate2D missing"); return -2; }
int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, tx=p->tilesx, crad0=0; float fz0=0.0f;
size_t ctd=p->dstride_corr_td/sizeof(float);
void* a[]={ &nc,&s0,&s1,&s2,&s3,&p->gpu_clt,&ncol,&sc0,&sc1,&sc2,&fz0,&p->ftasks,&ntsk,&tx,
&p->gpu_corr_indices,&p->gpu_num_corr_tiles,&ctd,&crad0,&p->gpu_corrs_td };
if(launch1(f,1,1,1,1,1,1,a,"correlate2D")) return -3;
cudaMemcpy(&p->last_num_corr_tiles,p->gpu_num_corr_tiles,sizeof(int),cudaMemcpyDeviceToHost);
return p->last_num_corr_tiles;
}
// correlate2D_inter (clt vs clt_ref, TD) -> gpu_corrs_td. returns count. (needs clt_ref via exec_convert_direct ref_scene=1)
int tp_proc_exec_corr2d_inter_td(TpProc* p, int sel_sensors, float sc0,float sc1,float sc2){
if(!p||!p->have_corr){ seterr("exec_corr2d_inter_td: no corr buffers"); return -1; } cuCtxSetCurrent(p->mod->ctx);
CUfunction f=getfun(p->mod,"correlate2D_inter"); if(!f){ seterr("correlate2D_inter missing"); return -2; }
int nc=p->num_cams, ss=sel_sensors, ncol=p->num_colors, ntsk=p->ntiles, tx=p->tilesx;
size_t ctd=p->dstride_corr_td/sizeof(float);
void* a[]={ &nc,&ss,&p->gpu_clt,&p->gpu_clt_ref,&ncol,&sc0,&sc1,&sc2,&p->ftasks,&ntsk,&tx,
&p->gpu_corr_indices,&p->gpu_num_corr_tiles,&ctd,&p->gpu_corrs_td };
if(launch1(f,1,1,1,1,1,1,a,"correlate2D_inter")) return -3;
cudaMemcpy(&p->last_num_corr_tiles,p->gpu_num_corr_tiles,sizeof(int),cudaMemcpyDeviceToHost);
return p->last_num_corr_tiles;
}
// corr2D_combine (init_corr_combo = init|(no_transpose<<1)) -> gpu_corrs_combo_td. returns num_corr_combo.
int tp_proc_exec_corr2d_combine(TpProc* p, int init_corr_combo, int num_pairs, int pairs_mask){
if(!p||!p->have_corr){ seterr("exec_corr2d_combine: no corr buffers"); return -1; } cuCtxSetCurrent(p->mod->ctx);
CUfunction f=getfun(p->mod,"corr2D_combine"); if(!f){ seterr("corr2D_combine missing"); return -2; }
int num_corrs=0; cudaMemcpy(&num_corrs,p->gpu_num_corr_tiles,sizeof(int),cudaMemcpyDeviceToHost);
int ncc=num_corrs/num_pairs; p->last_num_corr_combo=ncc;
size_t ctd=p->dstride_corr_td/sizeof(float), cctd=p->dstride_corr_combo_td/sizeof(float);
void* a[]={ &ncc,&num_pairs,&init_corr_combo,&pairs_mask,&p->gpu_corr_indices,&p->gpu_corrs_combo_indices,
&ctd,&p->gpu_corrs_td,&cctd,&p->gpu_corrs_combo_td };
if(launch1(f,1,1,1,1,1,1,a,"corr2D_combine")) return -3;
return ncc;
}
// corr2D_normalize: combo!=0 -> gpu_corrs_combo_td->gpu_corrs_combo; else gpu_corrs_td->gpu_corrs (per-pair).
int tp_proc_exec_corr2d_normalize(TpProc* p, int combo, double fat_zero, int corr_radius){
if(!p||!p->have_corr){ seterr("exec_corr2d_normalize: no corr buffers"); return -1; } cuCtxSetCurrent(p->mod->ctx);
CUfunction f=getfun(p->mod,"corr2D_normalize"); if(!f){ seterr("corr2D_normalize missing"); return -2; }
float fz2=(float)(fat_zero*fat_zero); float* wn=nullptr; int crad=corr_radius;
if(combo){
int nct=p->last_num_corr_combo; size_t std=p->dstride_corr_combo_td/sizeof(float), sp=p->dstride_corr_combo/sizeof(float);
void* a[]={ &nct,&std,&p->gpu_corrs_combo_td,&wn,&sp,&p->gpu_corrs_combo,&fz2,&crad };
return launch1(f,1,1,1,1,1,1,a,"corr2D_normalize_combo")?-3:0;
} else {
int nct=p->last_num_corr_tiles; size_t std=p->dstride_corr_td/sizeof(float), sp=p->dstride_corr/sizeof(float);
void* a[]={ &nct,&std,&p->gpu_corrs_td,&wn,&sp,&p->gpu_corrs,&fz2,&crad };
return launch1(f,1,1,1,1,1,1,a,"corr2D_normalize_pair")?-3:0;
}
}
// de-pitch per-pair gpu_corrs (last_num_corr_tiles x (2*corr_rad+1)^2). returns count.
int tp_proc_get_corr2d(TpProc* p, float* out, int corr_rad){ if(!p||!p->have_corr)return -1; cuCtxSetCurrent(p->mod->ctx);
int per=(2*corr_rad+1)*(2*corr_rad+1);
if(cudaMemcpy2D(out,(size_t)per*sizeof(float), p->gpu_corrs, p->dstride_corr,
(size_t)per*sizeof(float), p->last_num_corr_tiles, cudaMemcpyDeviceToHost)!=cudaSuccess) return -2;
return p->last_num_corr_tiles; }
int tp_proc_num_corr_tiles(TpProc* p){ return p?p->last_num_corr_tiles:-1; }
int tp_proc_num_corr_combo(TpProc* p){ return p?p->last_num_corr_combo:-1; }
void tp_proc_destroy(TpProc* p); // fwd void tp_proc_destroy(TpProc* p); // fwd
// Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to // Validate the persistent TpProc convert path end-to-end (file-driven), comparing CLT to
...@@ -869,7 +932,7 @@ void tp_proc_destroy(TpProc* p){ ...@@ -869,7 +932,7 @@ void tp_proc_destroy(TpProc* p){
cudaFree(p->active); cudaFree(p->num_active); if(p->ftasks)cudaFree(p->ftasks); cudaFree(p->active); cudaFree(p->num_active); if(p->ftasks)cudaFree(p->ftasks);
cudaFree(p->gc); cudaFree(p->cv); cudaFree(p->rbr); cudaFree(p->rot); cudaFree(p->gc); cudaFree(p->cv); cudaFree(p->rbr); cudaFree(p->rot);
if(p->have_rbg){ for(size_t c=0;c<p->corr_images_h.size();c++) cudaFree(p->corr_images_h[c]); cudaFree(p->gpu_corr_images); } if(p->have_rbg){ for(size_t c=0;c<p->corr_images_h.size();c++) cudaFree(p->corr_images_h[c]); cudaFree(p->gpu_corr_images); }
if(p->have_corr){ cudaFree(p->gpu_corrs_td); cudaFree(p->gpu_corrs_combo_td); cudaFree(p->gpu_corrs_combo); if(p->have_corr){ cudaFree(p->gpu_corrs_td); cudaFree(p->gpu_corrs_combo_td); cudaFree(p->gpu_corrs_combo); cudaFree(p->gpu_corrs);
cudaFree(p->gpu_corr_indices); cudaFree(p->gpu_corrs_combo_indices); cudaFree(p->gpu_num_corr_tiles); } cudaFree(p->gpu_corr_indices); cudaFree(p->gpu_corrs_combo_indices); cudaFree(p->gpu_num_corr_tiles); }
delete p; delete p;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment