Commit 0399a26d authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: FIX convert_direct deconvolution in JNA — pass (kernels_hor, kernels_vert), not (0, *)

THE production-mismatch bug (RMSE ~1.7 vs JCUDA, invariant to FPN/row-col/MB). convert_direct gates
the deconvolution kernels on `kernels_hor>0` (TileProcessor.cu:2782-2783): with kernels_hor=0 it passes
NULL kernels -> NO deconvolution. tp_proc_exec_convert_direct hardcoded kh=0 (copied from the harness,
whose golden was itself made with no deconvolution), so JNA skipped aberration deconvolution while
production GpuQuad passes (kernels_hor, kernels_vert)=(82,66) and applies it.

Fix: add kernels_vert to TpProc (= kern_tiles/(kernels_hor*num_colors)); exec passes
(no_kernels?0:kernels_hor, no_kernels?0:kernels_vert). tp_proc_convert_selftest now uses no_kernels=1
to keep matching the NO-deconv harness golden (StageProc still PASS: CLT 0.1085 / RBG 0.0201 / corr 2e-5).
Production (GpuQuadJna no_kernels=false) now applies deconvolution = matches JCUDA. .so-only change.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 53599fd3
...@@ -576,7 +576,7 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root, ...@@ -576,7 +576,7 @@ int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
// ============================================================================ // ============================================================================
struct TpProc { struct TpProc {
TpModule* mod; TpModule* mod;
int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kern_tiles, kern_size, slice; int num_cams, num_colors, img_w, img_h, tilesx, tilesy, kernels_hor, kernels_vert, kern_tiles, kern_size, slice;
size_t dstride_img; // image pitch (bytes) size_t dstride_img; // image pitch (bytes)
float *gc, *cv, *rbr, *rot; float *gc, *cv, *rbr, *rot;
std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h; std::vector<float*> kernels_h, offs_h, images_h, clt_h, clt_ref_h;
...@@ -615,6 +615,7 @@ int tp_proc_setup(TpProc* p, int num_cams, int num_colors, int img_w, int img_h, ...@@ -615,6 +615,7 @@ int tp_proc_setup(TpProc* p, int num_cams, int num_colors, int img_w, int img_h,
cuCtxSetCurrent(p->mod->ctx); cuCtxSetCurrent(p->mod->ctx);
p->num_cams=num_cams; p->num_colors=num_colors; p->img_w=img_w; p->img_h=img_h; p->num_cams=num_cams; p->num_colors=num_colors; p->img_w=img_w; p->img_h=img_h;
p->kernels_hor=kernels_hor; p->kern_tiles=kern_tiles; p->kern_size=kern_tiles*4*64; p->kernels_hor=kernels_hor; p->kern_tiles=kern_tiles; p->kern_size=kern_tiles*4*64;
p->kernels_vert = (kernels_hor*num_colors>0) ? kern_tiles/(kernels_hor*num_colors) : 0; // = KERNELS_VERT
p->tilesx=img_w/8; p->tilesy=img_h/8; p->tilesx=img_w/8; p->tilesy=img_h/8;
p->slice = p->tilesy*p->tilesx*num_colors*4*64; p->slice = p->tilesy*p->tilesx*num_colors*4*64;
const int CE = (int)(sizeof(CltExtra)/sizeof(float)); const int CE = (int)(sizeof(CltExtra)/sizeof(float));
...@@ -705,7 +706,7 @@ int tp_proc_exec_convert_direct(TpProc* p, int ref_scene, int erase_clt, int no_ ...@@ -705,7 +706,7 @@ int tp_proc_exec_convert_direct(TpProc* p, int ref_scene, int erase_clt, int no_
cuCtxSynchronize(); cuCtxSynchronize();
} }
int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, lpf=0, ww=p->img_w, wh=p->img_h, int nc=p->num_cams, ncol=p->num_colors, ntsk=p->ntiles, lpf=0, ww=p->img_w, wh=p->img_h,
kh=0, kv=no_kernels?0:p->kernels_hor, tx=p->tilesx; // standard: (0, kernels_hor); no_kernels: (0,0) kh=no_kernels?0:p->kernels_hor, kv=no_kernels?0:p->kernels_vert, tx=p->tilesx; // (kernels_hor,vert) applies deconvolution; (0,0) skips. convert_direct gates kernels on kernels_hor>0.
size_t dstride=p->dstride_img/sizeof(float); size_t dstride=p->dstride_img/sizeof(float);
void* a[]={ &nc,&ncol,&p->gpu_kernel_offsets,&p->gpu_kernels,&p->gpu_images,&p->ftasks,&clt_sel, void* a[]={ &nc,&ncol,&p->gpu_kernel_offsets,&p->gpu_kernels,&p->gpu_images,&p->ftasks,&clt_sel,
&dstride,&ntsk,&lpf,&ww,&wh,&kh,&kv,&p->active,&p->num_active,&tx }; &dstride,&ntsk,&lpf,&ww,&wh,&kh,&kv,&p->active,&p->num_active,&tx };
...@@ -901,9 +902,10 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root, ...@@ -901,9 +902,10 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
} }
tp_proc_set_tasks(p, ftask.data(), P.tp_tasks_size, P.tp_tasks_size*P.task_size); tp_proc_set_tasks(p, ftask.data(), P.tp_tasks_size, P.tp_tasks_size*P.task_size);
// geometry + convert (standard, with kernels) // geometry + convert. NOTE: the harness golden (aux_chnN.clt) was generated with kernels_hor=0
// (NO deconvolution), so match it with no_kernels=1. Production (GpuQuadJna) uses no_kernels=0 -> deconv.
tp_proc_exec_geometry(p, 1); tp_proc_exec_geometry(p, 1);
if(tp_proc_exec_convert_direct(p, 0, -1, 0)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; } if(tp_proc_exec_convert_direct(p, 0, -1, 1)) { free(hbuf); free(fgc); free(fcv); tp_proc_destroy(p); return -5; }
if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost); if(out_num_active) cudaMemcpy(out_num_active, p->num_active, sizeof(int), cudaMemcpyDeviceToHost);
double clt_err=0; { std::vector<float> got(p->slice), gold(p->slice); double clt_err=0; { std::vector<float> got(p->slice), gold(p->slice);
...@@ -931,9 +933,9 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root, ...@@ -931,9 +933,9 @@ int tp_proc_convert_selftest(TpModule* m, int lwir, const char* data_root,
} }
if(out_corr_err) *out_corr_err=corr_err; if(out_corr_err) *out_corr_err=corr_err;
// no_kernels smoke test: re-run convert into clt (no_kernels=1), check finite + how much it differs // deconvolution (no_kernels=0) smoke test: applies kernels (production path), check finite: re-run convert into clt (no_kernels=1), check finite + how much it differs
double nokern_max=0; double nokern_max=0;
if(tp_proc_exec_convert_direct(p, 0, 0, 1)==0){ if(tp_proc_exec_convert_direct(p, 0, 0, 0)==0){ // no_kernels=0 -> deconvolution applied
std::vector<float> got(p->slice); std::vector<float> got(p->slice);
bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data()); bool finite=true; for(int c=0;c<NC && finite;c++){ tp_proc_get_clt(p,c,0,got.data());
for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } } for(int i=0;i<p->slice;i++){ if(std::isnan(got[i])||std::isinf(got[i])){finite=false;break;} double a=std::fabs(got[i]); if(a>nokern_max)nokern_max=a; } }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment