Commit 341538c7 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell;...

CLAUDE: Stage 5 — native textures_nonoverlap via JNA (executes on Blackwell; golden mismatch documented)

Extend run_convert_pipeline with do_tex: setTextures/setRGBA-equivalent buffers
(texture_indices from tasks, gpu_textures, diff_rgb_combo, color_weights, generate_RBGA_params),
cuFuncSetAttribute(textures_accumulate, MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size) for the CDP
child, launch textures_nonoverlap <<<1,1>>> (18 args), de-interleave diff_rgb_combo. tp_tex_selftest.

RESULT on RTX 5060 Ti: textures_nonoverlap + its CDP child textures_accumulate EXECUTE correctly
(no errors, shared 58880, 5120 tiles, output rms within ~1% of golden) => Blackwell compatibility
confirmed. BUT diff_rgb_combo does NOT match the Jul-2025 golden numerically (value layers off by
constant ~268, diff layers diff_sigma-sensitive). Ruled out input-CLT sensitivity (same error with
golden CLT), diff_sigma (10.0 closest), arg/param order. NOT used by the LWIR16 CUAS workflow
(cuas/ uses only convert_direct/corr2D_normalize/imclt_rbg_all). Documented known issue (golden
staleness / unverified RGB-path drift), to track later via git bisect + the 107 kernel branch switch.
See imagej-elphel-internal handoffs/2026-06-25_texture-diff-rgb-combo-mismatch.md.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent a0984dca
......@@ -238,8 +238,8 @@ void tp_destroy_instance(TpInstance* inst){
// do_imclt!=0 → imclt_rbg_all, compare de-pitched RBG to clt/aux_chnN.rbg (out_rbg_err).
// do_corr!=0 → correlate2D(TD) + corr2D_combine + corr2D_normalize, compare the quad-combined
// pixel-domain correlation to clt/aux_corr-quad.corr (out_corr_err).
static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt, int do_corr,
double* out_clt_err, double* out_rbg_err, double* out_corr_err, int* out_num_active){
static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt, int do_corr, int do_tex,
double* out_clt_err, double* out_rbg_err, double* out_corr_err, double* out_tex_err, int* out_num_active){
g_err[0]=0;
if(!m){ seterr("selftest: null module"); return -1; }
if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
......@@ -256,9 +256,11 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
CUfunction f_cor = getfun(m,"correlate2D");
CUfunction f_cmb = getfun(m,"corr2D_combine");
CUfunction f_nrm = getfun(m,"corr2D_normalize");
if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)||(do_corr&&(!f_cor||!f_cmb||!f_nrm))){
seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p cor=%p cmb=%p nrm=%p",
(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im,(void*)f_cor,(void*)f_cmb,(void*)f_nrm); return -3; }
CUfunction f_tex = getfun(m,"textures_nonoverlap");
CUfunction f_acc = getfun(m,"textures_accumulate"); // CDP child; needs max-dyn-shared attribute
if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)||(do_corr&&(!f_cor||!f_cmb||!f_nrm))||(do_tex&&(!f_tex||!f_acc))){
seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p cor=%p cmb=%p nrm=%p tex=%p acc=%p",
(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im,(void*)f_cor,(void*)f_cmb,(void*)f_nrm,(void*)f_tex,(void*)f_acc); return -3; }
int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
......@@ -446,6 +448,73 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
}
if(out_corr_err) *out_corr_err=corr_err;
// --- textures: textures_nonoverlap (DP; CDP-launches textures_accumulate) vs aux_diff_rgb_combo_dp.drbg ---
double tex_err=-1;
float *gpu_textures=nullptr,*gpu_diff_rgb_combo=nullptr,*gpu_color_weights=nullptr,*gpu_rbga_params=nullptr;
int *gpu_texture_indices=nullptr,*gpu_pnum_texture_tiles=nullptr;
if(do_tex){
// NOTE: textures_nonoverlap EXECUTES correctly on Blackwell (CDP child textures_accumulate launches),
// output magnitude matches golden (rms within ~1%), BUT diff_rgb_combo does NOT match the Jul-2025
// golden numerically (value layers off by a constant ~268, diff layers diff_sigma-sensitive). Verified
// NOT input sensitivity (same error when gpu_clt is overwritten with the golden CLT), NOT diff_sigma
// (10.0 current is closest; 1.5/1.0 worse), NOT arg/param order (matches kernel signature). Texture
// path is NOT used by the LWIR16 CUAS workflow (cuas/ uses only convert_direct/corr2D_normalize/imclt).
// Treated as a documented known issue (likely golden staleness / unverified RGB-path drift); to be
// tracked later via git bisect of TileProcessor.cu textures + the 107 kernel branch switch.
// See handoffs/2026-06-25_texture-diff-rgb-combo-mismatch.md.
int tex_colors=P.texture_colors, keep_w=P.keep_texture_weights;
int tile_layers = tex_colors+1+(keep_w?(NC+tex_colors+1):0);
int tile_tex_size = tile_layers*256;
const int ntiles = P.tilesx*P.tilesy;
// texture index list from tasks (all tiles have task_text_en set in setTasks)
std::vector<int> tex_idx(ntiles); int num_tex=0;
for(int ty=0;ty<P.tilesy;ty++) for(int tx=0;tx<P.tilesx;tx++){
int nt=ty*P.tilesx+tx; int task=*(int*)&ftask[(size_t)P.task_size*nt];
if(task & (P.task_texture_bits | (1<<P.task_text_en))) tex_idx[num_tex++]=(nt<<P.text_ntile_shift)|(1<<P.list_texture_bit);
}
gpu_texture_indices = (int*)copyalloc_kernel_gpu((float*)tex_idx.data(), num_tex, P.tilesx*P.tilesya);
size_t dstride_textures=0;
gpu_textures = alloc_image_gpu(&dstride_textures, tile_tex_size, ntiles);
cudaMalloc((void**)&gpu_diff_rgb_combo, (size_t)ntiles*NC*(P.num_colors+1)*sizeof(float));
cudaMalloc((void**)&gpu_pnum_texture_tiles, sizeof(int));
int zero=0; cudaMemcpy(gpu_pnum_texture_tiles,&zero,sizeof(int),cudaMemcpyHostToDevice);
gpu_color_weights = copyalloc_kernel_gpu((float*)P.color_weights, (int)(sizeof(P.color_weights)/sizeof(float)));
gpu_rbga_params = copyalloc_kernel_gpu((float*)P.generate_RBGA_params, 5);
int shared_size = host_get_textures_shared_size(NC, tex_colors, 0);
cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_size);
cuFuncSetAttribute(f_acc, CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT, 100);
int num_cams=NC, ntsk=P.tp_tasks_size, is_lwir=(tex_colors==1), dust=1, linescan=1, tilesx=P.tilesx;
size_t tex_stride=dstride_textures/sizeof(float);
void* a[]={ &num_cams,&gpu_ftasks,&ntsk,&gpu_texture_indices,&gpu_pnum_texture_tiles,&gpu_clt,&gpu_gc,
&tex_colors,&is_lwir,&gpu_rbga_params,&gpu_color_weights,&dust,&keep_w,
&tex_stride,&gpu_textures,&linescan,&gpu_diff_rgb_combo,&tilesx };
cuCtxSynchronize();
cr=cuLaunchKernel(f_tex,1,1,1,1,1,1,0,nullptr,a,nullptr);
if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch textures_nonoverlap -> %d (%s)",cr,es); return -30; }
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync textures_nonoverlap -> %d (%s)",cr,es); return -31; }
// diff_rgb_combo: device order [ntile*nlayers+nl] -> golden order [nl*ntiles+ntile]; raster (order-stable)
int nlayers=NC*(P.num_colors+1), sz=ntiles*nlayers;
std::vector<float> dev(sz), got(sz), gold(sz);
cudaMemcpy(dev.data(), gpu_diff_rgb_combo, (size_t)sz*sizeof(float), cudaMemcpyDeviceToHost);
for(int nl=0;nl<nlayers;nl++) for(int nt=0;nt<ntiles;nt++) got[nl*ntiles+nt]=dev[nt*nlayers+nl];
readFloatsFromFile(gold.data(), PA.result_diff_rgb_combo_file_dp);
tex_err=0; double gotmax=0,gmax=0,sgot=0,sg=0; int nnan_got=0,nnan_gold=0;
for(int i=0;i<sz;i++){ if(std::isnan(got[i]))nnan_got++; if(std::isnan(gold[i]))nnan_gold++;
double e=std::fabs((double)got[i]-(double)gold[i]); if(e>tex_err) tex_err=e;
if(std::fabs(got[i])>gotmax)gotmax=std::fabs(got[i]); if(std::fabs(gold[i])>gmax)gmax=std::fabs(gold[i]);
sgot+=(double)got[i]*got[i]; sg+=(double)gold[i]*gold[i]; }
std::vector<float> sa(got),sb(gold); std::sort(sa.begin(),sa.end()); std::sort(sb.begin(),sb.end());
double serr=0; for(int i=0;i<sz;i++){ double e=std::fabs((double)sa[i]-(double)sb[i]); if(e>serr)serr=e; }
// per-layer-group: are the first ntiles (layer 0) matching? compare layer 0 only (diff[cam0])
double l0err=0; for(int nt=0;nt<ntiles;nt++){ double e=std::fabs((double)got[nt]-(double)gold[nt]); if(e>l0err)l0err=e; }
printf("TEX DIAG: num_tex=%d nlayers=%d shared=%d | pointwise=%.4g sorted=%.6g layer0=%.4g | got(max=%.2f rms=%.3f) gold(max=%.2f rms=%.3f)\n",
num_tex, nlayers, shared_size, tex_err, serr, l0err, gotmax, std::sqrt(sgot/sz), gmax, std::sqrt(sg/sz));
}
if(out_tex_err) *out_tex_err=tex_err;
// --- free ---
for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); if(do_imclt)cudaFree(corr_h[c]); }
cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
......@@ -454,6 +523,8 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
if(do_corr){ cudaFree(gpu_corrs_td); cudaFree(gpu_corrs_combo_td); cudaFree(gpu_corrs_combo);
cudaFree(gpu_corr_indices); cudaFree(gpu_corrs_combo_indices); cudaFree(gpu_num_corr_tiles); }
if(do_tex){ cudaFree(gpu_textures); cudaFree(gpu_diff_rgb_combo); cudaFree(gpu_texture_indices);
cudaFree(gpu_pnum_texture_tiles); cudaFree(gpu_color_weights); cudaFree(gpu_rbga_params); }
free(fgc); free(fcv); free(hbuf);
return 0;
}
......@@ -461,19 +532,25 @@ static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, in
// Stage-2 wrapper: convert_direct only.
int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
double* out_max_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 0,0, out_max_err, nullptr, nullptr, out_num_active);
return run_convert_pipeline(m, lwir, data_root, 0,0,0, out_max_err, nullptr, nullptr, nullptr, out_num_active);
}
// Stage-3 wrapper: convert_direct + imclt_rbg_all (reports both CLT and RBG max error).
int tp_imclt_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_rbg_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 1,0, out_clt_err, out_rbg_err, nullptr, out_num_active);
return run_convert_pipeline(m, lwir, data_root, 1,0,0, out_clt_err, out_rbg_err, nullptr, nullptr, out_num_active);
}
// Stage-4 wrapper: convert_direct + correlate2D/combine/normalize (reports CLT and quad-corr max error).
int tp_corr_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_corr_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 0,1, out_clt_err, nullptr, out_corr_err, out_num_active);
return run_convert_pipeline(m, lwir, data_root, 0,1,0, out_clt_err, nullptr, out_corr_err, nullptr, out_num_active);
}
// Stage-5 wrapper: convert_direct + textures_nonoverlap (reports CLT and diff_rgb_combo max error).
int tp_tex_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_tex_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 0,0,1, out_clt_err, nullptr, nullptr, out_tex_err, out_num_active);
}
} // extern "C"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment