Commit edfc7bae authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Stage 3 — native imclt_rbg_all via JNA + .rbg golden validation

Refactor the Stage-2 selftest into run_convert_pipeline(do_imclt) shared helper;
tp_convert_direct_selftest is now a thin wrapper (do_imclt=0). Add tp_imclt_selftest
(do_imclt=1): after convert_direct, allocate pitched RBG output buffers (alloc_image_gpu,
648x520/cam, mono), launch imclt_rbg_all <<<1,1>>> (gpu_clt -> gpu_corr_images),
de-pitch via cudaMemcpy2D, compare to clt/aux_chnN.rbg golden.

Validated on RTX 5060 Ti via Java->JNA: max|RBG-golden|=0.0201 over peaks 1535 ->
relative ~1.31e-5. convert_direct CLT error unchanged (0.108505) => no regression.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 05ee47d0
......@@ -234,8 +234,10 @@ void tp_destroy_instance(TpInstance* inst){
// cuLaunchKernel against the NVRTC module. Reads CLT back and compares to clt/aux_chnN.clt golden.
// data_root must contain clt/ (TpPaths uses relative "clt/..." names). lwir=1 -> 16-cam LWIR set.
// Runtime (cudaMalloc) and driver (cuLaunchKernel) interoperate because the module's context is current.
int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
double* out_max_err, int* out_num_active){
// Shared convert(+optional imclt) pipeline. do_imclt!=0 also runs imclt_rbg_all and
// compares the de-pitched RBG to clt/aux_chnN.rbg, writing out_rbg_err.
static int run_convert_pipeline(TpModule* m, int lwir, const char* data_root, int do_imclt,
double* out_clt_err, double* out_rbg_err, int* out_num_active){
g_err[0]=0;
if(!m){ seterr("selftest: null module"); return -1; }
if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
......@@ -248,7 +250,8 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
CUfunction f_rot = getfun(m,"calc_rot_deriv");
CUfunction f_off = getfun(m,"calculate_tiles_offsets");
CUfunction f_cd = getfun(m,"convert_direct");
if(!f_rbr||!f_rot||!f_off||!f_cd){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd); return -3; }
CUfunction f_im = getfun(m,"imclt_rbg_all");
if(!f_rbr||!f_rot||!f_off||!f_cd||(do_imclt&&!f_im)){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p im=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd,(void*)f_im); return -3; }
int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
......@@ -276,6 +279,16 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
cudaMalloc((void**)&gpu_active, (size_t)P.tilesx*P.tilesy*sizeof(int));
cudaMalloc((void**)&gpu_num_active, sizeof(int));
// --- RBG output images (pitched, 3x-height-for-colors, +dtt_size margins) for imclt ---
const int rbg_w = P.img_width + P.dtt_size; // 648
const int rbg_h = P.num_colors*(P.img_height + P.dtt_size); // 520 (mono)
const int rbg_size = rbg_w * rbg_h; // 336960 floats/cam (== aux_chnN.rbg)
std::vector<float*> corr_h(NC, nullptr); float** gpu_corr_images=nullptr; size_t dstride_rslt=0;
if(do_imclt){
for(int c=0;c<NC;c++) corr_h[c]=alloc_image_gpu(&dstride_rslt, rbg_w, rbg_h);
gpu_corr_images = copyalloc_pointers_gpu(corr_h.data(), NC);
}
// --- tasks (TpHostGpu::setTasks, target_disparity=0, scale=0) ---
std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
......@@ -329,23 +342,57 @@ int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
int num_active=0; cudaMemcpy(&num_active, gpu_num_active, sizeof(int), cudaMemcpyDeviceToHost);
// --- compare CLT to golden clt/aux_chnN.clt ---
double maxerr=0;
std::vector<float> golden(slice), got(slice);
double clt_err=0;
{ std::vector<float> golden(slice), got(slice);
for(int c=0;c<NC;c++){
readFloatsFromFile(golden.data(), PA.ports_clt_file[c]);
cudaMemcpy(got.data(), clt_h[c], (size_t)slice*sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>maxerr) maxerr=e; }
}
if(out_max_err) *out_max_err=maxerr;
for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>clt_err) clt_err=e; }
} }
if(out_clt_err) *out_clt_err=clt_err;
if(out_num_active) *out_num_active=num_active;
// --- imclt_rbg_all <<<1,1>>> (gpu_clt -> gpu_corr_images), compare de-pitched RBG to aux_chnN.rbg ---
double rbg_err=-1;
if(do_imclt){
int apply_lpf=1, num_colors2=P.num_colors, twidth=P.tilesx, theight=P.tilesy;
size_t rslt_floats = dstride_rslt/sizeof(float);
void* a[]={ &NC,&gpu_clt,&gpu_corr_images,&apply_lpf,&num_colors2,&twidth,&theight,&rslt_floats };
cuCtxSynchronize();
cr=cuLaunchKernel(f_im,1,1,1,1,1,1,0,nullptr,a,nullptr);
if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch imclt_rbg_all -> %d (%s)",cr,es); return -12; }
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync imclt_rbg_all -> %d (%s)",cr,es); return -13; }
rbg_err=0;
std::vector<float> gold(rbg_size), got(rbg_size);
for(int c=0;c<NC;c++){
readFloatsFromFile(gold.data(), PA.result_rbg_file[c]);
cudaMemcpy2D(got.data(), (size_t)rbg_w*sizeof(float), corr_h[c], dstride_rslt,
(size_t)rbg_w*sizeof(float), rbg_h, cudaMemcpyDeviceToHost); // de-pitch
for(int i=0;i<rbg_size;i++){ double e=std::fabs((double)got[i]-(double)gold[i]); if(e>rbg_err) rbg_err=e; }
}
}
if(out_rbg_err) *out_rbg_err=rbg_err;
// --- free ---
for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); }
for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); if(do_imclt)cudaFree(corr_h[c]); }
cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
if(gpu_corr_images) cudaFree(gpu_corr_images);
cudaFree(gpu_active); cudaFree(gpu_num_active); cudaFree(gpu_ftasks);
cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
free(fgc); free(fcv); free(hbuf);
return 0;
}
// Stage-2 wrapper: convert_direct only.
int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
double* out_max_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 0, out_max_err, nullptr, out_num_active);
}
// Stage-3 wrapper: convert_direct + imclt_rbg_all (reports both CLT and RBG max error).
int tp_imclt_selftest(TpModule* m, int lwir, const char* data_root,
double* out_clt_err, double* out_rbg_err, int* out_num_active){
return run_convert_pipeline(m, lwir, data_root, 1, out_clt_err, out_rbg_err, out_num_active);
}
} // extern "C"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment