Commit 05ee47d0 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Stage 2 — native convert_direct selftest (first real execution + CDP on Blackwell)

Add tp_convert_direct_selftest to the JNA shim: mirrors TpHostGpu allTests' convert
path (setImageKernels/setImgBuffers/setCltBuffers/setTasks + calc_reverse_distortions
-> rot_derivs -> calculate_tiles_offsets [CDP] -> convert_direct), reusing the harness
runtime-API host helpers (tp_utils/tp_files/TpParams/tp_paths) for ALL allocation and
porting only the launches to driver-API cuLaunchKernel against the NVRTC module. Reads
CLT back, compares to clt/aux_chnN.clt golden.

build_lib.sh: nvcc + -std=c++17 (static constexpr TpParams members become inline),
-Isrc + cuda-samples Common (helper_cuda.h), --pre-include algorithm.

Validated on RTX 5060 Ti via Java->JNA: num_active_tiles=5120 (all), max|CLT-golden|
=0.1085 over peaks of 12260 -> relative ~8.85e-6 (float32 NVRTC-vs-nvcc variation).
First CDP (calculate_tiles_offsets) and 17-arg pointer-of-pointers convert_direct
launch executing natively on Blackwell.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent a5b7c269
#!/usr/bin/env bash #!/usr/bin/env bash
# Build libtileproc.so (JNA-callable native tile-processor shim). By Claude on 2026-06-25. # Build libtileproc.so (JNA-callable native tile-processor shim). By Claude on 2026-06-25.
# Stage 2 adds the convert_direct selftest, which reuses the harness host helpers
# (tp_utils/tp_files/TpParams/tp_paths -- no __global__ kernels), so build with nvcc + -Isrc.
set -e set -e
cd "$(dirname "$0")" cd "$(dirname "$0")"
CUDA="${CUDA_HOME:-/usr/local/cuda}" CUDA="${CUDA_HOME:-/usr/local/cuda}"
g++ -std=c++14 -shared -fPIC tp_jna.cpp -o libtileproc.so \ SRC=../src
-I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \ SAMPLES="${CUDA_SAMPLES:-/home/elphel/git/cuda-samples/Common}" # helper_cuda.h etc.
-lnvrtc -lcuda nvcc -std=c++17 -O2 --shared -Xcompiler -fPIC \
--pre-include algorithm \
-I"$SRC" -I"$CUDA/include" -I"$SAMPLES" \
tp_jna.cpp "$SRC/tp_utils.cu" "$SRC/tp_files.cu" "$SRC/TpParams.cu" "$SRC/tp_paths.cu" \
-o libtileproc.so \
-L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
-lnvrtc -lcuda -lcudart
echo "built ./libtileproc.so" echo "built ./libtileproc.so"
...@@ -3,13 +3,22 @@ ...@@ -3,13 +3,22 @@
// tp_module_num_functions, tp_last_error, tp_destroy_module. By Claude on 2026-06-25. // tp_module_num_functions, tp_last_error, tp_destroy_module. By Claude on 2026-06-25.
#include <cuda.h> #include <cuda.h>
#include <nvrtc.h> #include <nvrtc.h>
#include <cuda_runtime.h>
#include <cstdio> #include <cstdio>
#include <cstdarg> #include <cstdarg>
#include <cstring> #include <cstring>
#include <cstdlib>
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <unistd.h> // chdir
// Reuse the harness's host-side params/paths/helpers verbatim (no __global__ kernels in these TUs):
#include "TpParams.h" // pulls geometry_correction.h (struct gc, CltExtra, TP_TASK_* offsets), tp_defines.h
#include "tp_paths.h"
#include "tp_utils.h" // copyalloc_kernel_gpu / copyalloc_pointers_gpu / copyalloc_image_gpu / alloc_kernel_gpu
#include "tp_files.h" // readFloatsFromFile / readAllFloatsFromFile
static char g_err[8192] = ""; static char g_err[8192] = "";
static void seterr(const char* fmt, ...){ va_list ap; va_start(ap,fmt); vsnprintf(g_err,sizeof(g_err),fmt,ap); va_end(ap); } static void seterr(const char* fmt, ...){ va_list ap; va_start(ap,fmt); vsnprintf(g_err,sizeof(g_err),fmt,ap); va_end(ap); }
...@@ -218,4 +227,125 @@ void tp_destroy_instance(TpInstance* inst){ ...@@ -218,4 +227,125 @@ void tp_destroy_instance(TpInstance* inst){
delete inst; delete inst;
} }
// ---- Stage 2: convert_direct selftest (the first real-execution milestone) ----
// Mirrors TpHostGpu allTests' convert path (setImageKernels/setImgBuffers/setCltBuffers/setTasks +
// calc_reverse_distortions -> rot_derivs -> calculate_tiles_offsets [CDP] -> convert_direct), reusing
// the harness's runtime-API host helpers for ALL allocation and porting only the launches to driver-API
// cuLaunchKernel against the NVRTC module. Reads CLT back and compares to clt/aux_chnN.clt golden.
// data_root must contain clt/ (TpPaths uses relative "clt/..." names). lwir=1 -> 16-cam LWIR set.
// Runtime (cudaMalloc) and driver (cuLaunchKernel) interoperate because the module's context is current.
int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
double* out_max_err, int* out_num_active){
g_err[0]=0;
if(!m){ seterr("selftest: null module"); return -1; }
if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
cuCtxSetCurrent(m->ctx);
CUresult cr; const char* es;
TpParams P((int)lwir);
TpPaths PA((int)lwir);
CUfunction f_rbr = getfun(m,"calcReverseDistortionTable");
CUfunction f_rot = getfun(m,"calc_rot_deriv");
CUfunction f_off = getfun(m,"calculate_tiles_offsets");
CUfunction f_cd = getfun(m,"convert_direct");
if(!f_rbr||!f_rot||!f_off||!f_cd){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd); return -3; }
int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
// --- kernels + kernel offsets (pointer-of-pointers) ---
std::vector<float*> kernels_h(NC), offs_h(NC);
for(int c=0;c<NC;c++){
readFloatsFromFile(hbuf, PA.kernel_file[c]); kernels_h[c]=copyalloc_kernel_gpu(hbuf, P.kern_size);
readFloatsFromFile(hbuf, PA.kernel_offs_file[c]); offs_h[c]=copyalloc_kernel_gpu(hbuf, P.kern_tiles*(int)(sizeof(CltExtra)/sizeof(float)));
}
float** gpu_kernels = copyalloc_pointers_gpu(kernels_h.data(), NC);
float** gpu_kernel_offsets = copyalloc_pointers_gpu(offs_h.data(), NC);
// --- images (pitched) ---
std::vector<float*> images_h(NC); size_t dstride=0;
for(int c=0;c<NC;c++){ readFloatsFromFile(hbuf, PA.image_files[c]); images_h[c]=copyalloc_image_gpu(hbuf,&dstride,P.img_width,P.img_height); }
float** gpu_images = copyalloc_pointers_gpu(images_h.data(), NC);
// --- CLT output buffers ---
const int slice = P.tilesy*P.tilesx*P.num_colors*4*P.dtt_size*P.dtt_size;
std::vector<float*> clt_h(NC);
for(int c=0;c<NC;c++) clt_h[c]=alloc_kernel_gpu(slice);
float** gpu_clt = copyalloc_pointers_gpu(clt_h.data(), NC);
int *gpu_active=nullptr,*gpu_num_active=nullptr;
cudaMalloc((void**)&gpu_active, (size_t)P.tilesx*P.tilesy*sizeof(int));
cudaMalloc((void**)&gpu_num_active, sizeof(int));
// --- tasks (TpHostGpu::setTasks, target_disparity=0, scale=0) ---
std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
std::vector<float> ftask((size_t)P.tp_tasks_size*P.task_size, 0.0f);
for(int ty=0; ty<P.tilesy; ty++) for(int tx=0; tx<P.tilesx; tx++){
int nt = ty*P.tilesx + tx;
int task_task = (1<<P.task_inter_en)|(1<<P.task_corr_en)|(1<<P.task_text_en);
int task_txy = tx + (ty<<16);
float* tp = ftask.data() + (size_t)P.task_size*nt;
tp[P.tp_task_task_offset] = *(float*)&task_task;
tp[P.tp_task_txy_offset] = *(float*)&task_txy;
tp[P.tp_task_disparity_offset] = 0.0f;
tp[P.tp_task_scale_offset] = 0.0f;
float* q = tp + P.tp_task_xy_offset;
for(int c=0;c<NC;c++){ *q++ = coords[(size_t)c*P.tp_tasks_size*2 + nt*2 + 0]; *q++ = coords[(size_t)c*P.tp_tasks_size*2 + nt*2 + 1]; }
}
float* gpu_ftasks = copyalloc_kernel_gpu(ftask.data(), P.tp_tasks_size*P.task_size);
// --- geometry (load gc/cv from files; compute rByRDist + rot_deriv via the kernels, as allTests does) ---
int n_gc=0,n_cv=0;
float* fgc = readAllFloatsFromFile(PA.geometry_correction_file,&n_gc);
float* fcv = readAllFloatsFromFile(PA.correction_vector_file,&n_cv);
float* gpu_gc = copyalloc_kernel_gpu(fgc, n_gc);
float* gpu_cv = copyalloc_kernel_gpu(fcv, n_cv);
float* gpu_rbr = alloc_kernel_gpu(5001);
float* gpu_rot = alloc_kernel_gpu(5*16*3*3);
#define LAUNCH(fn,gx,gy,gz,bx,by,bz,...) do{ void* a[]={__VA_ARGS__}; \
cuCtxSynchronize(); cr=cuLaunchKernel(fn,gx,gy,gz,bx,by,bz,0,nullptr,a,nullptr); \
if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch %s -> %d (%s)",#fn,cr,es); return -10; } \
cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync %s -> %d (%s)",#fn,cr,es); return -11; } }while(0)
int mc16 = 16; // calcReverseDistortionTable grid = MAX_NUM_CAMS
LAUNCH(f_rbr, mc16,1,1, 3,3,3, &gpu_gc, &gpu_rbr);
LAUNCH(f_rot, NC,1,1, 3,3,3, &NC, &gpu_cv, &gpu_rot);
// calculate_tiles_offsets <<<1,1>>> (uniform_grid=1) — first CDP execution
int uniform_grid = 1; int num_tiles = P.tp_tasks_size;
LAUNCH(f_off, 1,1,1, 1,1,1, &uniform_grid,&NC,&gpu_ftasks,&num_tiles,&gpu_gc,&gpu_cv,&gpu_rbr,&gpu_rot);
// convert_direct <<<1,1>>> (17 args; arg13=0, arg14=kernels_hor — verbatim from TpHostGpu)
int num_colors=P.num_colors, lpf_mask=0, woi_w=P.img_width, woi_h=P.img_height,
kh_arg=0, kv_arg=P.kernels_hor, tilesx=P.tilesx;
size_t dstride_floats = dstride/sizeof(float);
LAUNCH(f_cd, 1,1,1, 1,1,1,
&NC,&num_colors,&gpu_kernel_offsets,&gpu_kernels,&gpu_images,&gpu_ftasks,&gpu_clt,
&dstride_floats,&num_tiles,&lpf_mask,&woi_w,&woi_h,&kh_arg,&kv_arg,
&gpu_active,&gpu_num_active,&tilesx);
#undef LAUNCH
int num_active=0; cudaMemcpy(&num_active, gpu_num_active, sizeof(int), cudaMemcpyDeviceToHost);
// --- compare CLT to golden clt/aux_chnN.clt ---
double maxerr=0;
std::vector<float> golden(slice), got(slice);
for(int c=0;c<NC;c++){
readFloatsFromFile(golden.data(), PA.ports_clt_file[c]);
cudaMemcpy(got.data(), clt_h[c], (size_t)slice*sizeof(float), cudaMemcpyDeviceToHost);
for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>maxerr) maxerr=e; }
}
if(out_max_err) *out_max_err=maxerr;
if(out_num_active) *out_num_active=num_active;
// --- free ---
for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); }
cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
cudaFree(gpu_active); cudaFree(gpu_num_active); cudaFree(gpu_ftasks);
cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
free(fgc); free(fcv); free(hbuf);
return 0;
}
} // extern "C" } // extern "C"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment