Commit a5b7c269 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: Stage 1 — native TpInstance geometry path (calc_reverse_distortions + rot_derivs)

Add TpInstance to the JNA shim: device buffers (gpu_geometry_correction,
gpu_rByRDist, gpu_rot_deriv, gpu_correction_vector) + setters (HtoD),
the two pure-geometry launches (calcReverseDistortionTable {16,1,1}/{3,3,3},
calc_rot_deriv {num_cams,1,1}/{3,3,3}), and readback getters. Driver-API
cuLaunchKernel against the NVRTC module (mirrors GpuQuad.execCalcReverseDistortions
/ execRotDerivs, no JCuda). build_lib.sh builds libtileproc.so.

Validated via Java->JNA against tile_processor_gpu/clt reference data on the
RTX 5060 Ti: rByRDist == clt/*.rbyrdist to ~1e-7 (aux 16-cam and main),
rot_deriv rows orthogonal to ~1e-10 (scaled-rotation structure, det~zoom^3).
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent eec885a0
#!/usr/bin/env bash
# Build libtileproc.so (JNA-callable native tile-processor shim). By Claude on 2026-06-25.
set -e
cd "$(dirname "$0")"
CUDA="${CUDA_HOME:-/usr/local/cuda}"
g++ -std=c++14 -shared -fPIC tp_jna.cpp -o libtileproc.so \
-I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
-lnvrtc -lcuda
echo "built ./libtileproc.so"
......@@ -36,8 +36,32 @@ static const int N_KERNELS = sizeof(KERNELS)/sizeof(KERNELS[0]);
struct TpModule { CUcontext ctx; CUmodule mod; int nfun; };
// Stage-1 geometry buffer sizes (floats). Must match GpuQuad.java alloc:
// gpu_geometry_correction = GeometryCorrection.arrayLength(16) (== sizeof(struct gc)/4 == 165),
// gpu_rByRDist = RBYRDIST_LEN (5001),
// gpu_rot_deriv = 5*MAX_NUM_CAMS*3*3 (720),
// gpu_correction_vector = CORR_VECTOR_MAX_LENGTH (cv.toFullRollArray()).
// gc/cv allocated generously (kernel reads only the struct-sized prefix); over-alloc is harmless.
#define RBYRDIST_LEN_C 5001
#define ROT_DERIV_FLOATS 720
#define GC_FLOATS_MAX 512
#define CV_FLOATS_MAX 512
// One per-camera-config GPU instance: device buffers + the module it launches against.
// Stage 1 covers the geometry buffers only; ftasks/clt/images follow in later stages.
struct TpInstance {
TpModule* mod;
CUdeviceptr gpu_geometry_correction;
CUdeviceptr gpu_rByRDist;
CUdeviceptr gpu_rot_deriv;
CUdeviceptr gpu_correction_vector;
};
static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f) return std::string(); std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }
// Resolve a kernel by name from the module (cheap; not a hot path in Stage 1).
static CUfunction getfun(TpModule* m, const char* name){ CUfunction f=nullptr; if(m && m->mod) cuModuleGetFunction(&f, m->mod, name); return f; }
extern "C" {
TpModule* tp_create_module(const char* srcdir, const char* devrt){
......@@ -93,4 +117,105 @@ int tp_module_num_functions(TpModule* m){ return m ? m->nfun : -1; }
const char* tp_last_error(){ return g_err; }
void tp_destroy_module(TpModule* m){ if(m){ if(m->mod) cuModuleUnload(m->mod); if(m->ctx) cuCtxDestroy(m->ctx); delete m; } }
// ---- Stage 1: instance + geometry path (calcReverseDistortionTable, calc_rot_deriv) ----
// Mirrors GpuQuad.execCalcReverseDistortions / execRotDerivs (driver-API cuLaunchKernel),
// but native. Validated by reading rByRDist back and comparing to clt/*.rbyrdist reference.
void tp_destroy_instance(TpInstance* inst); // fwd (cleanup on partial-alloc failure)
TpInstance* tp_create_instance(TpModule* m){
g_err[0]=0;
if(!m){ seterr("tp_create_instance: null module"); return nullptr; }
CUresult cr; const char* es;
cuCtxSetCurrent(m->ctx);
TpInstance* inst = new TpInstance{m,0,0,0,0};
#define ALLOCF(p,nf) do{ cr=cuMemAlloc(&inst->p,(size_t)(nf)*sizeof(float)); \
if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuMemAlloc(%s,%d) -> %d (%s)",#p,(int)(nf),cr,es); \
tp_destroy_instance(inst); return nullptr; } }while(0)
ALLOCF(gpu_geometry_correction, GC_FLOATS_MAX);
ALLOCF(gpu_rByRDist, RBYRDIST_LEN_C);
ALLOCF(gpu_rot_deriv, ROT_DERIV_FLOATS);
ALLOCF(gpu_correction_vector, CV_FLOATS_MAX);
#undef ALLOCF
return inst;
}
// Upload host gc floats (gc.expandSensors(16).toFloatArray()) to gpu_geometry_correction.
int tp_set_geometry_correction(TpInstance* inst, const float* fgc, int n){
if(!inst){ seterr("tp_set_geometry_correction: null instance"); return -1; }
if(n<0 || n>GC_FLOATS_MAX){ seterr("tp_set_geometry_correction: n=%d out of [0,%d]",n,GC_FLOATS_MAX); return -2; }
cuCtxSetCurrent(inst->mod->ctx);
CUresult cr=cuMemcpyHtoD(inst->gpu_geometry_correction, fgc, (size_t)n*sizeof(float));
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("HtoD gc -> %d (%s)",cr,es); return -3; }
return 0;
}
// Upload host correction-vector floats (cv.toFullRollArray()) to gpu_correction_vector.
int tp_set_correction_vector(TpInstance* inst, const float* fcv, int n){
if(!inst){ seterr("tp_set_correction_vector: null instance"); return -1; }
if(n<0 || n>CV_FLOATS_MAX){ seterr("tp_set_correction_vector: n=%d out of [0,%d]",n,CV_FLOATS_MAX); return -2; }
cuCtxSetCurrent(inst->mod->ctx);
CUresult cr=cuMemcpyHtoD(inst->gpu_correction_vector, fcv, (size_t)n*sizeof(float));
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("HtoD cv -> %d (%s)",cr,es); return -3; }
return 0;
}
// calcReverseDistortionTable<<<{16,1,1},{3,3,3}>>>(gpu_geometry_correction, gpu_rByRDist)
int tp_exec_calc_reverse_distortions(TpInstance* inst){
if(!inst){ seterr("tp_exec_calc_reverse_distortions: null instance"); return -1; }
cuCtxSetCurrent(inst->mod->ctx);
CUfunction f=getfun(inst->mod,"calcReverseDistortionTable");
if(!f){ seterr("no kernel calcReverseDistortionTable"); return -2; }
void* args[] = { &inst->gpu_geometry_correction, &inst->gpu_rByRDist };
cuCtxSynchronize();
CUresult cr=cuLaunchKernel(f, 16,1,1, 3,3,3, 0,nullptr, args,nullptr);
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("launch calcReverseDistortionTable -> %d (%s)",cr,es); return -3; }
cr=cuCtxSynchronize();
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("sync calcReverseDistortionTable -> %d (%s)",cr,es); return -4; }
return 0;
}
// calc_rot_deriv<<<{num_cams,1,1},{3,3,3}>>>(num_cams, gpu_correction_vector, gpu_rot_deriv)
int tp_exec_rot_derivs(TpInstance* inst, int num_cams){
if(!inst){ seterr("tp_exec_rot_derivs: null instance"); return -1; }
cuCtxSetCurrent(inst->mod->ctx);
CUfunction f=getfun(inst->mod,"calc_rot_deriv");
if(!f){ seterr("no kernel calc_rot_deriv"); return -2; }
void* args[] = { &num_cams, &inst->gpu_correction_vector, &inst->gpu_rot_deriv };
cuCtxSynchronize();
CUresult cr=cuLaunchKernel(f, num_cams,1,1, 3,3,3, 0,nullptr, args,nullptr);
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("launch calc_rot_deriv -> %d (%s)",cr,es); return -3; }
cr=cuCtxSynchronize();
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("sync calc_rot_deriv -> %d (%s)",cr,es); return -4; }
return 0;
}
// Read rByRDist back (RBYRDIST_LEN floats) for validation against clt/*.rbyrdist.
int tp_get_rbyrdist(TpInstance* inst, float* out){
if(!inst){ seterr("tp_get_rbyrdist: null instance"); return -1; }
cuCtxSetCurrent(inst->mod->ctx);
CUresult cr=cuMemcpyDtoH(out, inst->gpu_rByRDist, (size_t)RBYRDIST_LEN_C*sizeof(float));
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("DtoH rByRDist -> %d (%s)",cr,es); return -2; }
return 0;
}
// Read rot_deriv back (5*16*3*3 floats) for sanity/validation.
int tp_get_rot_deriv(TpInstance* inst, float* out){
if(!inst){ seterr("tp_get_rot_deriv: null instance"); return -1; }
cuCtxSetCurrent(inst->mod->ctx);
CUresult cr=cuMemcpyDtoH(out, inst->gpu_rot_deriv, (size_t)ROT_DERIV_FLOATS*sizeof(float));
if(cr!=CUDA_SUCCESS){ const char* es; cuGetErrorString(cr,&es); seterr("DtoH rot_deriv -> %d (%s)",cr,es); return -2; }
return 0;
}
void tp_destroy_instance(TpInstance* inst){
if(!inst) return;
if(inst->mod) cuCtxSetCurrent(inst->mod->ctx);
if(inst->gpu_geometry_correction) cuMemFree(inst->gpu_geometry_correction);
if(inst->gpu_rByRDist) cuMemFree(inst->gpu_rByRDist);
if(inst->gpu_rot_deriv) cuMemFree(inst->gpu_rot_deriv);
if(inst->gpu_correction_vector) cuMemFree(inst->gpu_correction_vector);
delete inst;
}
} // extern "C"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment