Commit eec885a0 authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: JNA shim for the GPU migration (Stage 0/0b)

libtileproc shim (tp_jna.cpp: extern "C" tp_create_module/num_functions/last_error/destroy)
+ standalone tp_nvrtc_probe.cpp + build_probe.sh. NVRTC-compiles the kernels (+ JCUDA defines)
-> cuLink(libcudadevrt, CDP) -> module -> 19 functions, validated on the RTX 5060 Ti (sm_120 via
compute_90 PTX + driver JIT). Build artifacts gitignored. By the JCuda->JNA migration.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 75acb784
libtileproc.so
tp_nvrtc_probe
*.o
#!/usr/bin/env bash
# Build the Stage-0 NVRTC probe. By Claude on 2026-06-25.
set -e
cd "$(dirname "$0")"
CUDA="${CUDA_HOME:-/usr/local/cuda}"
g++ -std=c++14 tp_nvrtc_probe.cpp -o tp_nvrtc_probe \
-I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
-lnvrtc -lcuda
echo "built ./tp_nvrtc_probe"
// Stage-0b: the probe refactored into a JNA-callable shared lib (libtileproc.so).
// extern "C": tp_create_module (NVRTC-compile kernels + cuLink(libcudadevrt) + load 19 funcs),
// tp_module_num_functions, tp_last_error, tp_destroy_module. By Claude on 2026-06-25.
#include <cuda.h>
#include <nvrtc.h>
#include <cstdio>
#include <cstdarg>
#include <cstring>
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
static char g_err[8192] = "";
static void seterr(const char* fmt, ...){ va_list ap; va_start(ap,fmt); vsnprintf(g_err,sizeof(g_err),fmt,ap); va_end(ap); }
// getTpDefines() values from GPUTileProcessor.java (verbatim; TILES_PER_BLOCK_GEOM = 32/16 = 2).
static const char* TP_DEFINES =
"#define JCUDA\n#define DTT_SIZE_LOG2 3\n#define THREADSX 8\n#define NUM_CAMS 16\n"
"#define THREADS_PER_TILE 8\n#define TILES_PER_BLOCK 4\n#define CORR_THREADS_PER_TILE 8\n"
"#define CORR_TILES_PER_BLOCK 4\n#define CORR_TILES_PER_BLOCK_NORMALIZE 4\n#define CORR_TILES_PER_BLOCK_COMBINE 4\n"
"#define NUM_THREADS 32\n#define TEXTURE_THREADS_PER_TILE 8\n#define TEXTURE_TILES_PER_BLOCK 1\n"
"#define IMCLT_THREADS_PER_TILE 16\n#define IMCLT_TILES_PER_BLOCK 4\n#define CORR_NTILE_SHIFT 8\n"
"#define TASK_INTER_EN 10\n#define TASK_CORR_EN 9\n#define TASK_TEXT_EN 8\n"
"#define TASK_TEXT_N_BIT 0\n#define TASK_TEXT_NE_BIT 1\n#define TASK_TEXT_E_BIT 2\n#define TASK_TEXT_SE_BIT 3\n"
"#define TASK_TEXT_S_BIT 4\n#define TASK_TEXT_SW_BIT 5\n#define TASK_TEXT_W_BIT 6\n#define TASK_TEXT_NW_BIT 7\n"
"#define LIST_TEXTURE_BIT 8\n#define TEXT_NTILE_SHIFT 9\n#define FAT_ZERO_WEIGHT 0.0001\n"
"#define THREADS_DYNAMIC_BITS 5\n#define RBYRDIST_LEN 5001\n#define RBYRDIST_STEP 0.0004\n#define TILES_PER_BLOCK_GEOM 2\n";
static const char* SRC_FILES[] = {"dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"};
static const char* KERNELS[] = {
"convert_direct","imclt_rbg_all","correlate2D","correlate2D_inter","corr2D_combine","corr2D_normalize",
"textures_nonoverlap","generate_RBGA","clear_texture_list","mark_texture_tiles","mark_texture_neighbor_tiles",
"gen_texture_list","clear_texture_rbga","textures_accumulate","create_nonoverlap_list","erase_clt_tiles",
"calculate_tiles_offsets","calc_rot_deriv","calcReverseDistortionTable"};
static const int N_KERNELS = sizeof(KERNELS)/sizeof(KERNELS[0]);
struct TpModule { CUcontext ctx; CUmodule mod; int nfun; };
static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f) return std::string(); std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }
extern "C" {
TpModule* tp_create_module(const char* srcdir, const char* devrt){
g_err[0]=0;
CUresult cr; const char* es;
#define FAILCU(call) do{ cr=(call); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("%s -> CUDA %d (%s)", #call, cr, es); return nullptr; } }while(0)
FAILCU(cuInit(0));
CUdevice dev; FAILCU(cuDeviceGet(&dev,0));
int major=0,minor=0;
FAILCU(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
FAILCU(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
int cap = major*10+minor;
CUcontext ctx; FAILCU(cuCtxCreate(&ctx,0,dev));
int narch=0; nvrtcGetNumSupportedArchs(&narch);
std::vector<int> archs(narch>0?narch:1); nvrtcGetSupportedArchs(archs.data());
int maxarch=0; for(int i=0;i<narch;i++) if(archs[i]>maxarch) maxarch=archs[i];
if(cap>maxarch) cap=maxarch;
std::string src = TP_DEFINES;
for(const char* fn : SRC_FILES){ std::string c=readFile(std::string(srcdir)+"/"+fn);
if(c.empty()){ seterr("missing/empty kernel source: %s/%s", srcdir, fn); cuCtxDestroy(ctx); return nullptr; }
src += "\n// ==== "; src += fn; src += " ====\n"; src += c; }
nvrtcProgram prog; nvrtcResult nr = nvrtcCreateProgram(&prog, src.c_str(), "tileproc.cu", 0, nullptr, nullptr);
if(nr!=NVRTC_SUCCESS){ seterr("nvrtcCreateProgram -> %s", nvrtcGetErrorString(nr)); cuCtxDestroy(ctx); return nullptr; }
std::string archopt = "--gpu-architecture=compute_" + std::to_string(cap);
const char* opts[] = { archopt.c_str(), "--extensible-whole-program" };
nr = nvrtcCompileProgram(prog, 2, opts);
if(nr!=NVRTC_SUCCESS){ size_t ls=0; nvrtcGetProgramLogSize(prog,&ls); std::vector<char> lg(ls?ls:1); nvrtcGetProgramLog(prog,lg.data());
seterr("nvrtcCompileProgram -> %s\n%s", nvrtcGetErrorString(nr), lg.data()); nvrtcDestroyProgram(&prog); cuCtxDestroy(ctx); return nullptr; }
size_t ptxsz=0; nvrtcGetPTXSize(prog,&ptxsz); std::vector<char> ptx(ptxsz); nvrtcGetPTX(prog,ptx.data()); nvrtcDestroyProgram(&prog);
CUlinkState ls;
FAILCU(cuLinkCreate(0,nullptr,nullptr,&ls));
if((cr=cuLinkAddFile(ls,CU_JIT_INPUT_LIBRARY,devrt,0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkAddFile(%s) -> %d (%s)",devrt,cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
if((cr=cuLinkAddData(ls,CU_JIT_INPUT_PTX,ptx.data(),ptxsz,"tileproc.ptx",0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkAddData -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
void* cubin=nullptr; size_t cubinsz=0;
if((cr=cuLinkComplete(ls,&cubin,&cubinsz))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkComplete -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
CUmodule mod;
if((cr=cuModuleLoadDataEx(&mod,cubin,0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuModuleLoadDataEx -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
cuLinkDestroy(ls);
int found=0;
for(int i=0;i<N_KERNELS;i++){ CUfunction f; if(cuModuleGetFunction(&f,mod,KERNELS[i])==CUDA_SUCCESS) found++;
else { if(g_err[0]==0) seterr("missing kernel %s (and maybe others)", KERNELS[i]); } }
TpModule* m = new TpModule{ctx,mod,found};
return m;
#undef FAILCU
}
int tp_module_num_functions(TpModule* m){ return m ? m->nfun : -1; }
const char* tp_last_error(){ return g_err; }
void tp_destroy_module(TpModule* m){ if(m){ if(m->mod) cuModuleUnload(m->mod); if(m->ctx) cuCtxDestroy(m->ctx); delete m; } }
} // extern "C"
// Stage-0 proof for the JCuda->JNA migration: does the RUNTIME NVRTC path the native shim will use
// (compile the kernels with the JCUDA defines -> PTX -> cuLink(libcudadevrt) -> module -> get 19 functions)
// work on this GPU (RTX 5060 Ti / sm_120, CUDA 12.6)? No Java/JNA yet. Replicates GPUTileProcessor.createFunctions.
// Build: ./build_probe.sh Run: ./tp_nvrtc_probe [kernel_src_dir] [libcudadevrt.a]
// By Claude on 2026-06-25.
#include <cuda.h>
#include <nvrtc.h>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#define CU(call) do { CUresult r=(call); if(r!=CUDA_SUCCESS){ const char*s; cuGetErrorString(r,&s); \
printf("CUDA ERR %d (%s) at %s:%d -> %s\n", r, s, __FILE__, __LINE__, #call); return 2; } } while(0)
#define NVR(call) do { nvrtcResult r=(call); if(r!=NVRTC_SUCCESS){ \
printf("NVRTC ERR %d (%s) at %s:%d -> %s\n", r, nvrtcGetErrorString(r), __FILE__, __LINE__, #call); } } while(0)
// getTpDefines() values copied verbatim from GPUTileProcessor.java (lines 119-179 / 233-273). TILES_PER_BLOCK_GEOM = 32/16 = 2.
static const char* TP_DEFINES =
"#define JCUDA\n#define DTT_SIZE_LOG2 3\n#define THREADSX 8\n#define NUM_CAMS 16\n"
"#define THREADS_PER_TILE 8\n#define TILES_PER_BLOCK 4\n#define CORR_THREADS_PER_TILE 8\n"
"#define CORR_TILES_PER_BLOCK 4\n#define CORR_TILES_PER_BLOCK_NORMALIZE 4\n#define CORR_TILES_PER_BLOCK_COMBINE 4\n"
"#define NUM_THREADS 32\n#define TEXTURE_THREADS_PER_TILE 8\n#define TEXTURE_TILES_PER_BLOCK 1\n"
"#define IMCLT_THREADS_PER_TILE 16\n#define IMCLT_TILES_PER_BLOCK 4\n#define CORR_NTILE_SHIFT 8\n"
"#define TASK_INTER_EN 10\n#define TASK_CORR_EN 9\n#define TASK_TEXT_EN 8\n"
"#define TASK_TEXT_N_BIT 0\n#define TASK_TEXT_NE_BIT 1\n#define TASK_TEXT_E_BIT 2\n#define TASK_TEXT_SE_BIT 3\n"
"#define TASK_TEXT_S_BIT 4\n#define TASK_TEXT_SW_BIT 5\n#define TASK_TEXT_W_BIT 6\n#define TASK_TEXT_NW_BIT 7\n"
"#define LIST_TEXTURE_BIT 8\n#define TEXT_NTILE_SHIFT 9\n#define FAT_ZERO_WEIGHT 0.0001\n"
"#define THREADS_DYNAMIC_BITS 5\n#define RBYRDIST_LEN 5001\n#define RBYRDIST_STEP 0.0004\n#define TILES_PER_BLOCK_GEOM 2\n";
// GPU_SRC_FILES (USE_CUDA12 path): "*"(defines) + these, concatenated into ONE NVRTC unit.
static const char* SRC_FILES[] = {"dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"};
// The 19 kernels Java loads (GPU_*_NAME in GPUTileProcessor.java).
static const char* KERNELS[] = {
"convert_direct","imclt_rbg_all","correlate2D","correlate2D_inter","corr2D_combine","corr2D_normalize",
"textures_nonoverlap","generate_RBGA","clear_texture_list","mark_texture_tiles","mark_texture_neighbor_tiles",
"gen_texture_list","clear_texture_rbga","textures_accumulate","create_nonoverlap_list","erase_clt_tiles",
"calculate_tiles_offsets","calc_rot_deriv","calcReverseDistortionTable"};
static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f){printf("MISSING %s\n",p.c_str()); return std::string();}
std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }
int main(int argc, char** argv){
std::string srcdir = (argc>1)? argv[1] : "/home/elphel/git/tile_processor_gpu/src";
std::string devrt = (argc>2)? argv[2] : "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a";
printf("Stage-0 NVRTC probe srcdir=%s\n libcudadevrt=%s\n", srcdir.c_str(), devrt.c_str());
CU(cuInit(0));
CUdevice dev; CU(cuDeviceGet(&dev,0));
char name[256]; CU(cuDeviceGetName(name,sizeof(name),dev));
int major=0,minor=0;
CU(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
CU(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
int cap = major*10 + minor;
printf("GPU: %s compute capability %d.%d (cap=%d)\n", name, major, minor, cap);
CUcontext ctx; CU(cuCtxCreate(&ctx,0,dev));
// clamp cap to NVRTC-supported max (mirrors createFunctions)
int narch=0; NVR(nvrtcGetNumSupportedArchs(&narch));
std::vector<int> archs(narch>0?narch:1); NVR(nvrtcGetSupportedArchs(archs.data()));
int maxarch=0; for(int i=0;i<narch;i++){ if(archs[i]>maxarch) maxarch=archs[i]; }
printf("NVRTC supported archs: %d (max %d)\n", narch, maxarch);
if(cap>maxarch){ printf("WARN cap %d > NVRTC max %d -> clamping to %d\n", cap, maxarch, maxarch); cap=maxarch; }
// assemble single source unit: defines + each file
std::string src = TP_DEFINES;
for(const char* fn : SRC_FILES){ std::string c=readFile(srcdir+"/"+fn); if(c.empty()){printf("FATAL: empty/missing %s\n",fn); return 3;} src += "\n// ==== "; src += fn; src += " ====\n"; src += c; }
printf("assembled source: %zu bytes\n", src.size());
nvrtcProgram prog; NVR(nvrtcCreateProgram(&prog, src.c_str(), "tileproc.cu", 0, nullptr, nullptr));
std::string archopt = "--gpu-architecture=compute_" + std::to_string(cap);
const char* opts[] = { archopt.c_str(), "--extensible-whole-program" }; // USE_CUDA12 -> 2 options
printf("NVRTC options: %s %s\n", opts[0], opts[1]);
nvrtcResult cres = nvrtcCompileProgram(prog, 2, opts);
size_t logsz=0; nvrtcGetProgramLogSize(prog,&logsz);
if(logsz>1){ std::vector<char> log(logsz); nvrtcGetProgramLog(prog,log.data()); printf("--- NVRTC log ---\n%s\n-----------------\n", log.data()); }
if(cres!=NVRTC_SUCCESS){ printf("RESULT: NVRTC COMPILE FAILED (%s)\n", nvrtcGetErrorString(cres)); return 4; }
size_t ptxsz=0; NVR(nvrtcGetPTXSize(prog,&ptxsz)); std::vector<char> ptx(ptxsz); NVR(nvrtcGetPTX(prog,ptx.data()));
nvrtcDestroyProgram(&prog);
printf("NVRTC compiled OK, PTX %zu bytes\n", ptxsz);
// cuLink: PTX + libcudadevrt.a (CDP) -> cubin -> module (mirrors createFunctions)
CUlinkState ls; CU(cuLinkCreate(0,nullptr,nullptr,&ls));
CU(cuLinkAddFile(ls, CU_JIT_INPUT_LIBRARY, devrt.c_str(), 0, nullptr, nullptr));
CU(cuLinkAddData(ls, CU_JIT_INPUT_PTX, ptx.data(), ptxsz, "tileproc.ptx", 0, nullptr, nullptr));
void* cubin=nullptr; size_t cubinsz=0; CU(cuLinkComplete(ls,&cubin,&cubinsz));
printf("cuLinkComplete OK, cubin %zu bytes\n", cubinsz);
CUmodule mod; CU(cuModuleLoadDataEx(&mod, cubin, 0, nullptr, nullptr));
cuLinkDestroy(ls);
int nfun=sizeof(KERNELS)/sizeof(KERNELS[0]), found=0;
for(int i=0;i<nfun;i++){ CUfunction f; CUresult r=cuModuleGetFunction(&f, mod, KERNELS[i]);
if(r==CUDA_SUCCESS){ found++; printf(" [OK] %s\n", KERNELS[i]); }
else { const char*s; cuGetErrorString(r,&s); printf(" [MISS] %s (%s)\n", KERNELS[i], s); } }
printf("RESULT: loaded %d/%d functions on %s (sm_%d)\n", found, nfun, name, cap);
return (found==nfun)? 0 : 5;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment