CLAUDE: JNA shim for the GPU migration (Stage 0/0b)

libtileproc shim (tp_jna.cpp: extern "C" tp_create_module/num_functions/last_error/destroy) + standalone tp_nvrtc_probe.cpp + build_probe.sh. NVRTC-compiles the kernels (+ JCUDA defines) -> cuLink(libcudadevrt, CDP) -> module -> 19 functions, validated on the RTX 5060 Ti (sm_120 via compute_90 PTX + driver JIT). Build artifacts gitignored. By the JCuda->JNA migration. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: JNA shim for the GPU migration (Stage 0/0b)
libtileproc shim (tp_jna.cpp: extern "C" tp_create_module/num_functions/last_error/destroy) + standalone tp_nvrtc_probe.cpp + build_probe.sh. NVRTC-compiles the kernels (+ JCUDA defines) -> cuLink(libcudadevrt, CDP) -> module -> 19 functions, validated on the RTX 5060 Ti (sm_120 via compute_90 PTX + driver JIT). Build artifacts gitignored. By the JCuda->JNA migration. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
eec885a0 · Andrey Filippov · 75acb784 · eec885a0 · eec885a0 · eec885a0
Commit eec885a0 authored Jun 25, 2026 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 207 additions and 0 deletions

.gitignore jna/.gitignore +3 -0

build_probe.sh jna/build_probe.sh +9 -0

tp_jna.cpp jna/tp_jna.cpp +96 -0

tp_nvrtc_probe.cpp jna/tp_nvrtc_probe.cpp +99 -0

No files found.
--- a/jna/.gitignore
+++ b/jna/.gitignore
+libtileproc.so
+tp_nvrtc_probe
+*.o
--- a/jna/build_probe.sh
+++ b/jna/build_probe.sh
+#!/usr/bin/env bash
+# Build the Stage-0 NVRTC probe. By Claude on 2026-06-25.
+set -e
+cd "$(dirname "$0")"
+CUDA="${CUDA_HOME:-/usr/local/cuda}"
+g++ -std=c++14 tp_nvrtc_probe.cpp -o tp_nvrtc_probe \
+    -I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
+    -lnvrtc -lcuda
+echo "built ./tp_nvrtc_probe"
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
+// Stage-0b: the probe refactored into a JNA-callable shared lib (libtileproc.so).
+// extern "C": tp_create_module (NVRTC-compile kernels + cuLink(libcudadevrt) + load 19 funcs),
+// tp_module_num_functions, tp_last_error, tp_destroy_module. By Claude on 2026-06-25.
+#include <cuda.h>
+#include <nvrtc.h>
+#include <cstdio>
+#include <cstdarg>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <sstream>
+
+static char g_err[8192] = "";
+static void seterr(const char* fmt, ...){ va_list ap; va_start(ap,fmt); vsnprintf(g_err,sizeof(g_err),fmt,ap); va_end(ap); }
+
+// getTpDefines() values from GPUTileProcessor.java (verbatim; TILES_PER_BLOCK_GEOM = 32/16 = 2).
+static const char* TP_DEFINES =
+  "#define JCUDA\n#define DTT_SIZE_LOG2 3\n#define THREADSX 8\n#define NUM_CAMS 16\n"
+  "#define THREADS_PER_TILE 8\n#define TILES_PER_BLOCK 4\n#define CORR_THREADS_PER_TILE 8\n"
+  "#define CORR_TILES_PER_BLOCK 4\n#define CORR_TILES_PER_BLOCK_NORMALIZE 4\n#define CORR_TILES_PER_BLOCK_COMBINE 4\n"
+  "#define NUM_THREADS 32\n#define TEXTURE_THREADS_PER_TILE 8\n#define TEXTURE_TILES_PER_BLOCK 1\n"
+  "#define IMCLT_THREADS_PER_TILE 16\n#define IMCLT_TILES_PER_BLOCK 4\n#define CORR_NTILE_SHIFT 8\n"
+  "#define TASK_INTER_EN 10\n#define TASK_CORR_EN 9\n#define TASK_TEXT_EN 8\n"
+  "#define TASK_TEXT_N_BIT 0\n#define TASK_TEXT_NE_BIT 1\n#define TASK_TEXT_E_BIT 2\n#define TASK_TEXT_SE_BIT 3\n"
+  "#define TASK_TEXT_S_BIT 4\n#define TASK_TEXT_SW_BIT 5\n#define TASK_TEXT_W_BIT 6\n#define TASK_TEXT_NW_BIT 7\n"
+  "#define LIST_TEXTURE_BIT 8\n#define TEXT_NTILE_SHIFT 9\n#define FAT_ZERO_WEIGHT 0.0001\n"
+  "#define THREADS_DYNAMIC_BITS 5\n#define RBYRDIST_LEN 5001\n#define RBYRDIST_STEP 0.0004\n#define TILES_PER_BLOCK_GEOM 2\n";
+static const char* SRC_FILES[] = {"dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"};
+static const char* KERNELS[] = {
+  "convert_direct","imclt_rbg_all","correlate2D","correlate2D_inter","corr2D_combine","corr2D_normalize",
+  "textures_nonoverlap","generate_RBGA","clear_texture_list","mark_texture_tiles","mark_texture_neighbor_tiles",
+  "gen_texture_list","clear_texture_rbga","textures_accumulate","create_nonoverlap_list","erase_clt_tiles",
+  "calculate_tiles_offsets","calc_rot_deriv","calcReverseDistortionTable"};
+static const int N_KERNELS = sizeof(KERNELS)/sizeof(KERNELS[0]);
+
+struct TpModule { CUcontext ctx; CUmodule mod; int nfun; };
+
+static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f) return std::string(); std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }
+
+extern "C" {
+
+TpModule* tp_create_module(const char* srcdir, const char* devrt){
+  g_err[0]=0;
+  CUresult cr; const char* es;
+  #define FAILCU(call) do{ cr=(call); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("%s -> CUDA %d (%s)", #call, cr, es); return nullptr; } }while(0)
+  FAILCU(cuInit(0));
+  CUdevice dev; FAILCU(cuDeviceGet(&dev,0));
+  int major=0,minor=0;
+  FAILCU(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+  FAILCU(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
+  int cap = major*10+minor;
+  CUcontext ctx; FAILCU(cuCtxCreate(&ctx,0,dev));
+
+  int narch=0; nvrtcGetNumSupportedArchs(&narch);
+  std::vector<int> archs(narch>0?narch:1); nvrtcGetSupportedArchs(archs.data());
+  int maxarch=0; for(int i=0;i<narch;i++) if(archs[i]>maxarch) maxarch=archs[i];
+  if(cap>maxarch) cap=maxarch;
+
+  std::string src = TP_DEFINES;
+  for(const char* fn : SRC_FILES){ std::string c=readFile(std::string(srcdir)+"/"+fn);
+    if(c.empty()){ seterr("missing/empty kernel source: %s/%s", srcdir, fn); cuCtxDestroy(ctx); return nullptr; }
+    src += "\n// ==== "; src += fn; src += " ====\n"; src += c; }
+
+  nvrtcProgram prog; nvrtcResult nr = nvrtcCreateProgram(&prog, src.c_str(), "tileproc.cu", 0, nullptr, nullptr);
+  if(nr!=NVRTC_SUCCESS){ seterr("nvrtcCreateProgram -> %s", nvrtcGetErrorString(nr)); cuCtxDestroy(ctx); return nullptr; }
+  std::string archopt = "--gpu-architecture=compute_" + std::to_string(cap);
+  const char* opts[] = { archopt.c_str(), "--extensible-whole-program" };
+  nr = nvrtcCompileProgram(prog, 2, opts);
+  if(nr!=NVRTC_SUCCESS){ size_t ls=0; nvrtcGetProgramLogSize(prog,&ls); std::vector<char> lg(ls?ls:1); nvrtcGetProgramLog(prog,lg.data());
+    seterr("nvrtcCompileProgram -> %s\n%s", nvrtcGetErrorString(nr), lg.data()); nvrtcDestroyProgram(&prog); cuCtxDestroy(ctx); return nullptr; }
+  size_t ptxsz=0; nvrtcGetPTXSize(prog,&ptxsz); std::vector<char> ptx(ptxsz); nvrtcGetPTX(prog,ptx.data()); nvrtcDestroyProgram(&prog);
+
+  CUlinkState ls;
+  FAILCU(cuLinkCreate(0,nullptr,nullptr,&ls));
+  if((cr=cuLinkAddFile(ls,CU_JIT_INPUT_LIBRARY,devrt,0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkAddFile(%s) -> %d (%s)",devrt,cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
+  if((cr=cuLinkAddData(ls,CU_JIT_INPUT_PTX,ptx.data(),ptxsz,"tileproc.ptx",0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkAddData -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
+  void* cubin=nullptr; size_t cubinsz=0;
+  if((cr=cuLinkComplete(ls,&cubin,&cubinsz))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuLinkComplete -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
+  CUmodule mod;
+  if((cr=cuModuleLoadDataEx(&mod,cubin,0,nullptr,nullptr))!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("cuModuleLoadDataEx -> %d (%s)",cr,es); cuLinkDestroy(ls); cuCtxDestroy(ctx); return nullptr; }
+  cuLinkDestroy(ls);
+
+  int found=0;
+  for(int i=0;i<N_KERNELS;i++){ CUfunction f; if(cuModuleGetFunction(&f,mod,KERNELS[i])==CUDA_SUCCESS) found++;
+    else { if(g_err[0]==0) seterr("missing kernel %s (and maybe others)", KERNELS[i]); } }
+  TpModule* m = new TpModule{ctx,mod,found};
+  return m;
+  #undef FAILCU
+}
+
+int  tp_module_num_functions(TpModule* m){ return m ? m->nfun : -1; }
+const char* tp_last_error(){ return g_err; }
+void tp_destroy_module(TpModule* m){ if(m){ if(m->mod) cuModuleUnload(m->mod); if(m->ctx) cuCtxDestroy(m->ctx); delete m; } }
+
+} // extern "C"
--- a/jna/tp_nvrtc_probe.cpp
+++ b/jna/tp_nvrtc_probe.cpp
+// Stage-0 proof for the JCuda->JNA migration: does the RUNTIME NVRTC path the native shim will use
+// (compile the kernels with the JCUDA defines -> PTX -> cuLink(libcudadevrt) -> module -> get 19 functions)
+// work on this GPU (RTX 5060 Ti / sm_120, CUDA 12.6)? No Java/JNA yet. Replicates GPUTileProcessor.createFunctions.
+// Build:  ./build_probe.sh    Run:  ./tp_nvrtc_probe [kernel_src_dir] [libcudadevrt.a]
+// By Claude on 2026-06-25.
+#include <cuda.h>
+#include <nvrtc.h>
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <fstream>
+#include <sstream>
+
+#define CU(call)   do { CUresult r=(call); if(r!=CUDA_SUCCESS){ const char*s; cuGetErrorString(r,&s); \
+                        printf("CUDA ERR %d (%s) at %s:%d -> %s\n", r, s, __FILE__, __LINE__, #call); return 2; } } while(0)
+#define NVR(call)  do { nvrtcResult r=(call); if(r!=NVRTC_SUCCESS){ \
+                        printf("NVRTC ERR %d (%s) at %s:%d -> %s\n", r, nvrtcGetErrorString(r), __FILE__, __LINE__, #call); } } while(0)
+
+// getTpDefines() values copied verbatim from GPUTileProcessor.java (lines 119-179 / 233-273). TILES_PER_BLOCK_GEOM = 32/16 = 2.
+static const char* TP_DEFINES =
+  "#define JCUDA\n#define DTT_SIZE_LOG2 3\n#define THREADSX 8\n#define NUM_CAMS 16\n"
+  "#define THREADS_PER_TILE 8\n#define TILES_PER_BLOCK 4\n#define CORR_THREADS_PER_TILE 8\n"
+  "#define CORR_TILES_PER_BLOCK 4\n#define CORR_TILES_PER_BLOCK_NORMALIZE 4\n#define CORR_TILES_PER_BLOCK_COMBINE 4\n"
+  "#define NUM_THREADS 32\n#define TEXTURE_THREADS_PER_TILE 8\n#define TEXTURE_TILES_PER_BLOCK 1\n"
+  "#define IMCLT_THREADS_PER_TILE 16\n#define IMCLT_TILES_PER_BLOCK 4\n#define CORR_NTILE_SHIFT 8\n"
+  "#define TASK_INTER_EN 10\n#define TASK_CORR_EN 9\n#define TASK_TEXT_EN 8\n"
+  "#define TASK_TEXT_N_BIT 0\n#define TASK_TEXT_NE_BIT 1\n#define TASK_TEXT_E_BIT 2\n#define TASK_TEXT_SE_BIT 3\n"
+  "#define TASK_TEXT_S_BIT 4\n#define TASK_TEXT_SW_BIT 5\n#define TASK_TEXT_W_BIT 6\n#define TASK_TEXT_NW_BIT 7\n"
+  "#define LIST_TEXTURE_BIT 8\n#define TEXT_NTILE_SHIFT 9\n#define FAT_ZERO_WEIGHT 0.0001\n"
+  "#define THREADS_DYNAMIC_BITS 5\n#define RBYRDIST_LEN 5001\n#define RBYRDIST_STEP 0.0004\n#define TILES_PER_BLOCK_GEOM 2\n";
+
+// GPU_SRC_FILES (USE_CUDA12 path): "*"(defines) + these, concatenated into ONE NVRTC unit.
+static const char* SRC_FILES[] = {"dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cu"};
+// The 19 kernels Java loads (GPU_*_NAME in GPUTileProcessor.java).
+static const char* KERNELS[] = {
+  "convert_direct","imclt_rbg_all","correlate2D","correlate2D_inter","corr2D_combine","corr2D_normalize",
+  "textures_nonoverlap","generate_RBGA","clear_texture_list","mark_texture_tiles","mark_texture_neighbor_tiles",
+  "gen_texture_list","clear_texture_rbga","textures_accumulate","create_nonoverlap_list","erase_clt_tiles",
+  "calculate_tiles_offsets","calc_rot_deriv","calcReverseDistortionTable"};
+
+static std::string readFile(const std::string& p){ std::ifstream f(p); if(!f){printf("MISSING %s\n",p.c_str()); return std::string();}
+  std::stringstream ss; ss<<f.rdbuf(); return ss.str(); }
+
+int main(int argc, char** argv){
+  std::string srcdir = (argc>1)? argv[1] : "/home/elphel/git/tile_processor_gpu/src";
+  std::string devrt  = (argc>2)? argv[2] : "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a";
+  printf("Stage-0 NVRTC probe  srcdir=%s\n  libcudadevrt=%s\n", srcdir.c_str(), devrt.c_str());
+
+  CU(cuInit(0));
+  CUdevice dev; CU(cuDeviceGet(&dev,0));
+  char name[256]; CU(cuDeviceGetName(name,sizeof(name),dev));
+  int major=0,minor=0;
+  CU(cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
+  CU(cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));
+  int cap = major*10 + minor;
+  printf("GPU: %s  compute capability %d.%d (cap=%d)\n", name, major, minor, cap);
+  CUcontext ctx; CU(cuCtxCreate(&ctx,0,dev));
+
+  // clamp cap to NVRTC-supported max (mirrors createFunctions)
+  int narch=0; NVR(nvrtcGetNumSupportedArchs(&narch));
+  std::vector<int> archs(narch>0?narch:1); NVR(nvrtcGetSupportedArchs(archs.data()));
+  int maxarch=0; for(int i=0;i<narch;i++){ if(archs[i]>maxarch) maxarch=archs[i]; }
+  printf("NVRTC supported archs: %d (max %d)\n", narch, maxarch);
+  if(cap>maxarch){ printf("WARN cap %d > NVRTC max %d -> clamping to %d\n", cap, maxarch, maxarch); cap=maxarch; }
+
+  // assemble single source unit: defines + each file
+  std::string src = TP_DEFINES;
+  for(const char* fn : SRC_FILES){ std::string c=readFile(srcdir+"/"+fn); if(c.empty()){printf("FATAL: empty/missing %s\n",fn); return 3;} src += "\n// ==== "; src += fn; src += " ====\n"; src += c; }
+  printf("assembled source: %zu bytes\n", src.size());
+
+  nvrtcProgram prog; NVR(nvrtcCreateProgram(&prog, src.c_str(), "tileproc.cu", 0, nullptr, nullptr));
+  std::string archopt = "--gpu-architecture=compute_" + std::to_string(cap);
+  const char* opts[] = { archopt.c_str(), "--extensible-whole-program" };  // USE_CUDA12 -> 2 options
+  printf("NVRTC options: %s  %s\n", opts[0], opts[1]);
+  nvrtcResult cres = nvrtcCompileProgram(prog, 2, opts);
+  size_t logsz=0; nvrtcGetProgramLogSize(prog,&logsz);
+  if(logsz>1){ std::vector<char> log(logsz); nvrtcGetProgramLog(prog,log.data()); printf("--- NVRTC log ---\n%s\n-----------------\n", log.data()); }
+  if(cres!=NVRTC_SUCCESS){ printf("RESULT: NVRTC COMPILE FAILED (%s)\n", nvrtcGetErrorString(cres)); return 4; }
+  size_t ptxsz=0; NVR(nvrtcGetPTXSize(prog,&ptxsz)); std::vector<char> ptx(ptxsz); NVR(nvrtcGetPTX(prog,ptx.data()));
+  nvrtcDestroyProgram(&prog);
+  printf("NVRTC compiled OK, PTX %zu bytes\n", ptxsz);
+
+  // cuLink: PTX + libcudadevrt.a (CDP) -> cubin -> module  (mirrors createFunctions)
+  CUlinkState ls; CU(cuLinkCreate(0,nullptr,nullptr,&ls));
+  CU(cuLinkAddFile(ls, CU_JIT_INPUT_LIBRARY, devrt.c_str(), 0, nullptr, nullptr));
+  CU(cuLinkAddData(ls, CU_JIT_INPUT_PTX, ptx.data(), ptxsz, "tileproc.ptx", 0, nullptr, nullptr));
+  void* cubin=nullptr; size_t cubinsz=0; CU(cuLinkComplete(ls,&cubin,&cubinsz));
+  printf("cuLinkComplete OK, cubin %zu bytes\n", cubinsz);
+  CUmodule mod; CU(cuModuleLoadDataEx(&mod, cubin, 0, nullptr, nullptr));
+  cuLinkDestroy(ls);
+
+  int nfun=sizeof(KERNELS)/sizeof(KERNELS[0]), found=0;
+  for(int i=0;i<nfun;i++){ CUfunction f; CUresult r=cuModuleGetFunction(&f, mod, KERNELS[i]);
+    if(r==CUDA_SUCCESS){ found++; printf("  [OK]  %s\n", KERNELS[i]); }
+    else { const char*s; cuGetErrorString(r,&s); printf("  [MISS] %s (%s)\n", KERNELS[i], s); } }
+  printf("RESULT: loaded %d/%d functions on %s (sm_%d)\n", found, nfun, name, cap);
+  return (found==nfun)? 0 : 5;
+}