CLAUDE: Stage 2 — native convert_direct selftest (first real execution + CDP on Blackwell)

Add tp_convert_direct_selftest to the JNA shim: mirrors TpHostGpu allTests' convert path (setImageKernels/setImgBuffers/setCltBuffers/setTasks + calc_reverse_distortions -> rot_derivs -> calculate_tiles_offsets [CDP] -> convert_direct), reusing the harness runtime-API host helpers (tp_utils/tp_files/TpParams/tp_paths) for ALL allocation and porting only the launches to driver-API cuLaunchKernel against the NVRTC module. Reads CLT back, compares to clt/aux_chnN.clt golden. build_lib.sh: nvcc + -std=c++17 (static constexpr TpParams members become inline), -Isrc + cuda-samples Common (helper_cuda.h), --pre-include algorithm. Validated on RTX 5060 Ti via Java->JNA: num_active_tiles=5120 (all), max|CLT-golden| =0.1085 over peaks of 12260 -> relative ~8.85e-6 (float32 NVRTC-vs-nvcc variation). First CDP (calculate_tiles_offsets) and 17-arg pointer-of-pointers convert_direct launch executing natively on Blackwell. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>

CLAUDE: Stage 2 — native convert_direct selftest (first real execution + CDP on Blackwell)
Add tp_convert_direct_selftest to the JNA shim: mirrors TpHostGpu allTests' convert path (setImageKernels/setImgBuffers/setCltBuffers/setTasks + calc_reverse_distortions -> rot_derivs -> calculate_tiles_offsets [CDP] -> convert_direct), reusing the harness runtime-API host helpers (tp_utils/tp_files/TpParams/tp_paths) for ALL allocation and porting only the launches to driver-API cuLaunchKernel against the NVRTC module. Reads CLT back, compares to clt/aux_chnN.clt golden. build_lib.sh: nvcc + -std=c++17 (static constexpr TpParams members become inline), -Isrc + cuda-samples Common (helper_cuda.h), --pre-include algorithm. Validated on RTX 5060 Ti via Java->JNA: num_active_tiles=5120 (all), max|CLT-golden| =0.1085 over peaks of 12260 -> relative ~8.85e-6 (float32 NVRTC-vs-nvcc variation). First CDP (calculate_tiles_offsets) and 17-arg pointer-of-pointers convert_direct launch executing natively on Blackwell. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
05ee47d0 · Andrey Filippov · a5b7c269 · 05ee47d0 · 05ee47d0
Commit 05ee47d0 authored Jun 25, 2026 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 141 additions and 3 deletions

build_lib.sh jna/build_lib.sh +11 -3

tp_jna.cpp jna/tp_jna.cpp +130 -0

No files found.
--- a/jna/build_lib.sh
+++ b/jna/build_lib.sh
 #!/usr/bin/env bash
 # Build libtileproc.so (JNA-callable native tile-processor shim). By Claude on 2026-06-25.
+# Stage 2 adds the convert_direct selftest, which reuses the harness host helpers
+# (tp_utils/tp_files/TpParams/tp_paths -- no __global__ kernels), so build with nvcc + -Isrc.
 set -e
 cd "$(dirname "$0")"
 CUDA="${CUDA_HOME:-/usr/local/cuda}"
-g++ -std=c++14 -shared -fPIC tp_jna.cpp -o libtileproc.so \
+SRC=../src
-    -I"$CUDA/include" -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
+SAMPLES="${CUDA_SAMPLES:-/home/elphel/git/cuda-samples/Common}"  # helper_cuda.h etc.
-    -lnvrtc -lcuda
+nvcc -std=c++17 -O2 --shared -Xcompiler -fPIC \
+    --pre-include algorithm \
+    -I"$SRC" -I"$CUDA/include" -I"$SAMPLES" \
+    tp_jna.cpp "$SRC/tp_utils.cu" "$SRC/tp_files.cu" "$SRC/TpParams.cu" "$SRC/tp_paths.cu" \
+    -o libtileproc.so \
+    -L"$CUDA/lib64" -L"$CUDA/targets/x86_64-linux/lib" \
+    -lnvrtc -lcuda -lcudart
 echo "built ./libtileproc.so"
--- a/jna/tp_jna.cpp
+++ b/jna/tp_jna.cpp
@@ -3,13 +3,22 @@
 // tp_module_num_functions, tp_last_error, tp_destroy_module. By Claude on 2026-06-25.
 #include <cuda.h>
 #include <nvrtc.h>
+#include <cuda_runtime.h>
 #include <cstdio>
 #include <cstdarg>
 #include <cstring>
+#include <cstdlib>
+#include <cmath>
 #include <string>
 #include <vector>
 #include <fstream>
 #include <sstream>
+#include <unistd.h>     // chdir
+// Reuse the harness's host-side params/paths/helpers verbatim (no __global__ kernels in these TUs):
+#include "TpParams.h"   // pulls geometry_correction.h (struct gc, CltExtra, TP_TASK_* offsets), tp_defines.h
+#include "tp_paths.h"
+#include "tp_utils.h"   // copyalloc_kernel_gpu / copyalloc_pointers_gpu / copyalloc_image_gpu / alloc_kernel_gpu
+#include "tp_files.h"   // readFloatsFromFile / readAllFloatsFromFile
 static char g_err[8192] = "";
 static void seterr(const char* fmt, ...){ va_list ap; va_start(ap,fmt); vsnprintf(g_err,sizeof(g_err),fmt,ap); va_end(ap); }
@@ -218,4 +227,125 @@ void tp_destroy_instance(TpInstance* inst){
  delete inst;
 }
+// ---- Stage 2: convert_direct selftest (the first real-execution milestone) ----
+// Mirrors TpHostGpu allTests' convert path (setImageKernels/setImgBuffers/setCltBuffers/setTasks +
+// calc_reverse_distortions -> rot_derivs -> calculate_tiles_offsets [CDP] -> convert_direct), reusing
+// the harness's runtime-API host helpers for ALL allocation and porting only the launches to driver-API
+// cuLaunchKernel against the NVRTC module. Reads CLT back and compares to clt/aux_chnN.clt golden.
+// data_root must contain clt/ (TpPaths uses relative "clt/..." names). lwir=1 -> 16-cam LWIR set.
+// Runtime (cudaMalloc) and driver (cuLaunchKernel) interoperate because the module's context is current.
+int tp_convert_direct_selftest(TpModule* m, int lwir, const char* data_root,
+                               double* out_max_err, int* out_num_active){
+  g_err[0]=0;
+  if(!m){ seterr("selftest: null module"); return -1; }
+  if(data_root && data_root[0] && chdir(data_root)!=0){ seterr("chdir(%s) failed", data_root); return -2; }
+  cuCtxSetCurrent(m->ctx);
+  CUresult cr; const char* es;
+  TpParams P((int)lwir);
+  TpPaths  PA((int)lwir);
+  CUfunction f_rbr = getfun(m,"calcReverseDistortionTable");
+  CUfunction f_rot = getfun(m,"calc_rot_deriv");
+  CUfunction f_off = getfun(m,"calculate_tiles_offsets");
+  CUfunction f_cd  = getfun(m,"convert_direct");
+  if(!f_rbr||!f_rot||!f_off||!f_cd){ seterr("missing kernel(s): rbr=%p rot=%p off=%p cd=%p",(void*)f_rbr,(void*)f_rot,(void*)f_off,(void*)f_cd); return -3; }
+  int NC = P.num_cams; // non-const: passed by &NC as a kernel arg (cuLaunchKernel wants void*)
+  float* hbuf = (float*) malloc((size_t)P.kern_size * sizeof(float)); // largest single file (kernels)
+  // --- kernels + kernel offsets (pointer-of-pointers) ---
+  std::vector<float*> kernels_h(NC), offs_h(NC);
+  for(int c=0;c<NC;c++){
+    readFloatsFromFile(hbuf, PA.kernel_file[c]);      kernels_h[c]=copyalloc_kernel_gpu(hbuf, P.kern_size);
+    readFloatsFromFile(hbuf, PA.kernel_offs_file[c]); offs_h[c]=copyalloc_kernel_gpu(hbuf, P.kern_tiles*(int)(sizeof(CltExtra)/sizeof(float)));
+  }
+  float** gpu_kernels        = copyalloc_pointers_gpu(kernels_h.data(), NC);
+  float** gpu_kernel_offsets = copyalloc_pointers_gpu(offs_h.data(),    NC);
+  // --- images (pitched) ---
+  std::vector<float*> images_h(NC); size_t dstride=0;
+  for(int c=0;c<NC;c++){ readFloatsFromFile(hbuf, PA.image_files[c]); images_h[c]=copyalloc_image_gpu(hbuf,&dstride,P.img_width,P.img_height); }
+  float** gpu_images = copyalloc_pointers_gpu(images_h.data(), NC);
+  // --- CLT output buffers ---
+  const int slice = P.tilesy*P.tilesx*P.num_colors*4*P.dtt_size*P.dtt_size;
+  std::vector<float*> clt_h(NC);
+  for(int c=0;c<NC;c++) clt_h[c]=alloc_kernel_gpu(slice);
+  float** gpu_clt = copyalloc_pointers_gpu(clt_h.data(), NC);
+  int *gpu_active=nullptr,*gpu_num_active=nullptr;
+  cudaMalloc((void**)&gpu_active, (size_t)P.tilesx*P.tilesy*sizeof(int));
+  cudaMalloc((void**)&gpu_num_active, sizeof(int));
+  // --- tasks (TpHostGpu::setTasks, target_disparity=0, scale=0) ---
+  std::vector<float> coords((size_t)NC*P.tp_tasks_size*2);
+  for(int c=0;c<NC;c++) readFloatsFromFile(coords.data()+(size_t)c*P.tp_tasks_size*2, PA.ports_offs_xy_file[c]);
+  std::vector<float> ftask((size_t)P.tp_tasks_size*P.task_size, 0.0f);
+  for(int ty=0; ty<P.tilesy; ty++) for(int tx=0; tx<P.tilesx; tx++){
+    int nt = ty*P.tilesx + tx;
+    int task_task = (1<<P.task_inter_en)|(1<<P.task_corr_en)|(1<<P.task_text_en);
+    int task_txy  = tx + (ty<<16);
+    float* tp = ftask.data() + (size_t)P.task_size*nt;
+    tp[P.tp_task_task_offset]      = *(float*)&task_task;
+    tp[P.tp_task_txy_offset]       = *(float*)&task_txy;
+    tp[P.tp_task_disparity_offset] = 0.0f;
+    tp[P.tp_task_scale_offset]     = 0.0f;
+    float* q = tp + P.tp_task_xy_offset;
+    for(int c=0;c<NC;c++){ *q++ = coords[(size_t)c*P.tp_tasks_size*2 + nt*2 + 0]; *q++ = coords[(size_t)c*P.tp_tasks_size*2 + nt*2 + 1]; }
+  }
+  float* gpu_ftasks = copyalloc_kernel_gpu(ftask.data(), P.tp_tasks_size*P.task_size);
+  // --- geometry (load gc/cv from files; compute rByRDist + rot_deriv via the kernels, as allTests does) ---
+  int n_gc=0,n_cv=0;
+  float* fgc = readAllFloatsFromFile(PA.geometry_correction_file,&n_gc);
+  float* fcv = readAllFloatsFromFile(PA.correction_vector_file,&n_cv);
+  float* gpu_gc = copyalloc_kernel_gpu(fgc, n_gc);
+  float* gpu_cv = copyalloc_kernel_gpu(fcv, n_cv);
+  float* gpu_rbr = alloc_kernel_gpu(5001);
+  float* gpu_rot = alloc_kernel_gpu(5*16*3*3);
+  #define LAUNCH(fn,gx,gy,gz,bx,by,bz,...) do{ void* a[]={__VA_ARGS__}; \
+      cuCtxSynchronize(); cr=cuLaunchKernel(fn,gx,gy,gz,bx,by,bz,0,nullptr,a,nullptr); \
+      if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("launch %s -> %d (%s)",#fn,cr,es); return -10; } \
+      cr=cuCtxSynchronize(); if(cr!=CUDA_SUCCESS){ cuGetErrorString(cr,&es); seterr("sync %s -> %d (%s)",#fn,cr,es); return -11; } }while(0)
+  int mc16 = 16;                              // calcReverseDistortionTable grid = MAX_NUM_CAMS
+  LAUNCH(f_rbr, mc16,1,1, 3,3,3, &gpu_gc, &gpu_rbr);
+  LAUNCH(f_rot, NC,1,1,   3,3,3, &NC, &gpu_cv, &gpu_rot);
+  // calculate_tiles_offsets <<<1,1>>> (uniform_grid=1) — first CDP execution
+  int uniform_grid = 1; int num_tiles = P.tp_tasks_size;
+  LAUNCH(f_off, 1,1,1, 1,1,1, &uniform_grid,&NC,&gpu_ftasks,&num_tiles,&gpu_gc,&gpu_cv,&gpu_rbr,&gpu_rot);
+  // convert_direct <<<1,1>>> (17 args; arg13=0, arg14=kernels_hor — verbatim from TpHostGpu)
+  int num_colors=P.num_colors, lpf_mask=0, woi_w=P.img_width, woi_h=P.img_height,
+      kh_arg=0, kv_arg=P.kernels_hor, tilesx=P.tilesx;
+  size_t dstride_floats = dstride/sizeof(float);
+  LAUNCH(f_cd, 1,1,1, 1,1,1,
+    &NC,&num_colors,&gpu_kernel_offsets,&gpu_kernels,&gpu_images,&gpu_ftasks,&gpu_clt,
+    &dstride_floats,&num_tiles,&lpf_mask,&woi_w,&woi_h,&kh_arg,&kv_arg,
+    &gpu_active,&gpu_num_active,&tilesx);
+  #undef LAUNCH
+  int num_active=0; cudaMemcpy(&num_active, gpu_num_active, sizeof(int), cudaMemcpyDeviceToHost);
+  // --- compare CLT to golden clt/aux_chnN.clt ---
+  double maxerr=0;
+  std::vector<float> golden(slice), got(slice);
+  for(int c=0;c<NC;c++){
+    readFloatsFromFile(golden.data(), PA.ports_clt_file[c]);
+    cudaMemcpy(got.data(), clt_h[c], (size_t)slice*sizeof(float), cudaMemcpyDeviceToHost);
+    for(int i=0;i<slice;i++){ double e=std::fabs((double)got[i]-(double)golden[i]); if(e>maxerr) maxerr=e; }
+  }
+  if(out_max_err) *out_max_err=maxerr;
+  if(out_num_active) *out_num_active=num_active;
+  // --- free ---
+  for(int c=0;c<NC;c++){ cudaFree(kernels_h[c]); cudaFree(offs_h[c]); cudaFree(images_h[c]); cudaFree(clt_h[c]); }
+  cudaFree(gpu_kernels); cudaFree(gpu_kernel_offsets); cudaFree(gpu_images); cudaFree(gpu_clt);
+  cudaFree(gpu_active); cudaFree(gpu_num_active); cudaFree(gpu_ftasks);
+  cudaFree(gpu_gc); cudaFree(gpu_cv); cudaFree(gpu_rbr); cudaFree(gpu_rot);
+  free(fgc); free(fcv); free(hbuf);
+  return 0;
+}
 } // extern "C"