Commit 8b3dd30b authored by Andrey Filippov's avatar Andrey Filippov

CLAUDE: gpuTrace(-Dtp.trace=1) on uncertain GpuQuad methods — JCuda/JNA oracle comparison

Add GpuQuad.gpuTrace(m) printing "[GPUTRACE] "+getClass().getSimpleName()+"."+m (off unless
-Dtp.trace=1). Instrument the un-overridden GPU methods (potential oracle gaps): getCltData,
presentCltData, eraseGpuCorrs, execCorr2D (bundled), readbackTasks, setFullFrameImages, getCorrTdData,
getCorrIndices, getCorrComboIndices, getExtra, getTextureIndices, getRBGA, execRBGA, execTextures.
Since GpuQuadJna extends GpuQuad, the trace prints "GpuQuad.X" under JCuda and "GpuQuadJna.X" if a JNA
run falls through to one (= coverage gap) -> reveals oracle's real GPU usage before any NPE.
Co-Authored-By: 's avatarClaude Opus 4.8 (1M context) <noreply@anthropic.com>
parent 31772785
......@@ -490,6 +490,12 @@ public class GpuQuad{ // quad camera description
texture_stride_rgba = (int)(device_stride[0] / Sizeof.FLOAT);
}
// Diagnostic trace (set -Dtp.trace=1). getClass().getSimpleName() prints "GpuQuad" under JCuda and
// "GpuQuadJna" if a JNA run falls through to an un-overridden base method -> reveals oracle's actual
// GPU-method usage (run JCuda first) and any JNA coverage gap, before an NPE. By Claude 2026-06-25.
public static boolean GPU_TRACE = "1".equals(System.getProperty("tp.trace"));
protected void gpuTrace(String m) { if (GPU_TRACE) System.out.println("[GPUTRACE] " + getClass().getSimpleName() + "." + m); }
// Backend selector (architecture B). Default JCUDA; set -Dtp.backend=jna to use the native
// (libtileproc.so via JNA) backend GpuQuadJna instead. JNA mode never initializes JCuda.
// Validate by running the same workflow both ways and diffing saved outputs.
......@@ -995,6 +1001,7 @@ public class GpuQuad{ // quad camera description
int num_sensors,
boolean use_aux // while is it in class member? - just to be able to free
) {
gpuTrace("readbackTasks");
num_task_tiles = num_tasks;
TpTask [] tile_tasks = new TpTask[num_task_tiles];
int task_size = getTaskSize();
......@@ -1060,6 +1067,7 @@ public class GpuQuad{ // quad camera description
public int [] getTextureIndices()
{
gpuTrace("getTextureIndices");
float [] ftexture_indices_len = new float[1];
cuMemcpyDtoH(Pointer.to(ftexture_indices_len), gpu_texture_indices_len, 1 * Sizeof.FLOAT);
int num_tiles = Float.floatToIntBits(ftexture_indices_len[0]);
......@@ -1234,6 +1242,7 @@ public class GpuQuad{ // quad camera description
*/
public float [][] getCltData( // only for color=0
boolean use_ref){
gpuTrace("getCltData");
CUdeviceptr [] gpu_sel_clt_h = use_ref ? gpu_clt_ref_h : gpu_clt_h;
int [] wh = use_ref ? gpu_clt_ref_wh : gpu_clt_wh;
int tilesX = wh[0] / GPUTileProcessor.DTT_SIZE;
......@@ -1257,6 +1266,7 @@ public class GpuQuad{ // quad camera description
}
public float [][] presentCltData(boolean use_ref){
gpuTrace("presentCltData");
float [][] fclt = getCltData(use_ref);
float [][] pfclt = new float [fclt.length][fclt[0].length];
int [] wh = use_ref ? gpu_clt_ref_wh : gpu_clt_wh;
......@@ -1473,6 +1483,7 @@ public class GpuQuad{ // quad camera description
int corr_mask, // which correlation pairs to generate (maybe later - reduce size from 15x15)
// final int threadsMax, // maximal number of threads to launch
final int debugLevel) {
gpuTrace("setFullFrameImages");
int tilesX = img_width / GPUTileProcessor.DTT_SIZE;
int tilesY = img_height / GPUTileProcessor.DTT_SIZE;
float [] target_disparities = new float [tilesX * tilesY];
......@@ -2392,6 +2403,7 @@ public class GpuQuad{ // quad camera description
double [] scales,
double fat_zero,
int corr_radius) {
gpuTrace("execCorr2D");
if (this.gpuTileProcessor.GPU_CORRELATE2D_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CORRELATE2D_kernel");
......@@ -2692,6 +2704,7 @@ public class GpuQuad{ // quad camera description
double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
boolean dust_remove,
int keep_weights) {
gpuTrace("execRBGA");
if (GPUTileProcessor.USE_DS_DP) {
execRBGA_DP(
color_weights, // double [] color_weights,
......@@ -3261,6 +3274,7 @@ public class GpuQuad{ // quad camera description
boolean calc_extra,
boolean linescan_order
) {
gpuTrace("execTextures");
if (GPUTileProcessor.USE_DS_DP) {
execTextures_DP(
color_weights,
......@@ -3983,6 +3997,7 @@ public class GpuQuad{ // quad camera description
}
public int [] getCorrIndices() {
gpuTrace("getCorrIndices");
int [] inum_corrs = new int[1];
cuMemcpyDtoH(Pointer.to(inum_corrs), gpu_num_corr_tiles, 1 * Sizeof.INT);
int num_corrs = inum_corrs[0];
......@@ -3994,6 +4009,7 @@ public class GpuQuad{ // quad camera description
public float [] getCorrTdData(){
gpuTrace("getCorrTdData");
int corr_size_td = 4 * GPUTileProcessor.DTT_SIZE * GPUTileProcessor.DTT_SIZE;
float [] cpu_corrs = new float [ num_corr_tiles * corr_size_td];
CUDA_MEMCPY2D copyD2H = new CUDA_MEMCPY2D();
......@@ -4040,6 +4056,7 @@ public class GpuQuad{ // quad camera description
public int [] getCorrComboIndices() {
gpuTrace("getCorrComboIndices");
// float [] fnum_corrs = new float[1];
// cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
// int num_corrs = Float.floatToIntBits(fnum_corrs[0]);
......@@ -4077,6 +4094,7 @@ public class GpuQuad{ // quad camera description
public float [][] getCorr2D(int corr_rad){
gpuTrace("eraseGpuCorrs");
int corr_size = (2 * corr_rad + 1) * (2 * corr_rad + 1);
float [] cpu_corrs = new float [ num_corr_tiles * corr_size];
CUDA_MEMCPY2D copyD2H = new CUDA_MEMCPY2D();
......@@ -4132,6 +4150,7 @@ public class GpuQuad{ // quad camera description
* @return [ num_cams*(num_colors+1)][tilesX*tilesY] array for macro generation
*/
public float [][] getExtra(){
gpuTrace("getExtra");
int [] texture_indices = getTextureIndices();
int num_tile_extra = num_cams*(num_colors+1);
float [] diff_rgb_combo = new float[texture_indices.length * num_tile_extra];
......@@ -4181,6 +4200,7 @@ public class GpuQuad{ // quad camera description
copy_rbga.srcXInBytes = 4 * Sizeof.FLOAT;
for (int ncol = 0; ncol<= num_colors; ncol++ ) {
gpuTrace("getRBGA");
copy_rbga.dstHost = Pointer.to(rslt[ncol]);
copy_rbga.srcY = 4 + (woi.height +GPUTileProcessor.DTT_SIZE) * ncol;
cuMemcpy2D(copy_rbga); // run copy
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment