Pointer.to(gpu_twh));// int * twh); allocate int[2] for width, heightin DP
cuCtxSynchronize();
cuCtxSynchronize();
// Call the kernel function
// Call the kernel function
...
@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
...
@@ -2873,7 +2887,7 @@ public class GpuQuad{ // quad camera description
if(DEBUG8A){
if(DEBUG8A){
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),gpu_texture_indices_ovlp,cpu_texture_indices_ovlp.length*Sizeof.INT);// hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
cuMemcpyDtoH(Pointer.to(cpu_texture_indices_ovlp),gpu_texture_indices_ovlp,cpu_texture_indices_ovlp.length*Sizeof.INT);// hope that Float.floatToIntBits(fcorr_indices[i]) is not needed
}
}
int[]cpu_pnum_texture_tiles={0};//// debugging
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// Run 8 times - first 4 1-tile offsets inner tiles (w/o verifying margins), then - 4 times with verification and ignoring 4-pixel
// oversize (border 16x 16 tiles overhang by 4 pixels)
// oversize (border 16x 16 tiles overhang by 4 pixels)
//// (((pass & 3) << 1) + border_tile)*Sizeof.INT), // int * num_texture_tiles,// number of texture tiles to process
Pointer.to(gpu_texture_indices_len),// int * num_texture_tiles,// number of texture tiles to process
Pointer.to(newint[]{ti_offset}),// int gpu_texture_indices_offset, // add to gpu_texture_indices (now complicated: if negative - add *(pnum_texture_tiles) and negate
Pointer.to(gpu_texture_indices_ovlp),// gpu_texture_indices_offset,// add to gpu_texture_indices
Pointer.to(gpu_texture_indices_ovlp),// gpu_texture_indices_offset,// add to gpu_texture_indices
@@ -1074,14 +981,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
...
@@ -1074,14 +981,14 @@ extern "C" __global__ void correlate2D_inter( // only results in TD
scale0,// float scale0, // scale for R
scale0,// float scale0, // scale for R
scale1,// float scale1, // scale for B
scale1,// float scale1, // scale for B
scale2,// float scale2, // scale for G
scale2,// float scale2, // scale for G
num_corr_tiles_with_sum,// int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
pnum_corr_tiles,// num_corr_tiles_with_sum, // int num_corr_tiles, // number of correlation tiles to process (here it includes sum for compatibility with intra format)
// TODO: Make offsets calculate in compile time, to avoid NVRTC(in java): " error: dynamic initialization is not supported for a __constant__ variable"