Commit 4fb94627 authored by Andrey Filippov's avatar Andrey Filippov

changing corr2D to CDP

parent 20df596a
...@@ -190,6 +190,7 @@ public class GPUTileProcessor { ...@@ -190,6 +190,7 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_clt = new CUdeviceptr(); private CUdeviceptr gpu_clt = new CUdeviceptr();
private CUdeviceptr gpu_4_images = new CUdeviceptr(); private CUdeviceptr gpu_4_images = new CUdeviceptr();
private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER private CUdeviceptr gpu_corr_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_num_corr_tiles = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER private CUdeviceptr gpu_texture_indices = new CUdeviceptr(); // allocate tilesX * tilesY * 6 * Sizeof.POINTER
private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.POINTER private CUdeviceptr gpu_port_offsets = new CUdeviceptr(); // allocate Quad * 2 * Sizeof.POINTER
private CUdeviceptr gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles private CUdeviceptr gpu_woi = new CUdeviceptr(); // 4 integers (x, y, width, height) Rectangle - in tiles
...@@ -575,8 +576,8 @@ public class GPUTileProcessor { ...@@ -575,8 +576,8 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_tasks, tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT); cuMemAlloc(gpu_tasks, tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============ //=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
// Set corrs array // Set corrs array
/// cuMemAlloc(gpu_corrs, tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER); cuMemAlloc(gpu_corr_indices, tilesX * tilesY * NUM_PAIRS * Sizeof.FLOAT);
cuMemAlloc(gpu_corr_indices, tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER); cuMemAlloc(gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
//#define TILESYA ((TILESY +3) & (~3)) //#define TILESYA ((TILESY +3) & (~3))
int tilesYa = (tilesY + 3) & ~3; int tilesYa = (tilesY + 3) & ~3;
...@@ -1119,7 +1120,7 @@ public class GPUTileProcessor { ...@@ -1119,7 +1120,7 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later cuCtxSynchronize(); // remove later
} }
public void execConverDirect() { public void execConvertDirect() {
if (GPU_CONVERT_DIRECT_kernel == null) if (GPU_CONVERT_DIRECT_kernel == null)
{ {
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel"); IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
...@@ -1206,20 +1207,24 @@ public class GPUTileProcessor { ...@@ -1206,20 +1207,24 @@ public class GPUTileProcessor {
float fscale0 = (float) scales[0]; float fscale0 = (float) scales[0];
float fscale1 = (num_colors >1)?((float) scales[1]):0.0f; float fscale1 = (num_colors >1)?((float) scales[1]):0.0f;
float fscale2 = (num_colors >2)?((float) scales[2]):0.0f; float fscale2 = (num_colors >2)?((float) scales[2]):0.0f;
int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1}; // int [] GridFullWarps = {(num_corr_tiles + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1};
int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1}; // int [] ThreadsFullWarps = {CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1};
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt), Pointer.to(gpu_clt), // float ** gpu_clt,
Pointer.to(new int[] { num_colors }), Pointer.to(new int[] { num_colors }), // int colors, // number of colors (3/1)
Pointer.to(new float[] {fscale0 }), Pointer.to(new float[] {fscale0 }), // float scale0, // scale for R
Pointer.to(new float[] {fscale1 }), Pointer.to(new float[] {fscale1 }), // float scale1, // scale for B
Pointer.to(new float[] {fscale2 }), Pointer.to(new float[] {fscale2 }), // float scale2, // scale for G
Pointer.to(new float[] {(float) fat_zero }), Pointer.to(new float[] {(float) fat_zero }),// float fat_zero, // here - absolute
Pointer.to(new int[] { num_corr_tiles }), // lpf_mask Pointer.to(gpu_tasks), // struct tp_task * gpu_tasks,
Pointer.to(gpu_corr_indices), Pointer.to(new int[] { num_task_tiles }), // int num_tiles // number of tiles in task
Pointer.to(new int[] { corr_stride }), Pointer.to(gpu_corr_indices), // int * gpu_corr_indices, // packed tile+pair
Pointer.to(new int[] { corr_radius }), Pointer.to(gpu_num_corr_tiles), // int * pnum_corr_tiles, // pointer to a number of tiles to process
Pointer.to(gpu_corrs) // lpf_mask Pointer.to(new int[] { corr_stride }), // const size_t corr_stride, // in floats
Pointer.to(new int[] { corr_radius }), // int corr_radius, // radius of the output correlation (7 for 15x15)
Pointer.to(gpu_corrs) // float * gpu_corrs); // correlation output data
); );
cuCtxSynchronize(); cuCtxSynchronize();
// Call the kernel function // Call the kernel function
...@@ -1395,6 +1400,20 @@ public class GPUTileProcessor { ...@@ -1395,6 +1400,20 @@ public class GPUTileProcessor {
} }
return corrs; return corrs;
} }
public int [] getCorrIndices() {
float [] fnum_corrs = new float[1];
cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
int num_corrs = Float.floatToIntBits(fnum_corrs[0]);
float [] fcorr_indices = new float [num_corrs];
cuMemcpyDtoH(Pointer.to(fcorr_indices), gpu_corr_indices, num_corrs * Sizeof.FLOAT);
int [] corr_indices = new int [num_corrs];
for (int i = 0; i < num_corrs; i++) {
corr_indices[i] = Float.floatToIntBits(fcorr_indices[i]);
}
num_corr_tiles = num_corrs;
return corr_indices;
}
/** /**
* Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices. * Get woi and RBGA image from the GPU after execRBGA call as 2/4 slices.
......
...@@ -2078,10 +2078,10 @@ public class TwoQuadCLT { ...@@ -2078,10 +2078,10 @@ public class TwoQuadCLT {
use_aux); // boolean use_aux) use_aux); // boolean use_aux)
int [] corr_indices = gPUTileProcessor.getCorrTasks( // int [] corr_indices = gPUTileProcessor.getCorrTasks(
tp_tasks); // tp_tasks);
// corr_indices array of integers to be passed to GPU // corr_indices array of integers to be passed to GPU
gPUTileProcessor.setCorrIndices(corr_indices); // gPUTileProcessor.setCorrIndices(corr_indices);
int [] texture_indices = gPUTileProcessor.getTextureTasks( int [] texture_indices = gPUTileProcessor.getTextureTasks(
tp_tasks); tp_tasks);
...@@ -2119,7 +2119,7 @@ public class TwoQuadCLT { ...@@ -2119,7 +2119,7 @@ public class TwoQuadCLT {
long startDirectConvert=System.nanoTime(); long startDirectConvert=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) { for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execConverDirect(); gPUTileProcessor.execConvertDirect();
} }
// run imclt; // run imclt;
...@@ -2221,6 +2221,7 @@ public class TwoQuadCLT { ...@@ -2221,6 +2221,7 @@ public class TwoQuadCLT {
int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE; int tilesY = GPUTileProcessor.IMG_HEIGHT / GPUTileProcessor.DTT_SIZE;
int [] wh = new int[2]; int [] wh = new int[2];
if (clt_parameters.show_corr) { if (clt_parameters.show_corr) {
int [] corr_indices = gPUTileProcessor.getCorrIndices();
float [][] corr2D = gPUTileProcessor.getCorr2D( float [][] corr2D = gPUTileProcessor.getCorr2D(
clt_parameters.gpu_corr_rad); // int corr_rad); clt_parameters.gpu_corr_rad); // int corr_rad);
// convert to 6-layer image using tasks // convert to 6-layer image using tasks
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment