Commit 1b16c1e5 authored by Andrey Filippov's avatar Andrey Filippov

gpu intra to batch

parent fa5947b6
...@@ -896,7 +896,7 @@ public class GPUTileProcessor { ...@@ -896,7 +896,7 @@ public class GPUTileProcessor {
} }
cuMemcpyHtoD(gpu_tasks, Pointer.to(ftasks), TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT); cuMemcpyHtoD(gpu_tasks, Pointer.to(ftasks), TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT);
} }
/*
public void setCorrIndices(int [] corr_indices) public void setCorrIndices(int [] corr_indices)
{ {
num_corr_tiles = corr_indices.length; num_corr_tiles = corr_indices.length;
...@@ -906,7 +906,6 @@ public class GPUTileProcessor { ...@@ -906,7 +906,6 @@ public class GPUTileProcessor {
} }
cuMemcpyHtoD(gpu_corr_indices, Pointer.to(fcorr_indices), num_corr_tiles * Sizeof.FLOAT); cuMemcpyHtoD(gpu_corr_indices, Pointer.to(fcorr_indices), num_corr_tiles * Sizeof.FLOAT);
} }
public void setTextureIndices(int [] texture_indices) // never used public void setTextureIndices(int [] texture_indices) // never used
{ {
num_texture_tiles = texture_indices.length; num_texture_tiles = texture_indices.length;
...@@ -916,6 +915,7 @@ public class GPUTileProcessor { ...@@ -916,6 +915,7 @@ public class GPUTileProcessor {
} }
cuMemcpyHtoD(gpu_texture_indices, Pointer.to(ftexture_indices), num_texture_tiles * Sizeof.FLOAT); cuMemcpyHtoD(gpu_texture_indices, Pointer.to(ftexture_indices), num_texture_tiles * Sizeof.FLOAT);
} }
*/
public int [] getTextureIndices() public int [] getTextureIndices()
{ {
...@@ -1733,6 +1733,7 @@ public class GPUTileProcessor { ...@@ -1733,6 +1733,7 @@ public class GPUTileProcessor {
*/ */
public void execCorr2D_normalize( public void execCorr2D_normalize(
boolean combo, // normalize combo correlations (false - per-pair ones)
double fat_zero, double fat_zero,
int corr_radius) { int corr_radius) {
if (GPU_CORR2D_NORMALIZE_kernel == null) if (GPU_CORR2D_NORMALIZE_kernel == null)
...@@ -1740,13 +1741,12 @@ public class GPUTileProcessor { ...@@ -1740,13 +1741,12 @@ public class GPUTileProcessor {
IJ.showMessage("Error", "No GPU kernel: GPU_CORR2D_NORMALIZE_kernel"); IJ.showMessage("Error", "No GPU kernel: GPU_CORR2D_NORMALIZE_kernel");
return; return;
} }
// float [] fnum_corrs = new float[1];
// cuMemcpyDtoH(Pointer.to(fnum_corrs), gpu_num_corr_tiles, 1 * Sizeof.FLOAT);
// int num_tiles = Float.floatToIntBits(fnum_corrs[0])/num_pairs; // number of correlation tiles calculated
int [] GridFullWarps = {1, 1, 1}; int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1}; int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to( Pointer kernelParameters;
if (combo) {
kernelParameters = Pointer.to(
Pointer.to(new int[] { num_corr_combo_tiles }), // num_task_tiles }), // int num_corr_tiles, // number of correlation tiles to process Pointer.to(new int[] { num_corr_combo_tiles }), // num_task_tiles }), // int num_corr_tiles, // number of correlation tiles to process
Pointer.to(new int[] { corr_stride_combo_td }),// const size_t corr_stride_td, // in floats Pointer.to(new int[] { corr_stride_combo_td }),// const size_t corr_stride_td, // in floats
Pointer.to(gpu_corrs_combo_td), // float * gpu_corrs_combo); // combined correlation output (one per tile) Pointer.to(gpu_corrs_combo_td), // float * gpu_corrs_combo); // combined correlation output (one per tile)
...@@ -1754,6 +1754,16 @@ public class GPUTileProcessor { ...@@ -1754,6 +1754,16 @@ public class GPUTileProcessor {
Pointer.to(gpu_corrs_combo), // float * gpu_corrs, // correlation output data (pixel domain) Pointer.to(gpu_corrs_combo), // float * gpu_corrs, // correlation output data (pixel domain)
Pointer.to(new float[] {(float) fat_zero }), // float fat_zero, // here - absolute Pointer.to(new float[] {(float) fat_zero }), // float fat_zero, // here - absolute
Pointer.to(new int[] { corr_radius })); // int corr_radius, // radius of the output correlation (7 for 15x15) Pointer.to(new int[] { corr_radius })); // int corr_radius, // radius of the output correlation (7 for 15x15)
} else {
kernelParameters = Pointer.to(
Pointer.to(new int[] { num_corr_tiles }), // num_task_tiles }), // int num_corr_tiles, // number of correlation tiles to process
Pointer.to(new int[] { corr_stride_td }),// const size_t corr_stride_td, // in floats
Pointer.to(gpu_corrs_td), // float * gpu_corrs_combo); // combined correlation output (one per tile)
Pointer.to(new int[] { corr_stride }), // const size_t corr_stride, // in floats
Pointer.to(gpu_corrs), // float * gpu_corrs, // correlation output data (pixel domain)
Pointer.to(new float[] {(float) fat_zero }), // float fat_zero, // here - absolute
Pointer.to(new int[] { corr_radius })); // int corr_radius, // radius of the output correlation (7 for 15x15)
}
cuCtxSynchronize(); cuCtxSynchronize();
// Call the kernel function // Call the kernel function
......
...@@ -258,39 +258,6 @@ public class ImageDtt extends ImageDttCPU { ...@@ -258,39 +258,6 @@ public class ImageDtt extends ImageDttCPU {
" debug_tileX="+debug_tileX+" debug_tileY="+debug_tileY+" globalDebugLevel="+globalDebugLevel); " debug_tileX="+debug_tileX+" debug_tileY="+debug_tileY+" globalDebugLevel="+globalDebugLevel);
} }
// TODO: Remove unused
/**
final int [][] zi =
{{ 0, 1, 2, 3},
{-1, 0, -3, 2},
{-2, -3, 0, 1},
{ 3, -2, -1, 0}};
final int [][] corr_pairs ={ // {first, second, rot} rot: 0 - as is, 1 - swap y,x // not used in lwir
{0,1,0},
{2,3,0},
{0,2,1},
{1,3,1}};
final double[][] port_offsets = { // lwir: used only in textures to scale differences
{-0.5, -0.5},
{ 0.5, -0.5},
{-0.5, 0.5},
{ 0.5, 0.5}};
final int transform_len = transform_size * transform_size;
final double [] filter = doubleGetCltLpfFd(corr_sigma);
*/
// prepare disparity maps and weights
//// final int max_search_radius = (int) Math.abs(max_corr_radius); // use negative max_corr_radius for squares instead of circles?
//// final int max_search_radius_poly = 1;
/**
if (globalDebugLevel > 0){
System.out.println("max_corr_radius= "+max_corr_radius);
System.out.println("max_search_radius= "+max_search_radius);
System.out.println("max_search_radius_poly="+max_search_radius_poly);
System.out.println("gpu_fat_zero= "+gpu_fat_zero);
System.out.println("disparity_array[0][0]= "+disparity_array[0][0]);
}
*/
// add optional initialization of debug layers here // add optional initialization of debug layers here
boolean need_macro = false; boolean need_macro = false;
...@@ -445,17 +412,52 @@ public class ImageDtt extends ImageDttCPU { ...@@ -445,17 +412,52 @@ public class ImageDtt extends ImageDttCPU {
// does it need correlations? // does it need correlations?
if (fneed_corr) { if (fneed_corr) {
//Generate 2D phase correlations from the CLT representation //Generate 2D phase correlations from the CLT representation
/*
gpuQuad.execCorr2D( gpuQuad.execCorr2D(
col_weights, // scales,// double [] scales, col_weights, // scales,// double [] scales,
gpu_fat_zero, // double fat_zero); gpu_fat_zero, // double fat_zero);
gpu_corr_rad); // int corr_radius gpu_corr_rad); // int corr_radius
//Show 2D correlations
// int [] wh = new int[2];
final int [] corr_indices = gpuQuad.getCorrIndices(); final int [] corr_indices = gpuQuad.getCorrIndices();
final float [][] fcorr2D = gpuQuad.getCorr2D( final float [][] fcorr2D = gpuQuad.getCorr2D(gpu_corr_rad); // int corr_rad);
gpu_corr_rad); // int corr_rad); */
gpuQuad.execCorr2D_TD(col_weights); // Get TD version of correlations (may be read out and saved)
final int [] corr_indices = gpuQuad.getCorrIndices();
gpuQuad.execCorr2D_normalize(
false, // boolean combo, // normalize combo correlations (false - per-pair ones)
gpu_fat_zero, // double fat_zero);
gpu_corr_rad); // int corr_radius
final float [][] fcorr2D = gpuQuad.getCorr2D(gpu_corr_rad); // int corr_rad);
// calculate combine quad correlation
gpuQuad.execCorr2D_combine( // calculate cross pairs
true, // boolean init_corr, // initialize output tiles (false - add to current)
GPUTileProcessor.NUM_PAIRS, // int num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
0x0f); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
// normalize and convert to pixel domain
gpuQuad.execCorr2D_normalize(
true, // boolean combo, // normalize combo correlations (false - per-pair ones)
gpu_fat_zero, // double fat_zero);
gpu_corr_rad); // int corr_radius
final int [] corr_quad_indices = gpuQuad.getCorrComboIndices(); // get quad
final float [][] fcorr2D_quad = gpuQuad.getCorr2DCombo(gpu_corr_rad);
// calculate and get cross here
gpuQuad.execCorr2D_combine( // calculate cross pairs
true, // boolean init_corr, // initialize output tiles (false - add to current)
GPUTileProcessor.NUM_PAIRS, // int num_pairs_in, // typically 6 - number of pairs per tile (tile task should have same number per each tile
0x30); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
gpuQuad.execCorr2D_normalize(
true, // boolean combo, // normalize combo correlations (false - per-pair ones)
gpu_fat_zero, // double fat_zero);
gpu_corr_rad); // int corr_radius
// final int [] corr_cross_indices = gpuQuad.getCorrComboIndices(); // cross indices are the quad
final float [][] fcorr2D_cross = gpuQuad.getCorr2DCombo(gpu_corr_rad);
if (corr_indices.length > 0) { if (corr_indices.length > 0) {
if (true) { /*
if (true) { // debugging only
int [] wh = new int[2]; int [] wh = new int[2];
double [][] dbg_corr = GPUTileProcessor.getCorr2DView( double [][] dbg_corr = GPUTileProcessor.getCorr2DView(
tilesX, tilesX,
...@@ -471,7 +473,7 @@ public class ImageDtt extends ImageDttCPU { ...@@ -471,7 +473,7 @@ public class ImageDtt extends ImageDttCPU {
"dbg-corr2D", // name+"-CORR2D-D"+clt_parameters.disparity, "dbg-corr2D", // name+"-CORR2D-D"+clt_parameters.disparity,
GPUTileProcessor.getCorrTitles()); GPUTileProcessor.getCorrTitles());
} }
*/
final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size final int corr_length = fcorr2D[0].length;// all correlation tiles have the same size
// assuming that the correlation pairs sets are the same for each tile that has correlations // assuming that the correlation pairs sets are the same for each tile that has correlations
...@@ -481,7 +483,7 @@ public class ImageDtt extends ImageDttCPU { ...@@ -481,7 +483,7 @@ public class ImageDtt extends ImageDttCPU {
for (int i = 1; (i < corr_indices.length) && ((corr_indices[i] >> GPUTileProcessor.CORR_NTILE_SHIFT) == nt0) ; i++) { for (int i = 1; (i < corr_indices.length) && ((corr_indices[i] >> GPUTileProcessor.CORR_NTILE_SHIFT) == nt0) ; i++) {
nc0++; nc0++;
} }
final int num_tile_corr = nc0; final int num_tile_corr = nc0; // normally 6
final int num_tiles = corr_indices.length / num_tile_corr; final int num_tiles = corr_indices.length / num_tile_corr;
...@@ -504,7 +506,9 @@ public class ImageDtt extends ImageDttCPU { ...@@ -504,7 +506,9 @@ public class ImageDtt extends ImageDttCPU {
(imgdtt_params.lma_debug_level > 1)); // boolean debug); (imgdtt_params.lma_debug_level > 1)); // boolean debug);
for (int indx_tile = ai.getAndIncrement(); indx_tile < num_tiles; indx_tile = ai.getAndIncrement()) { for (int indx_tile = ai.getAndIncrement(); indx_tile < num_tiles; indx_tile = ai.getAndIncrement()) {
double [][] corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15) // double [][] corrs = new double [GPUTileProcessor.NUM_PAIRS][corr_length]; // 225-long (15x15)
// added quad and cross combos
double [][] corrs = new double [GPUTileProcessor.NUM_PAIRS + 2][corr_length]; // 225-long (15x15)
int indx_corr = indx_tile * num_tile_corr; int indx_corr = indx_tile * num_tile_corr;
int nt = (corr_indices[indx_corr] >> GPUTileProcessor.CORR_NTILE_SHIFT); int nt = (corr_indices[indx_corr] >> GPUTileProcessor.CORR_NTILE_SHIFT);
int tileX = nt % tilesX; int tileX = nt % tilesX;
...@@ -520,7 +524,21 @@ public class ImageDtt extends ImageDttCPU { ...@@ -520,7 +524,21 @@ public class ImageDtt extends ImageDttCPU {
} }
indx_corr++; indx_corr++;
} }
// add 2 combo layers
int pair = GPUTileProcessor.NUM_PAIRS; // 6
nt = (corr_quad_indices[indx_tile] >> GPUTileProcessor.CORR_NTILE_SHIFT); // corr_quad_indices - different sequence
for (int i = 0; i < corr_length; i++) {
corrs[pair][i] = gpu_corr_scale * fcorr2D_quad[indx_tile][i]; // from float to double
}
// indices for cross are the same as for quad
pair++;
for (int i = 0; i < corr_length; i++) {
corrs[pair][i] = gpu_corr_scale * fcorr2D_cross[indx_tile][i]; // from float to double
}
// does not include combo
int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks int used_pairs = pair_mask; // imgdtt_params.dbg_pair_mask; //TODO: use tile tasks
int tile_lma_debug_level = ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2; int tile_lma_debug_level = ((tileX == debug_tileX) && (tileY == debug_tileY))? (imgdtt_params.lma_debug_level-1) : -2;
boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1); boolean debugTile =(tileX == debug_tileX) && (tileY == debug_tileY) && (globalDebugLevel > -1);
...@@ -611,8 +629,10 @@ public class ImageDtt extends ImageDttCPU { ...@@ -611,8 +629,10 @@ public class ImageDtt extends ImageDttCPU {
clt_corr_partial[tileY][tileX][0][3] = corrs[3]; // 4 clt_corr_partial[tileY][tileX][0][3] = corrs[3]; // 4
clt_corr_partial[tileY][tileX][1][0] = corrs[4]; // 5 clt_corr_partial[tileY][tileX][1][0] = corrs[4]; // 5
clt_corr_partial[tileY][tileX][1][1] = corrs[5]; // 6 clt_corr_partial[tileY][tileX][1][1] = corrs[5]; // 6
clt_corr_partial[tileY][tileX][1][2] = corr2d.debugStrip(strip_hor); // 7 clt_corr_partial[tileY][tileX][1][2] = corrs[6]; // 5
clt_corr_partial[tileY][tileX][1][3] = corr2d.debugStrip(strip_vert); // 8 clt_corr_partial[tileY][tileX][1][3] = corrs[7]; // 6
// clt_corr_partial[tileY][tileX][1][2] = corr2d.debugStrip(strip_hor); // 7
// clt_corr_partial[tileY][tileX][1][3] = corr2d.debugStrip(strip_vert); // 8
clt_corr_partial[tileY][tileX][2][0] = corr2d.debugStrip(strips[4]); // 9 clt_corr_partial[tileY][tileX][2][0] = corr2d.debugStrip(strips[4]); // 9
clt_corr_partial[tileY][tileX][2][1] = corr2d.debugStrip(strips[5]); // 10 clt_corr_partial[tileY][tileX][2][1] = corr2d.debugStrip(strips[5]); // 10
clt_corr_partial[tileY][tileX][2][2] = corr2d.debugStrip2(strip_hor); // 11 clt_corr_partial[tileY][tileX][2][2] = corr2d.debugStrip2(strip_hor); // 11
......
...@@ -677,6 +677,7 @@ public class QuadCLT extends QuadCLTCPU { ...@@ -677,6 +677,7 @@ public class QuadCLT extends QuadCLTCPU {
0x0f); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) 0x0f); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
quadCLT_main.getGPU().execCorr2D_normalize( quadCLT_main.getGPU().execCorr2D_normalize(
true, // boolean combo, // normalize combo correlations (false - per-pair ones)
fat_zero, // double fat_zero); fat_zero, // double fat_zero);
clt_parameters.gpu_corr_rad); // int corr_radius clt_parameters.gpu_corr_rad); // int corr_radius
...@@ -803,6 +804,7 @@ public class QuadCLT extends QuadCLTCPU { ...@@ -803,6 +804,7 @@ public class QuadCLT extends QuadCLTCPU {
0x30); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross) 0x30); // int pairs_mask // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
quadCLT_main.getGPU().execCorr2D_normalize( quadCLT_main.getGPU().execCorr2D_normalize(
true, // boolean combo, // normalize combo correlations (false - per-pair ones)
fat_zero, // double fat_zero); fat_zero, // double fat_zero);
clt_parameters.gpu_corr_rad); // int corr_radius clt_parameters.gpu_corr_rad); // int corr_radius
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment