Commit 20df596a authored by Andrey Filippov's avatar Andrey Filippov

changing direct conversion to CDP, handling sparse tasks

parent 0bb31239
...@@ -99,15 +99,14 @@ public class GPUTileProcessor { ...@@ -99,15 +99,14 @@ public class GPUTileProcessor {
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}}; static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}}; // static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}}; // static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_RBG_NAME = "imclt_rbg"; // name in C code static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
// static String GPU_TEXTURES_NAME = "textures_gen"; // name in C code
static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all"; static String GPU_CALC_REVERSE_DISTORTION = "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction
// pass some defines to gpu source code with #ifdef JCUDA // pass some defines to gpu source code with #ifdef JCUDA
...@@ -162,14 +161,14 @@ public class GPUTileProcessor { ...@@ -162,14 +161,14 @@ public class GPUTileProcessor {
public boolean kernels_set = false; public boolean kernels_set = false;
public boolean bayer_set = false; public boolean bayer_set = false;
private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null; private CUfunction GPU_CONVERT_DIRECT_kernel = null;
private CUfunction GPU_IMCLT_RBG_kernel = null; private CUfunction GPU_IMCLT_ALL_kernel = null;
private CUfunction GPU_CORRELATE2D_kernel = null; private CUfunction GPU_CORRELATE2D_kernel = null;
private CUfunction GPU_TEXTURES_kernel = null; private CUfunction GPU_TEXTURES_kernel = null;
private CUfunction GPU_RBGA_kernel = null; private CUfunction GPU_RBGA_kernel = null;
private CUfunction GPU_ROT_DERIV_kernel = null; private CUfunction GPU_ROT_DERIV_kernel = null;
private CUfunction SET_TILES_OFFSETS_kernel = null; private CUfunction GPU_SET_TILES_OFFSETS_kernel = null;
private CUfunction GPU_IMCLT_ALL_kernel = null; private CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;
// CPU arrays of pointers to GPU memory // CPU arrays of pointers to GPU memory
...@@ -202,6 +201,9 @@ public class GPUTileProcessor { ...@@ -202,6 +201,9 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_geometry_correction= new CUdeviceptr(); private CUdeviceptr gpu_geometry_correction= new CUdeviceptr();
private CUdeviceptr gpu_rByRDist= new CUdeviceptr(); // calculated once for the camera distortion model in CPU (move to GPU?) private CUdeviceptr gpu_rByRDist= new CUdeviceptr(); // calculated once for the camera distortion model in CPU (move to GPU?)
private CUdeviceptr gpu_active_tiles = new CUdeviceptr(); // TILESX*TILESY*sizeof(int)
private CUdeviceptr gpu_num_active_tiles = new CUdeviceptr(); // 1 int
CUmodule module; // to access constants memory CUmodule module; // to access constants memory
private int mclt_stride; private int mclt_stride;
private int corr_stride; private int corr_stride;
...@@ -464,36 +466,40 @@ public class GPUTileProcessor { ...@@ -464,36 +466,40 @@ public class GPUTileProcessor {
// Create the kernel functions (first - just test) // Create the kernel functions (first - just test)
String [] func_names = { String [] func_names = {
GPU_CONVERT_CORRECT_TILES_NAME, // GPU_CONVERT_CORRECT_TILES_NAME,
GPU_IMCLT_RBG_NAME, GPU_CONVERT_DIRECT_NAME,
GPU_IMCLT_ALL_NAME,
GPU_CORRELATE2D_NAME, GPU_CORRELATE2D_NAME,
GPU_TEXTURES_NAME, GPU_TEXTURES_NAME,
GPU_RBGA_NAME, GPU_RBGA_NAME,
GPU_ROT_DERIV, GPU_ROT_DERIV,
SET_TILES_OFFSETS, GPU_SET_TILES_OFFSETS,
GPU_IMCLT_ALL_NAME GPU_CALC_REVERSE_DISTORTION
}; };
CUfunction[] functions = createFunctions(kernelSources, CUfunction[] functions = createFunctions(kernelSources,
func_names, func_names,
capability); // on my - 75 capability); // on my - 75
GPU_CONVERT_CORRECT_TILES_kernel = functions[0]; // GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
GPU_IMCLT_RBG_kernel = functions[1]; GPU_CONVERT_DIRECT_kernel = functions[0];
GPU_IMCLT_ALL_kernel = functions[1];
GPU_CORRELATE2D_kernel = functions[2]; GPU_CORRELATE2D_kernel = functions[2];
GPU_TEXTURES_kernel= functions[3]; GPU_TEXTURES_kernel= functions[3];
GPU_RBGA_kernel= functions[4]; GPU_RBGA_kernel= functions[4];
GPU_ROT_DERIV_kernel = functions[5]; GPU_ROT_DERIV_kernel = functions[5];
SET_TILES_OFFSETS_kernel = functions[6]; GPU_SET_TILES_OFFSETS_kernel = functions[6];
GPU_IMCLT_ALL_kernel = functions[7]; GPU_CALC_REVERSE_DISTORTION_kernel = functions[7];
System.out.println("GPU kernel functions initialized"); System.out.println("GPU kernel functions initialized");
System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString()); // System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
System.out.println(GPU_IMCLT_RBG_kernel.toString()); System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
System.out.println(GPU_IMCLT_ALL_kernel.toString());
System.out.println(GPU_CORRELATE2D_kernel.toString()); System.out.println(GPU_CORRELATE2D_kernel.toString());
System.out.println(GPU_TEXTURES_kernel.toString()); System.out.println(GPU_TEXTURES_kernel.toString());
System.out.println(GPU_RBGA_kernel.toString()); System.out.println(GPU_RBGA_kernel.toString());
System.out.println(GPU_ROT_DERIV_kernel.toString()); System.out.println(GPU_ROT_DERIV_kernel.toString());
System.out.println(SET_TILES_OFFSETS_kernel.toString()); System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());
// Init data arrays for all kernels // Init data arrays for all kernels
int tilesX = IMG_WIDTH / DTT_SIZE; int tilesX = IMG_WIDTH / DTT_SIZE;
...@@ -578,8 +584,10 @@ public class GPUTileProcessor { ...@@ -578,8 +584,10 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER); cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
cuMemAlloc(gpu_port_offsets, NUM_CAMS * 2 * Sizeof.POINTER); cuMemAlloc(gpu_port_offsets, NUM_CAMS * 2 * Sizeof.POINTER);
cuMemAlloc(gpu_woi, 4 * Sizeof.POINTER); // may be hidden in device code as a static array? cuMemAlloc(gpu_woi, 4 * Sizeof.FLOAT);
cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.POINTER); // may be hidden in device code as a static array? cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.FLOAT);
cuMemAlloc(gpu_active_tiles, tilesX * tilesY * Sizeof.FLOAT);
cuMemAlloc(gpu_num_active_tiles, 1 * Sizeof.FLOAT);
cuMemAllocPitch ( cuMemAllocPitch (
gpu_corrs, // CUdeviceptr dptr, gpu_corrs, // CUdeviceptr dptr,
...@@ -611,15 +619,18 @@ public class GPUTileProcessor { ...@@ -611,15 +619,18 @@ public class GPUTileProcessor {
} }
public void setGeometryCorrection(GeometryCorrection gc) { public void setGeometryCorrection(GeometryCorrection gc,
boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions
float [] fgc = gc.toFloatArray(); float [] fgc = gc.toFloatArray();
if (use_java_rByRDist) {
double [] rByRDist = gc.getRByRDist(); double [] rByRDist = gc.getRByRDist();
float [] fFByRDist = new float [rByRDist.length]; float [] fFByRDist = new float [rByRDist.length];
for (int i = 0; i < rByRDist.length; i++) { for (int i = 0; i < rByRDist.length; i++) {
fFByRDist[i] = (float) rByRDist[i]; fFByRDist[i] = (float) rByRDist[i];
} }
cuMemcpyHtoD(gpu_geometry_correction, Pointer.to(fgc), fgc.length * Sizeof.FLOAT);
cuMemcpyHtoD(gpu_rByRDist, Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT); cuMemcpyHtoD(gpu_rByRDist, Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
}
cuMemcpyHtoD(gpu_geometry_correction, Pointer.to(fgc), fgc.length * Sizeof.FLOAT);
cuMemAlloc (gpu_rot_deriv, 5 * NUM_CAMS *3 *3 * Sizeof.FLOAT); // NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera cuMemAlloc (gpu_rot_deriv, 5 * NUM_CAMS *3 *3 * Sizeof.FLOAT); // NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
} }
...@@ -1061,11 +1072,32 @@ public class GPUTileProcessor { ...@@ -1061,11 +1072,32 @@ public class GPUTileProcessor {
kernelParameters, null); // Kernel- and extra parameters kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize(); // remove later cuCtxSynchronize(); // remove later
} }
public void execCalcReverseDistortions() {
if (GPU_CALC_REVERSE_DISTORTION_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CALC_REVERSE_DISTORTION_kernel");
return;
}
// kernel parameters: pointer to pointers
int [] GridFullWarps = {NUM_CAMS, 1, 1}; // round up
int [] ThreadsFullWarps = {3, 3, 3};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(gpu_rByRDist)); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_CALC_REVERSE_DISTORTION_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize(); // remove later
}
public void execSetTilesOffsets() { public void execSetTilesOffsets() {
if (SET_TILES_OFFSETS_kernel == null) if (GPU_SET_TILES_OFFSETS_kernel == null)
{ {
IJ.showMessage("Error", "No GPU kernel: SET_TILES_OFFSETS_kernel"); IJ.showMessage("Error", "No GPU kernel: GPU_SET_TILES_OFFSETS_kernel");
return; return;
} }
// kernel parameters: pointer to pointers // kernel parameters: pointer to pointers
...@@ -1079,7 +1111,7 @@ public class GPUTileProcessor { ...@@ -1079,7 +1111,7 @@ public class GPUTileProcessor {
Pointer.to(gpu_rByRDist), // float * gpu_rByRDist) // length should match RBYRDIST_LEN Pointer.to(gpu_rByRDist), // float * gpu_rByRDist) // length should match RBYRDIST_LEN
Pointer.to(gpu_rot_deriv)); // trot_deriv * gpu_rot_deriv); Pointer.to(gpu_rot_deriv)); // trot_deriv * gpu_rot_deriv);
cuCtxSynchronize(); cuCtxSynchronize();
cuLaunchKernel(SET_TILES_OFFSETS_kernel, cuLaunchKernel(GPU_SET_TILES_OFFSETS_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code) 0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
...@@ -1087,15 +1119,15 @@ public class GPUTileProcessor { ...@@ -1087,15 +1119,15 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later cuCtxSynchronize(); // remove later
} }
public void execConverCorrectTiles() { public void execConverDirect() {
if (GPU_CONVERT_CORRECT_TILES_kernel == null) if (GPU_CONVERT_DIRECT_kernel == null)
{ {
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_CORRECT_TILES_kernel"); IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
return; return;
} }
// kernel parameters: pointer to pointers // kernel parameters: pointer to pointers
int [] GridFullWarps = {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1}; // round up int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {THREADSX, TILES_PER_BLOCK, 1}; int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_kernel_offsets), Pointer.to(gpu_kernel_offsets),
Pointer.to(gpu_kernels), Pointer.to(gpu_kernels),
...@@ -1109,12 +1141,14 @@ public class GPUTileProcessor { ...@@ -1109,12 +1141,14 @@ public class GPUTileProcessor {
Pointer.to(new int[] { IMG_WIDTH}), // int woi_width, Pointer.to(new int[] { IMG_WIDTH}), // int woi_width,
Pointer.to(new int[] { IMG_HEIGHT}), // int woi_height, Pointer.to(new int[] { IMG_HEIGHT}), // int woi_height,
Pointer.to(new int[] { KERNELS_HOR}), // int kernels_hor, Pointer.to(new int[] { KERNELS_HOR}), // int kernels_hor,
Pointer.to(new int[] { KERNELS_VERT}) // int kernels_vert); Pointer.to(new int[] { KERNELS_VERT}), // int kernels_vert);
Pointer.to(gpu_active_tiles),
Pointer.to(gpu_num_active_tiles)
); );
cuCtxSynchronize(); cuCtxSynchronize();
// Call the kernel function // Call the kernel function
cuLaunchKernel(GPU_CONVERT_CORRECT_TILES_kernel, cuLaunchKernel(GPU_CONVERT_DIRECT_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code) 0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
...@@ -1122,51 +1156,7 @@ public class GPUTileProcessor { ...@@ -1122,51 +1156,7 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later cuCtxSynchronize(); // remove later
} }
public void execImcltRbg(
boolean is_mono
) {
if (GPU_IMCLT_RBG_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
return;
}
int apply_lpf = 1;
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
int tilesy_half = (tilesY + (v_offs ^ 1)) >> 1;
int tilesx_half = (tilesX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
int [] GridFullWarps = {(tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt_h[ncam]),
Pointer.to(gpu_corr_images_h[ncam]),
Pointer.to(new int[] { apply_lpf }),
Pointer.to(new int[] { is_mono ? 1 : NUM_COLORS }), // now - NUM_COLORS
Pointer.to(new int[] { color }),
Pointer.to(new int[] { v_offs }),
Pointer.to(new int[] { h_offs }),
Pointer.to(new int[] { tilesX }),
Pointer.to(new int[] { tilesY }),
Pointer.to(new int[] { imclt_stride }) // lpf_mask
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_IMCLT_RBG_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
}
}
}
}
cuCtxSynchronize();
}
public void execImcltRbgAll( public void execImcltRbgAll(
boolean is_mono boolean is_mono
...@@ -1349,7 +1339,7 @@ public class GPUTileProcessor { ...@@ -1349,7 +1339,7 @@ public class GPUTileProcessor {
Pointer kernelParameters = Pointer.to( Pointer kernelParameters = Pointer.to(
// Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border // Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border
Pointer.to(gpu_texture_indices), // int * woi, - not used Pointer.to(new int[] {0}), // int * woi, - not used
Pointer.to(gpu_clt), Pointer.to(gpu_clt),
Pointer.to(new int[] { num_texture_tiles }), Pointer.to(new int[] { num_texture_tiles }),
Pointer.to(gpu_texture_indices), Pointer.to(gpu_texture_indices),
......
...@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0. ...@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double minDerivative=0.01; double minDerivative=0.01;
int numIterations=1000; int numIterations=1000;
double drDistDr=1.0; double drDistDr=1.0;
// public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
// public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
// public double distortionB=0.0; // r^3
// public double distortionC=0.0; // r^2
boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0); boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0);
double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC; double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC;
double rPrev=0.0; double rPrev=0.0;
......
...@@ -2087,7 +2087,9 @@ public class TwoQuadCLT { ...@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
tp_tasks); tp_tasks);
gPUTileProcessor.setTextureIndices( gPUTileProcessor.setTextureIndices(
texture_indices); texture_indices);
gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once gPUTileProcessor.setGeometryCorrection(
quadCLT_main.getGeometryCorrection(),
false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
// TODO: calculate from the camera geometry? // TODO: calculate from the camera geometry?
...@@ -2101,6 +2103,10 @@ public class TwoQuadCLT { ...@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
int NREPEAT = 1; // 00; int NREPEAT = 1; // 00;
System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------"); System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
long startGPU=System.nanoTime(); long startGPU=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execCalcReverseDistortions();
}
long startRotDerivs=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) { for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execRotDerivs(); gPUTileProcessor.execRotDerivs();
} }
...@@ -2113,13 +2119,12 @@ public class TwoQuadCLT { ...@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
long startDirectConvert=System.nanoTime(); long startDirectConvert=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) { for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execConverCorrectTiles(); gPUTileProcessor.execConverDirect();
} }
// run imclt; // run imclt;
long startIMCLT=System.nanoTime(); long startIMCLT=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) { for (int i = 0; i < NREPEAT; i++ ) {
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome()); gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
} }
long endImcltTime = System.nanoTime(); long endImcltTime = System.nanoTime();
...@@ -2159,10 +2164,10 @@ public class TwoQuadCLT { ...@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages) clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove); // boolean dust_remove, clt_parameters.dust_remove); // boolean dust_remove,
long endTexturesRBGA = System.nanoTime(); long endTexturesRBGA = System.nanoTime();
long endGPUTime = System.nanoTime(); long endGPUTime = System.nanoTime();
long rotDerivsTime= (startTasksSetup- startGPU) /NREPEAT; long calcReverseTime= (startRotDerivs- startGPU) /NREPEAT;
long rotDerivsTime= (startTasksSetup- startRotDerivs) /NREPEAT;
long tasksSetupTime= (startDirectConvert- startTasksSetup) /NREPEAT; long tasksSetupTime= (startDirectConvert- startTasksSetup) /NREPEAT;
long firstGPUTime= (startIMCLT- startDirectConvert) /NREPEAT; long firstGPUTime= (startIMCLT- startDirectConvert) /NREPEAT;
long runImcltTime = (endImcltTime - startIMCLT) /NREPEAT; long runImcltTime = (endImcltTime - startIMCLT) /NREPEAT;
...@@ -2171,9 +2176,10 @@ public class TwoQuadCLT { ...@@ -2171,9 +2176,10 @@ public class TwoQuadCLT {
long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA) /NREPEAT; long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA) /NREPEAT;
long runGPUTime = (endGPUTime - startGPU) /NREPEAT; long runGPUTime = (endGPUTime - startGPU) /NREPEAT;
// run corr2d // run corr2d
//RotDerivs
System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------"); System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
System.out.println("GPU run time ="+ (runGPUTime * 1.0e-6)+"ms"); System.out.println("GPU run time ="+ (runGPUTime * 1.0e-6)+"ms");
System.out.println(" - calc reverse dist.: "+(calcReverseTime*1.0e-6)+"ms");
System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms"); System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms");
System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms"); System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms");
System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms"); System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
......
...@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17. ...@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define KERNELS_STEP (1 << KERNELS_LSTEP) #define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE) #define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE) #define TILESY (IMG_HEIGHT / DTT_SIZE)
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
// Make TILESYA >= TILESX and a multiple of 4 // Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3)) #define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports // increase row length by 1 so vertical passes will use different ports
...@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list( ...@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
#endif //#ifdef USE_CDP #endif //#ifdef USE_CDP
//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
//#define CONVERT_DIRECT_NUM_CHUNKS ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//__global__ int num_active_tiles;
//__global__ int active_tiles [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
// not maintaining order of the tiles to be processed
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles) // indices to gpu_tasks // should be initialized to zero
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
if (gpu_tasks[num_tile].task != 0) {
active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
}
}
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles) // indices to gpu_tasks
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
*pnum_active_tiles = 0;
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, //int num_tiles, // number of tiles in task
gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
// now call actual convert_correct_tiles
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
convert_correct_tiles<<<grid_tp,threads_tp>>>(
gpu_kernel_offsets, // float ** gpu_kernel_offsets, // [NUM_CAMS],
gpu_kernels, // float ** gpu_kernels, // [NUM_CAMS],
gpu_images, // float ** gpu_images, // [NUM_CAMS],
gpu_tasks, // struct tp_task * gpu_tasks, // array of tasks
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*pnum_active_tiles, // int num_active_tiles, // number of tiles in task
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride, // size_t dstride, // in floats (pixels)
lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width, // int woi_width, // varaible to swict between EO and LWIR
woi_height, // int woi_height, // varaible to swict between EO and LWIR
kernels_hor, // int kernels_hor, // varaible to swict between EO and LWIR
kernels_vert); // int kernels_vert); // varaible to swict between EO and LWIR
}
}
#if 0 // trying to keep the same order
extern "C" __global__ void index_direct_init(
int num_chunks, // number of tiles in task
struct convert_direct_tmp* tmp)
{
int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
if (chunk_index <= num_chunks){
tmp->num_acive_per_chunk[chunk_index] = 0;
}
}
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
extern "C" extern "C" __global__ void build_index_direct(
__global__ void convert_correct_tiles( struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
/**
* Top level to call other kernel with CDP
*/
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS], float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float ** gpu_kernels, // [NUM_CAMS],
...@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles( ...@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels) size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * num_active_tiles,
struct convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
{
int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 (num_chunk_blocks,1, 1);
__shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
cudaDeviceSynchronize(); // not needed yet, just for testing
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles)
tmp);
cudaDeviceSynchronize(); // not needed yet, just for testing
// single-threaded - make cumulative
// tmp-> num_acive_per_chunk2[0] = 0;
superchunks[0] = 0;
}
__syncthreads();
// calculate cumulative in 3 steps
//1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
for (int pass = 0; pass < num_passes; pass++){
int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
if (num_cluster2 <= num_chunk_blocks){
superchunks[threadIdx.x+1] = superchunks[0];
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
if (indx <= num_chunks) {
superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
}
}
}
__syncthreads();
// make superchunks cumulative (single-threaded
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
superchunks[i + 1] += superchunks[i];
}
}
__syncthreads();
// now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
if (num_cluster2 <= num_chunk_blocks){
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
int prev = tmp -> num_acive_per_chunk[indx++];
if (indx <= num_chunks) {
tmp -> num_acive_per_chunk[indx] += prev;
}
}
}
__syncthreads();
}
}
#endif
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width, int woi_width,
int woi_height, int woi_height,
...@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles( ...@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
{ {
dim3 t = threadIdx; dim3 t = threadIdx;
int tile_in_block = threadIdx.y; int tile_in_block = threadIdx.y;
int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block; // int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_num >= num_tiles) return; // nothing to do // if (task_num >= num_tiles) return; // nothing to do
int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_indx >= num_active_tiles){
return; // nothing to do
}
int task_num = gpu_active_tiles[task_indx];
struct tp_task * gpu_task = &gpu_tasks[task_num]; struct tp_task * gpu_task = &gpu_tasks[task_num];
if (!gpu_task->task) return; // NOP tile if (!gpu_task->task) return; // NOP tile
__shared__ struct tp_task tt [TILES_PER_BLOCK]; __shared__ struct tp_task tt [TILES_PER_BLOCK];
...@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles( ...@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
woi_height, // int woi_height, woi_height, // int woi_height,
kernels_hor, // int kernels_hor, kernels_hor, // int kernels_hor,
kernels_vert); //int kernels_vert) kernels_vert); //int kernels_vert)
__syncthreads();
__syncthreads();// __syncwarp();
} }
} }
} }
...@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate( ...@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
int tileX = tile_num - tileY * TILESX; int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4 int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4 int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int height = *(woi + 3) << DTT_SIZE_LOG2; /// int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12 #ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){ if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
......
...@@ -41,9 +41,14 @@ ...@@ -41,9 +41,14 @@
#include "tp_defines.h" #include "tp_defines.h"
#endif #endif
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles); // indices to gpu_tasks // should be initialized to zero
extern "C" extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
__global__ void convert_correct_tiles( // struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS], float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS], float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS], float ** gpu_images, // [NUM_CAMS],
...@@ -51,6 +56,24 @@ __global__ void convert_correct_tiles( ...@@ -51,6 +56,24 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE] float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels) size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles); // indices to gpu_tasks
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 ! int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width, int woi_width,
int woi_height, int woi_height,
......
...@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv); ...@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
inline __device__ float getRByRDist(float rDist, inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}... __constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth { // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}}, {{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
...@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={ ...@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
{-1,-1,-1} // do nothing {-1,-1,-1} // do nothing
}}; }};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__ int offset_rots = 0; //0 __constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next __constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11 __constant__ int offset_matrices = 5; // 5..11
...@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
} }
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
break;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
break;
}
r+=(rDist-rD)/drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
}
}
/** /**
* Calculate non-distorted radius from distorted using table approximation * Calculate non-distorted radius from distorted using table approximation
......
...@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets( ...@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
float * gpu_rByRDist, // length should match RBYRDIST_LEN float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads // uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv( extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector, struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv); trot_deriv * gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment