Commit 20df596a authored by Andrey Filippov's avatar Andrey Filippov

changing direct conversion to CDP, handling sparse tasks

parent 0bb31239
......@@ -99,15 +99,14 @@ public class GPUTileProcessor {
static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
// static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
static String GPU_IMCLT_RBG_NAME = "imclt_rbg"; // name in C code
static String GPU_CONVERT_DIRECT_NAME = "convert_direct"; // name in C code
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_CORRELATE2D_NAME = "correlate2D"; // name in C code
// static String GPU_TEXTURES_NAME = "textures_gen"; // name in C code
static String GPU_TEXTURES_NAME = "textures_accumulate"; // name in C code
static String GPU_RBGA_NAME = "generate_RBGA"; // name in C code
static String GPU_ROT_DERIV = "calc_rot_deriv"; // calculate rotation matrices and derivatives
static String SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_IMCLT_ALL_NAME = "imclt_rbg_all";
static String GPU_SET_TILES_OFFSETS = "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
static String GPU_CALC_REVERSE_DISTORTION = "calcReverseDistortionTable"; // calculate reverse radial distortion table from gpu_geometry_correction
// pass some defines to gpu source code with #ifdef JCUDA
......@@ -162,14 +161,14 @@ public class GPUTileProcessor {
public boolean kernels_set = false;
public boolean bayer_set = false;
private CUfunction GPU_CONVERT_CORRECT_TILES_kernel = null;
private CUfunction GPU_IMCLT_RBG_kernel = null;
private CUfunction GPU_CORRELATE2D_kernel = null;
private CUfunction GPU_TEXTURES_kernel = null;
private CUfunction GPU_RBGA_kernel = null;
private CUfunction GPU_ROT_DERIV_kernel = null;
private CUfunction SET_TILES_OFFSETS_kernel = null;
private CUfunction GPU_IMCLT_ALL_kernel = null;
private CUfunction GPU_CONVERT_DIRECT_kernel = null;
private CUfunction GPU_IMCLT_ALL_kernel = null;
private CUfunction GPU_CORRELATE2D_kernel = null;
private CUfunction GPU_TEXTURES_kernel = null;
private CUfunction GPU_RBGA_kernel = null;
private CUfunction GPU_ROT_DERIV_kernel = null;
private CUfunction GPU_SET_TILES_OFFSETS_kernel = null;
private CUfunction GPU_CALC_REVERSE_DISTORTION_kernel = null;
// CPU arrays of pointers to GPU memory
......@@ -202,6 +201,9 @@ public class GPUTileProcessor {
private CUdeviceptr gpu_geometry_correction= new CUdeviceptr();
private CUdeviceptr gpu_rByRDist= new CUdeviceptr(); // calculated once for the camera distortion model in CPU (move to GPU?)
private CUdeviceptr gpu_active_tiles = new CUdeviceptr(); // TILESX*TILESY*sizeof(int)
private CUdeviceptr gpu_num_active_tiles = new CUdeviceptr(); // 1 int
CUmodule module; // to access constants memory
private int mclt_stride;
private int corr_stride;
......@@ -464,36 +466,40 @@ public class GPUTileProcessor {
// Create the kernel functions (first - just test)
String [] func_names = {
GPU_CONVERT_CORRECT_TILES_NAME,
GPU_IMCLT_RBG_NAME,
// GPU_CONVERT_CORRECT_TILES_NAME,
GPU_CONVERT_DIRECT_NAME,
GPU_IMCLT_ALL_NAME,
GPU_CORRELATE2D_NAME,
GPU_TEXTURES_NAME,
GPU_RBGA_NAME,
GPU_ROT_DERIV,
SET_TILES_OFFSETS,
GPU_IMCLT_ALL_NAME
GPU_SET_TILES_OFFSETS,
GPU_CALC_REVERSE_DISTORTION
};
CUfunction[] functions = createFunctions(kernelSources,
func_names,
capability); // on my - 75
GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
GPU_IMCLT_RBG_kernel = functions[1];
GPU_CORRELATE2D_kernel = functions[2];
GPU_TEXTURES_kernel= functions[3];
GPU_RBGA_kernel= functions[4];
GPU_ROT_DERIV_kernel = functions[5];
SET_TILES_OFFSETS_kernel = functions[6];
GPU_IMCLT_ALL_kernel = functions[7];
// GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
GPU_CONVERT_DIRECT_kernel = functions[0];
GPU_IMCLT_ALL_kernel = functions[1];
GPU_CORRELATE2D_kernel = functions[2];
GPU_TEXTURES_kernel= functions[3];
GPU_RBGA_kernel= functions[4];
GPU_ROT_DERIV_kernel = functions[5];
GPU_SET_TILES_OFFSETS_kernel = functions[6];
GPU_CALC_REVERSE_DISTORTION_kernel = functions[7];
System.out.println("GPU kernel functions initialized");
System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
System.out.println(GPU_IMCLT_RBG_kernel.toString());
// System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
System.out.println(GPU_CONVERT_DIRECT_kernel.toString());
System.out.println(GPU_IMCLT_ALL_kernel.toString());
System.out.println(GPU_CORRELATE2D_kernel.toString());
System.out.println(GPU_TEXTURES_kernel.toString());
System.out.println(GPU_RBGA_kernel.toString());
System.out.println(GPU_ROT_DERIV_kernel.toString());
System.out.println(SET_TILES_OFFSETS_kernel.toString());
System.out.println(GPU_SET_TILES_OFFSETS_kernel.toString());
System.out.println(GPU_CALC_REVERSE_DISTORTION_kernel.toString());
// Init data arrays for all kernels
int tilesX = IMG_WIDTH / DTT_SIZE;
......@@ -578,8 +584,10 @@ public class GPUTileProcessor {
cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
cuMemAlloc(gpu_port_offsets, NUM_CAMS * 2 * Sizeof.POINTER);
cuMemAlloc(gpu_woi, 4 * Sizeof.POINTER); // may be hidden in device code as a static array?
cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.POINTER); // may be hidden in device code as a static array?
cuMemAlloc(gpu_woi, 4 * Sizeof.FLOAT);
cuMemAlloc(gpu_num_texture_tiles, 8 * Sizeof.FLOAT);
cuMemAlloc(gpu_active_tiles, tilesX * tilesY * Sizeof.FLOAT);
cuMemAlloc(gpu_num_active_tiles, 1 * Sizeof.FLOAT);
cuMemAllocPitch (
gpu_corrs, // CUdeviceptr dptr,
......@@ -611,15 +619,18 @@ public class GPUTileProcessor {
}
public void setGeometryCorrection(GeometryCorrection gc) {
public void setGeometryCorrection(GeometryCorrection gc,
boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions
float [] fgc = gc.toFloatArray();
double [] rByRDist = gc.getRByRDist();
float [] fFByRDist = new float [rByRDist.length];
for (int i = 0; i < rByRDist.length; i++) {
fFByRDist[i] = (float) rByRDist[i];
if (use_java_rByRDist) {
double [] rByRDist = gc.getRByRDist();
float [] fFByRDist = new float [rByRDist.length];
for (int i = 0; i < rByRDist.length; i++) {
fFByRDist[i] = (float) rByRDist[i];
}
cuMemcpyHtoD(gpu_rByRDist, Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
}
cuMemcpyHtoD(gpu_geometry_correction, Pointer.to(fgc), fgc.length * Sizeof.FLOAT);
cuMemcpyHtoD(gpu_rByRDist, Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
cuMemAlloc (gpu_rot_deriv, 5 * NUM_CAMS *3 *3 * Sizeof.FLOAT); // NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
}
......@@ -1061,11 +1072,32 @@ public class GPUTileProcessor {
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize(); // remove later
}
public void execCalcReverseDistortions() {
if (GPU_CALC_REVERSE_DISTORTION_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CALC_REVERSE_DISTORTION_kernel");
return;
}
// kernel parameters: pointer to pointers
int [] GridFullWarps = {NUM_CAMS, 1, 1}; // round up
int [] ThreadsFullWarps = {3, 3, 3};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_geometry_correction), // struct gc * gpu_geometry_correction,
Pointer.to(gpu_rByRDist)); // float * gpu_rByRDist) // length should match RBYRDIST_LEN
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_CALC_REVERSE_DISTORTION_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
cuCtxSynchronize(); // remove later
}
public void execSetTilesOffsets() {
if (SET_TILES_OFFSETS_kernel == null)
if (GPU_SET_TILES_OFFSETS_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: SET_TILES_OFFSETS_kernel");
IJ.showMessage("Error", "No GPU kernel: GPU_SET_TILES_OFFSETS_kernel");
return;
}
// kernel parameters: pointer to pointers
......@@ -1079,7 +1111,7 @@ public class GPUTileProcessor {
Pointer.to(gpu_rByRDist), // float * gpu_rByRDist) // length should match RBYRDIST_LEN
Pointer.to(gpu_rot_deriv)); // trot_deriv * gpu_rot_deriv);
cuCtxSynchronize();
cuLaunchKernel(SET_TILES_OFFSETS_kernel,
cuLaunchKernel(GPU_SET_TILES_OFFSETS_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
......@@ -1087,34 +1119,36 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later
}
public void execConverCorrectTiles() {
if (GPU_CONVERT_CORRECT_TILES_kernel == null)
public void execConverDirect() {
if (GPU_CONVERT_DIRECT_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_CORRECT_TILES_kernel");
IJ.showMessage("Error", "No GPU kernel: GPU_CONVERT_DIRECT_kernel");
return;
}
// kernel parameters: pointer to pointers
int [] GridFullWarps = {(num_task_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1, 1}; // round up
int [] ThreadsFullWarps = {THREADSX, TILES_PER_BLOCK, 1};
int [] GridFullWarps = {1, 1, 1};
int [] ThreadsFullWarps = {1, 1, 1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_kernel_offsets),
Pointer.to(gpu_kernels),
Pointer.to(gpu_bayer),
Pointer.to(gpu_tasks),
Pointer.to(gpu_clt),
Pointer.to(new int[] { mclt_stride }),
Pointer.to(new int[] { num_task_tiles }),
// move lpf to 4-image generator kernel - DONE
Pointer.to(new int[] { 0 }), // lpf_mask
Pointer.to(new int[] { IMG_WIDTH}), // int woi_width,
Pointer.to(new int[] { IMG_HEIGHT}), // int woi_height,
Pointer.to(new int[] { KERNELS_HOR}), // int kernels_hor,
Pointer.to(new int[] { KERNELS_VERT}) // int kernels_vert);
);
Pointer.to(gpu_kernel_offsets),
Pointer.to(gpu_kernels),
Pointer.to(gpu_bayer),
Pointer.to(gpu_tasks),
Pointer.to(gpu_clt),
Pointer.to(new int[] { mclt_stride }),
Pointer.to(new int[] { num_task_tiles }),
// move lpf to 4-image generator kernel - DONE
Pointer.to(new int[] { 0 }), // lpf_mask
Pointer.to(new int[] { IMG_WIDTH}), // int woi_width,
Pointer.to(new int[] { IMG_HEIGHT}), // int woi_height,
Pointer.to(new int[] { KERNELS_HOR}), // int kernels_hor,
Pointer.to(new int[] { KERNELS_VERT}), // int kernels_vert);
Pointer.to(gpu_active_tiles),
Pointer.to(gpu_num_active_tiles)
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_CONVERT_CORRECT_TILES_kernel,
cuLaunchKernel(GPU_CONVERT_DIRECT_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
......@@ -1122,51 +1156,7 @@ public class GPUTileProcessor {
cuCtxSynchronize(); // remove later
}
public void execImcltRbg(
boolean is_mono
) {
if (GPU_IMCLT_RBG_kernel == null)
{
IJ.showMessage("Error", "No GPU kernel: GPU_IMCLT_RBG_kernel");
return;
}
int apply_lpf = 1;
int tilesX = IMG_WIDTH / DTT_SIZE;
int tilesY = IMG_HEIGHT / DTT_SIZE;
int [] ThreadsFullWarps = {IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1};
for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
for (int color = 0; color < NUM_COLORS; color++) {
for (int v_offs = 0; v_offs < 2; v_offs++){
for (int h_offs = 0; h_offs < 2; h_offs++){
int tilesy_half = (tilesY + (v_offs ^ 1)) >> 1;
int tilesx_half = (tilesX + (h_offs ^ 1)) >> 1;
int tiles_in_pass = tilesy_half * tilesx_half;
int [] GridFullWarps = {(tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1};
Pointer kernelParameters = Pointer.to(
Pointer.to(gpu_clt_h[ncam]),
Pointer.to(gpu_corr_images_h[ncam]),
Pointer.to(new int[] { apply_lpf }),
Pointer.to(new int[] { is_mono ? 1 : NUM_COLORS }), // now - NUM_COLORS
Pointer.to(new int[] { color }),
Pointer.to(new int[] { v_offs }),
Pointer.to(new int[] { h_offs }),
Pointer.to(new int[] { tilesX }),
Pointer.to(new int[] { tilesY }),
Pointer.to(new int[] { imclt_stride }) // lpf_mask
);
cuCtxSynchronize();
// Call the kernel function
cuLaunchKernel(GPU_IMCLT_RBG_kernel,
GridFullWarps[0], GridFullWarps[1], GridFullWarps[2], // Grid dimension
ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
0, null, // Shared memory size and stream (shared - only dynamic, static is in code)
kernelParameters, null); // Kernel- and extra parameters
}
}
}
}
cuCtxSynchronize();
}
public void execImcltRbgAll(
boolean is_mono
......@@ -1349,7 +1339,7 @@ public class GPUTileProcessor {
Pointer kernelParameters = Pointer.to(
// Pointer.to(new int[] {0}), // 0, // int border_tile, // if 1 - watch for border
Pointer.to(gpu_texture_indices), // int * woi, - not used
Pointer.to(new int[] {0}), // int * woi, - not used
Pointer.to(gpu_clt),
Pointer.to(new int[] { num_texture_tiles }),
Pointer.to(gpu_texture_indices),
......
......@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
// public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
// public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
// public double distortionB=0.0; // r^3
// public double distortionC=0.0; // r^2
boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0);
double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC;
double rPrev=0.0;
......
......@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
tp_tasks);
gPUTileProcessor.setTextureIndices(
texture_indices);
gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once
gPUTileProcessor.setGeometryCorrection(
quadCLT_main.getGeometryCorrection(),
false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
// TODO: calculate from the camera geometry?
......@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
int NREPEAT = 1; // 00;
System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
long startGPU=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execCalcReverseDistortions();
}
long startRotDerivs=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execRotDerivs();
}
......@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
long startDirectConvert=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execConverCorrectTiles();
gPUTileProcessor.execConverDirect();
}
// run imclt;
long startIMCLT=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
}
long endImcltTime = System.nanoTime();
......@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove); // boolean dust_remove,
long endTexturesRBGA = System.nanoTime();
long endGPUTime = System.nanoTime();
long rotDerivsTime= (startTasksSetup- startGPU) /NREPEAT;
long calcReverseTime= (startRotDerivs- startGPU) /NREPEAT;
long rotDerivsTime= (startTasksSetup- startRotDerivs) /NREPEAT;
long tasksSetupTime= (startDirectConvert- startTasksSetup) /NREPEAT;
long firstGPUTime= (startIMCLT- startDirectConvert) /NREPEAT;
long runImcltTime = (endImcltTime - startIMCLT) /NREPEAT;
......@@ -2171,16 +2176,17 @@ public class TwoQuadCLT {
long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA) /NREPEAT;
long runGPUTime = (endGPUTime - startGPU) /NREPEAT;
// run corr2d
//RotDerivs
System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
System.out.println("GPU run time ="+ (runGPUTime * 1.0e-6)+"ms");
System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms");
System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms");
System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
System.out.println(" - imclt: "+(runImcltTime*1.0e-6)+"ms");
System.out.println(" - corr2D: "+(runCorr2DTime*1.0e-6)+"ms");
System.out.println(" - textures: "+(runTexturesTime*1.0e-6)+"ms");
System.out.println(" - RGBA: "+(runTexturesRBGATime*1.0e-6)+"ms");
System.out.println(" - calc reverse dist.: "+(calcReverseTime*1.0e-6)+"ms");
System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms");
System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms");
System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
System.out.println(" - imclt: "+(runImcltTime*1.0e-6)+"ms");
System.out.println(" - corr2D: "+(runCorr2DTime*1.0e-6)+"ms");
System.out.println(" - textures: "+(runTexturesTime*1.0e-6)+"ms");
System.out.println(" - RGBA: "+(runTexturesRBGATime*1.0e-6)+"ms");
// get data back from GPU
float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
......
......@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
#define KERNELS_STEP (1 << KERNELS_LSTEP)
#define TILESX (IMG_WIDTH / DTT_SIZE)
#define TILESY (IMG_HEIGHT / DTT_SIZE)
#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
// Make TILESYA >= TILESX and a multiple of 4
#define TILESYA ((TILESY +3) & (~3))
// increase row length by 1 so vertical passes will use different ports
......@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
#endif //#ifdef USE_CDP
//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
//#define CONVERT_DIRECT_NUM_CHUNKS ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
//__global__ int num_active_tiles;
//__global__ int active_tiles [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
// not maintaining order of the tiles to be processed
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles) // indices to gpu_tasks // should be initialized to zero
{
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile >= num_tiles){
return;
}
if (gpu_tasks[num_tile].task != 0) {
active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
}
}
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles) // indices to gpu_tasks
{
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
*pnum_active_tiles = 0;
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, //int num_tiles, // number of tiles in task
gpu_active_tiles, //int * active_tiles, // pointer to the calculated number of non-zero tiles
pnum_active_tiles); //int * pnum_active_tiles) // indices to gpu_tasks // should be initialized to zero
cudaDeviceSynchronize();
// now call actual convert_correct_tiles
dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
convert_correct_tiles<<<grid_tp,threads_tp>>>(
gpu_kernel_offsets, // float ** gpu_kernel_offsets, // [NUM_CAMS],
gpu_kernels, // float ** gpu_kernels, // [NUM_CAMS],
gpu_images, // float ** gpu_images, // [NUM_CAMS],
gpu_tasks, // struct tp_task * gpu_tasks, // array of tasks
gpu_active_tiles, // int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
*pnum_active_tiles, // int num_active_tiles, // number of tiles in task
gpu_clt, // float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
dstride, // size_t dstride, // in floats (pixels)
lpf_mask, // int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
woi_width, // int woi_width, // varaible to swict between EO and LWIR
woi_height, // int woi_height, // varaible to swict between EO and LWIR
kernels_hor, // int kernels_hor, // varaible to swict between EO and LWIR
kernels_vert); // int kernels_vert); // varaible to swict between EO and LWIR
}
}
#if 0 // trying to keep the same order
extern "C" __global__ void index_direct_init(
int num_chunks, // number of tiles in task
struct convert_direct_tmp* tmp)
{
int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
if (chunk_index <= num_chunks){
tmp->num_acive_per_chunk[chunk_index] = 0;
}
}
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
extern "C"
__global__ void convert_correct_tiles(
extern "C" __global__ void build_index_direct(
struct tp_task * gpu_tasks,
int num_tiles, // number of tiles in task
struct convert_direct_tmp* tmp)
{
__shared__ int num_active;
if (threadIdx.x == 0) {
num_active = 0;
}
__syncthreads();
int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
if (num_tile < num_tiles) {
if (gpu_tasks[num_tile].task){
atomicAdd(&num_active, 1);
}
}
__syncthreads();
tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
}
/**
* Top level to call other kernel with CDP
*/
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
......@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * num_active_tiles,
struct convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
{
int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
dim3 blocks0 (num_chunk_blocks,1, 1);
__shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
cudaDeviceSynchronize(); // not needed yet, just for testing
index_direct<<<blocks0,threads0>>>(
gpu_tasks, // struct tp_task * gpu_tasks,
num_tiles, // int num_tiles)
tmp);
cudaDeviceSynchronize(); // not needed yet, just for testing
// single-threaded - make cumulative
// tmp-> num_acive_per_chunk2[0] = 0;
superchunks[0] = 0;
}
__syncthreads();
// calculate cumulative in 3 steps
//1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
for (int pass = 0; pass < num_passes; pass++){
int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
if (num_cluster2 <= num_chunk_blocks){
superchunks[threadIdx.x+1] = superchunks[0];
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
if (indx <= num_chunks) {
superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
}
}
}
__syncthreads();
// make superchunks cumulative (single-threaded
if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
superchunks[i + 1] += superchunks[i];
}
}
__syncthreads();
// now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
if (num_cluster2 <= num_chunk_blocks){
int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
int prev = tmp -> num_acive_per_chunk[indx++];
if (indx <= num_chunks) {
tmp -> num_acive_per_chunk[indx] += prev;
}
}
}
__syncthreads();
}
}
#endif
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
......@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
{
dim3 t = threadIdx;
int tile_in_block = threadIdx.y;
int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_num >= num_tiles) return; // nothing to do
// int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
// if (task_num >= num_tiles) return; // nothing to do
int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
if (task_indx >= num_active_tiles){
return; // nothing to do
}
int task_num = gpu_active_tiles[task_indx];
struct tp_task * gpu_task = &gpu_tasks[task_num];
if (!gpu_task->task) return; // NOP tile
__shared__ struct tp_task tt [TILES_PER_BLOCK];
......@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
woi_height, // int woi_height,
kernels_hor, // int kernels_hor,
kernels_vert); //int kernels_vert)
__syncthreads();// __syncwarp();
__syncthreads();
}
}
}
......@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
int tileX = tile_num - tileY * TILESX;
int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; // - (DTT_SIZE/2); // may be negative == -4
int height = *(woi + 3) << DTT_SIZE_LOG2;
/// int height = *(woi + 3) << DTT_SIZE_LOG2;
#ifdef DEBUG12
if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
......
......@@ -41,21 +41,44 @@
#include "tp_defines.h"
#endif
extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles); // indices to gpu_tasks // should be initialized to zero
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles); // indices to gpu_tasks
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
extern "C" __global__ void clear_texture_list(
......
......@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
......@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
{-1,-1,-1} // do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
......@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
}
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
break;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
break;
}
r+=(rDist-rD)/drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
}
}
/**
* Calculate non-distorted radius from distorted using table approximation
......
......@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment