Commit 20df596a authored by Andrey Filippov's avatar Andrey Filippov

changing direct conversion to CDP, handling sparse tasks

parent 0bb31239
......@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125, 0.125, 0.125, -0.125, 0.125, -0. , -0. , -0.
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
// public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
// public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
// public double distortionB=0.0; // r^3
// public double distortionC=0.0; // r^2
boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0);
double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC;
double rPrev=0.0;
......
......@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
tp_tasks);
gPUTileProcessor.setTextureIndices(
texture_indices);
gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once
gPUTileProcessor.setGeometryCorrection(
quadCLT_main.getGeometryCorrection(),
false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
// TODO: calculate from the camera geometry?
......@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
int NREPEAT = 1; // 00;
System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
long startGPU=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execCalcReverseDistortions();
}
long startRotDerivs=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execRotDerivs();
}
......@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
long startDirectConvert=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
gPUTileProcessor.execConverCorrectTiles();
gPUTileProcessor.execConverDirect();
}
// run imclt;
long startIMCLT=System.nanoTime();
for (int i = 0; i < NREPEAT; i++ ) {
// gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
}
long endImcltTime = System.nanoTime();
......@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
clt_parameters.min_agree, // double min_agree, // minimal number of channels to agree on a point (real number to work with fuzzy averages)
clt_parameters.dust_remove); // boolean dust_remove,
long endTexturesRBGA = System.nanoTime();
long endGPUTime = System.nanoTime();
long rotDerivsTime= (startTasksSetup- startGPU) /NREPEAT;
long calcReverseTime= (startRotDerivs- startGPU) /NREPEAT;
long rotDerivsTime= (startTasksSetup- startRotDerivs) /NREPEAT;
long tasksSetupTime= (startDirectConvert- startTasksSetup) /NREPEAT;
long firstGPUTime= (startIMCLT- startDirectConvert) /NREPEAT;
long runImcltTime = (endImcltTime - startIMCLT) /NREPEAT;
......@@ -2171,16 +2176,17 @@ public class TwoQuadCLT {
long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA) /NREPEAT;
long runGPUTime = (endGPUTime - startGPU) /NREPEAT;
// run corr2d
//RotDerivs
System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
System.out.println("GPU run time ="+ (runGPUTime * 1.0e-6)+"ms");
System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms");
System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms");
System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
System.out.println(" - imclt: "+(runImcltTime*1.0e-6)+"ms");
System.out.println(" - corr2D: "+(runCorr2DTime*1.0e-6)+"ms");
System.out.println(" - textures: "+(runTexturesTime*1.0e-6)+"ms");
System.out.println(" - RGBA: "+(runTexturesRBGATime*1.0e-6)+"ms");
System.out.println(" - calc reverse dist.: "+(calcReverseTime*1.0e-6)+"ms");
System.out.println(" - rot/derivs: "+(rotDerivsTime*1.0e-6)+"ms");
System.out.println(" - tasks setup: "+(tasksSetupTime*1.0e-6)+"ms");
System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
System.out.println(" - imclt: "+(runImcltTime*1.0e-6)+"ms");
System.out.println(" - corr2D: "+(runCorr2DTime*1.0e-6)+"ms");
System.out.println(" - textures: "+(runTexturesTime*1.0e-6)+"ms");
System.out.println(" - RGBA: "+(runTexturesRBGATime*1.0e-6)+"ms");
// get data back from GPU
float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {
......
......@@ -41,21 +41,44 @@
#include "tp_defines.h"
#endif
extern "C"
__global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
extern "C" __global__ void index_direct(
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
int * active_tiles, // pointer to the calculated number of non-zero tiles
int * num_active_tiles); // indices to gpu_tasks // should be initialized to zero
extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
// struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert,
int * gpu_active_tiles, // pointer to the calculated number of non-zero tiles
int * pnum_active_tiles); // indices to gpu_tasks
extern "C" __global__ void convert_correct_tiles(
float ** gpu_kernel_offsets, // [NUM_CAMS],
float ** gpu_kernels, // [NUM_CAMS],
float ** gpu_images, // [NUM_CAMS],
struct tp_task * gpu_tasks,
int * gpu_active_tiles, // indices in gpu_tasks to non-zero tiles
int num_active_tiles, // number of tiles in task
float ** gpu_clt, // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
size_t dstride, // in floats (pixels)
// int num_tiles, // number of tiles in task
int lpf_mask, // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
int woi_width,
int woi_height,
int kernels_hor,
int kernels_vert);
extern "C" __global__ void clear_texture_list(
......
......@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
inline __device__ float getRByRDist(float rDist,
float rByRDist [RBYRDIST_LEN]); //shared memory
__constant__ float ROTS_TEMPLATE[7][3][3][3] = {// ...{cos,sin,const}...
{ // azimuth
{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
......@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
{-1,-1,-1} // do nothing
}};
#if 0
__device__ float rot_matrices [NUM_CAMS][3][3];
//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
// threads (3,3,4)
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector)
{
__shared__ float zoom [NUM_CAMS];
__shared__ float sincos [NUM_CAMS][3][2]; // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
float angle;
int ncam = threadIdx.z;
int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
int nangle = nangle1 >> 1;
int is_sin = nangle1 & 1;
#ifdef DEBUG20a
if ((threadIdx.x == 0) && ( threadIdx.y == 0) && ( threadIdx.z == 0)){
printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
printExtrinsicCorrection(gpu_correction_vector);
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle < 4){ // this part only for 1-st 3
float* gangles =
(nangle ==0)?gpu_correction_vector->azimuth:(
(nangle ==1)?gpu_correction_vector->tilt:(
(nangle ==2)?gpu_correction_vector->roll:
gpu_correction_vector->zoom));
if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
angle = *(gangles + ncam);
} else {
angle = 0.0f;
#pragma unroll
for (int n = 0; n < (NUM_CAMS-1); n++){
angle -= *(gangles + n);
}
}
if (!is_sin){
angle += M_PI/2;
}
if (nangle < 3) {
sincos[ncam][nangle][is_sin]=sinf(angle);
} else if (is_sin){
zoom[ncam] = angle;
}
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf(" Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf(" Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
if (nangle == 3) {
sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\na Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
printf("a Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
printf("a Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// now 3x3
for (int axis = 0; axis < 3; axis++) {
matrices[ncam][axis][threadIdx.y][threadIdx.x] =
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2]; // const
}
__syncthreads();
#ifdef DEBUG20a
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
// tilt * az ->
// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
matrices[ncam][3][threadIdx.y][threadIdx.x] =
matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
__syncthreads();
rot_matrices[ncam][threadIdx.y][threadIdx.x] =
matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
__syncthreads();
#ifdef DEBUG20
if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
for (int n = 0; n < NUM_CAMS; n++){
printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][0][i][j]);
}
printf("\n");
}
printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][1][i][j]);
}
printf("\n");
}
printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", matrices[n][2][i][j]);
}
printf("\n");
}
printf("2 - Rotation matrix for camera %d\n", n);
for (int i = 0; i < 3; i++){
for (int j = 0; j < 3; j++){
printf("%9.6f, ", rot_matrices[n][i][j]);
}
printf("\n");
}
}
}
__syncthreads();// __syncwarp();
#endif // DEBUG20
}
#endif
__constant__ int offset_rots = 0; //0
__constant__ int offset_derivs = 1; // 1..4 // should be next
__constant__ int offset_matrices = 5; // 5..11
......@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
}
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist)
{
//int num_threads = NUM_CAMS * blockDim.z * blockDim.y * blockDim.x; // 36
int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
// double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
// double delta=1E-4; //rByRDist error = 0.000072
double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
double minDerivative=0.01;
int numIterations=1000;
double drDistDr=1.0;
double d=1.0
-geometry_correction -> distortionA8
-geometry_correction -> distortionA7
-geometry_correction -> distortionA6
-geometry_correction -> distortionA5
-geometry_correction -> distortionA
-geometry_correction -> distortionB
-geometry_correction -> distortionC;
double rPrev=0.0;
int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
for (int p = 0; p < num_points; p ++){
int i = indx * num_points +p;
if (i >= RBYRDIST_LEN){
return;
}
if (i == 0){
rByRDist[0]= (float) 1.0/d;
break;
}
double rDist = RBYRDIST_STEP * i;
double r = (p == 0) ? rDist : rPrev;
for (int iteration=0;iteration<numIterations;iteration++){
double k=(((((((
geometry_correction -> distortionA8) * r +
geometry_correction -> distortionA7) * r +
geometry_correction -> distortionA6) * r +
geometry_correction -> distortionA5) * r +
geometry_correction -> distortionA) * r +
geometry_correction -> distortionB) * r +
geometry_correction -> distortionC) * r + d;
drDistDr=(((((((
8 * geometry_correction -> distortionA8) * r +
7 * geometry_correction -> distortionA7) * r +
6 * geometry_correction -> distortionA6) * r +
5 * geometry_correction -> distortionA5) * r +
4 * geometry_correction -> distortionA) * r +
3 * geometry_correction -> distortionB) * r+
2 * geometry_correction -> distortionC) * r+d;
if (drDistDr<minDerivative) { // folds backwards !
return; // too high distortion
}
double rD=r*k;
if (fabs(rD-rDist)<delta){
break;
}
r+=(rDist-rD)/drDistDr;
}
rPrev=r;
rByRDist[i]= (float) r/rDist;
}
}
/**
* Calculate non-distorted radius from distorted using table approximation
......
......@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
float * gpu_rByRDist, // length should match RBYRDIST_LEN
trot_deriv * gpu_rot_deriv);
#if 0
// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
extern "C" __global__ void calc_rot_matrices(
struct corr_vector * gpu_correction_vector);
#endif
// uses NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calc_rot_deriv(
struct corr_vector * gpu_correction_vector,
trot_deriv * gpu_rot_deriv);
#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
extern "C" __global__ void calcReverseDistortionTable(
struct gc * geometry_correction,
float * rByRDist);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment