changing direct conversion to CDP, handling sparse tasks

20df596a · Andrey Filippov · 0bb31239 · 20df596a · 20df596a · 20df596a
Commit 20df596a authored Apr 16, 2020 by Andrey Filippov
7 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
--- a/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
@@ -4210,10 +4210,6 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 		double minDerivative=0.01;
 		int numIterations=1000;
 		double drDistDr=1.0;
-		//	public double distortionA5=0.0; //r^5 (normalized to focal length or to sensor half width?)
-		//	public double distortionA=0.0; // r^4 (normalized to focal length or to sensor half width?)
-		//	public double distortionB=0.0; // r^3
-		//	public double distortionC=0.0; // r^2
 		boolean use8=(this.distortionA8!=0.0) || (this.distortionA7!=0.0) || (this.distortionA6!=0.0);
 		double d=1.0-this.distortionA8-this.distortionA7-this.distortionA6-this.distortionA5-this.distortionA-this.distortionB-this.distortionC;
 		double rPrev=0.0;

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2087,7 +2087,9 @@ public class TwoQuadCLT {
 				tp_tasks);
 		gPUTileProcessor.setTextureIndices(
 				texture_indices);
-		gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once
+		gPUTileProcessor.setGeometryCorrection(
+				quadCLT_main.getGeometryCorrection(),
+				false); // boolean use_java_rByRDist) { // false - use newer GPU execCalcReverseDistortions); // once
 		gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
 		// TODO: calculate from the camera geometry?
@@ -2101,6 +2103,10 @@ public class TwoQuadCLT {
 		int NREPEAT = 1; // 00;
 		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
 		long startGPU=System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) {
+			gPUTileProcessor.execCalcReverseDistortions();
+		}
+		long startRotDerivs=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
 			gPUTileProcessor.execRotDerivs();
 		}
@@ -2113,13 +2119,12 @@ public class TwoQuadCLT {
 		long startDirectConvert=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-			gPUTileProcessor.execConverCorrectTiles();
+			gPUTileProcessor.execConverDirect();
 		}
 // run imclt;
 		long startIMCLT=System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) {
-//			gPUTileProcessor.execImcltRbg(quadCLT_main.isMonochrome());
 			gPUTileProcessor.execImcltRbgAll(quadCLT_main.isMonochrome());
 		}
 		long endImcltTime = System.nanoTime();
@@ -2159,10 +2164,10 @@ public class TwoQuadCLT {
 				clt_parameters.min_agree,      // double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 				clt_parameters.dust_remove);   // boolean   dust_remove,
 		long endTexturesRBGA = System.nanoTime();
 		long endGPUTime = System.nanoTime();
-		long rotDerivsTime=        (startTasksSetup-    startGPU)           /NREPEAT;
+		long calcReverseTime=      (startRotDerivs-     startGPU)           /NREPEAT;
+		long rotDerivsTime=        (startTasksSetup-    startRotDerivs)     /NREPEAT;
 		long tasksSetupTime=       (startDirectConvert- startTasksSetup)    /NREPEAT;
 		long firstGPUTime=         (startIMCLT-         startDirectConvert) /NREPEAT;
 		long runImcltTime =        (endImcltTime -      startIMCLT)         /NREPEAT;
@@ -2171,16 +2176,17 @@ public class TwoQuadCLT {
 		long runTexturesRBGATime = (endTexturesRBGA -   startTexturesRBGA)  /NREPEAT;
 		long runGPUTime =          (endGPUTime -        startGPU)           /NREPEAT;
 		// run corr2d
+//RotDerivs
 		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
 		System.out.println("GPU run time ="+        (runGPUTime * 1.0e-6)+"ms");
-		System.out.println(" - rot/derivs:        "+(rotDerivsTime*1.0e-6)+"ms");
+		System.out.println(" - calc reverse dist.: "+(calcReverseTime*1.0e-6)+"ms");
-		System.out.println(" - tasks setup:       "+(tasksSetupTime*1.0e-6)+"ms");
+		System.out.println(" - rot/derivs:         "+(rotDerivsTime*1.0e-6)+"ms");
-		System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
+		System.out.println(" - tasks setup:        "+(tasksSetupTime*1.0e-6)+"ms");
-		System.out.println(" - imclt:             "+(runImcltTime*1.0e-6)+"ms");
+		System.out.println(" - direct conversion:  "+(firstGPUTime*1.0e-6)+"ms");
-		System.out.println(" - corr2D:            "+(runCorr2DTime*1.0e-6)+"ms");
+		System.out.println(" - imclt:              "+(runImcltTime*1.0e-6)+"ms");
-		System.out.println(" - textures:          "+(runTexturesTime*1.0e-6)+"ms");
+		System.out.println(" - corr2D:             "+(runCorr2DTime*1.0e-6)+"ms");
-		System.out.println(" - RGBA:              "+(runTexturesRBGATime*1.0e-6)+"ms");
+		System.out.println(" - textures:           "+(runTexturesTime*1.0e-6)+"ms");
+		System.out.println(" - RGBA:               "+(runTexturesRBGATime*1.0e-6)+"ms");
 		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -41,21 +41,44 @@
 #include "tp_defines.h"
 #endif
+extern "C" __global__ void index_direct(
-extern "C"
-__global__ void convert_correct_tiles(
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
 		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
-		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
-		int                woi_width,
+		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero
-		int                woi_height,
-		int                kernels_hor,
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-		int                kernels_vert);
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			int *              pnum_active_tiles);  //  indices to gpu_tasks
+extern "C" __global__ void convert_correct_tiles(
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+			int                num_active_tiles,   // number of tiles in task
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+//			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert);
 extern "C" __global__ void clear_texture_list(

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
 inline __device__ float getRByRDist(float rDist,
 		float rByRDist [RBYRDIST_LEN]); //shared memory
 __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 		{ // azimuth
 				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
 				{-1,-1,-1} // do nothing
 		}};
-#if 0
-__device__ float rot_matrices       [NUM_CAMS][3][3];
-//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
-// threads (3,3,4)
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector)
-{
-	__shared__ float zoom    [NUM_CAMS];
-	__shared__ float sincos  [NUM_CAMS][3][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
-	__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
-	float angle;
-	int ncam = threadIdx.z;
-	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
-	int nangle =  nangle1 >> 1;
-	int is_sin = nangle1 & 1;
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0)  && ( threadIdx.y == 0)  && ( threadIdx.z == 0)){
-		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
-		printExtrinsicCorrection(gpu_correction_vector);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	if (nangle < 4){ // this part only for 1-st 3
-		float* gangles =
-				(nangle ==0)?gpu_correction_vector->azimuth:(
-						(nangle ==1)?gpu_correction_vector->tilt:(
-								(nangle ==2)?gpu_correction_vector->roll:
-										gpu_correction_vector->zoom));
-		if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
-			angle = *(gangles + ncam);
-		} else {
-			angle = 0.0f;
-#pragma	unroll
-			for (int n = 0; n < (NUM_CAMS-1); n++){
-				angle -= *(gangles + n);
-			}
-		}
-		if (!is_sin){
-			angle += M_PI/2;
-		}
-		if (nangle < 3) {
-			sincos[ncam][nangle][is_sin]=sinf(angle);
-		} else if (is_sin){
-			zoom[ncam] = angle;
-		}
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	if (nangle == 3) {
-		sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\na    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("a    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("a    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	// now 3x3
-	for (int axis = 0; axis < 3; axis++) {
-		matrices[ncam][axis][threadIdx.y][threadIdx.x] =
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2];                         // const
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-			printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-			printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-    // tilt * az ->
-	// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
-	matrices[ncam][3][threadIdx.y][threadIdx.x] =
-			matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
-	// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
-	__syncthreads();
-	rot_matrices[ncam][threadIdx.y][threadIdx.x] =
-			matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
-	__syncthreads();
-#ifdef DEBUG20
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Rotation matrix for camera %d\n", n);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", rot_matrices[n][i][j]);
-				}
-				printf("\n");
-			}
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-}
-#endif
 __constant__ int offset_rots =     0;                   //0
 __constant__ int offset_derivs =   1;                   // 1..4 // should be next
 __constant__ int offset_matrices = 5;   // 5..11
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
 }
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist)
+{
+	//int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
+	int indx =  ((blockIdx.x * blockDim.z + threadIdx.z) *  blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+//	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
+//	double delta=1E-4; //rByRDist error = 0.000072
+	double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
+	double minDerivative=0.01;
+	int numIterations=1000;
+	double drDistDr=1.0;
+	double d=1.0
+			-geometry_correction -> distortionA8
+			-geometry_correction -> distortionA7
+			-geometry_correction -> distortionA6
+			-geometry_correction -> distortionA5
+			-geometry_correction -> distortionA
+			-geometry_correction -> distortionB
+			-geometry_correction -> distortionC;
+	double rPrev=0.0;
+	int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
+	for (int p = 0; p < num_points; p ++){
+		int i = indx * num_points +p;
+		if (i >= RBYRDIST_LEN){
+			return;
+		}
+		if (i == 0){
+			rByRDist[0]= (float) 1.0/d;
+			break;
+		}
+		double rDist = RBYRDIST_STEP * i;
+		double r = (p == 0) ? rDist : rPrev;
+		for (int iteration=0;iteration<numIterations;iteration++){
+			double k=(((((((
+					geometry_correction -> distortionA8) * r +
+					geometry_correction -> distortionA7) * r +
+					geometry_correction -> distortionA6) * r +
+					geometry_correction -> distortionA5) * r +
+					geometry_correction -> distortionA) * r +
+					geometry_correction -> distortionB) * r +
+					geometry_correction -> distortionC) * r + d;
+			drDistDr=(((((((
+					8 * geometry_correction -> distortionA8) * r +
+					7 * geometry_correction -> distortionA7) * r +
+					6 * geometry_correction -> distortionA6) * r +
+					5 * geometry_correction -> distortionA5) * r +
+					4 * geometry_correction -> distortionA) * r +
+					3 * geometry_correction -> distortionB) * r+
+					2 * geometry_correction -> distortionC) * r+d;
+			if (drDistDr<minDerivative) { // folds backwards !
+				return; // too high distortion
+			}
+			double rD=r*k;
+			if (fabs(rD-rDist)<delta){
+				break;
+			}
+			r+=(rDist-rD)/drDistDr;
+		}
+		rPrev=r;
+		rByRDist[i]= (float) r/rDist;
+	}
+}
 /**
 * Calculate non-distorted radius from distorted using table approximation

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
 		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv);
-#if 0
-// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector);
-#endif
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv);
+#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
+// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist);