Implemented/tested per-tile geometric distortions setup

514057c6 · Andrey Filippov · 50630abc · 514057c6 · 514057c6 · 514057c6
Commit 514057c6 authored Apr 13, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -96,7 +96,8 @@ public class GPUTileProcessor {
 	{"*","dtt8x8.h","dtt8x8.cu"},
 	{"*","dtt8x8.h","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
 */
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
+	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","geometry_correction.cu","TileProcessor.h","TileProcessor.cuh"}};
+//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
 	//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
 	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
 	static String GPU_IMCLT_RBG_NAME =             "imclt_rbg"; // name in C code
@@ -104,6 +105,8 @@ public class GPUTileProcessor {
 //	static String GPU_TEXTURES_NAME =              "textures_gen"; // name in C code
 	static String GPU_TEXTURES_NAME =              "textures_accumulate"; // name in C code
 	static String GPU_RBGA_NAME =                  "generate_RBGA"; // name in C code
+	static String GPU_ROT_DERIV =                  "calc_rot_deriv"; // calculate rotation matrices and derivatives
+	static String SET_TILES_OFFSETS =              "get_tiles_offsets"; // calculate pixel offsets and disparity distortions
 //  pass some defines to gpu source code with #ifdef JCUDA
@@ -148,7 +151,7 @@ public class GPUTileProcessor {
 	public static int RBYRDIST_LEN =           5001; //for double, 10001 - float;   // length of rByRDist to allocate shared memory
 	public static double RBYRDIST_STEP =          0.0004; //  for double, 0.0002 - for float; // to fit into GPU shared memory (was 0.001);
-	public static int TILES_PER_BLOCK_GEOM =     32; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
+	public static int TILES_PER_BLOCK_GEOM =     32/NUM_CAMS; // blockDim.x = NUM_CAMS; blockDim.x = TILES_PER_BLOCK_GEOM
 	public static int TASK_TEXTURE_BITS = ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT));
@@ -163,15 +166,18 @@ public class GPUTileProcessor {
    private CUfunction GPU_CORRELATE2D_kernel =           null;
    private CUfunction GPU_TEXTURES_kernel =              null;
    private CUfunction GPU_RBGA_kernel =                  null;
+    private CUfunction GPU_ROT_DERIV_kernel =             null;
+    private CUfunction SET_TILES_OFFSETS_kernel =         null;
    // CPU arrays of pointers to GPU memory
-    // These arrays may go to method, they are here just to be able to free GPU memory if needed
+    // These arrays may go to methods, they are here just to be able to free GPU memory if needed
    private CUdeviceptr [] gpu_kernels_h =        new CUdeviceptr[NUM_CAMS];
    private CUdeviceptr [] gpu_kernel_offsets_h = new CUdeviceptr[NUM_CAMS];
    private CUdeviceptr [] gpu_bayer_h =          new CUdeviceptr[NUM_CAMS];
    private CUdeviceptr [] gpu_clt_h =            new CUdeviceptr[NUM_CAMS];
    private CUdeviceptr [] gpu_corr_images_h=     new CUdeviceptr[NUM_CAMS];
    // GPU pointers to array of GPU pointers
    private CUdeviceptr gpu_kernels =             new CUdeviceptr();
    private CUdeviceptr gpu_kernel_offsets =      new CUdeviceptr();
@@ -187,6 +193,11 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_num_texture_tiles =   new CUdeviceptr(); //  8 ints
    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
+    private CUdeviceptr gpu_correction_vector=    new CUdeviceptr();
+    private CUdeviceptr gpu_rot_deriv=            new CUdeviceptr(); //  used internally by device, may be read to CPU for testing
+    private CUdeviceptr gpu_geometry_correction=  new CUdeviceptr();
+    private CUdeviceptr gpu_rByRDist=             new CUdeviceptr(); //  calculated once for the camera distortion model in CPU (move to GPU?)
    CUmodule    module; // to access constants memory
    private int mclt_stride;
    private int corr_stride;
@@ -227,15 +238,22 @@ public class GPUTileProcessor {
    		flt[indx++] = Float.intBitsToFloat(tx + (ty << 16));
    		float [][] offsets = use_aux? this.xy_aux: this.xy;
    		for (int i = 0; i < NUM_CAMS; i++) {
-    			flt[indx++] = offsets[i][0];
+    			if (offsets != null) {
-    			flt[indx++] = offsets[i][1];
+    				flt[indx++] = offsets[i][0];
+    				flt[indx++] = offsets[i][1];
+    			} else {
+    				indx+= 2;
+    			}
    		}
    		flt[indx++] = this.target_disparity;
    		for (int i = 0; i < NUM_CAMS; i++) { // actually disp_dist will be initialized by the GPU
+    			indx+= 4;
+    			/*
    			flt[indx++] = disp_dist[i][0];
    			flt[indx++] = disp_dist[i][1];
    			flt[indx++] = disp_dist[i][2];
    			flt[indx++] = disp_dist[i][3];
+    			*/
    		}
    		return flt;
    	}
@@ -446,16 +464,22 @@ public class GPUTileProcessor {
        		GPU_IMCLT_RBG_NAME,
        		GPU_CORRELATE2D_NAME,
        		GPU_TEXTURES_NAME,
-        		GPU_RBGA_NAME};
+        		GPU_RBGA_NAME,
+        		GPU_ROT_DERIV,
+        		SET_TILES_OFFSETS
+        };
        CUfunction[] functions = createFunctions(kernelSources,
        		                                 func_names,
        		                                 capability); // on my - 75
-        this.GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
+        GPU_CONVERT_CORRECT_TILES_kernel = functions[0];
-        this.GPU_IMCLT_RBG_kernel =             functions[1];
+        GPU_IMCLT_RBG_kernel =             functions[1];
-        this.GPU_CORRELATE2D_kernel =           functions[2];
+        GPU_CORRELATE2D_kernel =           functions[2];
-        this.GPU_TEXTURES_kernel=               functions[3];
+        GPU_TEXTURES_kernel=               functions[3];
-        this.GPU_RBGA_kernel=                   functions[4];
+        GPU_RBGA_kernel=                   functions[4];
+        GPU_ROT_DERIV_kernel =             functions[5];
+        SET_TILES_OFFSETS_kernel =         functions[6];
        System.out.println("GPU kernel functions initialized");
        System.out.println(GPU_CONVERT_CORRECT_TILES_kernel.toString());
@@ -463,6 +487,8 @@ public class GPUTileProcessor {
        System.out.println(GPU_CORRELATE2D_kernel.toString());
        System.out.println(GPU_TEXTURES_kernel.toString());
        System.out.println(GPU_RBGA_kernel.toString());
+        System.out.println(GPU_ROT_DERIV_kernel.toString());
+        System.out.println(SET_TILES_OFFSETS_kernel.toString());
        // Init data arrays for all kernels
        int tilesX =  IMG_WIDTH / DTT_SIZE;
@@ -522,9 +548,15 @@ public class GPUTileProcessor {
        for (int ncam = 0; ncam < NUM_CAMS; ncam++) gpu_clt_l[ncam] =            getPointerAddress(gpu_clt_h[ncam]);
        cuMemcpyHtoD(gpu_clt, Pointer.to(gpu_clt_l),                             NUM_CAMS * Sizeof.POINTER);
-        // Set task array
+        // Set GeometryCorrection data
-    	cuMemAlloc(gpu_tasks,       tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER);
+    	cuMemAlloc(gpu_geometry_correction,      GeometryCorrection.arrayLength(NUM_CAMS) * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_rByRDist,                 RBYRDIST_LEN *  Sizeof.FLOAT);
+    	cuMemAlloc(gpu_rot_deriv,                5*NUM_CAMS*3*3 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_correction_vector,        GeometryCorrection.CorrVector.LENGTH * Sizeof.FLOAT);
+        // Set task array
+    	cuMemAlloc(gpu_tasks,      tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT);
+//=========== Seems that in many places Sizeof.POINTER (==8) is used instead of Sizeof.FLOAT !!! ============
    	// Set corrs array
 ///    	cuMemAlloc(gpu_corrs,       tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER);
    	cuMemAlloc(gpu_corr_indices,   tilesX * tilesY * NUM_PAIRS * Sizeof.POINTER);
@@ -568,6 +600,27 @@ public class GPUTileProcessor {
    }
+    public void setGeometryCorrection(GeometryCorrection gc) {
+    	float [] fgc = gc.toFloatArray();
+    	double [] rByRDist = gc.getRByRDist();
+    	float [] fFByRDist = new float [rByRDist.length];
+    	for (int i = 0; i < rByRDist.length; i++) {
+    		fFByRDist[i] = (float) rByRDist[i];
+    	}
+    	cuMemcpyHtoD(gpu_geometry_correction, Pointer.to(fgc),       fgc.length * Sizeof.FLOAT);
+    	cuMemcpyHtoD(gpu_rByRDist,            Pointer.to(fFByRDist), fFByRDist.length * Sizeof.FLOAT);
+    	cuMemAlloc  (gpu_rot_deriv, 5 * NUM_CAMS *3 *3 * Sizeof.FLOAT); // NCAM of 3x3 rotation matrices, plus 4 derivative matrices for each camera
+    }
+    public void setExtrinsicsVector(GeometryCorrection.CorrVector cv) {
+    	double [] dcv = cv.toFullRollArray();
+    	float []  fcv = new float [dcv.length];
+    	for (int i = 0; i < dcv.length; i++) {
+    		fcv[i] = (float) dcv[i];
+    	}
+    	cuMemcpyHtoD(gpu_correction_vector, Pointer.to(fcv), fcv.length * Sizeof.FLOAT);
+    }
    public void setTasks(TpTask [] tile_tasks, boolean use_aux) // while is it in class member? - just to be able to free
    {
@@ -576,7 +629,7 @@ public class GPUTileProcessor {
    	for (int i = 0; i < num_task_tiles; i++) {
    		tile_tasks[i].asFloatArray(ftasks, i* TPTASK_SIZE, use_aux);
    	}
-        cuMemcpyHtoD(gpu_tasks,        Pointer.to(ftasks),         TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT);
+        cuMemcpyHtoD(gpu_tasks, Pointer.to(ftasks), TPTASK_SIZE * num_task_tiles * Sizeof.FLOAT);
    }
    public void setCorrIndices(int [] corr_indices)
@@ -700,6 +753,7 @@ public class GPUTileProcessor {
    // prepare tasks for full frame, same dispaity.
    // need to run setTasks(TpTask [] tile_tasks, boolean use_aux) to format/transfer to GPU memory
    public TpTask [] setFullFrameImages(
+			boolean                   calc_offsets, // old way, now not needed with GPU calculation
    		Rectangle                 woi,
    		boolean                   round_woi,
    		float                     target_disparity, // apply same disparity to all tiles
@@ -725,6 +779,7 @@ public class GPUTileProcessor {
 			corr_masks[i] = corr_mask; // 0x3f; // all 6 correlations
 		}
    	return setFullFrameImages(
+    			calc_offsets,       // boolean                   calc_offsets, // old way, now not needed with GPU calculation
    			woi,                // Rectangle                 woi,
    			round_woi,          // boolean                   round_woi,
        		target_disparities, // should be tilesX*tilesY long
@@ -740,6 +795,7 @@ public class GPUTileProcessor {
    }
    public TpTask [] setFullFrameImages(
+			boolean                   calc_offsets, // old way, now not needed with GPU calculation
    		Rectangle                 woi, // or null
    		boolean                   round_woi,
    		float []                  target_disparities, // should be tilesX*tilesY long
@@ -838,13 +894,15 @@ public class GPUTileProcessor {
        		indx++;
        	}
    	}
-    	getTileSubcamOffsets(
+    	if (calc_offsets) {
-    			tp_tasks,                                    // final TpTask[]            tp_tasks,        // will use // modify to have offsets for 8 cameras
+    		getTileSubcamOffsets(
-    			(use_master? geometryCorrection_main: null), // final GeometryCorrection  geometryCorrection_main,
+    				tp_tasks,                                    // final TpTask[]            tp_tasks,        // will use // modify to have offsets for 8 cameras
-    			(use_aux?    geometryCorrection_aux: null),  // final GeometryCorrection  geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
+    				(use_master? geometryCorrection_main: null), // final GeometryCorrection  geometryCorrection_main,
-    			ers_delay,                                   // final double [][][]       ers_delay,        // if not null - fill with tile center acquisition delay
+    				(use_aux?    geometryCorrection_aux: null),  // final GeometryCorrection  geometryCorrection_aux, // if null, will only calculate offsets fro the main camera
-    			threadsMax,                                  // final int                 threadsMax,  // maximal number of threads to launch
+    				ers_delay,                                   // final double [][][]       ers_delay,        // if not null - fill with tile center acquisition delay
-    			debugLevel);                                 // final int                 debugLevel)
+    				threadsMax,                                  // final int                 threadsMax,  // maximal number of threads to launch
+    				debugLevel);                                 // final int                 debugLevel)
+    	}
    	return tp_tasks;
    }
@@ -966,6 +1024,58 @@ public class GPUTileProcessor {
 // All data is already copied to GPU memory
+    public void execRotDerivs() {
+        if (GPU_ROT_DERIV_kernel == null)
+        {
+            IJ.showMessage("Error", "No GPU kernel: GPU_ROT_DERIV_kernel");
+            return;
+        }
+        // kernel parameters: pointer to pointers
+        int [] GridFullWarps =    {NUM_CAMS, 1, 1}; // round up
+        int [] ThreadsFullWarps = {3,        3, 3};
+        Pointer kernelParameters = Pointer.to(
+            Pointer.to(gpu_correction_vector),
+            Pointer.to(gpu_rot_deriv)
+        );
+        cuCtxSynchronize();
+        	// Call the kernel function
+    	cuLaunchKernel(GPU_ROT_DERIV_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize(); // remove later
+    }
+    public void execSetTilesOffsets() {
+        if (SET_TILES_OFFSETS_kernel == null)
+        {
+            IJ.showMessage("Error", "No GPU kernel: SET_TILES_OFFSETS_kernel");
+            return;
+        }
+        // kernel parameters: pointer to pointers
+        int [] GridFullWarps =    {(num_task_tiles + TILES_PER_BLOCK_GEOM - 1)/TILES_PER_BLOCK_GEOM, 1, 1}; // round up
+        int [] ThreadsFullWarps = {NUM_CAMS, TILES_PER_BLOCK_GEOM, 1}; // 4,8,1
+        Pointer kernelParameters = Pointer.to(
+        		Pointer.to(gpu_tasks),                   // struct tp_task     * gpu_tasks,
+        		Pointer.to(new int[] { num_task_tiles }),// int                  num_tiles,          // number of tiles in task list
+        		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
+        		Pointer.to(gpu_correction_vector),       //	struct corr_vector * gpu_correction_vector,
+        		Pointer.to(gpu_rByRDist),                //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+        		Pointer.to(gpu_rot_deriv));              // trot_deriv         * gpu_rot_deriv);
+        cuCtxSynchronize();
+    	cuLaunchKernel(SET_TILES_OFFSETS_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize(); // remove later
+    }
    public void execConverCorrectTiles() {
        if (GPU_CONVERT_CORRECT_TILES_kernel == null)
        {
@@ -1437,6 +1547,7 @@ public class GPUTileProcessor {
 //    	for (String sourceCode: sourceCodeUnits) {
       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
       		String sourceCode = sourceCodeUnits[cunit];
+       		//System.out.print(sourceCode);
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);

--- a/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
@@ -156,6 +156,11 @@ public class GeometryCorrection {
 				(float) disparityRadius   //=150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 		};
 	}
+	public static int arrayLength(int ncam) {
+		return 21+8*ncam;
+	}
 	public double [] toDoubleArray() { // for GPU comparison
 		return new double[] {
 				pixelCorrectionWidth, //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
@@ -322,6 +327,11 @@ cameraRadius,     // average distance from the "mass center" of the sensors to t
 		return wh;
 	}
+	public GeometryCorrection() {
+		// just to get the length of toFloatArray()
+		resetCorrVector();
+	}
 	public GeometryCorrection(double [] extrinsic_corr)
 	{
 		this.extrinsic_corr = 	new CorrVector(extrinsic_corr);
@@ -1328,7 +1338,7 @@ cameraRadius,     // average distance from the "mass center" of the sensors to t
 	public class CorrVector{
-		static final int LENGTH =       19; //  10;
+		public static final int LENGTH =19; //  10;
 		static final int LENGTH_ANGLES =10;
 		static final int TILT_INDEX =    0;
 		static final int AZIMUTH_INDEX = 3;
@@ -3307,6 +3317,245 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 		return pXY;
 	}
+	public double [][] getPortsCoordinatesAndDerivativesDbg( // To print intermediate results for comparison with the GPU
+			GeometryCorrection gc_main,
+			boolean     use_rig_offsets,
+			Matrix []   rots,
+			Matrix [][] deriv_rots,
+			double [][] pXYderiv, // if not null, should be double[8][] - not used here
+			double [][] disp_dist, //
+			double px,
+			double py,
+			double disparity)
+	{
+//		String dbg_s = corr_vector.toString();
+/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
+ * 1) unapply common distortion (maybe for different - master camera)
+ * 2) apply disparity
+ * 3) apply rotations and zoom
+ * 4) re-apply distortion
+ * 5) return port center X and Y
+ * line_time
+ */
+// moved here so disp_dist and imu will always be created
+		if (disp_dist == null) {
+			disp_dist = new double [numSensors][4];
+		}
+		double [] imu =  null;
+		if (disp_dist != null) {
+			imu =  extrinsic_corr.getIMU(); // currently it is common for all channels
+			if ((deriv_rots == null) &&  ((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0))){
+					deriv_rots = extrinsic_corr.getRotDeriveMatrices();
+			}
+		}
+///		if ((disp_dist == null) && (pXYderiv != null)) {
+///			disp_dist = new double [numSensors][4];
+///		}
+		double [][] rXY = getRXY(use_rig_offsets); // may include rig offsets
+		double [][] pXY = new double [numSensors][2];
+		double pXcd = px - 0.5 * gc_main.pixelCorrectionWidth;
+		double pYcd = py - 0.5 * gc_main.pixelCorrectionHeight;
+		double rD = Math.sqrt(pXcd*pXcd + pYcd*pYcd)*0.001*gc_main.pixelSize; // distorted radius in a virtual center camera
+		double rND2R=gc_main.getRByRDist(rD/gc_main.distortionRadius, (debugLevel > -1));
+		double pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
+		double pYc = pYcd * rND2R; // in pixels
+		System.out.println("px="+px+", py="+py);
+		System.out.println("pXcd="+pXcd+", pYcd="+pYcd);
+		System.out.println("rD="+rD+", rND2R="+rND2R);
+		System.out.println("pXc="+pXc+", pYc="+pYc);
+		// next radial distortion coefficients are for this, not master camera (may be the same)
+		double [] rad_coeff={this.distortionC,this.distortionB,this.distortionA,this.distortionA5,this.distortionA6,this.distortionA7,this.distortionA8};
+		double fl_pix = focalLength/(0.001*pixelSize); // focal length in pixels - this camera
+		double ri_scale = 0.001 * this.pixelSize / this.distortionRadius;
+		System.out.println("fl_pix="+fl_pix+", ri_scale="+ri_scale);
+		double [] xyz = (disparity > 0) ? getWorldCoordinates(   // USED in lwir
+				px,                                  // double px,
+				py,                                  // double py,
+				disparity,                           // double disparity,
+				true) : null;                               // boolean correctDistortions)
+		System.out.println("xyz[0]="+xyz[0]+", xyz[1]="+xyz[1]+", xyz[2]="+xyz[2]);
+		for (int i = 0; i < numSensors; i++){
+			// non-distorted XY of the shifted location of the individual sensor
+			double pXci0 = pXc - disparity *  rXY[i][0]; // in pixels
+			double pYci0 = pYc - disparity *  rXY[i][1];
+			// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
+			System.out.println("ncam="+i+": pXci0="+pXci0+", pYci0="+pYci0);
+			// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
+			double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+			Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
+			System.out.println("ncam="+i+": vi=");
+			vi.print(10, 5);
+			// Apply port-individual combined rotation/zoom matrix
+			Matrix rvi = rots[i].times(vi);
+			System.out.println("ncam="+i+": rvi="); rvi.print(10, 5);
+			// get back to the projection plane by normalizing vector
+			double norm_z = fl_pix/rvi.get(2, 0);
+			double pXci =  rvi.get(0, 0) * norm_z;
+			double pYci =  rvi.get(1, 0) * norm_z;
+			System.out.println("ncam="+i+": norm_z="+norm_z+", pXci="+pXci+", pYci="+pYci);
+			// Re-apply distortion
+			double rNDi = Math.sqrt(pXci*pXci + pYci*pYci); // in pixels
+			//		Rdist/R=A8*R^7+A7*R^6+A6*R^5+A5*R^4+A*R^3+B*R^2+C*R+(1-A6-A7-A6-A5-A-B-C)");
+			double ri = rNDi* ri_scale; // relative to distortion radius
+			//    		double rD2rND = (1.0 - distortionA8 - distortionA7 - distortionA6 - distortionA5 - distortionA - distortionB - distortionC);
+			System.out.println("ncam="+i+": rNDi="+rNDi+", ri="+ri);
+			double rD2rND = 1.0;
+			double rri = 1.0;
+			for (int j = 0; j < rad_coeff.length; j++){
+				rri *= ri;
+				rD2rND += rad_coeff[j]*(rri - 1.0); // Fixed
+			}
+			System.out.println("ncam="+i+": rri="+rri+", rD2rND="+rD2rND);
+			// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
+			double pXid = pXci * rD2rND;
+			double pYid = pYci * rD2rND;
+			System.out.println("ncam="+i+":  pXid="+pXid+", pYid="+pYid);
+			pXY[i][0] =  pXid + this.pXY0[i][0];
+			pXY[i][1] =  pYid + this.pXY0[i][1];
+			System.out.println("pXY["+i+"][0]="+pXY[i][0]+", pXY["+i+"][1]="+pXY[i][1]);
+			// used when calculating derivatives, TODO: combine calculations !
+			double drD2rND_dri = 0.0;
+			Matrix drvi_daz = null;
+			Matrix drvi_dtl = null;
+			Matrix drvi_drl = null;
+			double dpXci_dazimuth = 0.0;
+			double dpYci_dazimuth = 0.0;
+			double dpXci_dtilt =    0.0;
+			double dpYci_dtilt =    0.0;
+			double dpXci_droll =    0.0;
+			double dpYci_droll =    0.0;
+			if ((disp_dist != null) || (pXYderiv != null)) {
+				rri = 1.0;
+				for (int j = 0; j < rad_coeff.length; j++){
+					drD2rND_dri += rad_coeff[j] * (j+1) * rri;
+					rri *= ri;
+				}
+				if (deriv_rots != null) {
+					// needed for derivatives and IMU
+					drvi_daz = deriv_rots[i][0].times(vi);
+					drvi_dtl = deriv_rots[i][1].times(vi);
+					drvi_drl = deriv_rots[i][2].times(vi);
+					System.out.println("ncam="+i+": drvi_daz="); drvi_daz.print(10, 5);
+					System.out.println("ncam="+i+": drvi_dtl="); drvi_dtl.print(10, 5);
+					System.out.println("ncam="+i+": drvi_drl="); drvi_drl.print(10, 5);
+					dpXci_dazimuth = drvi_daz.get(0, 0) * norm_z - pXci * drvi_daz.get(2, 0) / rvi.get(2, 0);
+					dpYci_dazimuth = drvi_daz.get(1, 0) * norm_z - pYci * drvi_daz.get(2, 0) / rvi.get(2, 0);
+					dpXci_dtilt =    drvi_dtl.get(0, 0) * norm_z - pXci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
+					dpYci_dtilt =    drvi_dtl.get(1, 0) * norm_z - pYci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
+					dpXci_droll =    drvi_drl.get(0, 0) * norm_z - pXci * drvi_drl.get(2, 0) / rvi.get(2, 0);
+					dpYci_droll =    drvi_drl.get(1, 0) * norm_z - pYci * drvi_drl.get(2, 0) / rvi.get(2, 0);
+					System.out.println("ncam="+i+":  dpXci_dazimuth="+dpXci_dazimuth+", dpYci_dazimuth="+dpYci_dazimuth);
+					System.out.println("ncam="+i+":  dpXci_dtilt="+   dpXci_dtilt+   ", dpYci_dtilt="+   dpYci_dtilt);
+					System.out.println("ncam="+i+":  dpXci_droll="+   dpXci_droll+   ", dpYci_droll="+   dpYci_droll);
+				}
+			}
+			double delta_t = 0.0;
+//			double [] imu =  null;
+			double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
+			if (disp_dist != null) {
+				disp_dist[i] =   new double [4]; // dx/d_disp, dx_d_ccw_disp
+				// Not clear - what should be in Z direction before rotation here?
+				double [][] add0 = {
+						{-rXY[i][0],  rXY[i][1], 0.0},
+						{-rXY[i][1], -rXY[i][0], 0.0},
+						{ 0.0,                     0.0,                    0.0}}; // what is last element???
+				Matrix dd0 = new Matrix(add0);
+				Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
+////				Matrix dd1 = dd0.getMatrix(0, 1,0,1); // get top left 2x2 sub-matrix
+				// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
+				// unity vector in the direction of radius
+				System.out.println("ncam="+i+": dd1="); dd1.print(10, 5);
+				double c_dist = pXci/rNDi;
+				double s_dist = pYci/rNDi;
+				double [][] arot2= {
+						{c_dist, s_dist},
+						{-s_dist, c_dist}};
+				Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
+				System.out.println("ncam="+i+": rot2="); rot2.print(10, 5);
+				double [][] ascale_distort = {
+						{rD2rND + ri* drD2rND_dri, 0     },
+						{0,                       rD2rND}};
+				Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
+				Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
+				System.out.println("ncam="+i+": scale_distortXrot2Xdd1="); scale_distort.times(rot2).times(dd1).print(10, 5);
+				System.out.println("ncam="+i+": dd2="); dd2.print(10, 5);
+				disp_dist[i][0] =   dd2.get(0, 0);
+				disp_dist[i][1] =   dd2.get(0, 1);
+				disp_dist[i][2] =   dd2.get(1, 0); // d_py/d_disp
+				disp_dist[i][3] =   dd2.get(1, 1);
+				System.out.println("disp_dist["+i+"][0]="+   disp_dist[i][0]);
+				System.out.println("disp_dist["+i+"][1]="+   disp_dist[i][1]);
+				System.out.println("disp_dist["+i+"][2]="+   disp_dist[i][2]);
+				System.out.println("disp_dist["+i+"][3]="+   disp_dist[i][3]);
+//				imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+				// ERS linear does not yet use per-port rotations, probably not needed
+//				double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
+				if ((imu != null) &&((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0) ||(imu[3] != 0.0) ||(imu[4] != 0.0) ||(imu[5] != 0.0))) {
+					delta_t = dd2.get(1, 0) * disparity * line_time; // positive for top cameras, negative - for bottom
+					double ers_Xci = delta_t* (dpXci_dtilt * imu[0] + dpXci_dazimuth * imu[1]  + dpXci_droll * imu[2]);
+					double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1]  + dpYci_droll * imu[2]);
+					if (xyz != null) {
+						double k = SCENE_UNITS_SCALE * this.disparityRadius;
+						double wdisparity = disparity;
+						double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
+						System.out.println("ncam="+i+":  k="+k+", wdisparity="+wdisparity+", dwdisp_dz="+dwdisp_dz);
+						dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
+						dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
+						dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
+						dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+						System.out.println("ncam="+i+
+								":  dpXci_pYci_imu_lin[0][0]="+dpXci_pYci_imu_lin[0][0]+
+								", dpXci_pYci_imu_lin[0][2]="+dpXci_pYci_imu_lin[0][2]);
+						System.out.println("ncam="+i+
+								": dpXci_pYci_imu_lin[1][1]="+dpXci_pYci_imu_lin[1][1]+
+								", dpXci_pYci_imu_lin[1][2]="+dpXci_pYci_imu_lin[1][2]);
+						ers_Xci += delta_t* (dpXci_pYci_imu_lin[0][0] * imu[3] + dpXci_pYci_imu_lin[0][2] * imu[5]);
+						ers_Yci += delta_t* (dpXci_pYci_imu_lin[1][1] * imu[4] + dpXci_pYci_imu_lin[1][2] * imu[5]);
+						System.out.println("ncam="+i+":  ers_Xci="+ers_Xci+", ers_Yci="+ers_Yci);
+					}
+					pXY[i][0] +=  ers_Xci * rD2rND; // added correction to pixel X
+					pXY[i][1] +=  ers_Yci * rD2rND; // added correction to pixel Y
+					System.out.println("pXY["+i+"][0]="+pXY[i][0]+", pXY["+i+"][1]="+pXY[i][1]);
+				} else {
+					imu = null;
+				}
+// TODO: calculate derivatives of pX, pY by 3 imu omegas
+			}
+		}
+		return pXY;
+	}
 //	private Matrix m_balance_xy = null; // [2*numSensors][2*numSensors] 8x8 matrix to make XY ports correction to have average == 0
 //	private Matrix m_balance_dd = null; // [2*numSensors+1)][2*numSensors] 9x8 matrix to extract disparity from dd

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -9705,6 +9705,23 @@ public class ImageDtt {
 								centerX,
 								centerY,
 								disparity_aux); //  + disparity_corr);
+						if ((tileX == debug_tileX ) && (tileY == debug_tileY )) {
+							// will just print debug data
+							geometryCorrection_main.getPortsCoordinatesAndDerivativesDbg(
+									geometryCorrection_main, //			GeometryCorrection gc_main,
+									false,          // boolean use_rig_offsets,
+									corr_rots_main, // Matrix []   rots,
+									null,           //  Matrix [][] deriv_rots,
+									null,           // double [][] pXYderiv, // if not null, should be double[8][]
+									disp_dist_main,       // used to correct 3D correlations
+									centerX,
+									centerY,
+									disparity_main); //  + disparity_corr);
+						}
 						// acquisition time of the tiles centers in scanline times
 						if (ers_delay != null) {
 							for (int i = 0; i < quad_main; i++) ers_delay[0][i][nTile] = centersXY_main[i][1]-geometryCorrection_main.woi_tops[i];

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2018,7 +2018,8 @@ public class TwoQuadCLT {
 				clt_parameters.gpu_woi_twidth,
 				clt_parameters.gpu_woi_theight);
 		GPUTileProcessor.TpTask [] tp_tasks  = gPUTileProcessor.setFullFrameImages(
-				twoi,                                  // Rectangle                 woi,
+				false,                                // boolean                   calc_offsets, // old way, now not needed with GPU calculation
+				twoi,                                 // Rectangle                 woi,
 				clt_parameters.gpu_woi_round,         // boolean                   round_woi,
 	    		(float) clt_parameters.disparity,     // float                     target_disparity, // apply same disparity to all tiles
 	    		0xf, // int                        out_image, // from which tiles to generate image (currently 0/1)
@@ -2031,7 +2032,6 @@ public class TwoQuadCLT {
 	    		threadsMax,                           // final int                 threadsMax,  // maximal number of threads to launch
 	    		debugLevel);                          // final int                 debugLevel)
 		// Optionally save offsets here?
 //			EyesisCorrectionParameters.CorrectionParameters ecp,
 		boolean save_ports_xy = false; // true; Same files as saved with the kernels
@@ -2087,6 +2087,8 @@ public class TwoQuadCLT {
 				tp_tasks);
 		gPUTileProcessor.setTextureIndices(
 				texture_indices);
+		gPUTileProcessor.setGeometryCorrection(quadCLT_main.getGeometryCorrection()); // once
+		gPUTileProcessor.setExtrinsicsVector(quadCLT_main.getGeometryCorrection().getCorrVector()); // for each new image
 		// TODO: calculate from the camera geometry?
 		double[][] port_offsets = { // used only in textures to scale differences
@@ -2099,7 +2101,20 @@ public class TwoQuadCLT {
 		int NREPEAT = 1; // 00;
 		System.out.println("\n------------ Running GPU "+NREPEAT+" times ----------------");
 		long startGPU=System.nanoTime();
-		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execConverCorrectTiles();
+		for (int i = 0; i < NREPEAT; i++ ) {
+			gPUTileProcessor.execRotDerivs();
+		}
+		long startTasksSetup=System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) {
+			gPUTileProcessor.execSetTilesOffsets();
+		}
+		long startDirectConvert=System.nanoTime();
+		for (int i = 0; i < NREPEAT; i++ ) {
+			gPUTileProcessor.execConverCorrectTiles();
+		}
 // run imclt;
 		long startIMCLT=System.nanoTime();
@@ -2145,18 +2160,26 @@ public class TwoQuadCLT {
 		long endTexturesRBGA = System.nanoTime();
 		long endGPUTime = System.nanoTime();
-		long firstGPUTime=         (startIMCLT-       startGPU)         /NREPEAT;
-		long runImcltTime =        (endImcltTime -    startIMCLT)       /NREPEAT;
+		long rotDerivsTime=        (startTasksSetup-    startGPU)           /NREPEAT;
-		long runCorr2DTime =       (endCorr2d -       startCorr2d)      /NREPEAT;
+		long tasksSetupTime=       (startDirectConvert- startTasksSetup)    /NREPEAT;
-		long runTexturesTime =     (endTextures -     startTextures)    /NREPEAT;
+		long firstGPUTime=         (startIMCLT-         startDirectConvert) /NREPEAT;
-		long runTexturesRBGATime = (endTexturesRBGA - startTexturesRBGA)/NREPEAT;
+		long runImcltTime =        (endImcltTime -      startIMCLT)         /NREPEAT;
-		long runGPUTime =          (endGPUTime -      startGPU)         /NREPEAT;
+		long runCorr2DTime =       (endCorr2d -         startCorr2d)        /NREPEAT;
+		long runTexturesTime =     (endTextures -       startTextures)      /NREPEAT;
+		long runTexturesRBGATime = (endTexturesRBGA -   startTexturesRBGA)  /NREPEAT;
+		long runGPUTime =          (endGPUTime -        startGPU)           /NREPEAT;
 		// run corr2d
 		System.out.println("\n------------ End of running GPU "+NREPEAT+" times ----------------");
-		System.out.println("GPU run time ="+(runGPUTime * 1.0e-6)+"ms, (direct conversion: "+(firstGPUTime*1.0e-6)+"ms, imclt: "+
+		System.out.println("GPU run time ="+        (runGPUTime * 1.0e-6)+"ms");
-				(runImcltTime*1.0e-6)+"ms), corr2D: "+(runCorr2DTime*1.0e-6)+"ms), textures: "+(runTexturesTime*1.0e-6)+"ms, RGBA: "+
+		System.out.println(" - rot/derivs:        "+(rotDerivsTime*1.0e-6)+"ms");
-				(runTexturesRBGATime*1.0e-6)+"ms");
+		System.out.println(" - tasks setup:       "+(tasksSetupTime*1.0e-6)+"ms");
+		System.out.println(" - direct conversion: "+(firstGPUTime*1.0e-6)+"ms");
+		System.out.println(" - imclt:             "+(runImcltTime*1.0e-6)+"ms");
+		System.out.println(" - corr2D:            "+(runCorr2DTime*1.0e-6)+"ms");
+		System.out.println(" - textures:          "+(runTexturesTime*1.0e-6)+"ms");
+		System.out.println(" - RGBA:              "+(runTexturesRBGATime*1.0e-6)+"ms");
 		// get data back from GPU
 		float [][][] iclt_fimg = new float [GPUTileProcessor.NUM_CAMS][][];
 		for (int ncam = 0; ncam < iclt_fimg.length; ncam++) {