updated GPU code

fa4f3beb · Andrey Filippov · 15cff9c7 · fa4f3beb · fa4f3beb · fa4f3beb
Commit fa4f3beb authored Nov 11, 2020 by Andrey Filippov
8 changed files
--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
@@ -7053,7 +7053,7 @@ private Panel panel1,
 			}
 			dpixels[i] = d;
 		}
-		if (disparity_max > 0) {
+		if (log_mode && (disparity_max > 0)) {
 			mn = 0.0;
 			double d = disparity_max;
 			if (d < 0.0) { //

--- a/src/main/java/com/elphel/imagej/tileprocessor/CLTPass3d.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/CLTPass3d.java
@@ -1019,24 +1019,9 @@ public class CLTPass3d{
 				double     step_threshold,
 				double     min_disparity,
 				double     max_disparity,
-//				double     strength_floor,
-//				double     strength_pow,
 				double     stBlurSigma,
 				boolean    smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 				MeasuredLayersFilterParameters mlfp,
-//				int        smplSide, //        = 2;      // Sample size (side of a square)
-//				int        smplNum, //         = 3;      // Number after removing worst
-//				double     smplRms, //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//				boolean    smplWnd,  // use window functions for the samples
-
-//				double     max_abs_tilt,  //  2.0;   // pix per tile
-//				double     max_rel_tilt,  //  0.2;   // (pix / disparity) per tile
-//				double     damp_tilt,     //  0.001; // Damp tilt to handle insufficient  (co-linear)data
-//				double     min_tilt_disp, //  4.0;   // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//				double     transition,    //  1.0;   // Mode transition range (between tilted and maximal disparity)
-//				int        far_mode,      //  1;     // Far objects filtering mode (0 - off, 1 - power of disparity)
-//				double     far_power,     //  3.0;   // Raise disparity to this power before averaging for far objects
-
 				int        measSel)
 		{
 			this.superTiles = new SuperTiles(
@@ -1046,23 +1031,9 @@ public class CLTPass3d{
 					step_threshold,
 					min_disparity,
 					max_disparity,
-//					strength_floor,
-//					strength_pow,
 					stBlurSigma,
 					smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 					mlfp,
-//					smplSide, //        = 2;      // Sample size (side of a square)
-//					smplNum, //         = 3;      // Number after removing worst
-//					smplRms, //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//					smplWnd,           // final boolean    smplWnd,  // use window functions for the samples
-//					max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//					max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//					damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//					min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//					transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//					far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//					far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
-//					true,          // boolean    null_if_none,
 					measSel);
 			return this.superTiles;
 		}
@@ -1072,19 +1043,6 @@ public class CLTPass3d{

 				boolean    smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 				MeasuredLayersFilterParameters mlfp,
-//				int        smplSide, //        = 2;      // Sample size (side of a square)
-//				int        smplNum,  //         = 3;      // Number after removing worst
-//				double     smplRms,  //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//				boolean    smplWnd,  // use window functions for the samples
-
-//	  			double     max_abs_tilt,  //  2.0;   // pix per tile
-//				double     max_rel_tilt,  //  0.2;   // (pix / disparity) per tile
-//				double     damp_tilt,     //  0.001; // Damp tilt to handle insufficient  (co-linear)data
-//				double     min_tilt_disp, //  4.0;   // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//				double     transition,    //  1.0;   // Mode transition range (between tilted and maximal disparity)
-//				int        far_mode,      //  1;     // Far objects filtering mode (0 - off, 1 - power of disparity)
-//				double     far_power,     //  3.0;   // Raise disparity to this power before averaging for far objects
-
 				int        measSel)
 		{
 			if (this.superTiles == null){
@@ -1096,19 +1054,6 @@ public class CLTPass3d{

 					smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 					mlfp,
-//					smplSide, //        = 2;      // Sample size (side of a square)
-//					smplNum,  //         = 3;      // Number after removing worst
-//					smplRms,  //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//					smplWnd,  // use window functions for the samples
-
-//					max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//					max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//					damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//					min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//					transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//					far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//					far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
-
 					measSel);
 		}


--- a/src/main/java/com/elphel/imagej/tileprocessor/SuperTiles.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/SuperTiles.java
@@ -104,24 +104,6 @@ public class SuperTiles{
 			double                  stBlurSigma,
 			boolean                 smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 			MeasuredLayersFilterParameters  mlfp,
-
-//			double                  strength_floor,
-//			double                  strength_pow,
-//			boolean                 smplMode, //        = true;   // Use sample mode (false - regular tile mode)
-//			int                     smplSide, //        = 2;      // Sample size (side of a square)
-//			int                     smplNum, //         = 3;      // Number after removing worst
-//			double                  smplRms, //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//			boolean                 smplWnd,  // use window functions for the samples
-
-//  			double     max_abs_tilt,  //  2.0;   // pix per tile
-//			double     max_rel_tilt,  //  0.2;   // (pix / disparity) per tile
-//			double     damp_tilt,     //  0.001; // Damp tilt to handle insufficient  (co-linear)data
-//			double     min_tilt_disp, //  4.0;   // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//			double     transition,    //  1.0;   // Mode transition range (between tilted and maximal disparity)
-//			int        far_mode,      //  1;     // Far objects filtering mode (0 - off, 1 - power of disparity)
-//			double     far_power,     //  3.0;   // Raise disparity to this power before averaging for far objects
-//			boolean    null_if_none,
-
 			int                     measSel)
 	{
 		this.cltPass3d =           cltPass3d;
@@ -135,21 +117,6 @@ public class SuperTiles{
 		this.smplMode        = smplMode;   // Use sample mode (false - regular tile mode)
 		this.mlfp = mlfp.clone();

-//		this.strength_floor = strength_floor;
-//		this.strength_pow =   strength_pow;
-//		this.smplSide        = smplSide;   // Sample size (side of a square)
-//		this.smplNum         = smplNum;    // Number after removing worst
-//		this.smplRms         = smplRms;    // Maximal RMS of the remaining tiles in a sample
-//		this.max_abs_tilt =  max_abs_tilt;
-//		this.max_rel_tilt =  max_rel_tilt;
-//		this.damp_tilt =     damp_tilt;
-//		this.min_tilt_disp = min_tilt_disp;
-//		this.transition =    transition;
-//		this.far_mode =      far_mode;
-//		this.far_power =     far_power;
-//		this.smplWnd         = smplWnd;    // Use window functions for the samples
-
-
 		this.measSel =        measSel;
 		this.step_threshold_near = this.step_threshold_far * step_near / this.step_far ;
 		this.bin_far =             this.step_threshold_far / this.step_far;
@@ -197,18 +164,6 @@ public class SuperTiles{
 				null,       // boolean [][] tile_sel, // null  or per-measurement layer, per-tile selection. For each layer null - do not use, {} - use all
 				smplMode,   // final boolean    smplMode, //        = true;   // Use sample mode (false - regular tile mode)
 				mlfp,
-//				smplSide,   // final int        smplSide, //        = 2;      // Sample size (side of a square)
-//				smplNum,    // final int        smplNum,  //         = 3;      // Number after removing worst
-//				smplRms,    // final double     smplRms,  //         = 0.1;    // Maximal RMS of the remaining tiles in a sample
-//				smplWnd,    // final boolean    smplWnd,  // use window functions for the samples
-
-//				max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//				max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//				damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//				min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//				transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//				far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//				far_power,     //    1.0; // Raise disparity to this power before averaging for far objects

 				measSel);   // calculate and blur supertiles (for all, not just selected?)
 		if (tileProcessor.globalDebugLevel > 0){

--- a/src/main/java/com/elphel/imagej/tileprocessor/TileProcessor.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TileProcessor.java
@@ -6796,22 +6796,9 @@ ImageDtt.startAndJoin(threads);
 					clt_parameters.stStepThreshold,  // double     step_threshold,
 					clt_parameters.stMinDisparity,   // double     min_disparity,
 					clt_parameters.grow_disp_max,   // double     max_disparity,
-//					clt_parameters.stFloor,          // double     strength_floor,
-//					clt_parameters.stPow,            // double     strength_pow,
 					0.0, // NO BLUR double     stBlurSigma)
 					false, //clt_parameters.stSmplMode,  // Use sample mode (false - regular tile mode)
 					clt_parameters.mlfp,         // Filter parameters
-//					clt_parameters.stSmplSide,  // Sample size (side of a square)
-//					clt_parameters.stSmplNum,   // Number after removing worst
-//					clt_parameters.stSmplRms,   // Maximal RMS of the remaining tiles in a sample
-//					clt_parameters.stSmplWnd,   // boolean                 smplWnd,  // use window functions for the samples
-//					clt_parameters.fs_max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//					clt_parameters.fs_max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//					clt_parameters.fs_damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//					clt_parameters.fs_min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//					clt_parameters.fs_transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//					clt_parameters.fs_far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//					clt_parameters.fs_far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
 					clt_parameters.stMeasSel); // bitmask of the selected measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert
 			dbg_hist[0] = scan_prev.getSuperTiles().showDisparityHistogram();
 			scan_prev.setSuperTiles(
@@ -6820,22 +6807,9 @@ ImageDtt.startAndJoin(threads);
 					clt_parameters.stStepThreshold,  // double     step_threshold,
 					clt_parameters.stMinDisparity,   // double     min_disparity,
 					clt_parameters.grow_disp_max,   // double     max_disparity,
-//					clt_parameters.stFloor,          // double     strength_floor,
-//					clt_parameters.stPow,            // double     strength_pow,
 					0.0, // NO BLUR double     stBlurSigma)
 					clt_parameters.stSmplMode,  // Use sample mode (false - regular tile mode)
 					clt_parameters.mlfp,         // Filter parameters
-//					clt_parameters.stSmplSide,  // Sample size (side of a square)
-//					clt_parameters.stSmplNum,   // Number after removing worst
-//					clt_parameters.stSmplRms,   // Maximal RMS of the remaining tiles in a sample
-//					clt_parameters.stSmplWnd,   // boolean                 smplWnd,  // use window functions for the samples
-//					clt_parameters.fs_max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//					clt_parameters.fs_max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//					clt_parameters.fs_damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//					clt_parameters.fs_min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//					clt_parameters.fs_transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//					clt_parameters.fs_far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//					clt_parameters.fs_far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
 					clt_parameters.stMeasSel); // bitmask of the selected measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert
 			dbg_hist[1] = scan_prev.getSuperTiles().showDisparityHistogram();
 		}
@@ -6847,22 +6821,9 @@ ImageDtt.startAndJoin(threads);
 				clt_parameters.stStepThreshold,  // double     step_threshold,
 				clt_parameters.stMinDisparity,   // double     min_disparity,
 				clt_parameters.grow_disp_max,   // double     max_disparity,
-//				clt_parameters.stFloor,          // double     strength_floor,
-//				clt_parameters.stPow,            // double     strength_pow,
 				clt_parameters.stSigma,          // with blur double     stBlurSigma)
 				false, //clt_parameters.stSmplMode,  // Use sample mode (false - regular tile mode)
 				clt_parameters.mlfp,         // Filter parameters
-//				clt_parameters.stSmplSide,  // Sample size (side of a square)
-//				clt_parameters.stSmplNum,   // Number after removing worst
-//				clt_parameters.stSmplRms,   // Maximal RMS of the remaining tiles in a sample
-//				clt_parameters.stSmplWnd,   // boolean                 smplWnd,  // use window functions for the samples
-//				clt_parameters.fs_max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//				clt_parameters.fs_max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//				clt_parameters.fs_damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//				clt_parameters.fs_min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//				clt_parameters.fs_transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//				clt_parameters.fs_far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//				clt_parameters.fs_far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
 				clt_parameters.stMeasSel); // bitmask of the selected measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert
 		if (show_st) { // otherwise only blured version is needed
 			dbg_hist[2] = scan_prev.getSuperTiles().showDisparityHistogram();
@@ -6884,22 +6845,9 @@ ImageDtt.startAndJoin(threads);
 					clt_parameters.stStepThreshold,  // double     step_threshold,
 					clt_parameters.stMinDisparity,   // double     min_disparity,
 					clt_parameters.grow_disp_max,   // double     max_disparity,
-//					clt_parameters.stFloor,          // double     strength_floor,
-//					clt_parameters.stPow,            // double     strength_pow,
 					0.0, // NO BLUR double     stBlurSigma)
 					clt_parameters.stSmplMode,  // Use sample mode (false - regular tile mode)
 					clt_parameters.mlfp,         // Filter parameters
-//					clt_parameters.stSmplSide,  // Sample size (side of a square)
-//					clt_parameters.stSmplNum,   // Number after removing worst
-//					clt_parameters.stSmplRms,   // Maximal RMS of the remaining tiles in a sample
-//					clt_parameters.stSmplWnd,   // boolean                 smplWnd,  // use window functions for the samples
-//					clt_parameters.fs_max_abs_tilt,  // 2.0; // Maximal absolute tilt in pixels/tile
-//					clt_parameters.fs_max_rel_tilt,  // 0.2; // Maximal relative tilt in pixels/tile/disparity
-//					clt_parameters.fs_damp_tilt,     //    0.001; // Damp tilt to handle insufficient  (co-linear)data
-//					clt_parameters.fs_min_tilt_disp, // 4.0; // Disparity switch between filtering modes - near objects use tilts, far - use max disparity
-//					clt_parameters.fs_transition,    // 1.0; // Mode transition range (between tilted and maximal disparity)
-//					clt_parameters.fs_far_mode,      //     1;   // Far objects filtering mode (0 - off, 1 - power of disparity)
-//					clt_parameters.fs_far_power,     //    1.0; // Raise disparity to this power before averaging for far objects
 					clt_parameters.stMeasSel); // bitmask of the selected measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert
 		}

@@ -7063,22 +7011,6 @@ ImageDtt.startAndJoin(threads);
 				debugLevel,                  // final int        debugLevel)
 				clt_parameters.tileX,
 				clt_parameters.tileY);
-/*
-		if (clt_parameters.plSplitApply) {
-			while (true) {
-				int num_added = 0;
-				num_added += st.fillSquares();
-				if (debugLevel > -1) {
-					System.out.println("after fillSquares() added "+num_added);
-				}
-				num_added += st.cutCorners();
-				if (debugLevel > -1) {
-					System.out.println("after plCutCorners() added (cumulative) "+num_added);
-				}
-				if (num_added == 0) break;
-			}
-		}
-*/
 		double [][][]  dispStrength = st.getDisparityStrengths(
 				clt_parameters.stMeasSel); // int        stMeasSel) //            = 1;      // Select measurements for supertiles : +1 - combo, +2 - quad +4 - hor +8 - vert)
 		boolean [][] tileSel =  st.getMeasurementSelections(
@@ -7199,25 +7131,6 @@ ImageDtt.startAndJoin(threads);
 					clt_parameters.tileY);
 		} // if (clt_parameters.plSplitApply)

-/*
-		while (true) {
-			int num_added = 0;
-			if (clt_parameters.plFillSquares){
-				num_added += st.fillSquares();
-			}
-			if (debugLevel > -1) {
-				System.out.println("after fillSquares() added "+num_added);
-			}
-			if (clt_parameters.plCutCorners){
-				num_added += st.cutCorners();
-			}
-			if (debugLevel > -1) {
-				System.out.println("after plCutCorners() added (cumulative) "+num_added);
-			}
-			if (num_added == 0) break;
-		}
-
-*/

 		int max_num_tries = 20;
 		if (clt_parameters.plIterations > 0) {

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -403,6 +403,16 @@ __constant__ float lpf_corr[64]={ // modify if needed
 				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
 		};

+__constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f,
+				1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f, 1.00000000f
+		};

 __constant__ int pairs[6][2]={
 		{0, 1},
@@ -1086,6 +1096,22 @@ extern "C" __global__ void correlate2D_inner(
 		float * clt_tile1i = clt_tile1 + threadIdx.x;
 		float * clt_tile2i = clt_tile2 + threadIdx.x;
 #pragma unroll
+#define USE_LOG
+#ifdef 	USE_LOG
+		// Apply high-pass filter to correlation inputs to reduce dynamic range before multiplication
+		for (int q = 0; q < 4; q++){
+    		float *log = LoG_corr + threadIdx.x;
+			for (int i = 0; i < DTT_SIZE; i++){ // copy 32 rows (4 quadrants of 8 rows)
+				*clt_tile1i= (*gpu_tile1) * (*log);
+				*clt_tile2i= (*gpu_tile2) * (*log);
+				clt_tile1i += DTT_SIZE1;
+				clt_tile2i += DTT_SIZE1;
+				gpu_tile1 += DTT_SIZE;
+				gpu_tile2 += DTT_SIZE;
+				log +=       DTT_SIZE;
+			}
+		}
+#else
 		for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
 			*clt_tile1i= *gpu_tile1;
 			*clt_tile2i= *gpu_tile2;
@@ -1094,6 +1120,7 @@ extern "C" __global__ void correlate2D_inner(
 			gpu_tile1 += DTT_SIZE;
 			gpu_tile2 += DTT_SIZE;
    	}
+#endif //USE_LOG
 		__syncthreads();
 #ifdef DBG_TILE
 #ifdef DEBUG6

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
@@ -281,6 +281,32 @@ extern "C" __global__ void calc_rot_deriv(
 }


+extern "C" __global__ void calculate_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv)
+{
+	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
+	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+	if (threadIdx.x == 0) { // always 1
+    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+    			gpu_tasks,               // struct tp_task     * gpu_tasks,
+				num_tiles,               // int                  num_tiles,          // number of tiles in task list
+				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
+				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
+				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
+
+	}
+//	__syncthreads();// __syncwarp();
+//	cudaDeviceSynchronize();
+//	cudaDeviceSynchronize();
+}
+
+
 /*
 * blockDim.x = NUM_CAMS
 * blockDim.y = TILES_PER_BLOCK_GEOM
@@ -295,12 +321,7 @@ extern "C" __global__ void get_tiles_offsets(
 		trot_deriv   * gpu_rot_deriv)
 {
 	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
-	if (task_num >= num_tiles){
-		return;
-	}
 	int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
-	int ncam = threadIdx.x;
-	// threadIdx.x - numcam, used for per-camera
 	__shared__ struct gc geometry_correction;
 	__shared__ float rByRDist [RBYRDIST_LEN];
 	__shared__ struct corr_vector extrinsic_corr;
@@ -355,6 +376,10 @@ extern "C" __global__ void get_tiles_offsets(
 		}
 	}
 	__syncthreads();
+	int ncam = threadIdx.x;
+	if (task_num >= num_tiles){
+		return;
+	}
 	int imu_exists = // todo - calculate once with rot_deriv?
 			(extrinsic_corr.imu_rot[0] != 0.0) ||
 			(extrinsic_corr.imu_rot[1] != 0.0) ||
@@ -418,7 +443,7 @@ extern "C" __global__ void get_tiles_offsets(
 	xyz[0] =  SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
 	xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
 	// next radial distortion coefficients are for this, not master camera (may be the same)
-//	geometry_correction.rad_coeff[i];
+	//	geometry_correction.rad_coeff[i];
 	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
 	float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;

@@ -486,7 +511,7 @@ extern "C" __global__ void get_tiles_offsets(
 	float pYid = pYci * rD2rND;
 	pXY[0] =  pXid + geometry_correction.pXY0[ncam][0];
 	pXY[1] =  pYid + geometry_correction.pXY0[ncam][1];
-// new for ERS
+	// new for ERS
 	pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
 	__syncthreads();
 	// Each thread re-calculate same sum
@@ -511,9 +536,6 @@ extern "C" __global__ void get_tiles_offsets(
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21

-
-
-	//	float rvi[3];
 	float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
 	float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
 	float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
@@ -547,7 +569,7 @@ extern "C" __global__ void get_tiles_offsets(

 	float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
 	float dpXci_pYci_imu_lin[2][3];
-/*
+	/*
 				double [][] add0 = {
 						{-rXY[i][0],  rXY[i][1], 0.0},
 						{-rXY[i][1], -rXY[i][0], 0.0},
@@ -570,12 +592,11 @@ extern "C" __global__ void get_tiles_offsets(
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21

-
 	// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
 	// unity vector in the direction of radius
 	float c_dist = pXci/rNDi;
 	float s_dist = pYci/rNDi;
-//#undef NVRTC_BUG
+	//#undef NVRTC_BUG
 	float drD2rND_dri = 0.0;
 	{
 		float rri = 1.0;
@@ -618,28 +639,16 @@ extern "C" __global__ void get_tiles_offsets(
 	__syncthreads();// __syncwarp();
 #endif // DEBUG21

-
 	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
 	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
 	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
 	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];

-//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
-//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
-//	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
-// ERS linear does not yet use per-port rotations, probably not needed
+	//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+	//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+	//	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
+	// ERS linear does not yet use per-port rotations, probably not needed
 	if (imu_exists){
-		/*
-		float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
-		float ers_Xci =	delta_t * (
-				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
-				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
-				dpXci_droll * extrinsic_corr.imu_rot[2]);
-		float ers_Yci =	delta_t* (
-				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
-				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
-				dpYci_droll * extrinsic_corr.imu_rot[2]);
-		 */
 		float ers_x =
 				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
 				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
@@ -649,11 +658,8 @@ extern "C" __global__ void get_tiles_offsets(
 				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
 				dpYci_droll * extrinsic_corr.imu_rot[2];

-
-
 #ifdef DEBUG21
 		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-//			printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
 			printf("ers_x = %f,  ers_y = %f\n", ers_x, ers_y);
 		}
 		__syncthreads();// __syncwarp();
@@ -665,15 +671,8 @@ extern "C" __global__ void get_tiles_offsets(
 			dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
 			dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
 			dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
-			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
-			/*
-			ers_Xci += delta_t* (
-					dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
-					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
-			ers_Yci += delta_t* (
-					dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
-					dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
-			*/
+////			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+			dpXci_pYci_imu_lin[1][2] =  -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
 			ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
 					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
 			ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
@@ -700,8 +699,6 @@ extern "C" __global__ void get_tiles_offsets(
 	// copy results to global memory pXY,  disp_dist
 	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
 	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
-
-
 }

 extern "C" __global__ void calcReverseDistortionTable(

--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -149,6 +149,15 @@ extern "C" __global__ void get_tiles_offsets(
 		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv);

+extern "C" __global__ void calculate_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv);
+
+
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,