reconciled geometry with java

a6844c60 · Andrey Filippov · dba4dfce · a6844c60 · a6844c60 · a6844c60
Commit a6844c60 authored Apr 13, 2020 by Andrey Filippov
6 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -110,14 +110,6 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
 #define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
-// Use CORR_OUT_RAD for the correlation output
-//#define DBG_TILE_X     40
-//#define DBG_TILE_Y     80
-#define DBG_TILE_X     161 // 49
-#define DBG_TILE_Y     111 // 66
-#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
-#undef DBG_MARK_DBG_TILE
 //56494
 // struct tp_task
@@ -1150,6 +1142,7 @@ __global__ void generate_RBGA(
 			int                height, // <= TILESY, use for faster processing of LWIR images
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
 			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction
@@ -1900,11 +1893,11 @@ __global__ void textures_gen(
 #endif // ifdef USE_textures_gen
 extern "C"
 __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
 		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -2006,14 +1999,21 @@ __global__ void textures_accumulate(
 		}
 		__syncthreads();// __syncwarp();
 #endif
-		// perform idct
+#ifdef DBG_TILE		// perform idct
 		imclt8threads(
 				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
 				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 				mclt_tile,  // float * mclt_tile )
 				((tile_num == DBG_TILE)  && (threadIdx.x == 0)));
+#else
+		imclt8threads(
+				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
+				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+				mclt_tile,  // float * mclt_tile )
+				0);
+#endif
 		__syncthreads();// __syncwarp();
 #ifdef DEBUG7
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\ntextures_gen mclt color = %d\n",color);
@@ -2024,6 +2024,7 @@ __global__ void textures_accumulate(
 		__syncthreads();// __syncwarp();
 #endif
 		if (colors > 1) {
+#ifdef DBG_TILE
 			debayer_shot(
 					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
 					min_shot,    // float     min_shot,   // 10.0
@@ -2032,6 +2033,16 @@ __global__ void textures_accumulate(
 					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 					mclt_tmp,    // float   * mclt_tmp,
 					((tile_num == DBG_TILE)  && (threadIdx.x == 0))); // int debug);
+#else
+			debayer_shot(
+					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
+					min_shot,    // float     min_shot,   // 10.0
+					scale_shot,  // float     scale_shot, // 3.0 (0.0 for mono)
+					mclt_tile,   // float   * mclt_src,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_tmp,    // float   * mclt_tmp,
+					0); // int debug);
+#endif
 			__syncthreads();// __syncwarp();
 		} else {
 			// copy? - no, just remember to use mclt_tile, not mclt_dst
@@ -2105,6 +2116,7 @@ __global__ void textures_accumulate(
 	__syncthreads();// __syncwarp();
 #endif
 //	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
+#ifdef DBG_TILE
 	tile_combine_rgba(
 			colors,                    // int     colors,        // number of colors
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
@@ -2120,7 +2132,23 @@ __global__ void textures_accumulate(
 			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
 			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
 			(tile_num == DBG_TILE) );  //int     debug );
+#else
+	tile_combine_rgba(
+			colors,                    // int     colors,        // number of colors
+			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
+			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
+			(float *) shr1.rgbaw,      // float * rgba,          // result
+			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
+			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
+			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
+			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
+			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
+			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
+			0);  //int     debug );
+#endif
 // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
 // float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 //	size_t texture_tile_offset = + tile_indx * texture_stride;

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -80,12 +80,12 @@ extern "C" __global__ void clear_texture_rbga(
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		// TODO: use geometry_correction rXY !
+		float           * gpu_port_offsets,   // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
 		float             min_shot,           // 10.0
@@ -127,6 +127,7 @@ __global__ void generate_RBGA(
 			int                height, // <= TILESY, use for faster processing of LWIR images
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
 			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -47,7 +47,8 @@
 #define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
 #define CYCLES_COPY_CV   ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
 #define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
-#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
 #define DBG_CAM 3
@@ -115,7 +116,7 @@ __constant__ int mm_seq [3][3][3]={
 				{-1,-1,-1} // do nothing
 		}};
+#if 0
 __device__ float rot_matrices       [NUM_CAMS][3][3];
 //__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
@@ -309,7 +310,7 @@ extern "C" __global__ void calc_rot_matrices(
 }
+#endif
 __constant__ int offset_rots =     0;                   //0
 __constant__ int offset_derivs =   1;                   // 1..4 // should be next
 __constant__ int offset_matrices = 5;   // 5..11
@@ -452,6 +453,12 @@ extern "C" __global__ void calc_rot_deriv(
 		gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 	}
 	__syncthreads();
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+			printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
 // All done - read/verify all arrays
@@ -468,7 +475,8 @@ extern "C" __global__ void get_tiles_offsets(
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
+		union trot_deriv   * gpu_rot_deriv)
 {
 //	int task_num = blockIdx.x * blockDim.x + threadIdx.x; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
 	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
@@ -478,8 +486,8 @@ extern "C" __global__ void get_tiles_offsets(
 	__shared__ struct gc geometry_correction;
 	__shared__ float rByRDist [RBYRDIST_LEN];
 	__shared__ struct corr_vector extrinsic_corr;
-	__shared__ float rots[NUM_CAMS][3][3];
+	__shared__ trot_deriv rot_deriv;
-	__shared__ float pXY[NUM_CAMS][2]; // result to be copied to task
+	float pXY[2]; // result to be copied to task
 	// copy data common to all threads
 	{
 		float * gcp_local =  (float *) &geometry_correction;
@@ -515,31 +523,37 @@ extern "C" __global__ void get_tiles_offsets(
 			offset += THREADS_PER_BLOCK_GEOM;
 		}
 	}
-	// copy rotational matrices
+	// copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
-	//	__shared__ float rots[NUM_CAMS][3][3];
-	//__device__ float rot_matrices [NUM_CAMS][3][3];
 	{
-		float * rots_local =  (float *) rots;
+		float * rots_local =  (float *) &rot_deriv;
-		float * rots_global = (float *) rot_matrices;
+		float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
 		int offset = thread_xy;
 		for (int i = 0; i < CYCLES_COPY_ROTS; i++){
-			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
+			if (offset < sizeof(trot_deriv)/sizeof(float)) {
 				*(rots_local + offset) = *(rots_global + offset);
 			}
 			offset += THREADS_PER_BLOCK_GEOM;
 		}
 	}
 	__syncthreads();
+	int imu_exists = // todo - calculate once with rot_deriv?
-#ifdef DEBUG20
+			(extrinsic_corr.imu_rot[0] != 0.0) ||
-	if ((threadIdx.x == 0)  && ( blockIdx.x == 0)){
+			(extrinsic_corr.imu_rot[1] != 0.0) ||
-		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
+			(extrinsic_corr.imu_rot[2] != 0.0) ||
+			(extrinsic_corr.imu_move[0] != 0.0) ||
+			(extrinsic_corr.imu_move[1] != 0.0) ||
+			(extrinsic_corr.imu_move[2] != 0.0);
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("\nTile = %d, camera= %d\n", task_num, ncam);
+		printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
 		printGeometryCorrection(&geometry_correction);
 		printExtrinsicCorrection(&extrinsic_corr);
 	}
 	__syncthreads();// __syncwarp();
-#endif // DEBUG20
+#endif // DEBUG21
 	//		String dbg_s = corr_vector.toString();
 	/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
 	 * 1) unapply common distortion (maybe for different - master camera)
@@ -561,11 +575,14 @@ extern "C" __global__ void get_tiles_offsets(
 	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
 	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
-	float rXY [NUM_CAMS][2];
+//	float rXY [NUM_CAMS][2];
+	float rXY [2];
 //	for (int i = 0; i < NUM_CAMS;i++){
-	rXY[ncam][0] = geometry_correction.rXY[ncam][0];
+//	rXY[ncam][0] = geometry_correction.rXY[ncam][0];
-	rXY[ncam][1] = geometry_correction.rXY[ncam][1];
+//	rXY[ncam][1] = geometry_correction.rXY[ncam][1];
+	rXY[0] = geometry_correction.rXY[ncam][0];
+	rXY[1] = geometry_correction.rXY[ncam][1];
 //	}
 	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
@@ -573,7 +590,8 @@ extern "C" __global__ void get_tiles_offsets(
 	float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
 	float pYc = pYcd * rND2R; // in pixels
 	float xyz [3]; // getWorldCoordinates
-	xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius / (disparity * 0.001*geometry_correction.pixelSize); // "+" - near, "-" far
+	xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
+			(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
 	xyz[0] =  SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
 	xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
 	// next radial distortion coefficients are for this, not master camera (may be the same)
@@ -581,23 +599,40 @@ extern "C" __global__ void get_tiles_offsets(
 	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
 	float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
-//	for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("\nTile = %d, camera= %d\n", task_num, ncam);
+		printf("tileX = %d,  tileY = %d\n", tileX, tileY);
+		printf("px = %f,  py = %f\n", px, py);
+		printf("pXcd = %f,  pYcd = %f\n", pXcd, pYcd);
+		printf("rXY[0] = %f,  rXY[1] = %f\n", rXY[0], rXY[1]);
+		printf("rD = %f,  rND2R = %f\n", rD, rND2R);
+		printf("pXc = %f,  pYc = %f\n", pXc, pYc);
+		printf("fl_pix = %f,  ri_scale = %f\n", fl_pix, ri_scale);
+		printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+	// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
 			// non-distorted XY of the shifted location of the individual sensor
 	// -------------- Each camera calculated by its own thread ----------------
-	float pXci0 = pXc - disparity *  rXY[ncam][0]; // in pixels
+	float pXci0 = pXc - disparity *  rXY[0]; // [ncam][0]; // in pixels
-	float pYci0 = pYc - disparity *  rXY[ncam][1];
+	float pYci0 = pYc - disparity *  rXY[1]; // [ncam][1];
 	// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
 	// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
 	///		double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
 	///		Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
 	// Apply port-individual combined rotation/zoom matrix
 	///		Matrix rvi = rots[i].times(vi);
 	float rvi[3];
 #pragma unroll
 	for (int j = 0; j< 3; j++){
-		rvi[j] = rots[ncam][j][0] * pXci0 + rots[ncam][j][1] * pYci0 + rots[ncam][j][2] * fl_pix;
+		rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
 	}
 	// get back to the projection plane by normalizing vector
 	float norm_z = fl_pix/rvi[2];
@@ -619,66 +654,93 @@ extern "C" __global__ void get_tiles_offsets(
 	// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
 	float pXid = pXci * rD2rND;
 	float pYid = pYci * rD2rND;
-	pXY[ncam][0] =  pXid + geometry_correction.pXY0[ncam][0];
+	pXY[0] =  pXid + geometry_correction.pXY0[ncam][0];
-	pXY[ncam][1] =  pYid + geometry_correction.pXY0[ncam][1];
+	pXY[1] =  pYid + geometry_correction.pXY0[ncam][1];
+	// used when calculating derivatives, TODO: combine calculations !
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
+		printf("rvi[0] = %f,  rvi[1] = %f,  rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
+		printf("norm_z = %f,  pXci = %f,  pYci = %f\n", norm_z, pXci, pYci);
+		printf("rNDi = %f,  ri = %f\n", rNDi, ri);
+		printf("rD2rND = %f\n", rD2rND);
+		printf("pXid = %f,  pYid = %f\n", pXid, pYid);
+		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+	//	float rvi[3];
+	float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
+	float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
+	float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
+#pragma unroll
+	for (int j = 0; j< 3; j++){
+//		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  rvi[0] + rot_deriv.d_daz[ncam][j][1] *  rvi[1] + rot_deriv.d_daz[ncam][j][2] *  rvi[2];
+//		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
+//		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
+		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  pXci0 + rot_deriv.d_daz[ncam][j][1] *  pYci0 + rot_deriv.d_daz[ncam][j][2] *  fl_pix;
+		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
+		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
+	}
+//			double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+	float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
+	float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
+	float dpXci_dtilt =    drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
+	float dpYci_dtilt =    drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
+	float dpXci_droll =    drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
+	float dpYci_droll =    drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("drvi_daz[0] = %f,  drvi_daz[1] = %f,  drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
+		printf("drvi_dtl[0] = %f,  drvi_dtl[1] = %f,  drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
+		printf("drvi_drl[0] = %f,  drvi_drl[1] = %f,  drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
+		printf("dpXci_dazimuth = %f,  dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
+		printf("dpXci_dtilt = %f,     dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
+		printf("dpXci_droll = %f,     dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
-//	}//for (int i = 0; i < NUM_CAMS; i++){
+	float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
+	float dpXci_pYci_imu_lin[2][3];
 /*
+ 				double [][] add0 = {
-			// used when calculating derivatives, TODO: combine calculations !
-			double drD2rND_dri = 0.0;
-			Matrix drvi_daz = null;
-			Matrix drvi_dtl = null;
-			Matrix drvi_drl = null;
-			double dpXci_dazimuth = 0.0;
-			double dpYci_dazimuth = 0.0;
-			double dpXci_dtilt =    0.0;
-			double dpYci_dtilt =    0.0;
-			double dpXci_droll =    0.0;
-			double dpYci_droll =    0.0;
-			if ((disp_dist != null) || (pXYderiv != null)) {
-				rri = 1.0;
-				for (int j = 0; j < rad_coeff.length; j++){
-					drD2rND_dri += rad_coeff[j] * (j+1) * rri;
-					rri *= ri;
-				}
-				if (deriv_rots != null) {
-					// needed for derivatives and IMU
-					drvi_daz = deriv_rots[i][0].times(vi);
-					drvi_dtl = deriv_rots[i][1].times(vi);
-					drvi_drl = deriv_rots[i][2].times(vi);
-					dpXci_dazimuth = drvi_daz.get(0, 0) * norm_z - pXci * drvi_daz.get(2, 0) / rvi.get(2, 0);
-					dpYci_dazimuth = drvi_daz.get(1, 0) * norm_z - pYci * drvi_daz.get(2, 0) / rvi.get(2, 0);
-					dpXci_dtilt =    drvi_dtl.get(0, 0) * norm_z - pXci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
-					dpYci_dtilt =    drvi_dtl.get(1, 0) * norm_z - pYci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
-					dpXci_droll =    drvi_drl.get(0, 0) * norm_z - pXci * drvi_drl.get(2, 0) / rvi.get(2, 0);
-					dpYci_droll =    drvi_drl.get(1, 0) * norm_z - pYci * drvi_drl.get(2, 0) / rvi.get(2, 0);
-				}
-			}
-			double delta_t = 0.0;
-			double [] imu =  null;
-			double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
-			if (disp_dist != null) {
-				disp_dist[i] =   new double [4]; // dx/d_disp, dx_d_ccw_disp
-				// Not clear - what should be in Z direction before rotation here?
-				double [][] add0 = {
 						{-rXY[i][0],  rXY[i][1], 0.0},
 						{-rXY[i][1], -rXY[i][0], 0.0},
-						{ 0.0,                     0.0,                    0.0}}; // what is last element???
+						{ 0.0,        0.0,       0.0}}; // what is last element???
 				Matrix dd0 = new Matrix(add0);
 				Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
-////				Matrix dd1 = dd0.getMatrix(0, 1,0,1); // get top left 2x2 sub-matrix
-				// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
-				// unity vector in the direction of radius
-				double c_dist = pXci/rNDi;
-				double s_dist = pYci/rNDi;
+ */
+	float dd1[2][2];// get top left 2x2 sub-matrix
+	dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
+	dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
+	dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
+	dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("dd1[0][0] = %f,  dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
+		printf("dd1[1][0] = %f,  dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+	// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
+	// unity vector in the direction of radius
+	float c_dist = pXci/rNDi;
+	float s_dist = pYci/rNDi;
+/*
 				double [][] arot2= {
 						{c_dist, s_dist},
 						{-s_dist, c_dist}};
@@ -696,43 +758,112 @@ extern "C" __global__ void get_tiles_offsets(
 				disp_dist[i][2] =   dd2.get(1, 0); // d_py/d_disp
 				disp_dist[i][3] =   dd2.get(1, 1);
-				imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+ */
-				// ERS linear does not yet use per-port rotations, probably not needed
-//				double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
-				if ((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0) ||(imu[3] != 0.0) ||(imu[4] != 0.0) ||(imu[5] != 0.0)) {
-					delta_t = dd2.get(1, 0) * disparity * line_time; // positive for top cameras, negative - for bottom
-					double ers_Xci = delta_t* (dpXci_dtilt * imu[0] + dpXci_dazimuth * imu[1]  + dpXci_droll * imu[2]);
-					double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1]  + dpYci_droll * imu[2]);
-					if (xyz != null) {
-						double k = SCENE_UNITS_SCALE * this.disparityRadius;
-						double wdisparity = disparity;
-						double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
-						dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
-						dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
-						dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
-						dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
-						ers_Xci += delta_t* (dpXci_pYci_imu_lin[0][0] * imu[3] + dpXci_pYci_imu_lin[0][2] * imu[5]);
-						ers_Yci += delta_t* (dpXci_pYci_imu_lin[1][1] * imu[4] + dpXci_pYci_imu_lin[1][2] * imu[5]);
-					}
-					pXY[i][0] +=  ers_Xci * rD2rND; // added correction to pixel X
-					pXY[i][1] +=  ers_Yci * rD2rND; // added correction to pixel Y
-				} else {
-					imu = null;
-				}
+	float drD2rND_dri = 0.0;
+	{
+		float rri = 1.0;
+#pragma unroll
+		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+			drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
+			rri *= ri;
+		}
+	}
+	float scale_distort00 = rD2rND + ri* drD2rND_dri;
+	float scale_distort11 = rD2rND;
+//	float rot2Xdd1[2][2];
+//	rot2Xdd1[0][0] =  c_dist * dd1[0][0] + s_dist * dd1[1][0];
+//	rot2Xdd1[0][1] =  c_dist * dd1[0][1] + s_dist * dd1[1][1];
+//	rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
+//	rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
+	float scale_distortXrot2Xdd1[2][2];
+	scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
+	scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
+	scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
+	scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
+	disp_dist[0] =    c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
+	disp_dist[1] =    c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
+	disp_dist[2] =    s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
+	disp_dist[3] =    s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
+		printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
+		printf("disp_dist[0] = %f\n", disp_dist[0]);
+		printf("disp_dist[1] = %f\n", disp_dist[1]);
+		printf("disp_dist[2] = %f\n", disp_dist[2]);
+		printf("disp_dist[3] = %f\n", disp_dist[3]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
+	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
+	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
+	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
+//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+//	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
+// ERS linear does not yet use per-port rotations, probably not needed
+	if (imu_exists){
+		float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
+		float ers_Xci =	delta_t * (
+				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
+				dpXci_droll * extrinsic_corr.imu_rot[2]);
+		float ers_Yci =	delta_t* (
+				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
+				dpYci_droll * extrinsic_corr.imu_rot[2]);
+#ifdef DEBUG21
+		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+			printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG21
+		if (disparity >= MIN_DISPARITY){ // all threads together
+			float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
+			float wdisparity = disparity;
+			float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
+			dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
+			dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
+			dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
+			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+			ers_Xci += delta_t* (
+					dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
+					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
+			ers_Yci += delta_t* (
+					dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
+					dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
+			pXY[0] +=  ers_Xci * rD2rND; // added correction to pixel X
+			pXY[1] +=  ers_Yci * rD2rND; // added correction to pixel Y
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
+		printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
+		printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
+		printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+		}
+	}
+	// copy results to global memory pXY,  disp_dist
+	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
+	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
-// TODO: calculate derivatives of pX, pY by 3 imu omegas
-			}
+}
- */
-}
 /**
 * Calculate non-distorted radius from distorted using table approximation

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -42,6 +42,7 @@
 #endif
 #define SCENE_UNITS_SCALE  0.001 // meters from mm
+#define MIN_DISPARITY      0.01  // minimal disparity to try to convert to world coordinates
 struct tp_task {
 	int   task;
 	union {
@@ -114,17 +115,19 @@ struct gc {
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 };
 extern "C" __global__ void get_tiles_offsets(
 		struct tp_task     * gpu_tasks,
 		int                  num_tiles,          // number of tiles in task
 		struct gc          * gpu_geometry_correction,
 		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist); // length should match RBYRDIST_LEN
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		union trot_deriv   * gpu_rot_deriv);
+#if 0
 // uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
 extern "C" __global__ void calc_rot_matrices(
 		struct corr_vector * gpu_correction_vector);
+#endif
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -339,7 +339,8 @@ struct tp_task {
    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
 // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
-    static struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    static struct tp_task     task_data  [TILESX*TILESY]; // maximal length - each tile
+    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
    union  trot_deriv  rot_deriv;
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
 //    int                texture_indices      [TILESX*TILESY];
@@ -634,8 +635,8 @@ struct tp_task {
 //    			gpu_correction_vector);   // 		struct corr_vector * gpu_correction_vector,
    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
-    			(corr_vector * ) gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
+    			gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
-    			(trot_deriv  * ) gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
+    			gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
    	getLastCudaError("Kernel failure");
@@ -683,7 +684,7 @@ struct tp_task {
 #define TEST_GEOM_CORR
 #ifdef  TEST_GEOM_CORR
-    dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
+    dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
@@ -703,7 +704,8 @@ struct tp_task {
 				tp_task_size,             // int                num_tiles,          // number of tiles in task list
 				gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
 				gpu_correction_vector,   // 		struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist); // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rByRDist, // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rot_deriv); // union trot_deriv   * gpu_rot_deriv);
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -714,6 +716,38 @@ struct tp_task {
    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
    sdkDeleteTimer(&timerGEOM);
    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+//    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+//    static struct tp_task     task_data1 [TILESX*TILESY]; // maximal length - each tile
+/// DBG_TILE
+	checkCudaErrors(cudaMemcpy( // copy modified/calculated tasks
+			&task_data1,
+			gpu_tasks,
+			tp_task_size * sizeof(struct tp_task),
+			cudaMemcpyDeviceToHost));
+	struct tp_task * old_task = &task_data [DBG_TILE];
+	struct tp_task * new_task = &task_data1[DBG_TILE];
+    printf("old_task txy = 0x%x\n",  task_data [DBG_TILE].txy);
+    printf("new_task txy = 0x%x\n",  task_data1[DBG_TILE].txy);
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+        printf("camera %d pX old %f new %f diff = %f\n", ncam,
+        		task_data [DBG_TILE].xy[ncam][0],  task_data1[DBG_TILE].xy[ncam][0],
+				task_data [DBG_TILE].xy[ncam][0] - task_data1[DBG_TILE].xy[ncam][0]);
+        printf("camera %d pY old %f new %f diff = %f\n", ncam,
+        		task_data [DBG_TILE].xy[ncam][1],  task_data1[DBG_TILE].xy[ncam][1],
+				task_data [DBG_TILE].xy[ncam][1]-  task_data1[DBG_TILE].xy[ncam][1]);
+    }
+#if 0
+    // temporarily restore tasks
+    checkCudaErrors(cudaMemcpy(
+    		gpu_tasks,
+			&task_data,
+			tp_task_size * sizeof(struct tp_task),
+            cudaMemcpyHostToDevice));
+#endif
 #endif // TEST_GEOM_CORR

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -72,10 +72,22 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list
-#define DBG_DISPARITY            32.0 // disparity for which to calculate offsets (not needed in Java)
+#define DBG_DISPARITY            56.0 // disparity for which to calculate offsets (not needed in Java)
 #define RBYRDIST_LEN           5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
 #define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
-#define TILES_PER_BLOCK_GEOM     32   // each tile has NUM_CAMS threads
+#define TILES_PER_BLOCK_GEOM     (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+// Use CORR_OUT_RAD for the correlation output
+//#define DBG_TILE_X     40
+//#define DBG_TILE_Y     80
+#define DBG_TILE_X     151 // 161 // 49
+#define DBG_TILE_Y     69  // 111 // 66
+#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#undef DBG_MARK_DBG_TILE
 //#undef HAS_PRINTF
@@ -99,7 +111,9 @@
 //#define USE_textures_gen
 //#define DEBUG_OOB1 1
 // geom
-#define DEBUG20 1
+//#define DEBUG20 1
+#define DEBUG21 1
 #endif //#ifndef JCUDA