implemented calc_rot_deriv()

dba4dfce · Andrey Filippov · b0f7d665 · dba4dfce · dba4dfce · dba4dfce
Commit dba4dfce authored Apr 11, 2020 by Andrey Filippov
7 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -45,6 +45,15 @@
 #include "TileProcessor.h"
 #endif // #ifndef JCUDA

+// CUDA fast math is slower!
+//#define FASTMATH 1
+/*
+ fast
+GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
+nofast
+GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
+ */
+
 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))

 //#define IMCLT14
@@ -1019,19 +1028,7 @@ __global__ void correlate2D(
        	__syncthreads();// __syncwarp();
 #endif
 #endif
-
-
-
-
-
-
-
-
        } // if (color == 1){ // LPF only after B (nothing in mono)
-
-
-
-
    } // for (int color = 0; color < colors; color++){
    normalizeTileAmplitude(
    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -1083,23 +1080,6 @@ __global__ void correlate2D(
 #endif
 #endif
     dttii_2d(clt_corr);
-/*
-    // change to 16-32 threads?? in next iteration
-    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-    for (int q = 0; q < 4; q++){
-    	int is_sin = (q >> 1) & 1;
-    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
-    }
-    __syncthreads();
-
-    // hor pass, corresponding to vert pass in Java
-    for (int q = 0; q < 4; q++){
-    	int is_sin = q & 1;
-    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
-    }
-    __syncthreads();
-*/
-

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -2655,7 +2635,11 @@ __device__ void normalizeTileAmplitude(
 				*(clt_tile_j1) * *(clt_tile_j1) +
 				*(clt_tile_j2) * *(clt_tile_j2) +
 				*(clt_tile_j3) * *(clt_tile_j3);
+#ifdef FASTMATH
+		float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
+#else
 		float scale = rsqrtf(s2); // 1.0/sqrt(s2)
+#endif
 		*(clt_tile_j0) *= scale;
 		*(clt_tile_j1) *= scale;
 		*(clt_tile_j2) *= scale;
@@ -3333,7 +3317,12 @@ __device__ void debayer_shot(


 	if (scale_shot > 0.0) {
+
+#ifdef FASTMATH
+		float k = __frsqrt_rn(min_shot);
+#else
 		float k = rsqrtf(min_shot);
+#endif

 		// double k = 1.0/Math.sqrt(min_shot); //sqrtf
 		//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
@@ -3343,7 +3332,14 @@ __device__ void debayer_shot(
 #pragma unroll
 			for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
 				float d = *mcltp;
+#ifdef FASTMATH
+				*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
+#else
 				*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
+#endif
+
+
+
 				mcltp += DTT_SIZE;
 			}
 			mcltp += (DTT_SIZE21-DTT_SIZE2);
@@ -3549,10 +3545,19 @@ __device__ void tile_combine_rgba(
 					s2 += d * d;
 				}
 				float mse = (s0*s2 - s1*s1) / (s0 * s0);
+#ifdef FASTMATH
+				* crms_col_i = __fsqrt_rn(mse);
+#else
 				* crms_col_i = sqrtf(mse);
+#endif
+
 				sw += *(chn_weights +ncol) * mse;
 			}
+#ifdef FASTMATH
+			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
+#else
 			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
+#endif
 		}
 #ifdef DEBUG9
 	}
@@ -3605,7 +3610,12 @@ __device__ void tile_combine_rgba(
 				dc *= wnd2_inv; // to compensate fading near the edges
 				d+= *(chn_weights + ncol) * dc * dc;
 			}
+#ifdef FASTMATH
+			d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#else
 			d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#endif
+
 			// Add weight to both channels in a pair
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
@@ -3711,7 +3721,13 @@ __device__ void tile_combine_rgba(
 			}
 			// TODO: Should it use pair_dist2r ? no as it is relative?
 			//				port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
+
+#ifdef FASTMATH
+			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#else
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#endif
+
 		}
 		// and now make a new average with those weights
 		// Inserting dust remove here
@@ -3879,7 +3895,11 @@ __device__ void tile_combine_rgba(
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
 				mx = fmaxf(mx, max_diff_tmp[cam][i]);
 			}
+#ifdef FASTMATH
+			max_diff[cam] = __fsqrt_rn(mx);
+#else
 			max_diff[cam] = sqrtf(mx);
+#endif
 		}
 	}


--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -41,6 +41,18 @@
 #include "tp_defines.h"
 #endif

+
+extern "C"
+__global__ void convert_correct_tiles(
+		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float           ** gpu_kernels,        // [NUM_CAMS],
+		float           ** gpu_images,         // [NUM_CAMS],
+		struct tp_task   * gpu_tasks,
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t             dstride,            // in floats (pixels)
+		int                num_tiles,          // number of tiles in task
+		int                lpf_mask);          // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+
 extern "C" __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
@@ -102,5 +114,34 @@ extern "C" __global__ void imclt_rbg(
 		int               h_offset,
 		const size_t      dstride);            // in floats (pixels)

+extern "C"
+__global__ void generate_RBGA(
+// Parameters to generate texture tasks
+			struct tp_task   * gpu_tasks,
+			int                num_tiles,          // number of tiles in task list
+// declare arrays in device code?
+			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+			int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+			int              * woi,                // x,y,width,height of the woi
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+			int                height, // <= TILESY, use for faster processing of LWIR images
+// Parameters for the texture generation
+			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+			int               colors,             // number of colors (3/1)
+			int               is_lwir,            // do not perform shot correction
+			float             min_shot,           // 10.0
+			float             scale_shot,         // 3.0
+			float             diff_sigma,         // pixel value/pixel change
+			float             diff_threshold,     // pixel value/pixel change
+			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			float             weight0,            // scale for R
+			float             weight1,            // scale for B
+			float             weight2,            // scale for G
+			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			int               keep_weights,       // return channel weights after A in RGBA (was removed)
+			const size_t      texture_rbga_stride,     // in floats
+			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+


--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -72,9 +72,9 @@


 // kernels (not used so far)
-#ifdef BBBB
+#if 0
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
-#endif// #ifdef BBBB
+#endif// #if 0

 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
+/**
+ **
+ ** geometry_correction.cu
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  geometry_correction.cu is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+
+/**
+**************************************************************************
+* \file geometry_correction.cu
+* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
+
+*/
+#ifndef JCUDA
+#include "tp_defines.h"
+#include "dtt8x8.h"
+#include "geometry_correction.h"
+#endif // #ifndef JCUDA
+
+// Using NUM_CAMS threads per tile
+#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
+#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_CV   ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+
+#define DBG_CAM 3
+
+__device__ void printGeometryCorrection(struct gc * g);
+__device__ void printExtrinsicCorrection(corr_vector * cv);
+/**
+ * Calculate non-distorted radius from distorted using table approximation
+ * @param rDist distorted radius
+ * @return corresponding non-distorted radius
+ */
+inline __device__ float getRByRDist(float rDist,
+		float rByRDist [RBYRDIST_LEN]); //shared memory
+
+__constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
+		{ // azimuth
+				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
+				{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
+				{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
+
+		},{ // tilt
+				{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
+				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
+		},{ // roll*zoom
+				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
+
+		},{ // d_azimuth
+				{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
+		},{ // d_tilt
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
+				{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
+		},{ // d_roll
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+		},{ // d_zoom
+				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+		}
+};
+
+__constant__ int angles_offsets [4] = {
+		offsetof(corr_vector, azimuth)/sizeof(float),
+		offsetof(corr_vector, tilt)   /sizeof(float),
+		offsetof(corr_vector, roll)   /sizeof(float),
+		offsetof(corr_vector, roll)   /sizeof(float)};
+__constant__ int mm_seq [3][3][3]={
+		{
+				{6,5,12}, // a_t * a_z -> tmp0
+				{7,6,13}, // a_r * a_t -> tmp1
+				{7,9,14}, // a_r * a_dt -> tmp2
+		}, {
+				{7,12,0}, // a_r * tmp0 -> rot          - bad
+				{13,8,1}, // tmp1 * a_daz -> deriv0     - good
+				{14,5,2}, // tmp2 * a_az  -> deriv1     - good
+		}, {
+				{10,12,3}, // a_dr * tmp0 -> deriv2     - good
+				{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
+				{-1,-1,-1} // do nothing
+		}};
+
+
+__device__ float rot_matrices       [NUM_CAMS][3][3];
+//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
+
+
+// threads (3,3,4)
+extern "C" __global__ void calc_rot_matrices(
+		struct corr_vector * gpu_correction_vector)
+{
+	__shared__ float zoom    [NUM_CAMS];
+	__shared__ float sincos  [NUM_CAMS][3][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+	__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
+
+	float angle;
+	int ncam = threadIdx.z;
+	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
+	int nangle =  nangle1 >> 1;
+	int is_sin = nangle1 & 1;
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0)  && ( threadIdx.y == 0)  && ( threadIdx.z == 0)){
+		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
+		printExtrinsicCorrection(gpu_correction_vector);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+	if (nangle < 4){ // this part only for 1-st 3
+		float* gangles =
+				(nangle ==0)?gpu_correction_vector->azimuth:(
+						(nangle ==1)?gpu_correction_vector->tilt:(
+								(nangle ==2)?gpu_correction_vector->roll:
+										gpu_correction_vector->zoom));
+		if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
+			angle = *(gangles + ncam);
+
+		} else {
+			angle = 0.0f;
+
+#pragma	unroll
+			for (int n = 0; n < (NUM_CAMS-1); n++){
+				angle -= *(gangles + n);
+			}
+		}
+		if (!is_sin){
+			angle += M_PI/2;
+		}
+		if (nangle < 3) {
+			sincos[ncam][nangle][is_sin]=sinf(angle);
+		} else if (is_sin){
+			zoom[ncam] = angle;
+		}
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+			printf("\n    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
+			printf("    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			printf("    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+	if (nangle == 3) {
+		sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+			printf("\na    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
+			printf("a    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			printf("a    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+
+
+	// now 3x3
+	for (int axis = 0; axis < 3; axis++) {
+		matrices[ncam][axis][threadIdx.y][threadIdx.x] =
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2];                         // const
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+
+			printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][0][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][1][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][2][i][j]);
+				}
+				printf("\n");
+			}
+
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+
+
+    // tilt * az ->
+	// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
+	matrices[ncam][3][threadIdx.y][threadIdx.x] =
+			matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
+			matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
+			matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
+	// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
+	__syncthreads();
+	rot_matrices[ncam][threadIdx.y][threadIdx.x] =
+			matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
+			matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
+			matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
+	__syncthreads();
+
+
+#ifdef DEBUG20
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+
+			printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][0][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][1][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][2][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Rotation matrix for camera %d\n", n);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", rot_matrices[n][i][j]);
+				}
+				printf("\n");
+			}
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+}
+
+__constant__ int offset_rots =     0;                   //0
+__constant__ int offset_derivs =   1;                   // 1..4 // should be next
+__constant__ int offset_matrices = 5;   // 5..11
+__constant__ int offset_tmp =      12; // 12..15
+
+/**
+ * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
+ * NUM_CAMS blocks of 3,3,3 tiles
+ */
+extern "C" __global__ void calc_rot_deriv(
+		struct corr_vector * gpu_correction_vector,
+		union trot_deriv   * gpu_rot_deriv)
+{
+//	__shared__ float zoom;
+	__shared__ float sincos  [4][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+	__shared__ float matrices[5 + 7 +4][3][3];
+	float angle;
+	float zoom;
+	int ncam =    blockIdx.x; // threadIdx.z;
+	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
+	int nangle =  nangle1 >> 1;
+	int is_sin = nangle1 & 1;
+	if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
+		float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
+		if (ncam == (NUM_CAMS-1)){ // for the whole block
+			angle = 0.0;
+			zoom = 0.0;
+#pragma	unroll
+			for (int n = 0; n < (NUM_CAMS-1); n++){
+				angle -= *(gangles + n);
+				zoom -= gpu_correction_vector->zoom[n];
+			}
+			if (nangle >= 2){ // diverging for roll (last two)
+				angle = *(gangles + ncam);
+			}
+
+		} else {
+			angle = *(gangles + ncam);
+			zoom =   gpu_correction_vector->zoom[ncam];
+		}
+		if (!is_sin){
+			angle += M_PI/2;
+		}
+		float sc = sinf(angle);
+		if (nangle ==2) {
+			sc *= 1.0 + zoom;
+		}
+		sincos[nangle][is_sin]= sc;
+	}
+	__syncthreads();
+
+#ifdef DEBUG20
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		printf("\n    Azimuth matrix for   camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
+		printf(  "    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[1][0], sincos[1][1]);
+		printf(  "    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[2][0], sincos[2][1]);
+		printf(  "    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[3][0], sincos[3][1]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+// Create 3 3x3 matrices for az, tilt, roll/zoom:
+	int axis = offset_matrices+threadIdx.z; // 0..2
+	int const_index = threadIdx.z; // 0..2
+	matrices[axis][threadIdx.y][threadIdx.x] =
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                   // const
+	axis += 3; // skip index == 3
+	const_index +=3;
+	matrices[axis][threadIdx.y][threadIdx.x] =
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                          // const
+	if (threadIdx.z == 0){
+		axis += 3;
+		const_index +=3;
+		matrices[axis][threadIdx.y][threadIdx.x] =
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                // const
+
+	}
+	__syncthreads();
+
+#ifdef DEBUG20
+	const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
+
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int i = 0; i < 7; i++) {
+			printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
+			for (int row = 0; row < 3; row++){
+				for (int col = 0; col < 3; col++){
+					printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
+				}
+				printf("\n");
+			}
+
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+/*
+	__constant__ int mm_seq [3][3][3]={
+			{
+					{6,5,12}, // a_t * a_z -> tmp0
+					{7,6,13}, // a_r * a_t -> tmp1
+					{7,9,14}, // a_r * a_dt -> tmp2
+			}, {
+					{7,12,0}, // a_r * tmp0 -> rot
+					{13,8,1}, // tmp1 * a_daz -> deriv0
+					{14,5,2}, // tmp2 * a_az  -> deriv1
+			}, {
+					{10,12,3}, // a_dr * tmp0 -> deriv2
+					{11,12,4}, // a_dzoom * tnmp0 -> deriv3
+			}};
+*/
+	for (int i = 0; i < 3; i++){
+		int srcl = mm_seq[i][threadIdx.z][0];
+		int srcr = mm_seq[i][threadIdx.z][1];
+		int dst =  mm_seq[i][threadIdx.z][2];
+		if (srcl >= 0){
+			matrices[dst][threadIdx.y][threadIdx.x] =
+					matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
+					matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
+					matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
+		}
+		__syncthreads();
+	}
+// copy results to global memory
+	int gindx = threadIdx.z;
+	int lindx = offset_rots + threadIdx.z;
+	gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+	gindx +=3;
+	lindx+=3;
+	if (lindx < 5) {
+		gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+	}
+	__syncthreads();
+
+// All done - read/verify all arrays
+
+}
+
+
+/*
+ * blockDim.x = NUM_CAMS
+ * blockDim.y = TILES_PER_BLOCK_GEOM
+ */
+
+extern "C" __global__ void get_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+{
+//	int task_num = blockIdx.x * blockDim.x + threadIdx.x; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
+	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
+	int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
+	int ncam = threadIdx.x;
+	// threadIdx.x - numcam, used for per-camera
+	__shared__ struct gc geometry_correction;
+	__shared__ float rByRDist [RBYRDIST_LEN];
+	__shared__ struct corr_vector extrinsic_corr;
+	__shared__ float rots[NUM_CAMS][3][3];
+	__shared__ float pXY[NUM_CAMS][2]; // result to be copied to task
+	// copy data common to all threads
+	{
+		float * gcp_local =  (float *) &geometry_correction;
+		float * gcp_global = (float *) gpu_geometry_correction;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_GC; i++){
+			if (offset < sizeof(struct gc)/sizeof(float)) {
+				*(gcp_local + offset) = *(gcp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	{
+		float * cvp_local =  (float *) &extrinsic_corr;
+		float * cvp_global = (float *) gpu_correction_vector;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_CV; i++){
+			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
+				*(cvp_local + offset) = *(cvp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	// TODO: maybe it is better to use system memory and not read all table?
+	{
+		float * rByRDistp_local =  (float *) rByRDist;
+		float * rByRDistp_global = (float *) gpu_rByRDist;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_RBRD; i++){
+			if (offset < RBYRDIST_LEN) {
+				*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	// copy rotational matrices
+	//	__shared__ float rots[NUM_CAMS][3][3];
+	//__device__ float rot_matrices [NUM_CAMS][3][3];
+
+	{
+		float * rots_local =  (float *) rots;
+		float * rots_global = (float *) rot_matrices;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_ROTS; i++){
+			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
+				*(rots_local + offset) = *(rots_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	__syncthreads();
+
+#ifdef DEBUG20
+	if ((threadIdx.x == 0)  && ( blockIdx.x == 0)){
+		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
+		printGeometryCorrection(&geometry_correction);
+		printExtrinsicCorrection(&extrinsic_corr);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+	//		String dbg_s = corr_vector.toString();
+	/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
+	 * 1) unapply common distortion (maybe for different - master camera)
+	 * 2) apply disparity
+	 * 3) apply rotations and zoom
+	 * 4) re-apply distortion
+	 * 5) return port center X and Y
+	 * line_time
+	 */
+
+	// common code, calculated in parallel
+	int cxy = gpu_tasks[task_num].txy;
+	float disparity = gpu_tasks[task_num].target_disparity;
+	int tileX = (cxy & 0xffff);
+	int tileY = (cxy >> 16);
+	float px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
+	float py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
+
+	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
+	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
+
+	float rXY [NUM_CAMS][2];
+
+//	for (int i = 0; i < NUM_CAMS;i++){
+	rXY[ncam][0] = geometry_correction.rXY[ncam][0];
+	rXY[ncam][1] = geometry_correction.rXY[ncam][1];
+//	}
+
+	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
+	float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
+	float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
+	float pYc = pYcd * rND2R; // in pixels
+	float xyz [3]; // getWorldCoordinates
+	xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius / (disparity * 0.001*geometry_correction.pixelSize); // "+" - near, "-" far
+	xyz[0] =  SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
+	xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
+	// next radial distortion coefficients are for this, not master camera (may be the same)
+//	geometry_correction.rad_coeff[i];
+	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
+	float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
+
+//	for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+
+			// non-distorted XY of the shifted location of the individual sensor
+
+	// -------------- Each camera calculated by its own thread ----------------
+	float pXci0 = pXc - disparity *  rXY[ncam][0]; // in pixels
+	float pYci0 = pYc - disparity *  rXY[ncam][1];
+	// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
+	// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
+	///		double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+	///		Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
+	// Apply port-individual combined rotation/zoom matrix
+	///		Matrix rvi = rots[i].times(vi);
+	float rvi[3];
+#pragma unroll
+	for (int j = 0; j< 3; j++){
+		rvi[j] = rots[ncam][j][0] * pXci0 + rots[ncam][j][1] * pYci0 + rots[ncam][j][2] * fl_pix;
+	}
+	// get back to the projection plane by normalizing vector
+	float norm_z = fl_pix/rvi[2];
+	float pXci =  rvi[0] * norm_z;
+	float pYci =  rvi[1] * norm_z;
+	// Re-apply distortion
+	float rNDi =  sqrtf(pXci*pXci + pYci*pYci); // in pixels
+	float ri =    rNDi* ri_scale; // relative to distortion radius
+
+	float rD2rND = 1.0;
+	{
+		float rri = 1.0;
+#pragma unroll
+		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+			rri *= ri;
+			rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
+		}
+	}
+	// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
+	float pXid = pXci * rD2rND;
+	float pYid = pYci * rD2rND;
+	pXY[ncam][0] =  pXid + geometry_correction.pXY0[ncam][0];
+	pXY[ncam][1] =  pYid + geometry_correction.pXY0[ncam][1];
+
+
+
+
+
+//	}//for (int i = 0; i < NUM_CAMS; i++){
+/*
+
+
+			// used when calculating derivatives, TODO: combine calculations !
+			double drD2rND_dri = 0.0;
+			Matrix drvi_daz = null;
+			Matrix drvi_dtl = null;
+			Matrix drvi_drl = null;
+			double dpXci_dazimuth = 0.0;
+			double dpYci_dazimuth = 0.0;
+			double dpXci_dtilt =    0.0;
+			double dpYci_dtilt =    0.0;
+			double dpXci_droll =    0.0;
+			double dpYci_droll =    0.0;
+
+			if ((disp_dist != null) || (pXYderiv != null)) {
+				rri = 1.0;
+				for (int j = 0; j < rad_coeff.length; j++){
+					drD2rND_dri += rad_coeff[j] * (j+1) * rri;
+					rri *= ri;
+				}
+				if (deriv_rots != null) {
+					// needed for derivatives and IMU
+					drvi_daz = deriv_rots[i][0].times(vi);
+					drvi_dtl = deriv_rots[i][1].times(vi);
+					drvi_drl = deriv_rots[i][2].times(vi);
+					dpXci_dazimuth = drvi_daz.get(0, 0) * norm_z - pXci * drvi_daz.get(2, 0) / rvi.get(2, 0);
+					dpYci_dazimuth = drvi_daz.get(1, 0) * norm_z - pYci * drvi_daz.get(2, 0) / rvi.get(2, 0);
+					dpXci_dtilt =    drvi_dtl.get(0, 0) * norm_z - pXci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
+					dpYci_dtilt =    drvi_dtl.get(1, 0) * norm_z - pYci * drvi_dtl.get(2, 0) / rvi.get(2, 0);
+					dpXci_droll =    drvi_drl.get(0, 0) * norm_z - pXci * drvi_drl.get(2, 0) / rvi.get(2, 0);
+					dpYci_droll =    drvi_drl.get(1, 0) * norm_z - pYci * drvi_drl.get(2, 0) / rvi.get(2, 0);
+				}
+			}
+			double delta_t = 0.0;
+			double [] imu =  null;
+			double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
+			if (disp_dist != null) {
+				disp_dist[i] =   new double [4]; // dx/d_disp, dx_d_ccw_disp
+				// Not clear - what should be in Z direction before rotation here?
+				double [][] add0 = {
+						{-rXY[i][0],  rXY[i][1], 0.0},
+						{-rXY[i][1], -rXY[i][0], 0.0},
+						{ 0.0,                     0.0,                    0.0}}; // what is last element???
+				Matrix dd0 = new Matrix(add0);
+				Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
+////				Matrix dd1 = dd0.getMatrix(0, 1,0,1); // get top left 2x2 sub-matrix
+				// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
+				// unity vector in the direction of radius
+				double c_dist = pXci/rNDi;
+				double s_dist = pYci/rNDi;
+
+				double [][] arot2= {
+						{c_dist, s_dist},
+						{-s_dist, c_dist}};
+				Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
+
+				double [][] ascale_distort = {
+						{rD2rND + ri* drD2rND_dri, 0     },
+						{0,                       rD2rND}};
+				Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
+
+				Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
+
+				disp_dist[i][0] =   dd2.get(0, 0);
+				disp_dist[i][1] =   dd2.get(0, 1);
+				disp_dist[i][2] =   dd2.get(1, 0); // d_py/d_disp
+				disp_dist[i][3] =   dd2.get(1, 1);
+
+				imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+
+				// ERS linear does not yet use per-port rotations, probably not needed
+
+//				double [][] dpXci_pYci_imu_lin = new double[2][3]; // null
+				if ((imu[0] != 0.0) || (imu[1] != 0.0) ||(imu[2] != 0.0) ||(imu[3] != 0.0) ||(imu[4] != 0.0) ||(imu[5] != 0.0)) {
+					delta_t = dd2.get(1, 0) * disparity * line_time; // positive for top cameras, negative - for bottom
+					double ers_Xci = delta_t* (dpXci_dtilt * imu[0] + dpXci_dazimuth * imu[1]  + dpXci_droll * imu[2]);
+					double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1]  + dpYci_droll * imu[2]);
+					if (xyz != null) {
+						double k = SCENE_UNITS_SCALE * this.disparityRadius;
+						double wdisparity = disparity;
+						double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
+						dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
+						dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
+						dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
+						dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+						ers_Xci += delta_t* (dpXci_pYci_imu_lin[0][0] * imu[3] + dpXci_pYci_imu_lin[0][2] * imu[5]);
+						ers_Yci += delta_t* (dpXci_pYci_imu_lin[1][1] * imu[4] + dpXci_pYci_imu_lin[1][2] * imu[5]);
+					}
+					pXY[i][0] +=  ers_Xci * rD2rND; // added correction to pixel X
+					pXY[i][1] +=  ers_Yci * rD2rND; // added correction to pixel Y
+
+
+				} else {
+					imu = null;
+				}
+
+
+
+// TODO: calculate derivatives of pX, pY by 3 imu omegas
+			}
+
+
+ */
+
+}
+
+/**
+ * Calculate non-distorted radius from distorted using table approximation
+ * @param rDist distorted radius
+ * @return corresponding non-distorted radius
+ */
+inline __device__ float getRByRDist(float rDist,
+		float rByRDist [RBYRDIST_LEN]) //shared memory
+{
+	if (rDist < 0) {
+		return 0.0f; // normally should not happen
+	}
+	float findex = rDist/RBYRDIST_STEP;
+	int index= (int) floorf(findex);
+	if (index < 0){
+		index = 0;
+	}
+	if (index > (RBYRDIST_LEN - 3)) {
+		index = RBYRDIST_LEN - 3;
+	}
+	float mu = fmaxf(findex - index, 0.0f);
+	float mu2 = mu * mu;
+	float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
+	// use Catmull-Rom
+	float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
+	float a1 =        y0 - 2.5 * rByRDist[index] + 2   * rByRDist[index+1] - 0.5 * rByRDist[index+2];
+	float a2 = -0.5 * y0                              + 0.5 * rByRDist[index+1];
+	float a3 =  rByRDist[index];
+	float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
+	return result;
+}
+
+__device__ void printGeometryCorrection(struct gc * g){
+#ifndef JCUDA
+	printf("\nGeometry Correction\n------------------\n");
+	printf("%22s: %f\n","pixelCorrectionWidth",  g->pixelCorrectionWidth);
+	printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
+	printf("%22s: %f\n","line_time",             g->line_time);
+
+	printf("%22s: %f\n","focalLength", g->focalLength);
+	printf("%22s: %f\n","pixelSize",   g->pixelSize);
+	printf("%22s: %f\n","distortionRadius",g->distortionRadius);
+
+	printf("%22s: %f\n","distortionC", g->distortionC);
+	printf("%22s: %f\n","distortionB", g->distortionB);
+	printf("%22s: %f\n","distortionA", g->distortionA);
+	printf("%22s: %f\n","distortionA5",g->distortionA5);
+	printf("%22s: %f\n","distortionA6",g->distortionA6);
+	printf("%22s: %f\n","distortionA7",g->distortionA7);
+	printf("%22s: %f\n","distortionA8",g->distortionA8);
+
+	printf("%22s: %f\n","elevation",   g->elevation);
+	printf("%22s: %f\n","heading",     g->heading);
+
+	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
+	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
+	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
+	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
+	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
+	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
+	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
+	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
+
+	printf("%22s: %f\n","common_right",   g->common_right);
+	printf("%22s: %f\n","common_forward", g->common_forward);
+	printf("%22s: %f\n","common_height",  g->common_height);
+	printf("%22s: %f\n","common_roll",    g->common_roll);
+
+	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
+
+	printf("%22s: %f\n","cameraRadius",    g->cameraRadius);
+	printf("%22s: %f\n","disparityRadius", g->disparityRadius);
+#endif //ifndef JCUDA
+}
+
+__device__ void printExtrinsicCorrection(corr_vector * cv)
+{
+#ifndef JCUDA
+	printf("\nExtrinsic Correction Vector\n---------------------------\n");
+	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
+	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
+	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
+	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
+
+	printf("%22s: %f(t), %f(a), %f(r)\n",     "imu_rot",    cv->imu_rot[0],    cv->imu_rot[1],    cv->imu_rot[2]);
+	printf("%22s: %f(x), %f(y), %f(z)\n",     "imu_move",    cv->imu_move[0],    cv->imu_move[1],    cv->imu_move[2]);
+#endif //ifndef JCUDA
+}
+
+
+
--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,6 +41,7 @@
 #include "tp_defines.h"
 #endif

+#define SCENE_UNITS_SCALE  0.001 // meters from mm
 struct tp_task {
 	int   task;
 	union {
@@ -62,18 +63,36 @@ struct corr_vector{
 	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
 };

+union trot_deriv{
+	struct {
+		float rots    [NUM_CAMS][3][3];
+		float d_daz   [NUM_CAMS][3][3];
+		float d_tilt  [NUM_CAMS][3][3];
+		float d_roll  [NUM_CAMS][3][3];
+		float d_zoom  [NUM_CAMS][3][3];
+	};
+	float matrices [5][NUM_CAMS][3][3];
+};
+
 struct gc {
+	float pixelCorrectionWidth; //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
+	float pixelCorrectionHeight; // =1936;
+	float line_time;        // duration of one scan line readout (for ERS)
 	float focalLength;      // =FOCAL_LENGTH;
 	float pixelSize;        // =  PIXEL_SIZE; //um
 	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
-
-	float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
-	float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
-	float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
-	float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
-	float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
-	float distortionB;      // r^3
-	float distortionC;      // r^2
+	union {
+		struct {
+			float distortionC;      // r^2
+			float distortionB;      // r^3
+			float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
+			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+		};
+		float rad_coeff [7];
+	};
 	// parameters, common for all sensors
 	float    elevation;     // degrees, up - positive;
 	float    heading;       // degrees, CW (from top) - positive
@@ -81,19 +100,34 @@ struct gc {
 	float forward    [NUM_CAMS];
 	float right      [NUM_CAMS];
 	float height     [NUM_CAMS];
-	float roll       [NUM_CAMS];  // degrees, CW (to target) - positive
-
+	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
+	float pXY0       [NUM_CAMS][2];
 	float common_right;    // mm right, camera center
 	float common_forward;  // mm forward (to target), camera center
 	float common_height;   // mm up, camera center
 	float common_roll;     // degrees CW (to target) camera as a whole
 //	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
 //	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
-	float rXY        [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
+	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
 //	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
 // only used for the multi-quad systems
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 };

+extern "C" __global__ void get_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist); // length should match RBYRDIST_LEN
+
+// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
+extern "C" __global__ void calc_rot_matrices(
+		struct corr_vector * gpu_correction_vector);
+// uses NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calc_rot_deriv(
+		struct corr_vector * gpu_correction_vector,
+		union trot_deriv   * gpu_rot_deriv);
+

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -44,6 +44,7 @@

 //#include "dtt8x8.cuh"
 #include "dtt8x8.h"
+#include "geometry_correction.h"
 #include "TileProcessor.cuh"
 ///#include "cuda_profiler_api.h"
 //#include "cudaProfiler.h"
@@ -339,6 +340,7 @@ struct tp_task {
    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
 // static - see https://stackoverflow.com/questions/20253267/segmentation-fault-before-main
    static struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    union  trot_deriv  rot_deriv;
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
 //    int                texture_indices      [TILESX*TILESY];
    int                texture_indices      [TILESX*TILESYA];
@@ -386,13 +388,13 @@ struct tp_task {
    struct gc fgeometry_correction;
    float*  correction_vector;
    int     correction_vector_length;
-//    float rByRDist
    float * rByRDist;
    int     rByRDist_length;

-    float            * gpu_geometry_correction;
-    float            * gpu_correction_vector;
-    float            * gpu_rByRDist;
+    struct gc          * gpu_geometry_correction;
+    struct corr_vector * gpu_correction_vector;
+    float              * gpu_rByRDist;
+    union trot_deriv   * gpu_rot_deriv;

    readFloatsFromFile(
    		(float *) &fgeometry_correction, // float * data, // allocated array
@@ -405,11 +407,11 @@ struct tp_task {
    		correction_vector_file, // const char *  path,
    		&correction_vector_length); // int * len_in_floats)

-    gpu_geometry_correction =  copyalloc_kernel_gpu(
+    gpu_geometry_correction =  (struct gc *) copyalloc_kernel_gpu(
    		(float *) &fgeometry_correction,
    		sizeof(fgeometry_correction)/sizeof(float));

-    gpu_correction_vector =  copyalloc_kernel_gpu(
+    gpu_correction_vector =  (struct corr_vector * ) copyalloc_kernel_gpu(
    		correction_vector,
 			correction_vector_length);

@@ -417,6 +419,8 @@ struct tp_task {
    		rByRDist,
 			rByRDist_length);

+    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
+
    float lpf_rbg[3][64]; // not used
    for (int ncol = 0; ncol < 3; ncol++) {
    	if (lpf_sigmas[ncol] > 0.0) {
@@ -597,6 +601,125 @@ struct tp_task {
    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
 //    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);

+
+#ifdef DBG_TILE
+    const int numIterations = 1; //0;
+    const int i0 =  0; // -1;
+#else
+    const int numIterations = 10; // 0; //0;
+    const int i0 = -1; // 0; // -1;
+#endif
+
+#define TEST_ROT_MATRICES
+#ifdef  TEST_ROT_MATRICES
+//    dim3 threads_rot(3,3,NUM_CAMS);
+//   dim3 grid_rot   (1, 1, 1);
+    dim3 threads_rot(3,3,3);
+    dim3 grid_rot   (NUM_CAMS, 1, 1);
+
+    printf("ROT_MATRICES: threads_list=(%d, %d, %d)\n",threads_rot.x,threads_rot.y,threads_rot.z);
+    printf("ROT_MATRICES: grid_list=(%d, %d, %d)\n",grid_rot.x,grid_rot.y,grid_rot.z);
+    StopWatchInterface *timerROT_MATRICES = 0;
+    sdkCreateTimer(&timerROT_MATRICES);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerROT_MATRICES);
+    		sdkStartTimer(&timerROT_MATRICES);
+    	}
+
+//    	calc_rot_matrices<<<grid_rot,threads_rot>>> (
+//    			gpu_correction_vector);   // 		struct corr_vector * gpu_correction_vector,
+
+    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
+    			(corr_vector * ) gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
+    			(trot_deriv  * ) gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
+
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerROT_MATRICES);
+    float avgTimeROT_MATRICES = (float)sdkGetTimerValue(&timerROT_MATRICES) / (float)numIterations;
+    sdkDeleteTimer(&timerROT_MATRICES);
+    printf("Average calc_rot_matrices run time =%f ms\n",  avgTimeROT_MATRICES);
+	checkCudaErrors(cudaMemcpy(
+			&rot_deriv,
+			gpu_rot_deriv,
+			sizeof(trot_deriv),
+			cudaMemcpyDeviceToHost));
+	const char* matrices_names[] = {
+	    		"rot","d_daz","d_tilt","d_roll","d_zoom"};
+    for (int i = 0; i < 5;i++){
+		printf("Matrix %s for camera\n",matrices_names[i]);
+		for (int row = 0; row<3; row++){
+			for (int ncam = 0; ncam<NUM_CAMS;ncam++){
+				for (int col = 0; col <3; col++){
+					printf("%9.6f,",rot_deriv.matrices[i][ncam][row][col]);
+					if (col == 2){
+						if (ncam == (NUM_CAMS-1)){
+							printf("\n");
+						} else {
+							printf("   ");
+						}
+					} else {
+						printf(" ");
+					}
+				}
+			}
+		}
+    }
+
+
+#endif // TEST_ROT_MATRICES
+
+
+
+
+
+#define TEST_GEOM_CORR
+#ifdef  TEST_GEOM_CORR
+    dim3 threads_geom(TILES_PER_BLOCK_GEOM,1, 1);
+    dim3 grid_geom   ((tp_task_size+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+    printf("GEOM: threads_list=(%d, %d, %d)\n",threads_geom.x,threads_geom.y,threads_geom.z);
+    printf("GEOM: grid_list=(%d, %d, %d)\n",grid_geom.x,grid_geom.y,grid_geom.z);
+    StopWatchInterface *timerGEOM = 0;
+    sdkCreateTimer(&timerGEOM);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerGEOM);
+    		sdkStartTimer(&timerGEOM);
+    	}
+
+    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+    			gpu_tasks,                // struct tp_task   * gpu_tasks,
+				tp_task_size,             // int                num_tiles,          // number of tiles in task list
+				gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
+				gpu_correction_vector,   // 		struct corr_vector * gpu_correction_vector,
+				gpu_rByRDist); // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerGEOM);
+    float avgTimeGEOM = (float)sdkGetTimerValue(&timerGEOM) / (float)numIterations;
+    sdkDeleteTimer(&timerGEOM);
+    printf("Average TextureList run time =%f ms\n",  avgTimeGEOM);
+#endif // TEST_GEOM_CORR
+
+
+
+
+
    //create and start CUDA timer
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);
@@ -607,28 +730,23 @@ struct tp_task {
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);

-#ifdef DBG_TILE
-    const int numIterations = 1; //0;
-    const int i0 =  0; // -1;
-#else
-    const int numIterations = 10; // 0; //0;
-    const int i0 = -1; // 0; // -1;
-#endif
+
+
    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
-///    cudaProfilerStart();
+    ///    cudaProfilerStart();
    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];

    for (int i = i0; i < numIterations; i++)
    {
-        if (i == 0)
-        {
-            checkCudaErrors(cudaDeviceSynchronize());
-            sdkResetTimer(&timerTP);
-            sdkStartTimer(&timerTP);
-        }
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerTP);
+    		sdkStartTimer(&timerTP);
+    	}

-        convert_correct_tiles<<<grid_tp,threads_tp>>>(
-        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
+    	convert_correct_tiles<<<grid_tp,threads_tp>>>(
+    			fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
 				gpu_kernels,           // 		float           ** gpu_kernels,
 				gpu_images,            // 		float           ** gpu_images,
 				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
@@ -638,11 +756,11 @@ struct tp_task {
 				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green


-        getLastCudaError("Kernel execution failed");
-        checkCudaErrors(cudaDeviceSynchronize());
-        printf("%d\n",i);
+    	getLastCudaError("Kernel execution failed");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("%d\n",i);
    }
-//    checkCudaErrors(cudaDeviceSynchronize());
+    //    checkCudaErrors(cudaDeviceSynchronize());
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
    sdkDeleteTimer(&timerTP);
@@ -1154,6 +1272,8 @@ struct tp_task {
 	checkCudaErrors(cudaFree(gpu_geometry_correction));
    checkCudaErrors(cudaFree(gpu_correction_vector));
    checkCudaErrors(cudaFree(gpu_rByRDist));
+    checkCudaErrors(cudaFree(gpu_rot_deriv));
+

 	free (rByRDist);
 	free (correction_vector);

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -39,6 +39,7 @@
 // Avoiding includes in jcuda, all source files will be merged
 #pragma once
 #ifndef JCUDA
+#include <stdio.h>
 #define THREADSX         (DTT_SIZE)
 #define NUM_CAMS                  4
 #define NUM_PAIRS                 6
@@ -72,7 +73,11 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list

 #define DBG_DISPARITY            32.0 // disparity for which to calculate offsets (not needed in Java)
-#define RBYRDIST_LEN          20001   // length of
+#define RBYRDIST_LEN           5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
+#define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
+#define TILES_PER_BLOCK_GEOM     32   // each tile has NUM_CAMS threads
+
+
 //#undef HAS_PRINTF
 #define HAS_PRINTF
 //7
@@ -87,10 +92,15 @@
 #define DEBUG8 1
 #define DEBUG9 1
 */
-#define DEBUG10 1
-#define DEBUG11 1
-#define DEBUG12 1
+//textures
+//#define DEBUG10 1
+//#define DEBUG11 1
+//#define DEBUG12 1
 //#define USE_textures_gen
-#define DEBUG_OOB1 1
+//#define DEBUG_OOB1 1
+// geom
+#define DEBUG20 1
+
+
 #endif //#ifndef JCUDA