updated kernels

3c033192 · Andrey Filippov · 514057c6 · 3c033192 · 3c033192 · 3c033192
Commit 3c033192 authored Apr 13, 2020 by Andrey Filippov
5 changed files
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -45,6 +45,15 @@
 #include "TileProcessor.h"
 #endif // #ifndef JCUDA

+// CUDA fast math is slower!
+//#define FASTMATH 1
+/*
+ fast
+GPU run time =620.210698ms, (direct conversion: 24.077195999999997ms, imclt: 17.218263ms), corr2D: 85.503204ms), textures: 237.225665ms, RGBA: 256.185703ms
+nofast
+GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.090526999999998ms), corr2D: 30.623282999999997ms), textures: 231.154339ms, RGBA: 220.503017ms
+ */
+
 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))

 //#define IMCLT14
@@ -101,14 +110,6 @@

 #define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))

-// Use CORR_OUT_RAD for the correlation output
-//#define DBG_TILE_X     40
-//#define DBG_TILE_Y     80
-#define DBG_TILE_X     161 // 49
-#define DBG_TILE_Y     111 // 66
-
-#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
-#undef DBG_MARK_DBG_TILE

 //56494
 // struct tp_task
@@ -1019,19 +1020,7 @@ __global__ void correlate2D(
        	__syncthreads();// __syncwarp();
 #endif
 #endif
-
-
-
-
-
-
-
-
        } // if (color == 1){ // LPF only after B (nothing in mono)
-
-
-
-
    } // for (int color = 0; color < colors; color++){
    normalizeTileAmplitude(
    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -1083,23 +1072,6 @@ __global__ void correlate2D(
 #endif
 #endif
     dttii_2d(clt_corr);
-/*
-    // change to 16-32 threads?? in next iteration
-    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-    for (int q = 0; q < 4; q++){
-    	int is_sin = (q >> 1) & 1;
-    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
-    }
-    __syncthreads();
-
-    // hor pass, corresponding to vert pass in Java
-    for (int q = 0; q < 4; q++){
-    	int is_sin = q & 1;
-    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
-    }
-    __syncthreads();
-*/
-

 #ifdef DBG_TILE
 #ifdef DEBUG6
@@ -1170,6 +1142,7 @@ __global__ void generate_RBGA(
 			int                height, // <= TILESY, use for faster processing of LWIR images
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
 			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction
@@ -1920,11 +1893,11 @@ __global__ void textures_gen(
 #endif // ifdef USE_textures_gen
 extern "C"
 __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
 		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -2026,14 +1999,21 @@ __global__ void textures_accumulate(
 		}
 		__syncthreads();// __syncwarp();
 #endif
-		// perform idct
+
+#ifdef DBG_TILE		// perform idct
 		imclt8threads(
 				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
 				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 				mclt_tile,  // float * mclt_tile )
 				((tile_num == DBG_TILE)  && (threadIdx.x == 0)));
+#else
+		imclt8threads(
+				0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
+				clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+				mclt_tile,  // float * mclt_tile )
+				0);
+#endif
 		__syncthreads();// __syncwarp();
-
 #ifdef DEBUG7
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\ntextures_gen mclt color = %d\n",color);
@@ -2044,6 +2024,7 @@ __global__ void textures_accumulate(
 		__syncthreads();// __syncwarp();
 #endif
 		if (colors > 1) {
+#ifdef DBG_TILE
 			debayer_shot(
 					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
 					min_shot,    // float     min_shot,   // 10.0
@@ -2052,6 +2033,16 @@ __global__ void textures_accumulate(
 					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 					mclt_tmp,    // float   * mclt_tmp,
 					((tile_num == DBG_TILE)  && (threadIdx.x == 0))); // int debug);
+#else
+			debayer_shot(
+					(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
+					min_shot,    // float     min_shot,   // 10.0
+					scale_shot,  // float     scale_shot, // 3.0 (0.0 for mono)
+					mclt_tile,   // float   * mclt_src,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_dst,    // float   * mclt_dst,   // [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+					mclt_tmp,    // float   * mclt_tmp,
+					0); // int debug);
+#endif
 			__syncthreads();// __syncwarp();
 		} else {
 			// copy? - no, just remember to use mclt_tile, not mclt_dst
@@ -2125,6 +2116,7 @@ __global__ void textures_accumulate(
 	__syncthreads();// __syncwarp();
 #endif
 //	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];
+#ifdef DBG_TILE
 	tile_combine_rgba(
 			colors,                    // int     colors,        // number of colors
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
@@ -2140,7 +2132,23 @@ __global__ void textures_accumulate(
 			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
 			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
 			(tile_num == DBG_TILE) );  //int     debug );
-
+#else
+	tile_combine_rgba(
+			colors,                    // int     colors,        // number of colors
+			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
+			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
+			(float *) shr1.rgbaw,      // float * rgba,          // result
+			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
+			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
+			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
+			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
+			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
+			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
+			0);  //int     debug );
+#endif
 // return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
 // float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 //	size_t texture_tile_offset = + tile_indx * texture_stride;
@@ -2655,7 +2663,11 @@ __device__ void normalizeTileAmplitude(
 				*(clt_tile_j1) * *(clt_tile_j1) +
 				*(clt_tile_j2) * *(clt_tile_j2) +
 				*(clt_tile_j3) * *(clt_tile_j3);
+#ifdef FASTMATH
+		float scale = __frsqrt_rn(s2); // 1.0/sqrt(s2)
+#else
 		float scale = rsqrtf(s2); // 1.0/sqrt(s2)
+#endif
 		*(clt_tile_j0) *= scale;
 		*(clt_tile_j1) *= scale;
 		*(clt_tile_j2) *= scale;
@@ -3333,7 +3345,12 @@ __device__ void debayer_shot(


 	if (scale_shot > 0.0) {
+
+#ifdef FASTMATH
+		float k = __frsqrt_rn(min_shot);
+#else
 		float k = rsqrtf(min_shot);
+#endif

 		// double k = 1.0/Math.sqrt(min_shot); //sqrtf
 		//for (int i = 0; i < tile.length; i++) tile_db[i] = scale_shot* ((tile_db[i] > min_shot)? Math.sqrt(tile_db[i]) : (k*tile_db[i]));
@@ -3343,7 +3360,14 @@ __device__ void debayer_shot(
 #pragma unroll
 			for (int col = 0; col < DTT_SIZE2; col += DTT_SIZE){
 				float d = *mcltp;
+#ifdef FASTMATH
+				*mcltp = scale_shot * (( d > min_shot)? __fsqrt_rn(d) : (k * d));
+#else
 				*mcltp = scale_shot * (( d > min_shot)? sqrtf(d) : (k * d));
+#endif
+
+
+
 				mcltp += DTT_SIZE;
 			}
 			mcltp += (DTT_SIZE21-DTT_SIZE2);
@@ -3549,10 +3573,19 @@ __device__ void tile_combine_rgba(
 					s2 += d * d;
 				}
 				float mse = (s0*s2 - s1*s1) / (s0 * s0);
+#ifdef FASTMATH
+				* crms_col_i = __fsqrt_rn(mse);
+#else
 				* crms_col_i = sqrtf(mse);
+#endif
+
 				sw += *(chn_weights +ncol) * mse;
 			}
+#ifdef FASTMATH
+			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = __fsqrt_rn(sw); // will fade as window
+#else
 			*(crms_i + (DTT_SIZE2*DTT_SIZE21) * colors) = sqrtf(sw); // will fade as window
+#endif
 		}
 #ifdef DEBUG9
 	}
@@ -3605,7 +3638,12 @@ __device__ void tile_combine_rgba(
 				dc *= wnd2_inv; // to compensate fading near the edges
 				d+= *(chn_weights + ncol) * dc * dc;
 			}
+#ifdef FASTMATH
+			d = __expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#else
 			d = expf(-pair_dist2r[ipair] * d) + (FAT_ZERO_WEIGHT); // 0.5 for exact match, lower for mismatch. Add this weight to both ports involved
+#endif
+
 			// Add weight to both channels in a pair
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][0]) +=d;
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * pair_ports[ipair][1]) +=d;
@@ -3711,7 +3749,13 @@ __device__ void tile_combine_rgba(
 			}
 			// TODO: Should it use pair_dist2r ? no as it is relative?
 			//				port_weights[ip][i] = Math.exp(-ksigma * d2[ip]);
+
+#ifdef FASTMATH
+			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = __expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#else
 			*(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam) = expf(-ksigma * d2_ip) + (FAT_ZERO_WEIGHT);
+#endif
+
 		}
 		// and now make a new average with those weights
 		// Inserting dust remove here
@@ -3879,7 +3923,11 @@ __device__ void tile_combine_rgba(
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
 				mx = fmaxf(mx, max_diff_tmp[cam][i]);
 			}
+#ifdef FASTMATH
+			max_diff[cam] = __fsqrt_rn(mx);
+#else
 			max_diff[cam] = sqrtf(mx);
+#endif
 		}
 	}


--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -41,6 +41,18 @@
 #include "tp_defines.h"
 #endif

+
+extern "C"
+__global__ void convert_correct_tiles(
+		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+		float           ** gpu_kernels,        // [NUM_CAMS],
+		float           ** gpu_images,         // [NUM_CAMS],
+		struct tp_task   * gpu_tasks,
+		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t             dstride,            // in floats (pixels)
+		int                num_tiles,          // number of tiles in task
+		int                lpf_mask);          // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+
 extern "C" __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
@@ -68,11 +80,11 @@ extern "C" __global__ void clear_texture_rbga(
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
 extern "C" __global__ void textures_accumulate(
-//		int               border_tile,        // if 1 - watch for border
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		// TODO: use geometry_correction rXY !
 		float           * gpu_port_offsets,   // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -102,5 +114,35 @@ extern "C" __global__ void imclt_rbg(
 		int               h_offset,
 		const size_t      dstride);            // in floats (pixels)

+extern "C"
+__global__ void generate_RBGA(
+// Parameters to generate texture tasks
+			struct tp_task   * gpu_tasks,
+			int                num_tiles,          // number of tiles in task list
+// declare arrays in device code?
+			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+			int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+			int              * woi,                // x,y,width,height of the woi
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+			int                height, // <= TILESY, use for faster processing of LWIR images
+// Parameters for the texture generation
+			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			// TODO: use geometry_correction rXY !
+			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+			int               colors,             // number of colors (3/1)
+			int               is_lwir,            // do not perform shot correction
+			float             min_shot,           // 10.0
+			float             scale_shot,         // 3.0
+			float             diff_sigma,         // pixel value/pixel change
+			float             diff_threshold,     // pixel value/pixel change
+			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+			float             weight0,            // scale for R
+			float             weight1,            // scale for B
+			float             weight2,            // scale for G
+			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+			int               keep_weights,       // return channel weights after A in RGBA (was removed)
+			const size_t      texture_rbga_stride,     // in floats
+			float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+


--- a/src/main/resources/kernels/dtt8x8.h
+++ b/src/main/resources/kernels/dtt8x8.h
@@ -72,9 +72,9 @@


 // kernels (not used so far)
-#ifdef BBBB
+#if 0
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
-#endif// #ifdef BBBB
+#endif// #if 0

 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(

--- a/src/main/resources/kernels/geometry_correction.cu
+++ b/src/main/resources/kernels/geometry_correction.cu
+/**
+ **
+ ** geometry_correction.cu
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  geometry_correction.cu is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+
+/**
+**************************************************************************
+* \file geometry_correction.cu
+* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
+
+*/
+#ifndef JCUDA
+#include "tp_defines.h"
+#include "dtt8x8.h"
+#include "geometry_correction.h"
+#endif // #ifndef JCUDA
+
+// Using NUM_CAMS threads per tile
+#define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
+#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_CV   ((sizeof(struct corr_vector)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_RBRD ((RBYRDIST_LEN + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+//#define CYCLES_COPY_ROTS ((NUM_CAMS * 3 *3 + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+#define CYCLES_COPY_ROTS (((sizeof(trot_deriv)/sizeof(float)) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
+
+#define DBG_CAM 3
+
+__device__ void printGeometryCorrection(struct gc * g);
+__device__ void printExtrinsicCorrection(corr_vector * cv);
+/**
+ * Calculate non-distorted radius from distorted using table approximation
+ * @param rDist distorted radius
+ * @return corresponding non-distorted radius
+ */
+inline __device__ float getRByRDist(float rDist,
+		float rByRDist [RBYRDIST_LEN]); //shared memory
+
+__constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
+		{ // azimuth
+				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
+				{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
+				{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
+
+		},{ // tilt
+				{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
+				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
+		},{ // roll*zoom
+				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
+
+		},{ // d_azimuth
+				{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
+		},{ // d_tilt
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
+				{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
+		},{ // d_roll
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+		},{ // d_zoom
+				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
+				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
+				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
+		}
+};
+
+__constant__ int angles_offsets [4] = {
+		offsetof(corr_vector, azimuth)/sizeof(float),
+		offsetof(corr_vector, tilt)   /sizeof(float),
+		offsetof(corr_vector, roll)   /sizeof(float),
+		offsetof(corr_vector, roll)   /sizeof(float)};
+__constant__ int mm_seq [3][3][3]={
+		{
+				{6,5,12}, // a_t * a_z -> tmp0
+				{7,6,13}, // a_r * a_t -> tmp1
+				{7,9,14}, // a_r * a_dt -> tmp2
+		}, {
+				{7,12,0}, // a_r * tmp0 -> rot          - bad
+				{13,8,1}, // tmp1 * a_daz -> deriv0     - good
+				{14,5,2}, // tmp2 * a_az  -> deriv1     - good
+		}, {
+				{10,12,3}, // a_dr * tmp0 -> deriv2     - good
+				{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
+				{-1,-1,-1} // do nothing
+		}};
+
+#if 0
+__device__ float rot_matrices       [NUM_CAMS][3][3];
+//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
+
+
+// threads (3,3,4)
+extern "C" __global__ void calc_rot_matrices(
+		struct corr_vector * gpu_correction_vector)
+{
+	__shared__ float zoom    [NUM_CAMS];
+	__shared__ float sincos  [NUM_CAMS][3][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+	__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
+
+	float angle;
+	int ncam = threadIdx.z;
+	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
+	int nangle =  nangle1 >> 1;
+	int is_sin = nangle1 & 1;
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0)  && ( threadIdx.y == 0)  && ( threadIdx.z == 0)){
+		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
+		printExtrinsicCorrection(gpu_correction_vector);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+	if (nangle < 4){ // this part only for 1-st 3
+		float* gangles =
+				(nangle ==0)?gpu_correction_vector->azimuth:(
+						(nangle ==1)?gpu_correction_vector->tilt:(
+								(nangle ==2)?gpu_correction_vector->roll:
+										gpu_correction_vector->zoom));
+		if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
+			angle = *(gangles + ncam);
+
+		} else {
+			angle = 0.0f;
+
+#pragma	unroll
+			for (int n = 0; n < (NUM_CAMS-1); n++){
+				angle -= *(gangles + n);
+			}
+		}
+		if (!is_sin){
+			angle += M_PI/2;
+		}
+		if (nangle < 3) {
+			sincos[ncam][nangle][is_sin]=sinf(angle);
+		} else if (is_sin){
+			zoom[ncam] = angle;
+		}
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+			printf("\n    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
+			printf("    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			printf("    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+	if (nangle == 3) {
+		sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+			printf("\na    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
+			printf("a    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			printf("a    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+
+
+	// now 3x3
+	for (int axis = 0; axis < 3; axis++) {
+		matrices[ncam][axis][threadIdx.y][threadIdx.x] =
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
+				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2];                         // const
+	}
+	__syncthreads();
+
+
+#ifdef DEBUG20a
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+
+			printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][0][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][1][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][2][i][j]);
+				}
+				printf("\n");
+			}
+
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+
+
+    // tilt * az ->
+	// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
+	matrices[ncam][3][threadIdx.y][threadIdx.x] =
+			matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
+			matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
+			matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
+	// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
+	__syncthreads();
+	rot_matrices[ncam][threadIdx.y][threadIdx.x] =
+			matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
+			matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
+			matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
+	__syncthreads();
+
+
+#ifdef DEBUG20
+	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int n = 0; n < NUM_CAMS; n++){
+
+			printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][0][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][1][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", matrices[n][2][i][j]);
+				}
+				printf("\n");
+			}
+
+			printf("2 - Rotation matrix for camera %d\n", n);
+			for (int i = 0; i < 3; i++){
+				for (int j = 0; j < 3; j++){
+					printf("%9.6f, ", rot_matrices[n][i][j]);
+				}
+				printf("\n");
+			}
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+}
+#endif
+__constant__ int offset_rots =     0;                   //0
+__constant__ int offset_derivs =   1;                   // 1..4 // should be next
+__constant__ int offset_matrices = 5;   // 5..11
+__constant__ int offset_tmp =      12; // 12..15
+
+/**
+ * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
+ * NUM_CAMS blocks of 3,3,3 tiles
+ */
+extern "C" __global__ void calc_rot_deriv(
+		struct corr_vector * gpu_correction_vector,
+		trot_deriv   * gpu_rot_deriv)
+{
+//	__shared__ float zoom;
+	__shared__ float sincos  [4][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+	__shared__ float matrices[5 + 7 +4][3][3];
+	float angle;
+	float zoom;
+	int ncam =    blockIdx.x; // threadIdx.z;
+	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
+	int nangle =  nangle1 >> 1;
+	int is_sin = nangle1 & 1;
+	if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
+		float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
+		if (ncam == (NUM_CAMS-1)){ // for the whole block
+			angle = 0.0;
+			zoom = 0.0;
+#pragma	unroll
+			for (int n = 0; n < (NUM_CAMS-1); n++){
+				angle -= *(gangles + n);
+				zoom -= gpu_correction_vector->zoom[n];
+			}
+			if (nangle >= 2){ // diverging for roll (last two)
+				angle = *(gangles + ncam);
+			}
+
+		} else {
+			angle = *(gangles + ncam);
+			zoom =   gpu_correction_vector->zoom[ncam];
+		}
+		if (!is_sin){
+			angle += M_PI/2;
+		}
+		float sc = sinf(angle);
+		if (nangle ==2) {
+			sc *= 1.0 + zoom;
+		}
+		sincos[nangle][is_sin]= sc;
+	}
+	__syncthreads();
+
+#ifdef DEBUG20
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		printf("\n    Azimuth matrix for   camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
+		printf(  "    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[1][0], sincos[1][1]);
+		printf(  "    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[2][0], sincos[2][1]);
+		printf(  "    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[3][0], sincos[3][1]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+
+
+// Create 3 3x3 matrices for az, tilt, roll/zoom:
+	int axis = offset_matrices+threadIdx.z; // 0..2
+	int const_index = threadIdx.z; // 0..2
+	matrices[axis][threadIdx.y][threadIdx.x] =
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                   // const
+	axis += 3; // skip index == 3
+	const_index +=3;
+	matrices[axis][threadIdx.y][threadIdx.x] =
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                          // const
+	if (threadIdx.z == 0){
+		axis += 3;
+		const_index +=3;
+		matrices[axis][threadIdx.y][threadIdx.x] =
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
+				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                // const
+
+	}
+	__syncthreads();
+
+#ifdef DEBUG20
+	const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
+
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+		for (int i = 0; i < 7; i++) {
+			printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
+			for (int row = 0; row < 3; row++){
+				for (int col = 0; col < 3; col++){
+					printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
+				}
+				printf("\n");
+			}
+
+		}
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+/*
+	__constant__ int mm_seq [3][3][3]={
+			{
+					{6,5,12}, // a_t * a_z -> tmp0
+					{7,6,13}, // a_r * a_t -> tmp1
+					{7,9,14}, // a_r * a_dt -> tmp2
+			}, {
+					{7,12,0}, // a_r * tmp0 -> rot
+					{13,8,1}, // tmp1 * a_daz -> deriv0
+					{14,5,2}, // tmp2 * a_az  -> deriv1
+			}, {
+					{10,12,3}, // a_dr * tmp0 -> deriv2
+					{11,12,4}, // a_dzoom * tnmp0 -> deriv3
+			}};
+*/
+	for (int i = 0; i < 3; i++){
+		int srcl = mm_seq[i][threadIdx.z][0];
+		int srcr = mm_seq[i][threadIdx.z][1];
+		int dst =  mm_seq[i][threadIdx.z][2];
+		if (srcl >= 0){
+			matrices[dst][threadIdx.y][threadIdx.x] =
+					matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
+					matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
+					matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
+		}
+		__syncthreads();
+	}
+// copy results to global memory
+	int gindx = threadIdx.z;
+	int lindx = offset_rots + threadIdx.z;
+#ifdef NVRTC_BUG
+	// going beyond first dimension
+	gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+#else
+	gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+#endif
+	gindx +=3;
+	lindx+=3;
+	if (lindx < 5) {
+#ifdef NVRTC_BUG
+	// going beyond first dimension
+		gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+#else
+		gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+#endif
+	}
+	__syncthreads();
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+			printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG20
+
+// All done - read/verify all arrays
+
+}
+
+
+/*
+ * blockDim.x = NUM_CAMS
+ * blockDim.y = TILES_PER_BLOCK_GEOM
+ */
+
+extern "C" __global__ void get_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv)
+{
+//	int task_num = blockIdx.x * blockDim.x + threadIdx.x; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.x
+	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
+	if (task_num >= num_tiles){
+		return;
+	}
+	int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
+	int ncam = threadIdx.x;
+	// threadIdx.x - numcam, used for per-camera
+	__shared__ struct gc geometry_correction;
+	__shared__ float rByRDist [RBYRDIST_LEN];
+	__shared__ struct corr_vector extrinsic_corr;
+	__shared__ trot_deriv rot_deriv;
+	float pXY[2]; // result to be copied to task
+	// copy data common to all threads
+	{
+		float * gcp_local =  (float *) &geometry_correction;
+		float * gcp_global = (float *) gpu_geometry_correction;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_GC; i++){
+			if (offset < sizeof(struct gc)/sizeof(float)) {
+				*(gcp_local + offset) = *(gcp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	{
+		float * cvp_local =  (float *) &extrinsic_corr;
+		float * cvp_global = (float *) gpu_correction_vector;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_CV; i++){
+			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
+				*(cvp_local + offset) = *(cvp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	// TODO: maybe it is better to use system memory and not read all table?
+	{
+		float * rByRDistp_local =  (float *) rByRDist;
+		float * rByRDistp_global = (float *) gpu_rByRDist;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_RBRD; i++){
+			if (offset < RBYRDIST_LEN) {
+				*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	// copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
+	{
+		float * rots_local =  (float *) &rot_deriv;
+		float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
+		int offset = thread_xy;
+		for (int i = 0; i < CYCLES_COPY_ROTS; i++){
+			if (offset < sizeof(trot_deriv)/sizeof(float)) {
+				*(rots_local + offset) = *(rots_global + offset);
+			}
+			offset += THREADS_PER_BLOCK_GEOM;
+		}
+	}
+	__syncthreads();
+	int imu_exists = // todo - calculate once with rot_deriv?
+			(extrinsic_corr.imu_rot[0] != 0.0) ||
+			(extrinsic_corr.imu_rot[1] != 0.0) ||
+			(extrinsic_corr.imu_rot[2] != 0.0) ||
+			(extrinsic_corr.imu_move[0] != 0.0) ||
+			(extrinsic_corr.imu_move[1] != 0.0) ||
+			(extrinsic_corr.imu_move[2] != 0.0);
+
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("\nTile = %d, camera= %d\n", task_num, ncam);
+		printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
+		printGeometryCorrection(&geometry_correction);
+		printExtrinsicCorrection(&extrinsic_corr);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+	//		String dbg_s = corr_vector.toString();
+	/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
+	 * 1) unapply common distortion (maybe for different - master camera)
+	 * 2) apply disparity
+	 * 3) apply rotations and zoom
+	 * 4) re-apply distortion
+	 * 5) return port center X and Y
+	 * line_time
+	 */
+
+	// common code, calculated in parallel
+	int cxy = gpu_tasks[task_num].txy;
+	float disparity = gpu_tasks[task_num].target_disparity;
+	int tileX = (cxy & 0xffff);
+	int tileY = (cxy >> 16);
+	float px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
+	float py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
+
+	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
+	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
+
+//	float rXY [NUM_CAMS][2];
+	float rXY [2];
+
+//	for (int i = 0; i < NUM_CAMS;i++){
+//	rXY[ncam][0] = geometry_correction.rXY[ncam][0];
+//	rXY[ncam][1] = geometry_correction.rXY[ncam][1];
+	rXY[0] = geometry_correction.rXY[ncam][0];
+	rXY[1] = geometry_correction.rXY[ncam][1];
+//	}
+
+	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
+	float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
+	float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
+	float pYc = pYcd * rND2R; // in pixels
+	float xyz [3]; // getWorldCoordinates
+	xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
+			(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
+	xyz[0] =  SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
+	xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
+	// next radial distortion coefficients are for this, not master camera (may be the same)
+//	geometry_correction.rad_coeff[i];
+	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
+	float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
+
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("\nTile = %d, camera= %d\n", task_num, ncam);
+		printf("tileX = %d,  tileY = %d\n", tileX, tileY);
+		printf("px = %f,  py = %f\n", px, py);
+		printf("pXcd = %f,  pYcd = %f\n", pXcd, pYcd);
+		printf("rXY[0] = %f,  rXY[1] = %f\n", rXY[0], rXY[1]);
+		printf("rD = %f,  rND2R = %f\n", rD, rND2R);
+		printf("pXc = %f,  pYc = %f\n", pXc, pYc);
+		printf("fl_pix = %f,  ri_scale = %f\n", fl_pix, ri_scale);
+		printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+
+	// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+			// non-distorted XY of the shifted location of the individual sensor
+
+	// -------------- Each camera calculated by its own thread ----------------
+	float pXci0 = pXc - disparity *  rXY[0]; // [ncam][0]; // in pixels
+	float pYci0 = pYc - disparity *  rXY[1]; // [ncam][1];
+	// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
+	// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
+	///		double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+	///		Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
+	// Apply port-individual combined rotation/zoom matrix
+	///		Matrix rvi = rots[i].times(vi);
+
+	float rvi[3];
+#pragma unroll
+	for (int j = 0; j< 3; j++){
+		rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
+	}
+	// get back to the projection plane by normalizing vector
+	float norm_z = fl_pix/rvi[2];
+	float pXci =  rvi[0] * norm_z;
+	float pYci =  rvi[1] * norm_z;
+	// Re-apply distortion
+	float rNDi =  sqrtf(pXci*pXci + pYci*pYci); // in pixels
+	float ri =    rNDi* ri_scale; // relative to distortion radius
+
+	float rD2rND = 1.0;
+	{
+		float rri = 1.0;
+#ifdef NVRTC_BUG
+#pragma unroll
+		for (int j = 0; j < RAD_COEFF_LEN; j++){
+			rri *= ri;
+			rD2rND +=  ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
+		}
+#else
+		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+			rri *= ri;
+			rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
+		}
+#endif
+	}
+	// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
+	float pXid = pXci * rD2rND;
+	float pYid = pYci * rD2rND;
+	pXY[0] =  pXid + geometry_correction.pXY0[ncam][0];
+	pXY[1] =  pYid + geometry_correction.pXY0[ncam][1];
+
+	// used when calculating derivatives, TODO: combine calculations !
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
+		printf("rvi[0] = %f,  rvi[1] = %f,  rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
+		printf("norm_z = %f,  pXci = %f,  pYci = %f\n", norm_z, pXci, pYci);
+		printf("rNDi = %f,  ri = %f\n", rNDi, ri);
+		printf("rD2rND = %f\n", rD2rND);
+		printf("pXid = %f,  pYid = %f\n", pXid, pYid);
+		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+
+
+	//	float rvi[3];
+	float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
+	float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
+	float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
+
+#pragma unroll
+	for (int j = 0; j< 3; j++){
+//		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  rvi[0] + rot_deriv.d_daz[ncam][j][1] *  rvi[1] + rot_deriv.d_daz[ncam][j][2] *  rvi[2];
+//		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * rvi[0] + rot_deriv.d_tilt[ncam][j][1] * rvi[1] + rot_deriv.d_tilt[ncam][j][2] * rvi[2];
+//		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * rvi[0] + rot_deriv.d_roll[ncam][j][1] * rvi[1] + rot_deriv.d_roll[ncam][j][2] * rvi[2];
+		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  pXci0 + rot_deriv.d_daz[ncam][j][1] *  pYci0 + rot_deriv.d_daz[ncam][j][2] *  fl_pix;
+		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
+		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
+	}
+//			double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+
+	float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
+	float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
+	float dpXci_dtilt =    drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
+	float dpYci_dtilt =    drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
+	float dpXci_droll =    drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
+	float dpYci_droll =    drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("drvi_daz[0] = %f,  drvi_daz[1] = %f,  drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
+		printf("drvi_dtl[0] = %f,  drvi_dtl[1] = %f,  drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
+		printf("drvi_drl[0] = %f,  drvi_drl[1] = %f,  drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
+
+		printf("dpXci_dazimuth = %f,  dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
+		printf("dpXci_dtilt = %f,     dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
+		printf("dpXci_droll = %f,     dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+	float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
+	float dpXci_pYci_imu_lin[2][3];
+/*
+ 				double [][] add0 = {
+						{-rXY[i][0],  rXY[i][1], 0.0},
+						{-rXY[i][1], -rXY[i][0], 0.0},
+						{ 0.0,        0.0,       0.0}}; // what is last element???
+				Matrix dd0 = new Matrix(add0);
+				Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
+
+ */
+	float dd1[2][2];// get top left 2x2 sub-matrix
+	dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
+	dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
+	dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
+	dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("dd1[0][0] = %f,  dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
+		printf("dd1[1][0] = %f,  dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+
+	// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
+	// unity vector in the direction of radius
+	float c_dist = pXci/rNDi;
+	float s_dist = pYci/rNDi;
+/*
+				double [][] arot2= {
+						{c_dist, s_dist},
+						{-s_dist, c_dist}};
+				Matrix rot2 = new Matrix(arot2); // convert from non-distorted X,Y to parallel and perpendicular (CCW) to the radius
+
+				double [][] ascale_distort = {
+						{rD2rND + ri* drD2rND_dri, 0     },
+						{0,                       rD2rND}};
+				Matrix scale_distort = new Matrix(ascale_distort); // scale component parallel to radius as distortion derivative, perpendicular - as distortion
+
+				Matrix dd2 = rot2.transpose().times(scale_distort).times(rot2).times(dd1);
+
+				disp_dist[i][0] =   dd2.get(0, 0);
+				disp_dist[i][1] =   dd2.get(0, 1);
+				disp_dist[i][2] =   dd2.get(1, 0); // d_py/d_disp
+				disp_dist[i][3] =   dd2.get(1, 1);
+
+ */
+//#undef NVRTC_BUG
+	float drD2rND_dri = 0.0;
+	{
+		float rri = 1.0;
+#ifdef NVRTC_BUG
+#pragma unroll
+		for (int j = 0; j < RAD_COEFF_LEN; j++){
+			drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
+			rri *= ri;
+		}
+#else
+#pragma unroll
+		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+			drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
+			rri *= ri;
+		}
+#endif
+	}
+	float scale_distort00 = rD2rND + ri* drD2rND_dri;
+	float scale_distort11 = rD2rND;
+//	float rot2Xdd1[2][2];
+//	rot2Xdd1[0][0] =  c_dist * dd1[0][0] + s_dist * dd1[1][0];
+//	rot2Xdd1[0][1] =  c_dist * dd1[0][1] + s_dist * dd1[1][1];
+//	rot2Xdd1[1][0] = -s_dist * dd1[0][0] + c_dist * dd1[1][0];
+//	rot2Xdd1[1][1] = -s_dist * dd1[0][1] + c_dist * dd1[1][1];
+	float scale_distortXrot2Xdd1[2][2];
+	scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
+	scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
+	scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
+	scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
+
+	disp_dist[0] =    c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
+	disp_dist[1] =    c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
+	disp_dist[2] =    s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
+	disp_dist[3] =    s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
+		printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
+		printf("disp_dist[0] = %f\n", disp_dist[0]);
+		printf("disp_dist[1] = %f\n", disp_dist[1]);
+		printf("disp_dist[2] = %f\n", disp_dist[2]);
+		printf("disp_dist[3] = %f\n", disp_dist[3]);
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+
+	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
+	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
+	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
+	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
+
+//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+//	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
+// ERS linear does not yet use per-port rotations, probably not needed
+	if (imu_exists){
+		float delta_t = disp_dist[2] * disparity * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
+		float ers_Xci =	delta_t * (
+				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
+				dpXci_droll * extrinsic_corr.imu_rot[2]);
+		float ers_Yci =	delta_t* (
+				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
+				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
+				dpYci_droll * extrinsic_corr.imu_rot[2]);
+#ifdef DEBUG21
+		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+			printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG21
+		if (disparity >= MIN_DISPARITY){ // all threads together
+			float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
+			float wdisparity = disparity;
+			float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
+			dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
+			dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
+			dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
+			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+			ers_Xci += delta_t* (
+					dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
+					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2]);
+			ers_Yci += delta_t* (
+					dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
+					dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2]);
+			pXY[0] +=  ers_Xci * rD2rND; // added correction to pixel X
+			pXY[1] +=  ers_Yci * rD2rND; // added correction to pixel Y
+
+#ifdef DEBUG21
+	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+		printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
+		printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
+		printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
+
+		printf("delta_t = %f,  ers_Xci = %f,  ers_Yci = %f\n", delta_t, ers_Xci, ers_Yci);
+		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG21
+
+		}
+	}
+	// copy results to global memory pXY,  disp_dist
+	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
+	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
+
+
+}
+
+
+
+/**
+ * Calculate non-distorted radius from distorted using table approximation
+ * @param rDist distorted radius
+ * @return corresponding non-distorted radius
+ */
+inline __device__ float getRByRDist(float rDist,
+		float rByRDist [RBYRDIST_LEN]) //shared memory
+{
+	if (rDist < 0) {
+		return 0.0f; // normally should not happen
+	}
+	float findex = rDist/RBYRDIST_STEP;
+	int index= (int) floorf(findex);
+	if (index < 0){
+		index = 0;
+	}
+	if (index > (RBYRDIST_LEN - 3)) {
+		index = RBYRDIST_LEN - 3;
+	}
+	float mu = fmaxf(findex - index, 0.0f);
+	float mu2 = mu * mu;
+	float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
+	// use Catmull-Rom
+	float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
+	float a1 =        y0 - 2.5 * rByRDist[index] + 2   * rByRDist[index+1] - 0.5 * rByRDist[index+2];
+	float a2 = -0.5 * y0                              + 0.5 * rByRDist[index+1];
+	float a3 =  rByRDist[index];
+	float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
+	return result;
+}
+
+__device__ void printGeometryCorrection(struct gc * g){
+#ifndef JCUDA
+	printf("\nGeometry Correction\n------------------\n");
+	printf("%22s: %f\n","pixelCorrectionWidth",  g->pixelCorrectionWidth);
+	printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
+	printf("%22s: %f\n","line_time",             g->line_time);
+
+	printf("%22s: %f\n","focalLength", g->focalLength);
+	printf("%22s: %f\n","pixelSize",   g->pixelSize);
+	printf("%22s: %f\n","distortionRadius",g->distortionRadius);
+
+	printf("%22s: %f\n","distortionC", g->distortionC);
+	printf("%22s: %f\n","distortionB", g->distortionB);
+	printf("%22s: %f\n","distortionA", g->distortionA);
+	printf("%22s: %f\n","distortionA5",g->distortionA5);
+	printf("%22s: %f\n","distortionA6",g->distortionA6);
+	printf("%22s: %f\n","distortionA7",g->distortionA7);
+	printf("%22s: %f\n","distortionA8",g->distortionA8);
+
+	printf("%22s: %f\n","elevation",   g->elevation);
+	printf("%22s: %f\n","heading",     g->heading);
+
+	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
+	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
+	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
+	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
+	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
+	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
+	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
+	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
+
+	printf("%22s: %f\n","common_right",   g->common_right);
+	printf("%22s: %f\n","common_forward", g->common_forward);
+	printf("%22s: %f\n","common_height",  g->common_height);
+	printf("%22s: %f\n","common_roll",    g->common_roll);
+
+	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
+	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
+
+	printf("%22s: %f\n","cameraRadius",    g->cameraRadius);
+	printf("%22s: %f\n","disparityRadius", g->disparityRadius);
+#endif //ifndef JCUDA
+}
+
+__device__ void printExtrinsicCorrection(corr_vector * cv)
+{
+#ifndef JCUDA
+	printf("\nExtrinsic Correction Vector\n---------------------------\n");
+	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
+	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
+	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
+	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
+
+	printf("%22s: %f(t), %f(a), %f(r)\n",     "imu_rot",    cv->imu_rot[0],    cv->imu_rot[1],    cv->imu_rot[2]);
+	printf("%22s: %f(x), %f(y), %f(z)\n",     "imu_move",    cv->imu_move[0],    cv->imu_move[1],    cv->imu_move[2]);
+#endif //ifndef JCUDA
+}
+
+
+
--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
@@ -41,6 +41,19 @@
 #include "tp_defines.h"
 #endif

+#define NVRTC_BUG 1
+#ifndef M_PI
+#define M_PI  3.14159265358979323846 /* pi */
+#endif
+#ifndef offsetof
+#define offsetof(st, m) \
+    ((size_t)&(((st *)0)->m))
+//#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
+#endif
+
+
+#define SCENE_UNITS_SCALE  0.001 // meters from mm
+#define MIN_DISPARITY      0.01  // minimal disparity to try to convert to world coordinates
 struct tp_task {
 	int   task;
 	union {
@@ -61,19 +74,50 @@ struct corr_vector{
 	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
 	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
 };
+#ifdef NVRTC_BUG
+struct trot_deriv{
+	float rots    [NUM_CAMS][3][3];
+	float d_daz   [NUM_CAMS][3][3];
+	float d_tilt  [NUM_CAMS][3][3];
+	float d_roll  [NUM_CAMS][3][3];
+	float d_zoom  [NUM_CAMS][3][3];
+};
+#else
+union trot_deriv{
+	struct {
+		float rots    [NUM_CAMS][3][3];
+		float d_daz   [NUM_CAMS][3][3];
+		float d_tilt  [NUM_CAMS][3][3];
+		float d_roll  [NUM_CAMS][3][3];
+		float d_zoom  [NUM_CAMS][3][3];
+	};
+	float matrices [5][NUM_CAMS][3][3];
+};
+#endif

 struct gc {
+	float pixelCorrectionWidth; //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
+	float pixelCorrectionHeight; // =1936;
+	float line_time;        // duration of one scan line readout (for ERS)
 	float focalLength;      // =FOCAL_LENGTH;
 	float pixelSize;        // =  PIXEL_SIZE; //um
 	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
-
-	float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
-	float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
-	float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
-	float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
-	float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
-	float distortionB;      // r^3
+#ifndef	NVRTC_BUG
+	union {
+		struct {
+#endif
 			float distortionC;      // r^2
+			float distortionB;      // r^3
+			float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
+			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+#ifndef	NVRTC_BUG
+//		};
+//		float rad_coeff [7];
+//	};
+#endif
 	// parameters, common for all sensors
 	float    elevation;     // degrees, up - positive;
 	float    heading;       // degrees, CW (from top) - positive
@@ -82,18 +126,36 @@ struct gc {
 	float right      [NUM_CAMS];
 	float height     [NUM_CAMS];
 	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
-
+	float pXY0       [NUM_CAMS][2];
 	float common_right;    // mm right, camera center
 	float common_forward;  // mm forward (to target), camera center
 	float common_height;   // mm up, camera center
 	float common_roll;     // degrees CW (to target) camera as a whole
 //	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
 //	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
-	float rXY        [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
+	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
 //	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
 // only used for the multi-quad systems
 	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
 	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
 };
+#define RAD_COEFF_LEN 7
+extern "C" __global__ void get_tiles_offsets(
+		struct tp_task     * gpu_tasks,
+		int                  num_tiles,          // number of tiles in task
+		struct gc          * gpu_geometry_correction,
+		struct corr_vector * gpu_correction_vector,
+		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+		trot_deriv   * gpu_rot_deriv);
+
+#if 0
+// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
+extern "C" __global__ void calc_rot_matrices(
+		struct corr_vector * gpu_correction_vector);
+#endif
+// uses NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calc_rot_deriv(
+		struct corr_vector * gpu_correction_vector,
+		trot_deriv   * gpu_rot_deriv);