ran formatter

6c76931e · Palani Johnson · 4648cb20 · 6c76931e · 6c76931e · 6c76931e
Commit 6c76931e authored Feb 26, 2022 by Palani Johnson
8 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -41,147 +41,152 @@
 #include "tp_defines.h"
 #endif

-
-extern "C" __global__ void convert_direct( // called with a single block, single thread
-		//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
-		int                num_cams,           // actual number of cameras
-		int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t             dstride,            // in floats (pixels)
-		int                num_tiles,          // number of tiles in task
-		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
-		int                woi_width,
-		int                woi_height,
-		int                kernels_hor,
-		int                kernels_vert,
-		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              pnum_active_tiles,  //  indices to gpu_tasks
-		int                tilesx);
+extern "C" __global__ void convert_direct(  // called with a single block, single thread
+                                            //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+    int num_cams,                           // actual number of cameras
+    int num_colors,                         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    float** gpu_kernel_offsets,             // [NUM_CAMS],
+    float** gpu_kernels,                    // [NUM_CAMS],
+    float** gpu_images,                     // [NUM_CAMS],
+    float* gpu_ftasks,                      // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+                                            //		struct tp_task   * gpu_tasks,
+    float** gpu_clt,                        // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    size_t dstride,                         // in floats (pixels)
+    int num_tiles,                          // number of tiles in task
+    int lpf_mask,                           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+    int woi_width,
+    int woi_height,
+    int kernels_hor,
+    int kernels_vert,
+    int* gpu_active_tiles,   // pointer to the calculated number of non-zero tiles
+    int* pnum_active_tiles,  //  indices to gpu_tasks
+    int tilesx);

 extern "C" __global__ void correlate2D(
-		int               num_cams,
-//		int *             sel_pairs,
-		int               sel_pairs0,
-		int               sel_pairs1,
-		int               sel_pairs2,
-		int               sel_pairs3,
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		int               colors,             // number of colors (3/1)
-		float             scale0,             // scale for R
-		float             scale1,             // scale for B
-		float             scale2,             // scale for G
-		float             fat_zero2,           // here - absolute, squared
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
-		int               num_tiles,          // number of tiles in task
-		int               tilesx,             // number of tile rows
-		int             * gpu_corr_indices,   // packed tile+pair
-		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
-		size_t      corr_stride,        // in floats
-//		int               corr_stride,        // in floats
-		int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		float           * gpu_corrs);          // correlation output data
+    int num_cams,
+    //		int *             sel_pairs,
+    int sel_pairs0,
+    int sel_pairs1,
+    int sel_pairs2,
+    int sel_pairs3,
+    float** gpu_clt,        // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    int colors,             // number of colors (3/1)
+    float scale0,           // scale for R
+    float scale1,           // scale for B
+    float scale2,           // scale for G
+    float fat_zero2,        // here - absolute, squared
+    float* gpu_ftasks,      // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+                            //		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+    int num_tiles,          // number of tiles in task
+    int tilesx,             // number of tile rows
+    int* gpu_corr_indices,  // packed tile+pair
+    int* pnum_corr_tiles,   // pointer to a number of correlation tiles to process
+    size_t corr_stride,     // in floats
+                            //		int               corr_stride,        // in floats
+    int corr_radius,        // radius of the output correlation (7 for 15x15)
+    float* gpu_corrs);      // correlation output data

 extern "C" __global__ void corr2D_normalize(
-		int               num_corr_tiles,     // number of correlation tiles to process
-		const size_t      corr_stride_td,     // in floats
-		float           * gpu_corrs_td,       // correlation tiles in transform domain
-		float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
-		const size_t      corr_stride,        // in floats
-		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
-		float             fat_zero2,          // here - absolute, squared
-		int               corr_radius);        // radius of the output correlation (7 for 15x15)
+    int num_corr_tiles,           // number of correlation tiles to process
+    const size_t corr_stride_td,  // in floats
+    float* gpu_corrs_td,          // correlation tiles in transform domain
+    float* corr_weights,          // null or per-tile weight (fat_zero2 will be divided by it)
+    const size_t corr_stride,     // in floats
+    float* gpu_corrs,             // correlation output data (either pixel domain or transform domain
+    float fat_zero2,              // here - absolute, squared
+    int corr_radius);             // radius of the output correlation (7 for 15x15)

 extern "C" __global__ void corr2D_combine(
-		int               num_tiles,          // number of tiles to process (each with num_pairs)
-		int               num_pairs,          // num pairs per tile (should be the same)
-		int               init_output,        // !=0 - reset output tiles to zero before accumulating
-		int               pairs_mask,         // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
-		int             * gpu_corr_indices,   // packed tile+pair
-		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
-		const size_t      corr_stride,        // (in floats) stride for the input TD correlations
-		float           * gpu_corrs,          // input correlation tiles
-		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
-		float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+    int num_tiles,                   // number of tiles to process (each with num_pairs)
+    int num_pairs,                   // num pairs per tile (should be the same)
+    int init_output,                 // !=0 - reset output tiles to zero before accumulating
+    int pairs_mask,                  // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
+    int* gpu_corr_indices,           // packed tile+pair
+    int* gpu_combo_indices,          // output if noty null: packed tile+pairs_mask (will point to the first used pair
+    const size_t corr_stride,        // (in floats) stride for the input TD correlations
+    float* gpu_corrs,                // input correlation tiles
+    const size_t corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
+    float* gpu_corrs_combo);         // combined correlation output (one per tile)

 extern "C" __global__ void textures_nonoverlap(
-		int               num_cams,           // number of cameras
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats
-		//		struct tp_task  * gpu_tasks,
-		int               num_tiles,          // number of tiles in task list
-//		int               num_tilesx,         // number of tiles in a row
-// declare arrays in device code?
-		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		// TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
-		int               colors,             // number of colors (3/1)
-		int               is_lwir,            // do not perform shot correction
-		float             params[5],
-		float             weights[3],         // scale for R,B,G
-		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-// combining both non-overlap and overlap (each calculated if pointer is not null )
-		size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
-		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
-		int               linescan_order,     // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
-		float           * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
-		int               num_tilesx);
+    int num_cams,       // number of cameras
+    float* gpu_ftasks,  // flattened tasks, 29 floats for quad EO, 101 floats
+    //		struct tp_task  * gpu_tasks,
+    int num_tiles,             // number of tiles in task list
+                               //		int               num_tilesx,         // number of tiles in a row
+                               // declare arrays in device code?
+    int* gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
+    int* pnum_texture_tiles,   // returns total number of elements in gpu_texture_indices array
+    float** gpu_clt,           // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    // TODO: use geometry_correction rXY !
+    struct gc* gpu_geometry_correction,
+    int colors,   // number of colors (3/1)
+    int is_lwir,  // do not perform shot correction
+    float params[5],
+    float weights[3],           // scale for R,B,G
+    int dust_remove,            // Do not reduce average weight when only one image differs much from the average
+                                // combining both non-overlap and overlap (each calculated if pointer is not null )
+    size_t texture_stride,      // in floats (now 256*4 = 1024)  // may be 0 if not needed
+    float* gpu_texture_tiles,   // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
+    int linescan_order,         // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
+    float* gpu_diff_rgb_combo,  //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
+    int num_tilesx);

-extern "C"
-__global__ void imclt_rbg_all(
-		int                num_cams,
-		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
-		int                apply_lpf,
-		int                colors,
-		int                woi_twidth,
-		int                woi_theight,
-		const size_t       dstride);            // in floats (pixels)
+extern "C" __global__ void imclt_rbg_all(
+    int num_cams,
+    float** gpu_clt,          // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_corr_images,  // [NUM_CAMS][WIDTH, 3 * HEIGHT]
+    int apply_lpf,
+    int colors,
+    int woi_twidth,
+    int woi_theight,
+    const size_t dstride);  // in floats (pixels)

 extern "C" __global__ void erase8x8(
-		float           * gpu_top_left,
-		const size_t      dstride);
+    float* gpu_top_left,
+    const size_t dstride);

 extern "C" __global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-		int               apply_lpf,
-		int               mono,               // defines lpf filter
-		int               color,              // defines location of clt data
-		int               v_offset,
-		int               h_offset,
-		int               woi_twidth,
-		int               woi_theight,
-		const size_t      dstride);            // in floats (pixels)
+    float* gpu_clt,  // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float* gpu_rbg,  // WIDTH, 3 * HEIGHT
+    int apply_lpf,
+    int mono,   // defines lpf filter
+    int color,  // defines location of clt data
+    int v_offset,
+    int h_offset,
+    int woi_twidth,
+    int woi_theight,
+    const size_t dstride);  // in floats (pixels)

 extern "C" __global__ void generate_RBGA(
-		int                num_cams,           // number of cameras used
-		// Parameters to generate texture tasks
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
-		int                num_tiles,          // number of tiles in task list
-		// declare arrays in device code?
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
-		int              * woi,                // x,y,width,height of the woi
-		int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
-		int                height, // <= TILES-Y, use for faster processing of LWIR images
-		// Parameters for the texture generation
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		// TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
-		int               colors,             // number of colors (3/1)
-		int               is_lwir,            // do not perform shot correction
-		float             params[5],          // mitigating CUDA_ERROR_INVALID_PTX
-		float             weights[3],         // scale for R,B,G
-		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
-		int               keep_weights,       // return channel weights after A in RGBA (was removed)
-		const size_t      texture_rbga_stride,     // in floats
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+    int num_cams,  // number of cameras used
+    // Parameters to generate texture tasks
+    float* gpu_ftasks,  // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+                        //		struct tp_task   * gpu_tasks,
+    int num_tiles,      // number of tiles in task list
+    // declare arrays in device code?
+    int* gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
+    int* num_texture_tiles,    // number of texture tiles to process  (8 separate elements for accumulation)
+    int* woi,                  // x,y,width,height of the woi
+    int width,                 // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
+    int height,                // <= TILES-Y, use for faster processing of LWIR images
+    // Parameters for the texture generation
+    float** gpu_clt,  // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    // TODO: use geometry_correction rXY !
+    struct gc* gpu_geometry_correction,
+    int colors,                        // number of colors (3/1)
+    int is_lwir,                       // do not perform shot correction
+    float params[5],                   // mitigating CUDA_ERROR_INVALID_PTX
+    float weights[3],                  // scale for R,B,G
+    int dust_remove,                   // Do not reduce average weight when only one image differs much from the average
+    int keep_weights,                  // return channel weights after A in RGBA (was removed)
+    const size_t texture_rbga_stride,  // in floats
+    float* gpu_texture_tiles);         // (number of colors +1 + ?)*16*16 rgba texture tiles

+extern "C" __global__ void accumulate_correlations(
+    int tilesY,
+    int tilesX,
+    int pairs,
+    float* num_acc,        // number of accumulated tiles [tilesY][tilesX][pair]
+    float* fcorr_td,       // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
+    float* fcorr_td_acc);  // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f;
 __constant__ float SQRT_2 = 1.414214f;
 __constant__ float SQRT1_2 = 0.707107f;
 __constant__ float SQRT1_8 = 0.353553f;
-__constant__ float COSN1[] = {0.980785f,0.831470f};
-__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
-__constant__ float SINN1[] = {0.195090f,0.555570f};
-__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
-__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
-__constant__ float idct_signs[4][4][4] ={
-		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
-				{-1, 1, 1, 1},
-				{-1, 1, 1, 1},
-				{-1, 1, 1, 1}
-		},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
-				{-1,-1,-1, 1},
-				{-1,-1,-1, 1},
-				{-1,-1,-1, 1}
-		},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
-				{ 1,-1,-1,-1},
-				{ 1,-1,-1,-1},
-				{-1, 1, 1, 1}
-		},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
-				{ 1, 1, 1,-1},
-				{ 1, 1, 1,-1},
-				{-1,-1,-1, 1}
-		}};
-__constant__ float HWINDOW2[] =  {0.049009f, 0.145142f, 0.235698f, 0.317197f,
-                                  0.386505f, 0.440961f, 0.478470f, 0.497592f};
-
-
-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
-inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
-inline __device__ void dctiv_nodiverg  (float * x0,  int inc);                    // used in TP
-inline __device__ void dstiv_nodiverg  (float * x0,  int inc);                    // used in TP
-
-inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-
+__constant__ float COSN1[] = {0.980785f, 0.831470f};
+__constant__ float COSN2[] = {0.995185f, 0.956940f, 0.881921f, 0.773010f};
+__constant__ float SINN1[] = {0.195090f, 0.555570f};
+__constant__ float SINN2[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f};
+__constant__ int imclt_indx9[16] = {0x28, 0x29, 0x2a, 0x2b, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x24, 0x25, 0x26, 0x27};
+__constant__ float idct_signs[4][4][4] = {
+    {// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+     {1, -1, -1, -1},
+     {-1, 1, 1, 1},
+     {-1, 1, 1, 1},
+     {-1, 1, 1, 1}},
+    {// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+     {1, 1, 1, -1},
+     {-1, -1, -1, 1},
+     {-1, -1, -1, 1},
+     {-1, -1, -1, 1}},
+    {// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+     {1, -1, -1, -1},
+     {1, -1, -1, -1},
+     {1, -1, -1, -1},
+     {-1, 1, 1, 1}},
+    {// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+     {1, 1, 1, -1},
+     {1, 1, 1, -1},
+     {1, 1, 1, -1},
+     {-1, -1, -1, 1}}};
+__constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
+                                 0.386505f, 0.440961f, 0.478470f, 0.497592f};
+
+inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct);  // does not scale by y[0] (y[7]) by 1/sqrt[0]
+inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct);           // used in GPU_DTT24_DRV
+inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct);           // used in GPU_DTT24_DRV
+inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct);              // not used
+inline __device__ void dctiv_nodiverg(float *x0, int inc);                              // used in TP
+inline __device__ void dstiv_nodiverg(float *x0, int inc);                              // used in TP
+
+inline __device__ void dct_ii8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
+inline __device__ void dct_iv8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
+inline __device__ void dst_iv8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
+inline __device__ void _dctii_nrecurs8(float x[8], float y[8]);  // x,y point to 8-element arrays each // not used
+inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]);  // x,y point to 8-element arrays each // not used

 /**
 **************************************************************************
@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
 * \return None
 */
 #ifdef BBBB
-extern "C"
-__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
-{
-	int dtt_mode0 = dtt_mode & 1;
-	int dtt_mode1 = (dtt_mode >>1) & 1;
+extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) {
+    int dtt_mode0 = dtt_mode & 1;
+    int dtt_mode1 = (dtt_mode >> 1) & 1;

    __shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];

@@ -162,1185 +157,1151 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
    __syncthreads();
    // horizontal pass
    if (dtt_mode > 3) {
-    	dttii_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
+        dttii_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
    } else {
-    	dttiv_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
+        dttiv_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
    }

    __syncthreads();
    // vertical pass
    if (dtt_mode > 3) {
-    	dttii_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+        dttii_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
    } else {
-    	dttiv_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+        dttiv_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
    }
    __syncthreads();
    for (unsigned int i = 0; i < DTT_SIZE; i++)
        dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
 }
-#endif //#ifdef BBBB
+#endif  //#ifdef BBBB

+inline __device__ void _dctiv_nrecurs8(float x[8], float y[8])  // x,y point to 8-element arrays each
+{
+    float u00 = (COSN2[0] * x[0] + SINN2[0] * x[7]);
+    float u10 = (-SINN2[3] * x[3] + COSN2[3] * x[4]);

+    float u01 = (COSN2[1] * x[1] + SINN2[1] * x[6]);
+    float u11 = -(-SINN2[2] * x[2] + COSN2[2] * x[5]);

-inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	float u00=            ( COSN2[0] * x[0] + SINN2[0] * x[7]);
-	float u10=            (-SINN2[3] * x[3] + COSN2[3] * x[4]);
+    float u02 = (COSN2[2] * x[2] + SINN2[2] * x[5]);
+    float u12 = (-SINN2[1] * x[1] + COSN2[1] * x[6]);

-	float u01=            ( COSN2[1] * x[1] + SINN2[1] * x[6]);
-	float u11=           -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
+    float u03 = (COSN2[3] * x[3] + SINN2[3] * x[4]);
+    float u13 = -(-SINN2[0] * x[0] + COSN2[0] * x[7]);

-	float u02=            ( COSN2[2] * x[2] + SINN2[2] * x[5]);
-	float u12=            (-SINN2[1] * x[1] + COSN2[1] * x[6]);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);

-	float u03=            ( COSN2[3] * x[3] + SINN2[3] * x[4]);
-	float u13=           -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;

-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;

-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;

-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;

-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);

-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;

-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;

-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;

-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;

-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
+    y[0] = SQRT_2 * v00;  // w0[0];
+    y[1] = v01 - vb11;    // w1[0];
+    // j == 1
+    y[2] = v01 + vb11;  // w0[1];
+    y[3] = v02 + vb01;  // w1[1];
+    // j == 2
+    y[4] = v02 - vb01;  // w0[2];
+    y[5] = v03 - vb10;  // w1[2]; - same as y[3]
+    // j == 3
+    y[6] = v03 + vb10;     // w0[3];
+    y[7] = SQRT_2 * vb00;  // w1[3];
+}

-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+__device__ void _dttiv(float x0, float x1, float x2, float x3, float x4, float x5, float x6, float x7,
+                       float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) {
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    if (dst_not_dct) {  // DSTIV
+        u00 = (COSN2[0] * x7 + SINN2[0] * x0);
+        u10 = (-SINN2[3] * x4 + COSN2[3] * x3);

+        u01 = (COSN2[1] * x6 + SINN2[1] * x1);
+        u11 = -(-SINN2[2] * x5 + COSN2[2] * x2);

-	y[0] =  SQRT_2 * v00;    // w0[0];
-	y[1] =  v01 -  vb11;    // w1[0];
-	// j == 1
-	y[2] =  v01 +  vb11;    // w0[1];
-	y[3] =  v02 +  vb01;    // w1[1];
-	// j == 2
-	y[4] =  v02 -  vb01;    // w0[2];
-	y[5] =  v03 -  vb10;    // w1[2]; - same as y[3]
-	// j == 3
-	y[6] =  v03 +  vb10;    // w0[3];
-	y[7] =  SQRT_2 * vb00;    // w1[3];
-}
+        u02 = (COSN2[2] * x5 + SINN2[2] * x2);
+        u12 = (-SINN2[1] * x6 + COSN2[1] * x1);

-__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
-		float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
-{
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTIV
-		u00=  ( COSN2[0] * x7 + SINN2[0] * x0);
-		u10=  (-SINN2[3] * x4 + COSN2[3] * x3);
-
-		u01=  ( COSN2[1] * x6 + SINN2[1] * x1);
-		u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
-
-		u02=  ( COSN2[2] * x5 + SINN2[2] * x2);
-		u12=  (-SINN2[1] * x6 + COSN2[1] * x1);
-
-		u03=  ( COSN2[3] * x4 + SINN2[3] * x3);
-		u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
-	} else { // DCTIV
-		u00=  ( COSN2[0] * x0 + SINN2[0] * x7);
-		u10=  (-SINN2[3] * x3 + COSN2[3] * x4);
-
-		u01=  ( COSN2[1] * x1 + SINN2[1] * x6);
-		u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
-
-		u02=  ( COSN2[2] * x2 + SINN2[2] * x5);
-		u12=  (-SINN2[1] * x1 + COSN2[1] * x6);
-
-		u03=  ( COSN2[3] * x3 + SINN2[3] * x4);
-		u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
-	}
-
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-
-	*y0 =  v00 * 0.5f;              // w0[0];
-	// j == 1
-	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	// j == 2
-	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	// j == 3
-	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	if (dst_not_dct) { // DSTIV
-		*y1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-		*y3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-		*y5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-		*y7 = -vb00 * 0.5f;             // w1[3];
-	} else {
-		*y1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-		*y3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-		*y5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-		*y7 =  vb00 * 0.5f;             // w1[3];
-	}
-}
+        u03 = (COSN2[3] * x4 + SINN2[3] * x3);
+        u13 = -(-SINN2[0] * x7 + COSN2[0] * x0);
+    } else {  // DCTIV
+        u00 = (COSN2[0] * x0 + SINN2[0] * x7);
+        u10 = (-SINN2[3] * x3 + COSN2[3] * x4);

-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTII
-		// invert odd input samples
-		u00= ( (*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
-
-		u01= (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
-
-		u02= ( (*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
-
-		u03= (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
-	} else { // DCTII
-		u00= ( (*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
-
-		u01= ( (*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
-
-		u02= ( (*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
-
-		u03= ( (*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
-	}
-	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-
-		float w00= u00 + u03;
-		float w10= u00 - u03;
-
-		float w01= (u01 + u02);
-		float w11= (u01 - u02);
-
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-		float v11 = w20 - w21 - w30 + w31;
-		float v12 = w20 - w21 + w30 - w31;
-
-	if (dst_not_dct) { // DSTII
-		// Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-		*x1 =   v03 *         SQRT1_8;
-
-		*x2 =   v12 *         SQRT1_8;
-		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-
-		*x4 =   v11 *         SQRT1_8;
-		*x5 =   v01 *         SQRT1_8;
-
-		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x7 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
-	} else {
-		*x0 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
-		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-
-		*x2 =   v01 *         SQRT1_8;
-		*x3 =   v11 *         SQRT1_8;
-
-		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x5 =   v12 *         SQRT1_8;
-
-		*x6 =   v03 *         SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-	}
-}
+        u01 = (COSN2[1] * x1 + SINN2[1] * x6);
+        u11 = -(-SINN2[2] * x2 + COSN2[2] * x5);

-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTII
-		// invert odd input samples
-		u00= ( (*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
-
-		u01= (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
-
-		u02= ( (*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
-
-		u03= (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
-	} else { // DCTII
-		u00= ( (*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
-
-		u01= ( (*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
-
-		u02= ( (*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
-
-		u03= ( (*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
-	}
-	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-
-		float w00= u00 + u03;
-		float w10= u00 - u03;
-
-		float w01= (u01 + u02);
-		float w11= (u01 - u02);
-
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-		float v11 = w20 - w21 - w30 + w31;
-		float v12 = w20 - w21 + w30 - w31;
-
-	if (dst_not_dct) { // DSTII
-		// Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-		*x1 =   v03 *         SQRT1_8;
-
-		*x2 =   v12 *         SQRT1_8;
-		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-
-		*x4 =   v11 *         SQRT1_8;
-		*x5 =   v01 *         SQRT1_8;
-
-		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x7 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
-	} else {
-		*x0 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
-		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-
-		*x2 =   v01 *         SQRT1_8;
-		*x3 =   v11 *         SQRT1_8;
-
-		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x5 =   v12 *         SQRT1_8;
-
-		*x6 =   v03 *         SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-	}
-}
+        u02 = (COSN2[2] * x2 + SINN2[2] * x5);
+        u12 = (-SINN2[1] * x1 + COSN2[1] * x6);

-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTIV
-		u00=  ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
-		u10=  (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
-
-		u01=  ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
-		u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
-
-		u02=  ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
-		u12=  (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
-
-		u03=  ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
-		u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
-	} else { // DCTIV
-		u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-		u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-
-		u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-		u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-
-		u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-		u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-
-		u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-		u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-	}
-
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-
-
-	*x0 =  v00 * 0.5f;              // w0[0];
-	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	if (dst_not_dct) { // DSTIV
-		*x1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-		*x3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-		*x5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-		*x7 = -vb00 * 0.5f;             // w1[3];
-	} else {
-		*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-		*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-		*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-		*x7 =  vb00 * 0.5f;             // w1[3];
-	}
-}
+        u03 = (COSN2[3] * x3 + SINN2[3] * x4);
+        u13 = -(-SINN2[0] * x0 + COSN2[0] * x7);
+    }

-inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
-{
-	float sgn = 1 - 2* dst_not_dct;
-	float *y0 = x;
-	float *y1 = y0 + inc;
-	float *y2 = y1 + inc;
-	float *y3 = y2 + inc;
-	float *y4 = y3 + inc;
-	float *y5 = y4 + inc;
-	float *y6 = y5 + inc;
-	float *y7 = y6 + inc;
-
-	float *x0 =  x + dst_not_dct * 7 * inc;
-	// negate inc, replace
-	inc *= sgn;
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-
-
-	*y0 =  v00 * 0.5f;              // w0[0];
-	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	*y1 =  sgn * (v01 -  vb11) * SQRT1_8; // w1[0];
-	*y3 =  sgn * (v02 +  vb01) * SQRT1_8; // w1[1];
-	*y5 =  sgn * (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-	*y7 =  sgn * vb00 * 0.5f;             // w1[3];
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+
+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;
+
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;
+
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;
+
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+
+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;
+
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;
+
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;
+
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
+
+    *y0 = v00 * 0.5f;  // w0[0];
+    // j == 1
+    *y2 = (v01 + vb11) * SQRT1_8;  // w0[1];
+    // j == 2
+    *y4 = (v02 - vb01) * SQRT1_8;  // w0[2];
+    // j == 3
+    *y6 = (v03 + vb10) * SQRT1_8;       // w0[3];
+    if (dst_not_dct) {                  // DSTIV
+        *y1 = (vb11 - v01) * SQRT1_8;   // w1[0];
+        *y3 = -(v02 + vb01) * SQRT1_8;  // w1[1];
+        *y5 = (vb10 - v03) * SQRT1_8;   // w1[2]; - same as y[3]
+        *y7 = -vb00 * 0.5f;             // w1[3];
+    } else {
+        *y1 = (v01 - vb11) * SQRT1_8;  // w1[0];
+        *y3 = (v02 + vb01) * SQRT1_8;  // w1[1];
+        *y5 = (v03 - vb10) * SQRT1_8;  // w1[2]; - same as y[3]
+        *y7 = vb00 * 0.5f;             // w1[3];
+    }
 }

-inline __device__ void dctiv_nodiverg(float * x0,  int inc)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct) {
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    if (dst_not_dct) {  // DSTII
+        // invert odd input samples
+        u00 = ((*x0) - (*x7));
+        u10 = ((*x0) + (*x7));
+
+        u01 = (-(*x1) + (*x6));
+        u11 = (-(*x1) - (*x6));
+
+        u02 = ((*x2) - (*x5));
+        u12 = ((*x2) + (*x5));
+
+        u03 = (-(*x3) + (*x4));
+        u13 = (-(*x3) - (*x4));
+    } else {  // DCTII
+        u00 = ((*x0) + (*x7));
+        u10 = ((*x0) - (*x7));
+
+        u01 = ((*x1) + (*x6));
+        u11 = ((*x1) - (*x6));
+
+        u02 = ((*x2) + (*x5));
+        u12 = ((*x2) - (*x5));
+
+        u03 = ((*x3) + (*x4));
+        u13 = ((*x3) - (*x4));
+    }
+    //	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
+
+    float w00 = u00 + u03;
+    float w10 = u00 - u03;
+
+    float w01 = (u01 + u02);
+    float w11 = (u01 - u02);
+
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
+
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float v11 = w20 - w21 - w30 + w31;
+    float v12 = w20 - w21 + w30 - w31;
+
+    if (dst_not_dct) {  // DSTII
+        // Invert output sequence
+        *x0 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
+        *x1 = v03 * SQRT1_8;
+
+        *x2 = v12 * SQRT1_8;
+        *x3 = (w00 - w01) * SQRT1_8;  // v02 * SQRT1_8
+
+        *x4 = v11 * SQRT1_8;
+        *x5 = v01 * SQRT1_8;
+
+        *x6 = (w20 + w21) * 0.5f;     // v10 * SQRT1_8; z00 * 0.5f;
+        *x7 = (w00 + w01) * SQRT1_8;  // v00 * SQRT1_8
+    } else {
+        *x0 = (w00 + w01) * SQRT1_8;  // v00 * SQRT1_8
+        *x1 = (w20 + w21) * 0.5f;     // v10 * SQRT1_8; z00 * 0.5f;

-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+        *x2 = v01 * SQRT1_8;
+        *x3 = v11 * SQRT1_8;

-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+        *x4 = (w00 - w01) * SQRT1_8;  // v02 * SQRT1_8
+        *x5 = v12 * SQRT1_8;

-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+        *x6 = v03 * SQRT1_8;
+        *x7 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
+    }
+}

-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct) {
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    if (dst_not_dct) {  // DSTII
+        // invert odd input samples
+        u00 = ((*x0) - (*x7));
+        u10 = ((*x0) + (*x7));
+
+        u01 = (-(*x1) + (*x6));
+        u11 = (-(*x1) - (*x6));
+
+        u02 = ((*x2) - (*x5));
+        u12 = ((*x2) + (*x5));
+
+        u03 = (-(*x3) + (*x4));
+        u13 = (-(*x3) - (*x4));
+    } else {  // DCTII
+        u00 = ((*x0) + (*x7));
+        u10 = ((*x0) - (*x7));
+
+        u01 = ((*x1) + (*x6));
+        u11 = ((*x1) - (*x6));
+
+        u02 = ((*x2) + (*x5));
+        u12 = ((*x2) - (*x5));
+
+        u03 = ((*x3) + (*x4));
+        u13 = ((*x3) - (*x4));
+    }
+    //	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);

-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
+    float w00 = u00 + u03;
+    float w10 = u00 - u03;

-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
+    float w01 = (u01 + u02);
+    float w11 = (u01 - u02);

-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);

-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float v11 = w20 - w21 - w30 + w31;
+    float v12 = w20 - w21 + w30 - w31;

-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    if (dst_not_dct) {  // DSTII
+        // Invert output sequence
+        *x0 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
+        *x1 = v03 * SQRT1_8;

-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
+        *x2 = v12 * SQRT1_8;
+        *x3 = (w00 - w01) * SQRT1_8;  // v02 * SQRT1_8

-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
+        *x4 = v11 * SQRT1_8;
+        *x5 = v01 * SQRT1_8;

-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
+        *x6 = (w20 + w21) * 0.5f;  // v10 * SQRT1_8; z00 * 0.5f;
+        *x7 = (w00 + w01) * 0.5f;  // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+    } else {
+        *x0 = (w00 + w01) * 0.5f;  // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+        *x1 = (w20 + w21) * 0.5f;  // v10 * SQRT1_8; z00 * 0.5f;

-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+        *x2 = v01 * SQRT1_8;
+        *x3 = v11 * SQRT1_8;

+        *x4 = (w00 - w01) * SQRT1_8;  // v02 * SQRT1_8
+        *x5 = v12 * SQRT1_8;

-	*x0 =  v00 * 0.5f;              // w0[0];
-	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-	*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-	*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-	*x7 =   vb00 * 0.5f;             // w1[3];
+        *x6 = v03 * SQRT1_8;
+        *x7 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
+    }
 }

-inline __device__ void dstiv_nodiverg(float * x,  int inc)
-{
-	float *x0 =  x +  7 * inc;
-	// negate inc, replace
-	inc = -inc;
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct) {
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    if (dst_not_dct) {  // DSTIV
+        u00 = (COSN2[0] * (*x7) + SINN2[0] * (*x0));
+        u10 = (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
+
+        u01 = (COSN2[1] * (*x6) + SINN2[1] * (*x1));
+        u11 = -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
+
+        u02 = (COSN2[2] * (*x5) + SINN2[2] * (*x2));
+        u12 = (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
+
+        u03 = (COSN2[3] * (*x4) + SINN2[3] * (*x3));
+        u13 = -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
+    } else {  // DCTIV
+        u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
+        u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+
+        u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
+        u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+
+        u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
+        u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+
+        u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+        u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+    }
+
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+
+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;
+
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;
+
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;
+
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);

-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;

-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;

-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;

-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;

-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
+    *x0 = v00 * 0.5f;                   // w0[0];
+    *x2 = (v01 + vb11) * SQRT1_8;       // w0[1];
+    *x4 = (v02 - vb01) * SQRT1_8;       // w0[2];
+    *x6 = (v03 + vb10) * SQRT1_8;       // w0[3];
+    if (dst_not_dct) {                  // DSTIV
+        *x1 = (vb11 - v01) * SQRT1_8;   // w1[0];
+        *x3 = -(v02 + vb01) * SQRT1_8;  // w1[1];
+        *x5 = (vb10 - v03) * SQRT1_8;   // w1[2]; - same as y[3]
+        *x7 = -vb00 * 0.5f;             // w1[3];
+    } else {
+        *x1 = (v01 - vb11) * SQRT1_8;  // w1[0];
+        *x3 = (v02 + vb01) * SQRT1_8;  // w1[1];
+        *x5 = (v03 - vb10) * SQRT1_8;  // w1[2]; - same as y[3]
+        *x7 = vb00 * 0.5f;             // w1[3];
+    }
+}
+
+inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct) {
+    float sgn = 1 - 2 * dst_not_dct;
+    float *y0 = x;
+    float *y1 = y0 + inc;
+    float *y2 = y1 + inc;
+    float *y3 = y2 + inc;
+    float *y4 = y3 + inc;
+    float *y5 = y4 + inc;
+    float *y6 = y5 + inc;
+    float *y7 = y6 + inc;
+
+    float *x0 = x + dst_not_dct * 7 * inc;
+    // negate inc, replace
+    inc *= sgn;
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+
+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;
+
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;
+
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;
+
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+
+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;
+
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;
+
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;
+
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
+
+    *y0 = v00 * 0.5f;                    // w0[0];
+    *y2 = (v01 + vb11) * SQRT1_8;        // w0[1];
+    *y4 = (v02 - vb01) * SQRT1_8;        // w0[2];
+    *y6 = (v03 + vb10) * SQRT1_8;        // w0[3];
+    *y1 = sgn * (v01 - vb11) * SQRT1_8;  // w1[0];
+    *y3 = sgn * (v02 + vb01) * SQRT1_8;  // w1[1];
+    *y5 = sgn * (v03 - vb10) * SQRT1_8;  // w1[2]; - same as y[3]
+    *y7 = sgn * vb00 * 0.5f;             // w1[3];
+}
+
+inline __device__ void dctiv_nodiverg(float *x0, int inc) {
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));

-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));

-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));

-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);

-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;

-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;

-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;

-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;

-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);

+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;

-	*x7 =  v00 * 0.5f;              // w0[0];
-	*x5 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x3 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x1 =  (v03 +  vb10) * SQRT1_8; // w0[3];
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;

-	*x6 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-	*x4 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-	*x2 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-	*x0 = -vb00 * 0.5f;             // w1[3];
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;
+
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
+
+    *x0 = v00 * 0.5f;              // w0[0];
+    *x2 = (v01 + vb11) * SQRT1_8;  // w0[1];
+    *x4 = (v02 - vb01) * SQRT1_8;  // w0[2];
+    *x6 = (v03 + vb10) * SQRT1_8;  // w0[3];
+    *x1 = (v01 - vb11) * SQRT1_8;  // w1[0];
+    *x3 = (v02 + vb01) * SQRT1_8;  // w1[1];
+    *x5 = (v03 - vb10) * SQRT1_8;  // w1[2]; - same as y[3]
+    *x7 = vb00 * 0.5f;             // w1[3];
 }

+inline __device__ void dstiv_nodiverg(float *x, int inc) {
+    float *x0 = x + 7 * inc;
+    // negate inc, replace
+    inc = -inc;
+    float *x1 = x0 + inc;
+    float *x2 = x1 + inc;
+    float *x3 = x2 + inc;
+    float *x4 = x3 + inc;
+    float *x5 = x4 + inc;
+    float *x6 = x5 + inc;
+    float *x7 = x6 + inc;
+    float u00, u01, u02, u03, u10, u11, u12, u13;
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);

+    float ua00 = u00 + u03;
+    float ua10 = u00 - u03;

-inline  __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
+    float ua01 = u01 + u02;
+    float ua11 = u01 - u02;
+
+    float v00 = ua00 + ua01;
+    float v02 = ua00 - ua01;
+
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+
+    float ub00 = u10 + u13;
+    float ub10 = u10 - u13;
+
+    float ub01 = u11 + u12;
+    float ub11 = u11 - u12;
+
+    float vb00 = ub00 + ub01;
+    float vb01 = ub00 - ub01;
+
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
+
+    *x7 = v00 * 0.5f;              // w0[0];
+    *x5 = (v01 + vb11) * SQRT1_8;  // w0[1];
+    *x3 = (v02 - vb01) * SQRT1_8;  // w0[2];
+    *x1 = (v03 + vb10) * SQRT1_8;  // w0[3];
+
+    *x6 = (vb11 - v01) * SQRT1_8;   // w1[0];
+    *x4 = -(v02 + vb01) * SQRT1_8;  // w1[1];
+    *x2 = (vb10 - v03) * SQRT1_8;   // w1[2]; - same as y[3]
+    *x0 = -vb00 * 0.5f;             // w1[3];
+}
+
+inline __device__ void _dctii_nrecurs8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
-	float u00= (x[0] + x[7]);
-	float u10= (x[0] - x[7]);
+    float u00 = (x[0] + x[7]);
+    float u10 = (x[0] - x[7]);

-	float u01= (x[1] + x[6]);
-	float u11= (x[1] - x[6]);
+    float u01 = (x[1] + x[6]);
+    float u11 = (x[1] - x[6]);

-	float u02= (x[2] + x[5]);
-	float u12= (x[2] - x[5]);
+    float u02 = (x[2] + x[5]);
+    float u12 = (x[2] - x[5]);

-	float u03= (x[3] + x[4]);
-	float u13= (x[3] - x[4]);
+    float u03 = (x[3] + x[4]);
+    float u13 = (x[3] - x[4]);

-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);

-	float w00= u00 + u03;
-	float w10= u00 - u03;
+    float w00 = u00 + u03;
+    float w10 = u00 - u03;

-	float w01= (u01 + u02);
-	float w11= (u01 - u02);
+    float w01 = (u01 + u02);
+    float w11 = (u01 - u02);

-	float v00= w00 + w01;
-	float v02= w00 - w01;
-	float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-	float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    float v00 = w00 + w01;
+    float v02 = w00 - w01;
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;

-//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-	float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);

-	float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-	float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);

-//	_dctii_nrecurs2(u00, u01, &v00, &v01);
-	float z00= w20 + w21;
-	float z01= w20 - w21;
+    //	_dctii_nrecurs2(u00, u01, &v00, &v01);
+    float z00 = w20 + w21;
+    float z01 = w20 - w21;

-//	_dctii_nrecurs2(u10, u11, &v10, &v11);
-	float z10= w30 + w31;
-	float z11= w30 - w31;
+    //	_dctii_nrecurs2(u10, u11, &v10, &v11);
+    float z10 = w30 + w31;
+    float z11 = w30 - w31;

-	float v10 = SQRT_2 * z00;
-	float v11 = z01 - z11;
+    float v10 = SQRT_2 * z00;
+    float v11 = z01 - z11;

-	float v12 = z01 + z11;
-	float v13 = SQRT_2 * z10;
+    float v12 = z01 + z11;
+    float v13 = SQRT_2 * z10;

-	y[0] =   v00;
-	y[1] =   v10;
+    y[0] = v00;
+    y[1] = v10;

-	y[2] =   v01;
-	y[3] =   v11;
+    y[2] = v01;
+    y[3] = v11;

-	y[4] =   v02;
-	y[5] =   v12;
+    y[4] = v02;
+    y[5] = v12;

-	y[6] =   v03;
-	y[7] =   v13;
+    y[6] = v03;
+    y[7] = v13;
 }

-inline  __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
+inline __device__ void dct_ii8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
-	_dctii_nrecurs8(x, y);
+    _dctii_nrecurs8(x, y);
 #pragma unroll
-	for (int i = 0; i < 8 ; i++) {
-		y[i] *= SQRT1_8;
-	}
+    for (int i = 0; i < 8; i++) {
+        y[i] *= SQRT1_8;
+    }
 }

-
-__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
+__device__ void dct_iv8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
-	_dctiv_nrecurs8(x, y);
+    _dctiv_nrecurs8(x, y);
 #pragma unroll
-	for (int i = 0; i < 8 ; i++) {
-		y[i] *= SQRT1_8;
-	}
-
+    for (int i = 0; i < 8; i++) {
+        y[i] *= SQRT1_8;
+    }
 }

-inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
+inline __device__ void dst_iv8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
-	float xr[8];
+    float xr[8];
 #pragma unroll
-	for (int i=0; i < 8;i++){
-		xr[i] = x[7 - i];
-	}
-	_dctiv_nrecurs8(xr, y);
+    for (int i = 0; i < 8; i++) {
+        xr[i] = x[7 - i];
+    }
+    _dctiv_nrecurs8(xr, y);
 #pragma unroll
-	for (int i=0; i < 8;i+=2){
-		y[i]   *=  SQRT1_8;
-		y[i+1] *= -SQRT1_8;
-	}
+    for (int i = 0; i < 8; i += 2) {
+        y[i] *= SQRT1_8;
+        y[i + 1] *= -SQRT1_8;
+    }
 }

-
 //=========================== 2D functions ===============
 __device__ void corrUnfoldTile(
-		int corr_radius,
-		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+    int corr_radius,
+    float *qdata0,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+    float *rslt)    //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
 {
-	int size2r1 = 2 * corr_radius + 1; // 15
-	int crp1 = corr_radius + 1;        //8
-///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
-	int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
-
-	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
-	int i = threadIdx.x;
-	if (i > corr_radius) {
-		return; // not needed, only use inner
-	}
-//	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
-	float corr_pixscale = 0.25f;
-	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
-	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
-///	int rslt_row_offs = i * DTT_SIZE2M1;
-	int rslt_row_offs = i * size2r1;
-	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
-	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
-	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
-	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
-///	for (int j = 1; j < DTT_SIZE; j++) {
-	for (int j = 1; j <= corr_radius; j++) {
-		int rslt_base_index_pp = rslt_base_index_p + j;
-		int rslt_base_index_pm = rslt_base_index_p - j;
-		rslt[rslt_base_index_pp] = corr_pixscale * (
-				 qdata0[i_transform_size + j] +
-				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-		rslt[rslt_base_index_pm] = corr_pixscale * (
-				 qdata0[i_transform_size + j] +
-				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-	}
-	if (i == 0) {
-		return;
-	}
-///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
-	float d = corr_pixscale * qdata2[im1_transform_size];
-	rslt[rslt_base_index_p] += d;
-	rslt[rslt_base_index_m] -= d;
-	for (int j = 1; j <= corr_radius; j++) {
-		int rslt_base_index_pp = rslt_base_index_p + j;
-		int rslt_base_index_pm = rslt_base_index_p - j;
-		int rslt_base_index_mp = rslt_base_index_m + j;
-		int rslt_base_index_mm = rslt_base_index_m - j;
-		float d2 = corr_pixscale * qdata2[im1_transform_size + j];
-		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
-		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
-		rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
-		rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
-		rslt[rslt_base_index_pp] += d2 + d3;
-		rslt[rslt_base_index_pm] += d2 - d3;
-	}
+    int size2r1 = 2 * corr_radius + 1;  // 15
+    int crp1 = corr_radius + 1;         // 8
+    ///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
+    int rslt_base_index = size2r1 * crp1 - crp1;  // offset of the center
+
+    float *qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
+    float *qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
+    float *qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
+    int i = threadIdx.x;
+    if (i > corr_radius) {
+        return;  // not needed, only use inner
+    }
+    //	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
+    float corr_pixscale = 0.25f;
+    int i_transform_size = i * DTT_SIZE1;                   // used to address source rows which are 9 long
+    int im1_transform_size = i_transform_size - DTT_SIZE1;  // negative for i = 0, use only after divergence
+                                                            ///	int rslt_row_offs = i * DTT_SIZE2M1;
+    int rslt_row_offs = i * size2r1;
+    int rslt_base_index_p = rslt_base_index + rslt_row_offs;             // i * DTT_SIZE2M1;
+    int rslt_base_index_m = rslt_base_index - rslt_row_offs;             // i * DTT_SIZE2M1;
+    rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size];  // incomplete, will only be used for thread i=0
+    rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                   // nop for i=0 incomplete, will only be used for thread i=0
+                                                                         ///	for (int j = 1; j < DTT_SIZE; j++) {
+    for (int j = 1; j <= corr_radius; j++) {
+        int rslt_base_index_pp = rslt_base_index_p + j;
+        int rslt_base_index_pm = rslt_base_index_p - j;
+        rslt[rslt_base_index_pp] = corr_pixscale * (qdata0[i_transform_size + j] +
+                                                    qdata1[i_transform_size + j - 1]);  // incomplete, will only be used for thread i=0
+        rslt[rslt_base_index_pm] = corr_pixscale * (qdata0[i_transform_size + j] +
+                                                    -qdata1[i_transform_size + j - 1]);  // incomplete, will only be used for thread i=0
+    }
+    if (i == 0) {
+        return;
+    }
+    ///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
+    float d = corr_pixscale * qdata2[im1_transform_size];
+    rslt[rslt_base_index_p] += d;
+    rslt[rslt_base_index_m] -= d;
+    for (int j = 1; j <= corr_radius; j++) {
+        int rslt_base_index_pp = rslt_base_index_p + j;
+        int rslt_base_index_pm = rslt_base_index_p - j;
+        int rslt_base_index_mp = rslt_base_index_m + j;
+        int rslt_base_index_mm = rslt_base_index_m - j;
+        float d2 = corr_pixscale * qdata2[im1_transform_size + j];
+        float d3 = corr_pixscale * qdata3[im1_transform_size + j - 1];
+        // rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
+        rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
+        rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
+        rslt[rslt_base_index_pp] += d2 + d3;
+        rslt[rslt_base_index_pm] += d2 - d3;
+    }
 }

 __device__ void dttii_2d(
-		float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+    float *clt_corr)  // shared memory, [4][DTT_SIZE1][DTT_SIZE]
 {
    // change to 16-32 threads?? in next iteration
    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-    for (int q = 0; q < 4; q++){
-    	int is_sin = (q >> 1) & 1;
-    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+    for (int q = 0; q < 4; q++) {
+        int is_sin = (q >> 1) & 1;
+        dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x, DTT_SIZE1, is_sin);  // vertical pass, thread is column
    }
    __syncthreads();

    // hor pass, corresponding to vert pass in Java
-    for (int q = 0; q < 4; q++){
-    	int is_sin = q & 1;
-    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    for (int q = 0; q < 4; q++) {
+        int is_sin = q & 1;
+        dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1, 1, is_sin);  // horizontal pass, tread is row
    }
    __syncthreads();
-
 }

 __device__ void dttiv_color_2d(
-		float * clt_tile,
-		int color)
-{
-    dctiv_nodiverg( // all colors
-			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
-			1); //int inc);
-//	__syncthreads();// worsened
-    if (color == BAYER_GREEN){
-        dstiv_nodiverg( // all colors
-				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
-    			1); //int inc);
-
+    float *clt_tile,
+    int color) {
+    dctiv_nodiverg(                            // all colors
+        clt_tile + (DTT_SIZE1 * threadIdx.x),  // [0][threadIdx.x], // pointer to start of row
+        1);                                    // int inc);
+    //	__syncthreads();// worsened
+    if (color == BAYER_GREEN) {
+        dstiv_nodiverg(                                                 // all colors
+            clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE,  // clt_tile[1][threadIdx.x], // pointer to start of row
+            1);                                                         // int inc);
    }
-  	 __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();

 #ifdef DEBUG222
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+    if ((threadIdx.x) == 0) {
+        printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
+        debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
-    dctiv_nodiverg( // all colors
-    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
-			DTT_SIZE1); // int inc,
-//	__syncthreads();// worsened
-    if (color == BAYER_GREEN){
-          dctiv_nodiverg( // all colors
-        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
-    			DTT_SIZE1); // int inc,
+    dctiv_nodiverg(              // all colors
+        clt_tile + threadIdx.x,  //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
+        DTT_SIZE1);              // int inc,
+                                 //	__syncthreads();// worsened
+    if (color == BAYER_GREEN) {
+        dctiv_nodiverg(                                       // all colors
+            clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE),  // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+            DTT_SIZE1);                                       // int inc,
    }
-  	 __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 }

 __device__ void dttiv_mono_2d(
-		float * clt_tile)
-{
-	// Copy 0-> 1
+    float *clt_tile) {
+    // Copy 0-> 1

    dctiv_nodiverg(
-			clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
+        1);  // int inc);
    dstiv_nodiverg(
-    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
+        1);  // int inc);
    dctiv_nodiverg(
-			clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
+        1);  // int inc);
    dstiv_nodiverg(
-    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
-	__syncthreads();// __syncwarp();
+        clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
+        1);           // int inc);
+    __syncthreads();  // __syncwarp();

 #ifdef DEBUG222
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+    if ((threadIdx.x) == 0) {
+        printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
+        debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif

- 	dctiv_nodiverg( // CC
- 			clt_tile + threadIdx.x,
-			DTT_SIZE1); // int inc,
- 	dctiv_nodiverg( // SC
- 			clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
-			DTT_SIZE1); // int inc,
- 	dstiv_nodiverg( // CS
- 			clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
-			DTT_SIZE1); // int inc,
- 	dstiv_nodiverg( // SS
- 			clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
-			DTT_SIZE1); // int inc,
-  	 __syncthreads();// __syncwarp();
+    dctiv_nodiverg(  // CC
+        clt_tile + threadIdx.x,
+        DTT_SIZE1);  // int inc,
+    dctiv_nodiverg(  // SC
+        clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
+        DTT_SIZE1);                                           // int inc,
+    dstiv_nodiverg(                                           // CS
+        clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE),  // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+        DTT_SIZE1);                                           // int inc,
+    dstiv_nodiverg(                                           // SS
+        clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE),  // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+        DTT_SIZE1);                                           // int inc,
+    __syncthreads();                                          // __syncwarp();
 }

-
-
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
 __device__ void imclt(
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile ) //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    float *clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float *mclt_tile)  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 {
-	int thr3 =    threadIdx.x >> 3;
-	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-	int thr012 =  threadIdx.x & 7;
-	int column4 = threadIdx.x >> 2;
-//	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
-//	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
-	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
+    int thr3 = threadIdx.x >> 3;
+    int column = threadIdx.x;  // modify to use 2*8 threads, if needed.
+    int thr012 = threadIdx.x & 7;
+    int column4 = threadIdx.x >> 2;
+    //	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
+    //	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
+    int wcolumn = ((thr3 << 3) - thr3) ^ thr012;  // 0..7,7,..0
+    float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
 #ifdef DEBUG3
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
        printf("\nDTT Tiles before IDTT\n");
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
+        debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif

-	// perform horizontal dct-iv on quadrants 0 and 1
+    // perform horizontal dct-iv on quadrants 0 and 1
    dctiv_nodiverg(
-    		clt_tile +  DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
-			1);
-	// perform horizontal dst-iv on quadrants 2 and 3
-    dstiv_nodiverg( // all colors
-    		clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
-			1);
-    __syncthreads();// __syncwarp();
-	// perform vertical   dct-iv on quadrants 0 and 2
+        clt_tile + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3),  // pointer to start of row for quadrants 0 and 2
+        1);
+    // perform horizontal dst-iv on quadrants 2 and 3
+    dstiv_nodiverg(                                              // all colors
+        clt_tile1 + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3),  // pointer to start of row for quadrants 1 and 3
+        1);
+    __syncthreads();  // __syncwarp();
+                      // perform vertical   dct-iv on quadrants 0 and 2
    dctiv_nodiverg(
-    		clt_tile +  thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
-			DTT_SIZE1);
-	// perform vertical   dst-iv on quadrants 1 and 3
+        clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3,  // pointer to start of row for quadrants 0 and 1
+        DTT_SIZE1);
+    // perform vertical   dst-iv on quadrants 1 and 3
    dstiv_nodiverg(
-    		clt_tile2 + thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
-			DTT_SIZE1);
-    __syncthreads();// __syncwarp();
+        clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3,  // pointer to start of row for quadrants 2 and 3
+        DTT_SIZE1);
+    __syncthreads();  // __syncwarp();

 #ifdef DEBUG3
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
        printf("\nDTT Tiles after IDTT\n");
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
+        debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif

-
    float hw = HWINDOW2[wcolumn];
-    int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
-    float * rslt = mclt_tile + column;
+    int clt_offset = imclt_indx9[column];  // index in each of the 4 iclt quadrants, accounting for stride=9
+    float *rslt = mclt_tile + column;
 #pragma unroll
-    for (int i = 0; i < 4; i++){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i < 3){
-    		clt_offset +=  DTT_SIZE1;
-    	}
-//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	val = __fmaf_rd(w,d0,val); // w*d0 + val
-    	*rslt = val;
-    	rslt += DTT_SIZE21;
+    for (int i = 0; i < 4; i++) {
+        float val = *rslt;
+        float w = HWINDOW2[i] * hw;
+        float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
+        float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
+        float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
+        float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
+        d0 += d1;
+        d2 += d3;
+        d0 += d2;
+        if (i < 3) {
+            clt_offset += DTT_SIZE1;
+        }
+        //    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+        val = __fmaf_rd(w, d0, val);  // w*d0 + val
+        *rslt = val;
+        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 4; i < 8; i++){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-//    	if (i < 7){
-   		clt_offset -=  DTT_SIZE1;
-//    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
+    for (int i = 4; i < 8; i++) {
+        float val = *rslt;
+        float w = HWINDOW2[i] * hw;
+        float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
+        float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
+        float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
+        float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
+        d0 += d1;
+        d2 += d3;
+        d0 += d2;
+        //    	if (i < 7){
+        clt_offset -= DTT_SIZE1;
+        //    	}
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
+        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 7; i >= 4; i--){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i > 4){
-    		clt_offset -=  DTT_SIZE1;
-    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
+    for (int i = 7; i >= 4; i--) {
+        float val = *rslt;
+        float w = HWINDOW2[i] * hw;
+        float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
+        float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
+        float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
+        float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
+        d0 += d1;
+        d2 += d3;
+        d0 += d2;
+        if (i > 4) {
+            clt_offset -= DTT_SIZE1;
+        }
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
+        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 3; i >= 0; i--){
-    	float val = *rslt;
-    	float w = HWINDOW2[i] * hw;
-    	float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
-    	float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
-    	float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
-    	float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
-    	d2+=d3;
-    	d0+= d2;
-    	if (i > 0){
-    		clt_offset +=  DTT_SIZE1;
-    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	rslt += DTT_SIZE21;
+    for (int i = 3; i >= 0; i--) {
+        float val = *rslt;
+        float w = HWINDOW2[i] * hw;
+        float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
+        float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
+        float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
+        float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
+        d0 += d1;
+        d2 += d3;
+        d0 += d2;
+        if (i > 0) {
+            clt_offset += DTT_SIZE1;
+        }
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
+        rslt += DTT_SIZE21;
    }
 #ifdef DEBUG3
-    __syncthreads();// __syncwarp();
-    if ((threadIdx.x) == 0){
+    __syncthreads();  // __syncwarp();
+    if ((threadIdx.x) == 0) {
        printf("\nMCLT Tiles after IMCLT\n");
-    	debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
+        debug_print_mclt(mclt_tile, -1);  // only 1 quadrant for R,B and 2 - for G
    }
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
 }

-
 // Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};

 __device__ void imclt8threads(
-		int     do_acc,     // 1 - add to previous value, 0 - overwrite
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-		int debug)
-{
-//	int thr3 =    threadIdx.x >> 3;
-//	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-//	int thr012 =  threadIdx.x & 7;
-//	int column4 = threadIdx.x >> 2;
-//	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
+    int do_acc,        // 1 - add to previous value, 0 - overwrite
+    float *clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float *mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    int debug) {
+    //	int thr3 =    threadIdx.x >> 3;
+    //	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
+    //	int thr012 =  threadIdx.x & 7;
+    //	int column4 = threadIdx.x >> 2;
+    //	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
+    float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
 #ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
        printf("\nDTT Tiles before IDTT\n");
-        debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
+        debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif

-	// perform horizontal dct-iv on quadrants 0 and 1
-    dctiv_nodiverg( // quadrant 0
-    		clt_tile +  threadIdx.x,                              // pointer to start of row for quadrant 0
-			DTT_SIZE1);
-    dctiv_nodiverg( // quadrant 1
-    		clt_tile +  threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
-			DTT_SIZE1);
-	// perform horizontal dst-iv on quadrants 2 and 3
-    dstiv_nodiverg( // quadrant 2
-    		clt_tile +  threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
-			DTT_SIZE1);
-    dstiv_nodiverg( // quadrant 3
-    		clt_tile +  threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
-			DTT_SIZE1);
-    __syncthreads();// __syncwarp();
-	// perform vertical   dct-iv on quadrants 0 and 2
-    dctiv_nodiverg( // quadrant 0
-    		clt_tile +  DTT_SIZE1 * threadIdx.x,                              // pointer to start of row for quadrant 0
-			1);
-    dctiv_nodiverg( // quadrant 2
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
-			1);
+    // perform horizontal dct-iv on quadrants 0 and 1
+    dctiv_nodiverg(              // quadrant 0
+        clt_tile + threadIdx.x,  // pointer to start of row for quadrant 0
+        DTT_SIZE1);
+    dctiv_nodiverg(                                           // quadrant 1
+        clt_tile + threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 1
+        DTT_SIZE1);
+    // perform horizontal dst-iv on quadrants 2 and 3
+    dstiv_nodiverg(                                           // quadrant 2
+        clt_tile + threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 2
+        DTT_SIZE1);
+    dstiv_nodiverg(                                           // quadrant 3
+        clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 3
+        DTT_SIZE1);
+    __syncthreads();                         // __syncwarp();
+                                             // perform vertical   dct-iv on quadrants 0 and 2
+    dctiv_nodiverg(                          // quadrant 0
+        clt_tile + DTT_SIZE1 * threadIdx.x,  // pointer to start of row for quadrant 0
+        1);
+    dctiv_nodiverg(                                                       // quadrant 2
+        clt_tile + DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 2
+        1);
    // perform vertical   dst-iv on quadrants 1 and 3
-    dstiv_nodiverg( // quadrant 1
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
-			1);
-    dstiv_nodiverg( // quadrant 3
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
-			1);
-    __syncthreads();// __syncwarp();
+    dstiv_nodiverg(                                                       // quadrant 1
+        clt_tile + DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 1
+        1);
+    dstiv_nodiverg(                                                       // quadrant 3
+        clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 3
+        1);
+    __syncthreads();  // __syncwarp();

 #ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
-    	printf("\nDTT Tiles after IDTT\n");
-    	debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
+        printf("\nDTT Tiles after IDTT\n");
+        debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25);  // only 1 quadrant for R,B and 2 - for G
    }
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    // re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
-    for (int thr3 = 0; thr3 < 2; thr3++){
-    	int thr3m = (thr3 << 3);
-    	int column =  threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
-    	int thr012 =  threadIdx.x & 7; // == threadIdx.x
-    	int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
-    	int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
-
-    	float hw = HWINDOW2[wcolumn];
-    	int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
-    	float * rslt = mclt_tile + column;
+    for (int thr3 = 0; thr3 < 2; thr3++) {
+        int thr3m = (thr3 << 3);
+        int column = threadIdx.x + thr3m;       // modify to use 2*8 threads, if needed.
+        int thr012 = threadIdx.x & 7;           // == threadIdx.x
+        int column4 = column >> 2;              // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
+        int wcolumn = (thr3m - thr3) ^ thr012;  // 0..7,7,..0
+
+        float hw = HWINDOW2[wcolumn];
+        int clt_offset = imclt_indx9[column];  // index in each of the 4 iclt quadrants, accounting for stride=9
+        float *rslt = mclt_tile + column;
 #ifdef DEBUG7
-        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
-    	printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
-    			thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
-    }
-    __syncthreads();// __syncwarp();
+        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
+            printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
+                   thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
+            debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
+        }
+        __syncthreads();  // __syncwarp();
 #endif

 #pragma unroll
-    	for (int i = 0; i < 4; i++){
-    		float val = *rslt;
-    		// facc
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i < 3){
-    			clt_offset +=  DTT_SIZE1;
-    		}
-    		//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		// val =__fmaf_rd(w,d0,val); // w*d0 + val
-    		// *rslt = val;
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
+        for (int i = 0; i < 4; i++) {
+            float val = *rslt;
+            // facc
+            float w = HWINDOW2[i] * hw;
+            float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
+            float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
+            float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
+            float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
+            d0 += d1;
+            d2 += d3;
+            d0 += d2;
+            if (i < 3) {
+                clt_offset += DTT_SIZE1;
+            }
+            //    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+            // val =__fmaf_rd(w,d0,val); // w*d0 + val
+            // *rslt = val;
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
+            rslt += DTT_SIZE21;
+        }
 #pragma unroll
-    	for (int i = 4; i < 8; i++){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		//    	if (i < 7){
-    		clt_offset -=  DTT_SIZE1;
-    		//    	}
-//    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-
-    		rslt += DTT_SIZE21;
-    	}
+        for (int i = 4; i < 8; i++) {
+            float val = *rslt;
+            float w = HWINDOW2[i] * hw;
+            float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
+            float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
+            float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
+            float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
+            d0 += d1;
+            d2 += d3;
+            d0 += d2;
+            //    	if (i < 7){
+            clt_offset -= DTT_SIZE1;
+            //    	}
+            //    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
+
+            rslt += DTT_SIZE21;
+        }
 #pragma unroll
-    	for (int i = 7; i >= 4; i--){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i > 4){
-    			clt_offset -=  DTT_SIZE1;
-    		}
-    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
+        for (int i = 7; i >= 4; i--) {
+            float val = *rslt;
+            float w = HWINDOW2[i] * hw;
+            float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
+            float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
+            float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
+            float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
+            d0 += d1;
+            d2 += d3;
+            d0 += d2;
+            if (i > 4) {
+                clt_offset -= DTT_SIZE1;
+            }
+            //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
+            rslt += DTT_SIZE21;
+        }
 #pragma unroll
-    	for (int i = 3; i >= 0; i--){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i > 0){
-    			clt_offset +=  DTT_SIZE1;
-    		}
-    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
+        for (int i = 3; i >= 0; i--) {
+            float val = *rslt;
+            float w = HWINDOW2[i] * hw;
+            float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
+            float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
+            float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
+            float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
+            d0 += d1;
+            d2 += d3;
+            d0 += d2;
+            if (i > 0) {
+                clt_offset += DTT_SIZE1;
+            }
+            //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
+            rslt += DTT_SIZE21;
+        }
    }
 #ifdef DEBUG7
-    __syncthreads();// __syncwarp();
-	for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
-		if (debug  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
-			printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
-			debug_print_mclt(
-					mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-					-1);
-		}
-		__syncthreads();// __syncwarp();
-	}
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
+    for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
+        if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)) {
+            printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
+            debug_print_mclt(
+                mclt_tile,  //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+                -1);
+        }
+        __syncthreads();  // __syncwarp();
+    }
+    __syncthreads();  // __syncwarp();
 #endif
 }

-
-
-
 //#endif
-
--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -45,57 +45,56 @@
 * with Nvidia Nsight, driver API when calling these kernels from Java
 */
 #ifndef JCUDA
-#define DTT_SIZE_LOG2                 3
+#define DTT_SIZE_LOG2 3
 #endif

 #pragma once
-#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
-#define DTT_SIZE1        (DTT_SIZE + 1)
-#define DTT_SIZE2        (2 * DTT_SIZE)
-#define DTT_SIZE21       (DTT_SIZE2 + 1)
-#define DTT_SIZE4        (4 * DTT_SIZE)
-#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
-#define BAYER_RED   0
-#define BAYER_BLUE  1
+#define DTT_SIZE (1 << DTT_SIZE_LOG2)
+#define DTT_SIZE1 (DTT_SIZE + 1)
+#define DTT_SIZE2 (2 * DTT_SIZE)
+#define DTT_SIZE21 (DTT_SIZE2 + 1)
+#define DTT_SIZE4 (4 * DTT_SIZE)
+#define DTT_SIZE2M1 (DTT_SIZE2 - 1)
+#define BAYER_RED 0
+#define BAYER_BLUE 1
 #define BAYER_GREEN 2
 // assuming GR/BG as now
 #define BAYER_RED_ROW 0
 #define BAYER_RED_COL 1

-#define DTTTEST_BLOCK_WIDTH          32
-#define DTTTEST_BLOCK_HEIGHT         16
-#define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
-
-//extern __constant__ float idct_signs[4][4][4];
-//extern __constant__ int imclt_indx9[16];
-//extern __constant__ float HWINDOW2[];
+#define DTTTEST_BLOCK_WIDTH 32
+#define DTTTEST_BLOCK_HEIGHT 16
+#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH + 1)

+// extern __constant__ float idct_signs[4][4][4];
+// extern __constant__ int imclt_indx9[16];
+// extern __constant__ float HWINDOW2[];

 // kernels (not used so far)
 #if 0
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
-#endif// #if 0
+#endif  // #if 0

 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(
-		int corr_radius,
-		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+    int corr_radius,
+    float* qdata0,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+    float* rslt);   //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15

 extern __device__ void dttii_2d(
-		float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+    float* clt_corr);  // shared memory, [4][DTT_SIZE1][DTT_SIZE]

 extern __device__ void dttiv_color_2d(
-		float * clt_tile,
-		int color);
+    float* clt_tile,
+    int color);
 extern __device__ void dttiv_mono_2d(
-		float * clt_tile);
+    float* clt_tile);
 extern __device__ void imclt(
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile );
+    float* clt_tile,  //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float* mclt_tile);

 extern __device__ void imclt8threads(
-		int     do_acc,     // 1 - add to previous value, 0 - overwrite
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-		int     debug);
+    int do_acc,        // 1 - add to previous value, 0 - overwrite
+    float* clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float* mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    int debug);
--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -37,16 +37,15 @@

 */
 #ifndef JCUDA
-	#include "tp_defines.h"
-	#include "dtt8x8.h"
-	#include "geometry_correction.h"
-#endif // #ifndef JCUDA
+#include "tp_defines.h"
+#include "dtt8x8.h"
+#include "geometry_correction.h"
+#endif  // #ifndef JCUDA

 #ifndef get_task_size
-	#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
 #endif

-
 // Using NUM_CAMS threads per tile
 #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
 ///#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
@@ -57,9 +56,8 @@

 #define DBG_CAM 3

-__device__ void printGeometryCorrection(struct gc * g, int num_cams);
-__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
-
+__device__ void printGeometryCorrection(struct gc *g, int num_cams);
+__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams);

 /**
 * Calculate non-distorted radius from distorted using table approximation
@@ -67,774 +65,777 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
 * @return corresponding non-distorted radius
 */
 inline __device__ float getRByRDist(float rDist,
-		float rByRDist [RBYRDIST_LEN]); //shared memory
-
-
-
-__constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
-		{ // azimuth
-				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
-				{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
-				{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
-
-		},{ // tilt
-				{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
-				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
-		},{ // roll*zoom
-				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
-
-		},{ // d_azimuth
-				{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
-		},{ // d_tilt
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
-				{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
-		},{ // d_roll
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-		},{ // d_zoom
-				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-		}
-};
-
-__constant__ int angles_offsets [4] = {
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
-__constant__ int mm_seq [3][3][3]={
-		{
-				{6,5,12}, // a_t * a_z -> tmp0
-				{7,6,13}, // a_r * a_t -> tmp1
-				{7,9,14}, // a_r * a_dt -> tmp2
-		}, {
-				{7,12,0}, // a_r * tmp0 -> rot          - bad
-				{13,8,1}, // tmp1 * a_daz -> deriv0     - good
-				{14,5,2}, // tmp2 * a_az  -> deriv1     - good
-		}, {
-				{10,12,3}, // a_dr * tmp0 -> deriv2     - good
-				{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
-				{-1,-1,-1} // do nothing
-		}};
-
-__constant__ int offset_rots =     0;                   //0
-__constant__ int offset_derivs =   1;                   // 1..4 // should be next
-__constant__ int offset_matrices = 5;   // 5..11
-__constant__ int offset_tmp =      12; // 12..15
-
-//inline __device__ int get_task_size_gc(int num_cams);
-inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
-inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
-
-//inline __device__ int get_task_size_gc(int num_cams){
+                                    float rByRDist[RBYRDIST_LEN]);  // shared memory
+
+__constant__ float ROTS_TEMPLATE[7][3][3][3] = {  //  ...{cos,sin,const}...
+    {
+        // azimuth
+        {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
+        {{0, 0, 0}, {0, 0, 1}, {0, 0, 0}},
+        {{0, 1, 0}, {0, 0, 0}, {1, 0, 0}},
+
+    },
+    {
+        // tilt
+        {{0, 0, 1}, {0, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {1, 0, 0}, {0, 1, 0}},
+        {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
+    },
+    {
+        // roll*zoom
+        {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 1}},
+
+    },
+    {
+        // d_azimuth
+        {{0, -1, 0}, {0, 0, 0}, {-1, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+        {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
+    },
+    {
+        // d_tilt
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
+        {{0, 0, 0}, {-1, 0, 0}, {0, -1, 0}},
+    },
+    {
+        // d_roll
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{-1, 0, 0}, {0, -1, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+    },
+    {
+        // d_zoom
+        {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+    }};
+
+__constant__ int angles_offsets[4] = {
+    offsetof(corr_vector, azimuth) / sizeof(float),
+    offsetof(corr_vector, tilt) / sizeof(float),
+    offsetof(corr_vector, roll) / sizeof(float),
+    offsetof(corr_vector, roll) / sizeof(float)};
+__constant__ int mm_seq[3][3][3] = {
+    {
+        {6, 5, 12},  // a_t * a_z -> tmp0
+        {7, 6, 13},  // a_r * a_t -> tmp1
+        {7, 9, 14},  // a_r * a_dt -> tmp2
+    },
+    {
+        {7, 12, 0},  // a_r * tmp0 -> rot          - bad
+        {13, 8, 1},  // tmp1 * a_daz -> deriv0     - good
+        {14, 5, 2},  // tmp2 * a_az  -> deriv1     - good
+    },
+    {
+        {10, 12, 3},  // a_dr * tmp0 -> deriv2     - good
+        {11, 12, 4},  // a_dzoom * tnmp0 -> deriv3 - good
+        {-1, -1, -1}  // do nothing
+    }};
+
+__constant__ int offset_rots = 0;      // 0
+__constant__ int offset_derivs = 1;    // 1..4 // should be next
+__constant__ int offset_matrices = 5;  // 5..11
+__constant__ int offset_tmp = 12;      // 12..15
+
+// inline __device__ int get_task_size_gc(int num_cams);
+inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams);
+inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams);
+
+// inline __device__ int get_task_size_gc(int num_cams){
 //	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
-//}
+// }

-inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
-	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile);
+inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams) {
+    return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile);
 }
-inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
-	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile + 1);
+inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams) {
+    return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
 }

-
 /**
 * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
 * NUM_CAMS blocks of 3,3,3 tiles
 */
 extern "C" __global__ void calc_rot_deriv(
-		int                  num_cams,
-		struct corr_vector * gpu_correction_vector,
-		trot_deriv   * gpu_rot_deriv)
-{
-	__shared__ float sincos  [4][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
-	__shared__ float matrices[5 + 7 +4][3][3];
-	float angle;
-	float zoom;
-	int ncam =    blockIdx.x; // threadIdx.z;
-	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
-	int nangle =  nangle1 >> 1; // 0: az, 1: tilt, 2: roll, 3:roll
-	int is_sin = nangle1 & 1;
-	if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
-		float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
-///		if (ncam == (NUM_CAMS-1)){ // for the whole block
-		if (ncam == (num_cams-1)){ // for the whole block
-			angle = 0.0;
-			zoom = 0.0;
-///			for (int n = 0; n < (NUM_CAMS-1); n++){
-			for (int n = 0; n < (num_cams-1); n++){
-				angle -= *(gangles + n);
-				zoom -= gpu_correction_vector->zoom[n];
-			}
-			if (nangle >= 2){ // diverging for roll (last two)
-				angle = *(gangles + ncam);
-			}
-
-		} else {
-			angle = *(gangles + ncam);
-			zoom =   gpu_correction_vector->zoom[ncam];
-		}
-		if (!is_sin){
-			angle += M_PI/2;
-		}
-		float sc = sinf(angle);
-		if (nangle ==2) {
-			sc *= 1.0 + zoom;
-		}
-		sincos[nangle][is_sin]= sc;
-	}
-	__syncthreads();
+    int num_cams,
+    struct corr_vector *gpu_correction_vector,
+    trot_deriv *gpu_rot_deriv) {
+    __shared__ float sincos[4][2];  // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+    __shared__ float matrices[5 + 7 + 4][3][3];
+    float angle;
+    float zoom;
+    int ncam = blockIdx.x;                                 // threadIdx.z;
+    int nangle1 = threadIdx.x + threadIdx.y * blockDim.x;  // * >> 1;
+    int nangle = nangle1 >> 1;                             // 0: az, 1: tilt, 2: roll, 3:roll
+    int is_sin = nangle1 & 1;
+    if ((threadIdx.z == 0) && (nangle < 4)) {                                      // others just idle here
+        float *gangles = (float *)gpu_correction_vector + angles_offsets[nangle];  // pointer for channel 0
+                                                                                   ///		if (ncam == (NUM_CAMS-1)){ // for the whole block
+        if (ncam == (num_cams - 1)) {                                              // for the whole block
+            angle = 0.0;
+            zoom = 0.0;
+            ///			for (int n = 0; n < (NUM_CAMS-1); n++){
+            for (int n = 0; n < (num_cams - 1); n++) {
+                angle -= *(gangles + n);
+                zoom -= gpu_correction_vector->zoom[n];
+            }
+            if (nangle >= 2) {  // diverging for roll (last two)
+                angle = *(gangles + ncam);
+            }
+
+        } else {
+            angle = *(gangles + ncam);
+            zoom = gpu_correction_vector->zoom[ncam];
+        }
+        if (!is_sin) {
+            angle += M_PI / 2;
+        }
+        float sc = sinf(angle);
+        if (nangle == 2) {
+            sc *= 1.0 + zoom;
+        }
+        sincos[nangle][is_sin] = sc;
+    }
+    __syncthreads();

 #ifdef DEBUG20
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		printf("\n    Azimuth matrix for   camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
-		printf(  "    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[1][0], sincos[1][1]);
-		printf(  "    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[2][0], sincos[2][1]);
-		printf(  "    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[3][0], sincos[3][1]);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-
-
-// Create 3 3x3 matrices for az, tilt, roll/zoom:
-	int axis = offset_matrices+threadIdx.z; // 0..2
-	int const_index = threadIdx.z; // 0..2
-	matrices[axis][threadIdx.y][threadIdx.x] =
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                   // const
-	axis += 3; // skip index == 3
-	const_index +=3;
-	matrices[axis][threadIdx.y][threadIdx.x] =
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                          // const
-	if (threadIdx.z == 0){
-		axis += 3;
-		const_index +=3;
-		matrices[axis][threadIdx.y][threadIdx.x] =
-				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
-				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
-				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                // const
-
-	}
-	__syncthreads();
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
+        printf("\n    Azimuth matrix for   camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
+        printf("    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
+        printf("    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
+        printf("    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG20
+
+    // Create 3 3x3 matrices for az, tilt, roll/zoom:
+    int axis = offset_matrices + threadIdx.z;  // 0..2
+    int const_index = threadIdx.z;             // 0..2
+    matrices[axis][threadIdx.y][threadIdx.x] =
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] +  // cos
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] +  // sin
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                            // const
+    axis += 3;                                                                              // skip index == 3
+    const_index += 3;
+    matrices[axis][threadIdx.y][threadIdx.x] =
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] +  // cos
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] +  // sin
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                            // const
+    if (threadIdx.z == 0) {
+        axis += 3;
+        const_index += 3;
+        matrices[axis][threadIdx.y][threadIdx.x] =
+            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0] +  // cos
+            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1] +  // sin
+            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                  // const
+    }
+    __syncthreads();

 #ifdef DEBUG20
-	const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
-
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int i = 0; i < 7; i++) {
-			printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
-			for (int row = 0; row < 3; row++){
-				for (int col = 0; col < 3; col++){
-					printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
-				}
-				printf("\n");
-			}
-
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-/*
-	__constant__ int mm_seq [3][3][3]={
-			{
-					{6,5,12}, // a_t * a_z -> tmp0
-					{7,6,13}, // a_r * a_t -> tmp1
-					{7,9,14}, // a_r * a_dt -> tmp2
-			}, {
-					{7,12,0}, // a_r * tmp0 -> rot
-					{13,8,1}, // tmp1 * a_daz -> deriv0
-					{14,5,2}, // tmp2 * a_az  -> deriv1
-			}, {
-					{10,12,3}, // a_dr * tmp0 -> deriv2
-					{11,12,4}, // a_dzoom * tnmp0 -> deriv3
-			}};
-*/
-	for (int i = 0; i < 3; i++){
-		int srcl = mm_seq[i][threadIdx.z][0];
-		int srcr = mm_seq[i][threadIdx.z][1];
-		int dst =  mm_seq[i][threadIdx.z][2];
-		if (srcl >= 0){
-			matrices[dst][threadIdx.y][threadIdx.x] =
-					matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
-					matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
-					matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
-		}
-		__syncthreads();
-	}
-// copy results to global memory
-	int gindx = threadIdx.z;
-	int lindx = offset_rots + threadIdx.z;
+    const char *matrices_names[] = {"az", "tilt", "roll*zoom", "d_daz", "d_tilt", "d_roll", "d_zoom"};
+
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
+        for (int i = 0; i < 7; i++) {
+            printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
+            for (int row = 0; row < 3; row++) {
+                for (int col = 0; col < 3; col++) {
+                    printf("%9.6f, ", matrices[offset_matrices + i][row][col]);
+                }
+                printf("\n");
+            }
+        }
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG20
+
+    /*
+            __constant__ int mm_seq [3][3][3]={
+                            {
+                                            {6,5,12}, // a_t * a_z -> tmp0
+                                            {7,6,13}, // a_r * a_t -> tmp1
+                                            {7,9,14}, // a_r * a_dt -> tmp2
+                            }, {
+                                            {7,12,0}, // a_r * tmp0 -> rot
+                                            {13,8,1}, // tmp1 * a_daz -> deriv0
+                                            {14,5,2}, // tmp2 * a_az  -> deriv1
+                            }, {
+                                            {10,12,3}, // a_dr * tmp0 -> deriv2
+                                            {11,12,4}, // a_dzoom * tnmp0 -> deriv3
+                            }};
+    */
+    for (int i = 0; i < 3; i++) {
+        int srcl = mm_seq[i][threadIdx.z][0];
+        int srcr = mm_seq[i][threadIdx.z][1];
+        int dst = mm_seq[i][threadIdx.z][2];
+        if (srcl >= 0) {
+            matrices[dst][threadIdx.y][threadIdx.x] =
+                matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x] +
+                matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x] +
+                matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
+        }
+        __syncthreads();
+    }
+    // copy results to global memory
+    int gindx = threadIdx.z;
+    int lindx = offset_rots + threadIdx.z;
 #ifdef NVRTC_BUG
-	// going beyond first dimension
-	gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+    // going beyond first dimension
+    gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 #else
-	gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+    gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 #endif
-	gindx +=3;
-	lindx+=3;
-	if (lindx < 5) {
+    gindx += 3;
+    lindx += 3;
+    if (lindx < 5) {
 #ifdef NVRTC_BUG
-	// going beyond first dimension
-		gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+        // going beyond first dimension
+        gpu_rot_deriv->rots[ncam + gindx * NUM_CAMS][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 #else
-		gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
+        gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 #endif
-	}
-	__syncthreads();
+    }
+    __syncthreads();
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-			printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-
-// All done - read/verify all arrays
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
+        printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG20

+    // All done - read/verify all arrays
 }

-
 extern "C" __global__ void calculate_tiles_offsets(
-		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-		int                  num_cams,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task     * gpu_tasks,
-		int                  num_tiles,          // number of tiles in task
-		struct gc          * gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv)
-{
-///	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
-///	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
-	int tiles_per_block_geom = NUM_THREADS/ num_cams;
-	dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
-	dim3 grid_geom   ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
-//#define NUM_THREADS                   32
-
-	if (threadIdx.x == 0) { // always 1
-    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			uniform_grid,            // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-    			num_cams,                // int                  num_cams,
-				gpu_ftasks,              // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//    			gpu_tasks,               // struct tp_task     * gpu_tasks,
-				num_tiles,               // int                  num_tiles,          // number of tiles in task list
-				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
-				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
-				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
-
-	}
-//	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
+    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+    int num_cams,
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+                        //		struct tp_task     * gpu_tasks,
+    int num_tiles,      // number of tiles in task
+    struct gc *gpu_geometry_correction,
+    struct corr_vector *gpu_correction_vector,
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
+    trot_deriv *gpu_rot_deriv) {
+    ///	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
+    ///	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+    int tiles_per_block_geom = NUM_THREADS / num_cams;
+    dim3 threads_geom(num_cams, tiles_per_block_geom, 1);
+    dim3 grid_geom((num_tiles + tiles_per_block_geom - 1) / tiles_per_block_geom, 1, 1);
+    //#define NUM_THREADS                   32
+
+    if (threadIdx.x == 0) {  // always 1
+        get_tiles_offsets<<<grid_geom, threads_geom>>>(
+            uniform_grid,             // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+            num_cams,                 // int                  num_cams,
+            gpu_ftasks,               // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+                                      //    			gpu_tasks,               // struct tp_task     * gpu_tasks,
+            num_tiles,                // int                  num_tiles,          // number of tiles in task list
+            gpu_geometry_correction,  //	struct gc          * gpu_geometry_correction,
+            gpu_correction_vector,    //	struct corr_vector * gpu_correction_vector,
+            gpu_rByRDist,             //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+            gpu_rot_deriv);           // union trot_deriv   * gpu_rot_deriv);
+    }
+    //	__syncthreads();// __syncwarp();
+    //	cudaDeviceSynchronize();
+    //	cudaDeviceSynchronize();
 }

-
 /*
 * blockDim.x = NUM_CAMS
 * blockDim.y = TILES_PER_BLOCK_GEOM
 */

 extern "C" __global__ void get_tiles_offsets(
-		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-		int                  num_cams,
-//		struct tp_task     * gpu_tasks,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                  num_tiles,          // number of tiles in task
-		struct gc          * gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
-		trot_deriv *         gpu_rot_deriv)
-{
-	int task_size = get_task_size(num_cams);
-	int task_num = blockIdx.x * blockDim.y + threadIdx.y; //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
-	int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
-	int dim_xy = blockDim.x * blockDim.y; // number of parallel threads (<=32)
-	__shared__ struct gc geometry_correction;
-	__shared__ float rByRDist [RBYRDIST_LEN];
-	__shared__ struct corr_vector extrinsic_corr;
-	__shared__ trot_deriv rot_deriv;
-///	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
-	__shared__ float pY_offsets[NUM_THREADS][NUM_CAMS]; // maximal dimensions, actual will be smaller
-	float pXY[2]; // result to be copied to task
-	//blockDim.y
-	// copy data common to all threads
-	{
-		int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
-		float * gcp_local =  (float *) &geometry_correction;
-		float * gcp_global = (float *) gpu_geometry_correction;
-		int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_gc; i++){
-			if (offset < sizeof(struct gc)/sizeof(float)) {
-				*(gcp_local + offset) = *(gcp_global + offset);
-			}
-			offset += dim_xy;
-		}
-	}
-	{
-		int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
-		float * cvp_local =  (float *) &extrinsic_corr;
-		float * cvp_global = (float *) gpu_correction_vector;
-		int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_cv; i++){
-			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
-				*(cvp_local + offset) = *(cvp_global + offset);
-			}
-			offset += dim_xy;
-		}
-	}
-	// TODO: maybe it is better to use system memory and not read all table?
-	{
-		int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
-		float * rByRDistp_local =  (float *) rByRDist;
-		float * rByRDistp_global = (float *) gpu_rByRDist;
-		int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_rbrd; i++){
-			if (offset < RBYRDIST_LEN) {
-				*(rByRDistp_local + offset) = *(rByRDistp_global + offset);
-			}
-			offset += dim_xy;
-		}
-	}
-	// copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
-	{
-		int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
-		float * rots_local =  (float *) &rot_deriv;
-		float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
-		int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_rot; i++){
-			if (offset < sizeof(trot_deriv)/sizeof(float)) {
-				*(rots_local + offset) = *(rots_global + offset);
-			}
-			offset += dim_xy;
-		}
-	}
-	__syncthreads();
-	int ncam = threadIdx.x;
-	if (task_num >= num_tiles){
-		return;
-	}
-	int imu_exists = // todo - calculate once with rot_deriv?
-			(extrinsic_corr.imu_rot[0] != 0.0) ||
-			(extrinsic_corr.imu_rot[1] != 0.0) ||
-			(extrinsic_corr.imu_rot[2] != 0.0) ||
-			(extrinsic_corr.imu_move[0] != 0.0) ||
-			(extrinsic_corr.imu_move[1] != 0.0) ||
-			(extrinsic_corr.imu_move[2] != 0.0);
+    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+    int num_cams,
+    //		struct tp_task     * gpu_tasks,
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    int num_tiles,      // number of tiles in task
+    struct gc *gpu_geometry_correction,
+    struct corr_vector *gpu_correction_vector,
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
+    trot_deriv *gpu_rot_deriv) {
+    int task_size = get_task_size(num_cams);
+    int task_num = blockIdx.x * blockDim.y + threadIdx.y;  //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
+    int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
+    int dim_xy = blockDim.x * blockDim.y;  // number of parallel threads (<=32)
+    __shared__ struct gc geometry_correction;
+    __shared__ float rByRDist[RBYRDIST_LEN];
+    __shared__ struct corr_vector extrinsic_corr;
+    __shared__ trot_deriv rot_deriv;
+    ///	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
+    __shared__ float pY_offsets[NUM_THREADS][NUM_CAMS];  // maximal dimensions, actual will be smaller
+    float pXY[2];                                        // result to be copied to task
+    // blockDim.y
+    //  copy data common to all threads
+    {
+        int cycles_copy_gc = ((sizeof(struct gc) / sizeof(float) + dim_xy - 1) / dim_xy);
+        float *gcp_local = (float *)&geometry_correction;
+        float *gcp_global = (float *)gpu_geometry_correction;
+        int offset = thread_xy;
+        for (int i = 0; i < cycles_copy_gc; i++) {
+            if (offset < sizeof(struct gc) / sizeof(float)) {
+                *(gcp_local + offset) = *(gcp_global + offset);
+            }
+            offset += dim_xy;
+        }
+    }
+    {
+        int cycles_copy_cv = ((sizeof(struct corr_vector) / sizeof(float) + dim_xy - 1) / dim_xy);
+        float *cvp_local = (float *)&extrinsic_corr;
+        float *cvp_global = (float *)gpu_correction_vector;
+        int offset = thread_xy;
+        for (int i = 0; i < cycles_copy_cv; i++) {
+            if (offset < sizeof(struct corr_vector) / sizeof(float)) {
+                *(cvp_local + offset) = *(cvp_global + offset);
+            }
+            offset += dim_xy;
+        }
+    }
+    // TODO: maybe it is better to use system memory and not read all table?
+    {
+        int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
+        float *rByRDistp_local = (float *)rByRDist;
+        float *rByRDistp_global = (float *)gpu_rByRDist;
+        int offset = thread_xy;
+        for (int i = 0; i < cycles_copy_rbrd; i++) {
+            if (offset < RBYRDIST_LEN) {
+                *(rByRDistp_local + offset) = *(rByRDistp_global + offset);
+            }
+            offset += dim_xy;
+        }
+    }
+    // copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
+    {
+        int cycles_copy_rot = ((sizeof(trot_deriv) / sizeof(float)) + dim_xy - 1) / dim_xy;
+        float *rots_local = (float *)&rot_deriv;
+        float *rots_global = (float *)gpu_rot_deriv;  // rot_matrices;
+        int offset = thread_xy;
+        for (int i = 0; i < cycles_copy_rot; i++) {
+            if (offset < sizeof(trot_deriv) / sizeof(float)) {
+                *(rots_local + offset) = *(rots_global + offset);
+            }
+            offset += dim_xy;
+        }
+    }
+    __syncthreads();
+    int ncam = threadIdx.x;
+    if (task_num >= num_tiles) {
+        return;
+    }
+    int imu_exists =  // todo - calculate once with rot_deriv?
+        (extrinsic_corr.imu_rot[0] != 0.0) ||
+        (extrinsic_corr.imu_rot[1] != 0.0) ||
+        (extrinsic_corr.imu_rot[2] != 0.0) ||
+        (extrinsic_corr.imu_move[0] != 0.0) ||
+        (extrinsic_corr.imu_move[1] != 0.0) ||
+        (extrinsic_corr.imu_move[2] != 0.0);

 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("\nTile = %d, camera= %d\n", task_num, ncam);
-		printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
-		printGeometryCorrection(&geometry_correction, num_cams);
-		printExtrinsicCorrection(&extrinsic_corr,num_cams);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-
-
-	//		String dbg_s = corr_vector.toString();
-	/* Starting with required tile center X, Y and nominal distortion, for each sensor port:
-	 * 1) unapply common distortion (maybe for different - master camera)
-	 * 2) apply disparity
-	 * 3) apply rotations and zoom
-	 * 4) re-apply distortion
-	 * 5) return port center X and Y
-	 * line_time
-	 */
-
-	// common code, calculated in parallel
-///	int cxy = gpu_tasks[task_num].txy;
-///	float disparity = gpu_tasks[task_num].target_disparity;
-	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
-	float *centerXY =    gpu_ftasks +  task_size * task_num + tp_task_centerXY_offset;
-	float px =  *(centerXY);
-	float py =  *(centerXY + 1);
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
-	int tileX = (cxy & 0xffff);
-	int tileY = (cxy >> 16);
-
-//	if (isnan(px)) {
-//	if (__float_as_int(px) == 0x7fffffff) {
-	if (uniform_grid) {
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("\nTile = %d, camera= %d\n", task_num, ncam);
+        printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int)blockIdx.x);
+        printGeometryCorrection(&geometry_correction, num_cams);
+        printExtrinsicCorrection(&extrinsic_corr, num_cams);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG21
+
+    //		String dbg_s = corr_vector.toString();
+    /* Starting with required tile center X, Y and nominal distortion, for each sensor port:
+     * 1) unapply common distortion (maybe for different - master camera)
+     * 2) apply disparity
+     * 3) apply rotations and zoom
+     * 4) re-apply distortion
+     * 5) return port center X and Y
+     * line_time
+     */
+
+    // common code, calculated in parallel
+    ///	int cxy = gpu_tasks[task_num].txy;
+    ///	float disparity = gpu_tasks[task_num].target_disparity;
+    float disparity = *(gpu_ftasks + task_size * task_num + 2);
+    float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
+    float px = *(centerXY);
+    float py = *(centerXY + 1);
+    int cxy = *(int *)(gpu_ftasks + task_size * task_num + 1);
+    int tileX = (cxy & 0xffff);
+    int tileY = (cxy >> 16);
+
+    //	if (isnan(px)) {
+    //	if (__float_as_int(px) == 0x7fffffff) {
+    if (uniform_grid) {
 #ifdef DEBUG23
-		if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
-			printf ("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
-			printf("\n");
-			__syncthreads();
-		}
-#endif //#ifdef DEBUG23
-		px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
-		py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
-		*(centerXY) =     px;
-		*(centerXY + 1) = py;
-	}
-	__syncthreads();
-
-	float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
-	float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
-
-	float rXY [2];
-
-	rXY[0] = geometry_correction.rXY[ncam][0];
-	rXY[1] = geometry_correction.rXY[ncam][1];
-
-	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
-	float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
-	float pXc = pXcd * rND2R; // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
-	float pYc = pYcd * rND2R; // in pixels
-	float xyz [3]; // getWorldCoordinates
-	xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
-			(disparity * 0.001 * geometry_correction.pixelSize); // "+" - near, "-" far
-	xyz[0] =  SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
-	xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
-	// next radial distortion coefficients are for this, not master camera (may be the same)
-	//	geometry_correction.rad_coeff[i];
-	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
-	float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
-
+        if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)) {
+            printf("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX, tileY, ncam);
+            printf("\n");
+            __syncthreads();
+        }
+#endif                                         //#ifdef DEBUG23
+        px = tileX * DTT_SIZE + DTT_SIZE / 2;  //  - shiftX;
+        py = tileY * DTT_SIZE + DTT_SIZE / 2;  //  - shiftY;
+        *(centerXY) = px;
+        *(centerXY + 1) = py;
+    }
+    __syncthreads();
+
+    float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
+    float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
+
+    float rXY[2];
+
+    rXY[0] = geometry_correction.rXY[ncam][0];
+    rXY[1] = geometry_correction.rXY[ncam][1];
+
+    float rD = sqrtf(pXcd * pXcd + pYcd * pYcd) * 0.001 * geometry_correction.pixelSize;  // distorted radius in a virtual center camera
+    float rND2R = getRByRDist(rD / geometry_correction.distortionRadius, rByRDist);
+    float pXc = pXcd * rND2R;  // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
+    float pYc = pYcd * rND2R;  // in pixels
+    float xyz[3];              // getWorldCoordinates
+    xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
+             (disparity * 0.001 * geometry_correction.pixelSize);  // "+" - near, "-" far
+    xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
+    xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
+    // next radial distortion coefficients are for this, not master camera (may be the same)
+    //	geometry_correction.rad_coeff[i];
+    float fl_pix = geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize);  // focal length in pixels - this camera
+    float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;

 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("\nuniform_grid=%d\n",                  uniform_grid);
-		printf("Tile = %d, camera= %d\n",              task_num, ncam);
-		printf("TargetDisparity = %f\n",               disparity);
-		printf("tileX = %d,  tileY = %d\n",            tileX, tileY);
-		printf("px = %f,  py = %f\n",                  px, py);
-		printf("centerXY[0] = %f,  centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
-		printf("pXcd = %f,  pYcd = %f\n",              pXcd, pYcd);
-		printf("rXY[0] = %f,  rXY[1] = %f\n",          rXY[0], rXY[1]);
-		printf("rD = %f,  rND2R = %f\n",               rD, rND2R);
-		printf("pXc = %f,  pYc = %f\n",                pXc, pYc);
-		printf("fl_pix = %f,  ri_scale = %f\n",        fl_pix, ri_scale);
-		printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-
-	// above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
-	// non-distorted XY of the shifted location of the individual sensor
-
-	// -------------- Each camera calculated by its own thread ----------------
-	float pXci0 = pXc - disparity *  rXY[0]; // [ncam][0]; // in pixels
-	float pYci0 = pYc - disparity *  rXY[1]; // [ncam][1];
-	// rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
-	// Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
-	///		double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
-	///		Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
-	// Apply port-individual combined rotation/zoom matrix
-	///		Matrix rvi = rots[i].times(vi);
-
-	float rvi[3];
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("\nuniform_grid=%d\n", uniform_grid);
+        printf("Tile = %d, camera= %d\n", task_num, ncam);
+        printf("TargetDisparity = %f\n", disparity);
+        printf("tileX = %d,  tileY = %d\n", tileX, tileY);
+        printf("px = %f,  py = %f\n", px, py);
+        printf("centerXY[0] = %f,  centerXY[1] = %f\n", *(centerXY), *(centerXY + 1));
+        printf("pXcd = %f,  pYcd = %f\n", pXcd, pYcd);
+        printf("rXY[0] = %f,  rXY[1] = %f\n", rXY[0], rXY[1]);
+        printf("rD = %f,  rND2R = %f\n", rD, rND2R);
+        printf("pXc = %f,  pYc = %f\n", pXc, pYc);
+        printf("fl_pix = %f,  ri_scale = %f\n", fl_pix, ri_scale);
+        printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0], xyz[1], xyz[2]);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG21
+
+    // above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
+    // non-distorted XY of the shifted location of the individual sensor
+
+    // -------------- Each camera calculated by its own thread ----------------
+    float pXci0 = pXc - disparity * rXY[0];  // [ncam][0]; // in pixels
+    float pYci0 = pYc - disparity * rXY[1];  // [ncam][1];
+    // rectilinear, end of dealing with possibly other (master) camera, below all is for this camera distortions
+    // Convert a 2-d non-distorted vector to 3d at fl_pix distance in z direction
+    ///		double [][] avi = {{pXci0}, {pYci0},{fl_pix}};
+    ///		Matrix vi = new Matrix(avi); // non-distorted sensor channel view vector in pixels (z -along the common axis)
+    // Apply port-individual combined rotation/zoom matrix
+    ///		Matrix rvi = rots[i].times(vi);
+
+    float rvi[3];
 #pragma unroll
-	for (int j = 0; j< 3; j++){
-		rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
-	}
-	// get back to the projection plane by normalizing vector
-	float norm_z = fl_pix/rvi[2];
-	float pXci =  rvi[0] * norm_z;
-	float pYci =  rvi[1] * norm_z;
-	// Re-apply distortion
-	float rNDi =  sqrtf(pXci*pXci + pYci*pYci); // in pixels
-	float ri =    rNDi* ri_scale; // relative to distortion radius
-
-	float rD2rND = 1.0;
-	{
-		float rri = 1.0;
+    for (int j = 0; j < 3; j++) {
+        rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
+    }
+    // get back to the projection plane by normalizing vector
+    float norm_z = fl_pix / rvi[2];
+    float pXci = rvi[0] * norm_z;
+    float pYci = rvi[1] * norm_z;
+    // Re-apply distortion
+    float rNDi = sqrtf(pXci * pXci + pYci * pYci);  // in pixels
+    float ri = rNDi * ri_scale;                     // relative to distortion radius
+
+    float rD2rND = 1.0;
+    {
+        float rri = 1.0;
 #ifdef NVRTC_BUG
 #pragma unroll
-		for (int j = 0; j < RAD_COEFF_LEN; j++){
-			rri *= ri;
-			rD2rND +=  ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
-		}
+        for (int j = 0; j < RAD_COEFF_LEN; j++) {
+            rri *= ri;
+            rD2rND += ((float *)&geometry_correction.distortionC)[j] * (rri - 1.0);
+        }
 #else
-		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
-			rri *= ri;
-			rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
-		}
+        for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
+            rri *= ri;
+            rD2rND += geometry_correction.rad_coeff[j] * (rri - 1.0);
+        }
 #endif
-	}
-	// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
-	float pXid = pXci * rD2rND;
-	float pYid = pYci * rD2rND;
-	pXY[0] =  pXid + geometry_correction.pXY0[ncam][0];
-	pXY[1] =  pYid + geometry_correction.pXY0[ncam][1];
-	// new for ERS
-	pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
-	__syncthreads();
-	// Each thread re-calculate same sum
-	float lines_avg = 0;
-	for (int i = 0; i < num_cams; i ++){
-		lines_avg += pY_offsets[threadIdx.y][i];
-	}
-	lines_avg *= (1.0/num_cams);
-	// used when calculating derivatives, TODO: combine calculations !
-	float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
+    }
+    // Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
+    float pXid = pXci * rD2rND;
+    float pYid = pYci * rD2rND;
+    pXY[0] = pXid + geometry_correction.pXY0[ncam][0];
+    pXY[1] = pYid + geometry_correction.pXY0[ncam][1];
+    // new for ERS
+    pY_offsets[threadIdx.y][ncam] = pXY[1] - geometry_correction.woi_tops[ncam];
+    __syncthreads();
+    // Each thread re-calculate same sum
+    float lines_avg = 0;
+    for (int i = 0; i < num_cams; i++) {
+        lines_avg += pY_offsets[threadIdx.y][i];
+    }
+    lines_avg *= (1.0 / num_cams);
+    // used when calculating derivatives, TODO: combine calculations !
+    float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
-		printf("rvi[0] = %f,  rvi[1] = %f,  rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
-		printf("norm_z = %f,  pXci = %f,  pYci = %f\n", norm_z, pXci, pYci);
-		printf("rNDi = %f,  ri = %f\n", rNDi, ri);
-		printf("rD2rND = %f\n", rD2rND);
-		printf("pXid = %f,  pYid = %f\n", pXid, pYid);
-		printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
-		printf("lines_avg = %f,  pY_offset = %f\n", lines_avg, pY_offset); // *
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-	float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
-	float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
-	float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
+        printf("rvi[0] = %f,  rvi[1] = %f,  rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
+        printf("norm_z = %f,  pXci = %f,  pYci = %f\n", norm_z, pXci, pYci);
+        printf("rNDi = %f,  ri = %f\n", rNDi, ri);
+        printf("rD2rND = %f\n", rD2rND);
+        printf("pXid = %f,  pYid = %f\n", pXid, pYid);
+        printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]);              // OK
+        printf("lines_avg = %f,  pY_offset = %f\n", lines_avg, pY_offset);  // *
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG21
+
+    float drvi_daz[3];  // drvi_daz = deriv_rots[i][0].times(vi);
+    float drvi_dtl[3];  // drvi_dtl = deriv_rots[i][1].times(vi);
+    float drvi_drl[3];  // drvi_drl = deriv_rots[i][2].times(vi);

 #pragma unroll
-	for (int j = 0; j< 3; j++){
-		drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] *  pXci0 + rot_deriv.d_daz[ncam][j][1] *  pYci0 + rot_deriv.d_daz[ncam][j][2] *  fl_pix;
-		drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
-		drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
-	}
-
-	float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
-	float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
-	float dpXci_dtilt =    drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
-	float dpYci_dtilt =    drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
-	float dpXci_droll =    drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
-	float dpYci_droll =    drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
+    for (int j = 0; j < 3; j++) {
+        drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
+        drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
+        drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
+    }
+
+    float dpXci_dazimuth = drvi_daz[0] * norm_z - pXci * drvi_daz[2] / rvi[2];
+    float dpYci_dazimuth = drvi_daz[1] * norm_z - pYci * drvi_daz[2] / rvi[2];
+    float dpXci_dtilt = drvi_dtl[0] * norm_z - pXci * drvi_dtl[2] / rvi[2];
+    float dpYci_dtilt = drvi_dtl[1] * norm_z - pYci * drvi_dtl[2] / rvi[2];
+    float dpXci_droll = drvi_drl[0] * norm_z - pXci * drvi_drl[2] / rvi[2];
+    float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];

 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("drvi_daz[0] = %f,  drvi_daz[1] = %f,  drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
-		printf("drvi_dtl[0] = %f,  drvi_dtl[1] = %f,  drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
-		printf("drvi_drl[0] = %f,  drvi_drl[1] = %f,  drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
-
-		printf("dpXci_dazimuth = %f,  dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
-		printf("dpXci_dtilt = %f,     dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
-		printf("dpXci_droll = %f,     dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-	float disp_dist[4]; // only for this channel, to be copied to global gpu_tasks in the end
-	float dpXci_pYci_imu_lin[2][3];
-	/*
- 				double [][] add0 = {
-						{-rXY[i][0],  rXY[i][1], 0.0},
-						{-rXY[i][1], -rXY[i][0], 0.0},
-						{ 0.0,        0.0,       0.0}}; // what is last element???
-				Matrix dd0 = new Matrix(add0);
-				Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
-
-	 */
-	float dd1[2][2];// get top left 2x2 sub-matrix
-	dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
-	dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
-	dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
-	dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("drvi_daz[0] = %f,  drvi_daz[1] = %f,  drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
+        printf("drvi_dtl[0] = %f,  drvi_dtl[1] = %f,  drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
+        printf("drvi_drl[0] = %f,  drvi_drl[1] = %f,  drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
+
+        printf("dpXci_dazimuth = %f,  dpYci_dazimuth = %f\n", dpXci_dazimuth, dpYci_dazimuth);
+        printf("dpXci_dtilt = %f,     dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
+        printf("dpXci_droll = %f,     dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG21
+
+    float disp_dist[4];  // only for this channel, to be copied to global gpu_tasks in the end
+    float dpXci_pYci_imu_lin[2][3];
+    /*
+                            double [][] add0 = {
+                                            {-rXY[i][0],  rXY[i][1], 0.0},
+                                            {-rXY[i][1], -rXY[i][0], 0.0},
+                                            { 0.0,        0.0,       0.0}}; // what is last element???
+                            Matrix dd0 = new Matrix(add0);
+                            Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
+
+     */
+    float dd1[2][2];  // get top left 2x2 sub-matrix
+    dd1[0][0] = (-rot_deriv.rots[ncam][0][0] * rXY[0] - rot_deriv.rots[ncam][0][1] * rXY[1]) * norm_z;
+    dd1[0][1] = (rot_deriv.rots[ncam][0][0] * rXY[1] - rot_deriv.rots[ncam][0][1] * rXY[0]) * norm_z;
+    dd1[1][0] = (-rot_deriv.rots[ncam][1][0] * rXY[0] - rot_deriv.rots[ncam][1][1] * rXY[1]) * norm_z;
+    dd1[1][1] = (rot_deriv.rots[ncam][1][0] * rXY[1] - rot_deriv.rots[ncam][1][1] * rXY[0]) * norm_z;

 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("dd1[0][0] = %f,  dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
-		printf("dd1[1][0] = %f,  dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-	// now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
-	// unity vector in the direction of radius
-	float c_dist = pXci/rNDi;
-	float s_dist = pYci/rNDi;
-	//#undef NVRTC_BUG
-	float drD2rND_dri = 0.0;
-	{
-		float rri = 1.0;
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("dd1[0][0] = %f,  dd1[0][1] = %f\n", dd1[0][0], dd1[0][1]);
+        printf("dd1[1][0] = %f,  dd1[1][1] = %f\n", dd1[1][0], dd1[1][1]);
+    }
+    __syncthreads();  // __syncwarp();
+#endif                // DEBUG21
+
+    // now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
+    // unity vector in the direction of radius
+    float c_dist = pXci / rNDi;
+    float s_dist = pYci / rNDi;
+    //#undef NVRTC_BUG
+    float drD2rND_dri = 0.0;
+    {
+        float rri = 1.0;
 #ifdef NVRTC_BUG
 #pragma unroll
-		for (int j = 0; j < RAD_COEFF_LEN; j++){
-			drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
-			rri *= ri;
-		}
+        for (int j = 0; j < RAD_COEFF_LEN; j++) {
+            drD2rND_dri += ((float *)&geometry_correction.distortionC)[j] * (j + 1) * rri;
+            rri *= ri;
+        }
 #else
 #pragma unroll
-		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
-			drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
-			rri *= ri;
-		}
+        for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
+            drD2rND_dri += geometry_correction.rad_coeff[j] * (j + 1) * rri;
+            rri *= ri;
+        }
 #endif
-	}
-	float scale_distort00 = rD2rND + ri* drD2rND_dri;
-	float scale_distort11 = rD2rND;
-	float scale_distortXrot2Xdd1[2][2];
-	scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
-	scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
-	scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
-	scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
-
-	disp_dist[0] =    c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
-	disp_dist[1] =    c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
-	disp_dist[2] =    s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
-	disp_dist[3] =    s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
+    }
+    float scale_distort00 = rD2rND + ri * drD2rND_dri;
+    float scale_distort11 = rD2rND;
+    float scale_distortXrot2Xdd1[2][2];
+    scale_distortXrot2Xdd1[0][0] = (c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
+    scale_distortXrot2Xdd1[0][1] = (c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
+    scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
+    scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
+
+    disp_dist[0] = c_dist * scale_distortXrot2Xdd1[0][0] - s_dist * scale_distortXrot2Xdd1[1][0];
+    disp_dist[1] = c_dist * scale_distortXrot2Xdd1[0][1] - s_dist * scale_distortXrot2Xdd1[1][1];
+    disp_dist[2] = s_dist * scale_distortXrot2Xdd1[0][0] + c_dist * scale_distortXrot2Xdd1[1][0];
+    disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];

 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-		printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
-		printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
-		printf("disp_dist[0] = %f\n", disp_dist[0]);
-		printf("disp_dist[1] = %f\n", disp_dist[1]);
-		printf("disp_dist[2] = %f\n", disp_dist[2]);
-		printf("disp_dist[3] = %f\n", disp_dist[3]);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG21
-///	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
-///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
-///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
-///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
-	*(disp_dist_p++) = disp_dist[0]; // global memory
-	*(disp_dist_p++) = disp_dist[1];
-	*(disp_dist_p++) = disp_dist[2];
-	*(disp_dist_p++) = disp_dist[3];
-
-	//	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
-	//	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
-	//	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
-	// ERS linear does not yet use per-port rotations, probably not needed
-	if (imu_exists){
-		float ers_x =
-				dpXci_dtilt * extrinsic_corr.imu_rot[0] +
-				dpXci_dazimuth * extrinsic_corr.imu_rot[1]  +
-				dpXci_droll * extrinsic_corr.imu_rot[2];
-		float ers_y =
-				dpYci_dtilt * extrinsic_corr.imu_rot[0] +
-				dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
-				dpYci_droll * extrinsic_corr.imu_rot[2];
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+        printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n", scale_distortXrot2Xdd1[0][0], scale_distortXrot2Xdd1[0][1]);
+        printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n", scale_distortXrot2Xdd1[1][0], scale_distortXrot2Xdd1[1][1]);
+        printf("disp_dist[0] = %f\n", disp_dist[0]);
+        printf("disp_dist[1] = %f\n", disp_dist[1]);
+        printf("disp_dist[2] = %f\n", disp_dist[2]);
+        printf("disp_dist[3] = %f\n", disp_dist[3]);
+    }
+    __syncthreads();                                                                                       // __syncwarp();
+#endif                                                                                                     // DEBUG21
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
+    float *disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams * 2 + ncam * 4;  //  ncam = threadIdx.x, so each thread will have different offset
+    *(disp_dist_p++) = disp_dist[0];                                                                       // global memory
+    *(disp_dist_p++) = disp_dist[1];
+    *(disp_dist_p++) = disp_dist[2];
+    *(disp_dist_p++) = disp_dist[3];
+
+    //	imu =  extrinsic_corr.getIMU(i); // currently it is common for all channels
+    //	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+    //	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
+    // ERS linear does not yet use per-port rotations, probably not needed
+    if (imu_exists) {
+        float ers_x =
+            dpXci_dtilt * extrinsic_corr.imu_rot[0] +
+            dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
+            dpXci_droll * extrinsic_corr.imu_rot[2];
+        float ers_y =
+            dpYci_dtilt * extrinsic_corr.imu_rot[0] +
+            dpYci_dazimuth * extrinsic_corr.imu_rot[1] +
+            dpYci_droll * extrinsic_corr.imu_rot[2];

 #ifdef DEBUG21
-		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-			printf("ers_x = %f,  ers_y = %f\n", ers_x, ers_y);
-		}
-		__syncthreads();// __syncwarp();
-#endif // DEBUG21
-		if (disparity >= MIN_DISPARITY){ // all threads together
-			float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
-			float wdisparity = disparity;
-			float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
-			dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X
-			dpXci_pYci_imu_lin[1][1] =  wdisparity / k; // dpy/ dworld_Y
-			dpXci_pYci_imu_lin[0][2] =  (xyz[0] / k) * dwdisp_dz; // dpx/ dworld_Z
-////			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
-			dpXci_pYci_imu_lin[1][2] =  -(xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
-			ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
-					dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
-			ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
-					dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
-			float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
-
-			pXY[0] +=   delta_t * ers_x * rD2rND; // added correction to pixel X
-			pXY[1] +=   delta_t * ers_y * rD2rND; // added correction to pixel Y
+        if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+            printf("ers_x = %f,  ers_y = %f\n", ers_x, ers_y);
+        }
+        __syncthreads();                   // __syncwarp();
+#endif                                     // DEBUG21
+        if (disparity >= MIN_DISPARITY) {  // all threads together
+            float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
+            float wdisparity = disparity;
+            float dwdisp_dz = (k * geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
+            dpXci_pYci_imu_lin[0][0] = -wdisparity / k;            // dpx/ dworld_X
+            dpXci_pYci_imu_lin[1][1] = wdisparity / k;             // dpy/ dworld_Y
+            dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz;   // dpx/ dworld_Z
+                                                                   ////			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+            dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz;  // dpy/ dworld_Z
+            ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
+                     dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
+            ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
+                     dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
+            float delta_t = (pY_offset / (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time;  // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
+
+            pXY[0] += delta_t * ers_x * rD2rND;  // added correction to pixel X
+            pXY[1] += delta_t * ers_y * rD2rND;  // added correction to pixel Y

 #ifdef DEBUG21
-			if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
-				printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
-				printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
-				printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
-
-				printf("delta_t = %f,  ers_x = %f,  ers_y = %f\n", delta_t, ers_x, ers_y);
-				printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]); // OK
-			}
-			__syncthreads();// __syncwarp();
-#endif // DEBUG21
-
-		}
-	}
-	// copy results to global memory pXY,  disp_dist (already copied)
-//	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
-//	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
-//	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
-	// .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
-	*(tile_xy_p++) = pXY[0]; // global memory
-	*(tile_xy_p++) = pXY[1]; // global memory
+            if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
+                printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
+                printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0], dpXci_pYci_imu_lin[0][2]);
+                printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1], dpXci_pYci_imu_lin[1][2]);
+
+                printf("delta_t = %f,  ers_x = %f,  ers_y = %f\n", delta_t, ers_x, ers_y);
+                printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]);  // OK
+            }
+            __syncthreads();  // __syncwarp();
+#endif                        // DEBUG21
+        }
+    }
+    // copy results to global memory pXY,  disp_dist (already copied)
+    //	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
+    //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
+    //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+    // .xy goes right after 3 commonn (tak, txy and target_disparity
+    float *tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2;  //  ncam = threadIdx.x, so each thread will have different offset
+    *(tile_xy_p++) = pXY[0];                                                              // global memory
+    *(tile_xy_p++) = pXY[1];                                                              // global memory
 }

 extern "C" __global__ void calcReverseDistortionTable(
-		struct gc * geometry_correction,
-		float * rByRDist)
-{
-	//int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
-	int indx =  ((blockIdx.x * blockDim.z + threadIdx.z) *  blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
-//	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
-//	double delta=1E-4; //rByRDist error = 0.000072
-	double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
-	double minDerivative=0.01;
-	int numIterations=1000;
-	double drDistDr=1.0;
-	double d=1.0
-			-geometry_correction -> distortionA8
-			-geometry_correction -> distortionA7
-			-geometry_correction -> distortionA6
-			-geometry_correction -> distortionA5
-			-geometry_correction -> distortionA
-			-geometry_correction -> distortionB
-			-geometry_correction -> distortionC;
-	double rPrev=0.0;
-	int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
-	for (int p = 0; p < num_points; p ++){
-		int i = indx * num_points +p;
-		if (i >= RBYRDIST_LEN){
-			return;
-		}
-		if (i == 0){
-			rByRDist[0]= (float) 1.0/d;
-			continue;
-		}
-		double rDist = RBYRDIST_STEP * i;
-		double r = (p == 0) ? rDist : rPrev;
-		for (int iteration=0;iteration<numIterations;iteration++){
-			double k=(((((((
-					geometry_correction -> distortionA8) * r +
-					geometry_correction -> distortionA7) * r +
-					geometry_correction -> distortionA6) * r +
-					geometry_correction -> distortionA5) * r +
-					geometry_correction -> distortionA) * r +
-					geometry_correction -> distortionB) * r +
-					geometry_correction -> distortionC) * r + d;
-			drDistDr=(((((((
-					8 * geometry_correction -> distortionA8) * r +
-					7 * geometry_correction -> distortionA7) * r +
-					6 * geometry_correction -> distortionA6) * r +
-					5 * geometry_correction -> distortionA5) * r +
-					4 * geometry_correction -> distortionA) * r +
-					3 * geometry_correction -> distortionB) * r+
-					2 * geometry_correction -> distortionC) * r+d;
-			if (drDistDr<minDerivative) { // folds backwards !
-				return; // too high distortion
-			}
-			double rD=r*k;
-			if (fabs(rD-rDist)<delta){
-				break;
-			}
-			r+=(rDist-rD)/drDistDr;
-		}
-		rPrev=r;
-		rByRDist[i]= (float) r/rDist;
-	}
+    struct gc *geometry_correction,
+    float *rByRDist) {
+    // int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
+    int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+    //	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
+    //	double delta=1E-4; //rByRDist error = 0.000072
+    double delta = 1E-10;  // 12; // 10; // -8; 0.730000 ms
+    double minDerivative = 0.01;
+    int numIterations = 1000;
+    double drDistDr = 1.0;
+    double d = 1.0 - geometry_correction->distortionA8 - geometry_correction->distortionA7 - geometry_correction->distortionA6 - geometry_correction->distortionA5 - geometry_correction->distortionA - geometry_correction->distortionB - geometry_correction->distortionC;
+    double rPrev = 0.0;
+    int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
+    for (int p = 0; p < num_points; p++) {
+        int i = indx * num_points + p;
+        if (i >= RBYRDIST_LEN) {
+            return;
+        }
+        if (i == 0) {
+            rByRDist[0] = (float)1.0 / d;
+            continue;
+        }
+        double rDist = RBYRDIST_STEP * i;
+        double r = (p == 0) ? rDist : rPrev;
+        for (int iteration = 0; iteration < numIterations; iteration++) {
+            double k = (((((((
+                                 geometry_correction->distortionA8) *
+                                 r +
+                             geometry_correction->distortionA7) *
+                                r +
+                            geometry_correction->distortionA6) *
+                               r +
+                           geometry_correction->distortionA5) *
+                              r +
+                          geometry_correction->distortionA) *
+                             r +
+                         geometry_correction->distortionB) *
+                            r +
+                        geometry_correction->distortionC) *
+                           r +
+                       d;
+            drDistDr = (((((((
+                                 8 * geometry_correction->distortionA8) *
+                                 r +
+                             7 * geometry_correction->distortionA7) *
+                                r +
+                            6 * geometry_correction->distortionA6) *
+                               r +
+                           5 * geometry_correction->distortionA5) *
+                              r +
+                          4 * geometry_correction->distortionA) *
+                             r +
+                         3 * geometry_correction->distortionB) *
+                            r +
+                        2 * geometry_correction->distortionC) *
+                           r +
+                       d;
+            if (drDistDr < minDerivative) {  // folds backwards !
+                return;                      // too high distortion
+            }
+            double rD = r * k;
+            if (fabs(rD - rDist) < delta) {
+                break;
+            }
+            r += (rDist - rD) / drDistDr;
+        }
+        rPrev = r;
+        rByRDist[i] = (float)r / rDist;
+    }
 }

 /**
@@ -843,110 +844,122 @@ extern "C" __global__ void calcReverseDistortionTable(
 * @return corresponding non-distorted radius
 */
 inline __device__ float getRByRDist(float rDist,
-		float rByRDist [RBYRDIST_LEN]) //shared memory
+                                    float rByRDist[RBYRDIST_LEN])  // shared memory
 {
-	if (rDist < 0) {
-		return 0.0f; // normally should not happen
-	}
-	float findex = rDist/RBYRDIST_STEP;
-	int index= (int) floorf(findex);
-	if (index < 0){
-		index = 0;
-	}
-	if (index > (RBYRDIST_LEN - 3)) {
-		index = RBYRDIST_LEN - 3;
-	}
-	float mu = fmaxf(findex - index, 0.0f);
-	float mu2 = mu * mu;
-	float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
-	// use Catmull-Rom
-	float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
-	float a1 =        y0 - 2.5 * rByRDist[index] + 2   * rByRDist[index+1] - 0.5 * rByRDist[index+2];
-	float a2 = -0.5 * y0                              + 0.5 * rByRDist[index+1];
-	float a3 =  rByRDist[index];
-	float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
-	return result;
+    if (rDist < 0) {
+        return 0.0f;  // normally should not happen
+    }
+    float findex = rDist / RBYRDIST_STEP;
+    int index = (int)floorf(findex);
+    if (index < 0) {
+        index = 0;
+    }
+    if (index > (RBYRDIST_LEN - 3)) {
+        index = RBYRDIST_LEN - 3;
+    }
+    float mu = fmaxf(findex - index, 0.0f);
+    float mu2 = mu * mu;
+    float y0 = (index > 0) ? rByRDist[index - 1] : (2 * rByRDist[index] - rByRDist[index + 1]);
+    // use Catmull-Rom
+    float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index + 1] + 0.5 * rByRDist[index + 2];
+    float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index + 1] - 0.5 * rByRDist[index + 2];
+    float a2 = -0.5 * y0 + 0.5 * rByRDist[index + 1];
+    float a3 = rByRDist[index];
+    float result = a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3;
+    return result;
 }

-__device__ void printGeometryCorrection(struct gc * g, int num_cams){
+__device__ void printGeometryCorrection(struct gc *g, int num_cams) {
 #ifndef JCUDA
-	printf("\nGeometry Correction\n------------------\n");
-	printf("%22s: %f\n","pixelCorrectionWidth",  g->pixelCorrectionWidth);
-	printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
-	printf("%22s: %f\n","line_time",             g->line_time);
-
-	printf("%22s: %f\n","focalLength", g->focalLength);
-	printf("%22s: %f\n","pixelSize",   g->pixelSize);
-	printf("%22s: %f\n","distortionRadius",g->distortionRadius);
-
-	printf("%22s: %f\n","distortionC", g->distortionC);
-	printf("%22s: %f\n","distortionB", g->distortionB);
-	printf("%22s: %f\n","distortionA", g->distortionA);
-	printf("%22s: %f\n","distortionA5",g->distortionA5);
-	printf("%22s: %f\n","distortionA6",g->distortionA6);
-	printf("%22s: %f\n","distortionA7",g->distortionA7);
-	printf("%22s: %f\n","distortionA8",g->distortionA8);
-
-	printf("%22s: %f\n","elevation",   g->elevation);
-	printf("%22s: %f\n","heading",     g->heading);
-
-//	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
-//	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
-//	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
-//	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
-//	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
-	printf("%22s:","forward"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]); printf("\n");
-	printf("%22s:","right");   for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right  [ncam]); printf("\n");
-	printf("%22s:","height");  for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height [ncam]); printf("\n");
-	printf("%22s:","roll");    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll   [ncam]); printf("\n");
-	for (int ncam = 0; ncam < num_cams; ncam++) {
-		printf("%19s%2d]: %f, %f \n", "pXY0[",ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
-	}
-
-	printf("%22s: %f\n","common_right",   g->common_right);
-	printf("%22s: %f\n","common_forward", g->common_forward);
-	printf("%22s: %f\n","common_height",  g->common_height);
-	printf("%22s: %f\n","common_roll",    g->common_roll);
-
-//	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
-	for (int ncam = 0; ncam < num_cams; ncam++) {
-		printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
-	}
-	printf("%22s: %f\n","cameraRadius",    g->cameraRadius);
-	printf("%22s: %f\n","disparityRadius", g->disparityRadius);
-
-//	printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
-	printf("%22s:","woi_tops");    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]); printf("\n");
-
-#endif //ifndef JCUDA
+    printf("\nGeometry Correction\n------------------\n");
+    printf("%22s: %f\n", "pixelCorrectionWidth", g->pixelCorrectionWidth);
+    printf("%22s: %f\n", "pixelCorrectionHeight", g->pixelCorrectionHeight);
+    printf("%22s: %f\n", "line_time", g->line_time);
+
+    printf("%22s: %f\n", "focalLength", g->focalLength);
+    printf("%22s: %f\n", "pixelSize", g->pixelSize);
+    printf("%22s: %f\n", "distortionRadius", g->distortionRadius);
+
+    printf("%22s: %f\n", "distortionC", g->distortionC);
+    printf("%22s: %f\n", "distortionB", g->distortionB);
+    printf("%22s: %f\n", "distortionA", g->distortionA);
+    printf("%22s: %f\n", "distortionA5", g->distortionA5);
+    printf("%22s: %f\n", "distortionA6", g->distortionA6);
+    printf("%22s: %f\n", "distortionA7", g->distortionA7);
+    printf("%22s: %f\n", "distortionA8", g->distortionA8);
+
+    printf("%22s: %f\n", "elevation", g->elevation);
+    printf("%22s: %f\n", "heading", g->heading);
+
+    //	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
+    //	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
+    printf("%22s:", "forward");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]);
+    printf("\n");
+    printf("%22s:", "right");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right[ncam]);
+    printf("\n");
+    printf("%22s:", "height");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height[ncam]);
+    printf("\n");
+    printf("%22s:", "roll");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll[ncam]);
+    printf("\n");
+    for (int ncam = 0; ncam < num_cams; ncam++) {
+        printf("%19s%2d]: %f, %f \n", "pXY0[", ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
+    }
+
+    printf("%22s: %f\n", "common_right", g->common_right);
+    printf("%22s: %f\n", "common_forward", g->common_forward);
+    printf("%22s: %f\n", "common_height", g->common_height);
+    printf("%22s: %f\n", "common_roll", g->common_roll);
+
+    //	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
+    for (int ncam = 0; ncam < num_cams; ncam++) {
+        printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
+    }
+    printf("%22s: %f\n", "cameraRadius", g->cameraRadius);
+    printf("%22s: %f\n", "disparityRadius", g->disparityRadius);
+
+    //	printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
+    printf("%22s:", "woi_tops");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]);
+    printf("\n");
+
+#endif  // ifndef JCUDA
 }

-__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams)
-{
+__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams) {
 #ifndef JCUDA
-	printf("\nExtrinsic Correction Vector\n---------------------------\n");
-//	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
-//	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
-//	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
-//	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
-
-	printf("%22s:","tilt");    for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->tilt[ncam]);    printf("\n");
-	printf("%22s:","azimuth"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->azimuth[ncam]); printf("\n");
-	printf("%22s:","roll");    for (int ncam = 0; ncam <  num_cams;    ncam++) printf(" %f,", cv->roll[ncam]);    printf("\n");
-	printf("%22s:","zoom");    for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->zoom[ncam]);    printf("\n");
-
-
-
-	printf("%22s: %f(t), %f(a), %f(r)\n",     "imu_rot",    cv->imu_rot[0],    cv->imu_rot[1],    cv->imu_rot[2]);
-	printf("%22s: %f(x), %f(y), %f(z)\n",     "imu_move",    cv->imu_move[0],    cv->imu_move[1],    cv->imu_move[2]);
-#endif //ifndef JCUDA
+    printf("\nExtrinsic Correction Vector\n---------------------------\n");
+    //	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
+    //	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
+    //	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
+    //	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
+
+    printf("%22s:", "tilt");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->tilt[ncam]);
+    printf("\n");
+    printf("%22s:", "azimuth");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->azimuth[ncam]);
+    printf("\n");
+    printf("%22s:", "roll");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]);
+    printf("\n");
+    printf("%22s:", "zoom");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->zoom[ncam]);
+    printf("\n");
+
+    printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
+    printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
+#endif  // ifndef JCUDA
 }
-
-
-
--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,147 +41,141 @@
 #include "tp_defines.h"
 #endif

-
 #define NVRTC_BUG 1
 #ifndef M_PI
-#define M_PI  3.14159265358979323846 /* pi */
+#define M_PI 3.14159265358979323846 /* pi */
 #endif
 #ifndef offsetof
 #define offsetof(st, m) \
-    ((size_t)&(((st *)0)->m))
+    ((size_t) & (((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif

-
-#define SCENE_UNITS_SCALE  0.001 // meters from mm
-#define MIN_DISPARITY      0.01  // minimal disparity to try to convert to world coordinates
+#define SCENE_UNITS_SCALE 0.001  // meters from mm
+#define MIN_DISPARITY 0.01       // minimal disparity to try to convert to world coordinates
 struct tp_task {
-	int   task;
-	union {
-		int      txy;
-		unsigned short sxy[2];
-	};
-	float target_disparity;
-	float centerXY[2];          // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
-	                            // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
-	float xy[NUM_CAMS][2];
-	float disp_dist[NUM_CAMS][4]; // calculated with getPortsCoordinates()
+    int task;
+    union {
+        int txy;
+        unsigned short sxy[2];
+    };
+    float target_disparity;
+    float centerXY[2];  // "ideal" centerX, centerY to use instead of the uniform tile centers (txy) for interscene accumulation
+                        // if isnan(centerXY[0]), then txy is used to calculate centerXY and all xy
+    float xy[NUM_CAMS][2];
+    float disp_dist[NUM_CAMS][4];  // calculated with getPortsCoordinates()
 };

-#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
 #define tp_task_xy_offset 5
 #define tp_task_centerXY_offset 3

-struct corr_vector{
-	float tilt    [NUM_CAMS-1]; // 0..2
-	float azimuth [NUM_CAMS-1]; // 3..5
-	float roll    [NUM_CAMS];   // 6..9
-	float zoom    [NUM_CAMS-1]; // 10..12
-	// for ERS correction:
-	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
-	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
+struct corr_vector {
+    float tilt[NUM_CAMS - 1];     // 0..2
+    float azimuth[NUM_CAMS - 1];  // 3..5
+    float roll[NUM_CAMS];         // 6..9
+    float zoom[NUM_CAMS - 1];     // 10..12
+    // for ERS correction:
+    float imu_rot[3];   // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+    float imu_move[3];  // dx/dt, dy/dt, dz/dt 16..19
 };
 #ifdef NVRTC_BUG
-struct trot_deriv{
-	float rots    [NUM_CAMS][3][3];
-	float d_daz   [NUM_CAMS][3][3];
-	float d_tilt  [NUM_CAMS][3][3];
-	float d_roll  [NUM_CAMS][3][3];
-	float d_zoom  [NUM_CAMS][3][3];
+struct trot_deriv {
+    float rots[NUM_CAMS][3][3];
+    float d_daz[NUM_CAMS][3][3];
+    float d_tilt[NUM_CAMS][3][3];
+    float d_roll[NUM_CAMS][3][3];
+    float d_zoom[NUM_CAMS][3][3];
 };
 #else
-union trot_deriv{
-	struct {
-		float rots    [NUM_CAMS][3][3];
-		float d_daz   [NUM_CAMS][3][3];
-		float d_tilt  [NUM_CAMS][3][3];
-		float d_roll  [NUM_CAMS][3][3];
-		float d_zoom  [NUM_CAMS][3][3];
-	};
-	float matrices [5][NUM_CAMS][3][3];
+union trot_deriv {
+    struct {
+        float rots[NUM_CAMS][3][3];
+        float d_daz[NUM_CAMS][3][3];
+        float d_tilt[NUM_CAMS][3][3];
+        float d_roll[NUM_CAMS][3][3];
+        float d_zoom[NUM_CAMS][3][3];
+    };
+    float matrices[5][NUM_CAMS][3][3];
 };
 #endif

 struct gc {
-	float pixelCorrectionWidth; //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
-	float pixelCorrectionHeight; // =1936;
-	float line_time;        // duration of one scan line readout (for ERS)
-	float focalLength;      // =FOCAL_LENGTH;
-	float pixelSize;        // =  PIXEL_SIZE; //um
-	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
-#ifndef	NVRTC_BUG
-	union {
-		struct {
+    float pixelCorrectionWidth;   //  =2592;   // virtual camera center is at (pixelCorrectionWidth/2, pixelCorrectionHeight/2)
+    float pixelCorrectionHeight;  // =1936;
+    float line_time;              // duration of one scan line readout (for ERS)
+    float focalLength;            // =FOCAL_LENGTH;
+    float pixelSize;              // =  PIXEL_SIZE; //um
+    float distortionRadius;       // =  DISTORTION_RADIUS; // mm - half width of the sensor
+#ifndef NVRTC_BUG
+    union {
+        struct {
 #endif
-			float distortionC;      // r^2
-			float distortionB;      // r^3
-			float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
-			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
-			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
-			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
-			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
-#ifndef	NVRTC_BUG
-		};
-		float rad_coeff [7];
-	};
+            float distortionC;   // r^2
+            float distortionB;   // r^3
+            float distortionA;   // r^4 (normalized to focal length or to sensor half width?)
+            float distortionA5;  // r^5 (normalized to focal length or to sensor half width?)
+            float distortionA6;  // r^6 (normalized to focal length or to sensor half width?)
+            float distortionA7;  // r^7 (normalized to focal length or to sensor half width?)
+            float distortionA8;  // r^8 (normalized to focal length or to sensor half width?)
+#ifndef NVRTC_BUG
+        };
+        float rad_coeff[7];
+    };
 #endif
-	// parameters, common for all sensors
-	float    elevation;     // degrees, up - positive;
-	float    heading;       // degrees, CW (from top) - positive
-
-	float forward    [NUM_CAMS];
-	float right      [NUM_CAMS];
-	float height     [NUM_CAMS];
-	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
-	float pXY0       [NUM_CAMS][2];
-	float common_right;    // mm right, camera center
-	float common_forward;  // mm forward (to target), camera center
-	float common_height;   // mm up, camera center
-	float common_roll;     // degrees CW (to target) camera as a whole
-//	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
-//	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
-	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
-//	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
-// only used for the multi-quad systems
-	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
-	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
-	float woi_tops   [NUM_CAMS]; // used to calculate scanline timing
+    // parameters, common for all sensors
+    float elevation;  // degrees, up - positive;
+    float heading;    // degrees, CW (from top) - positive
+
+    float forward[NUM_CAMS];
+    float right[NUM_CAMS];
+    float height[NUM_CAMS];
+    float roll[NUM_CAMS];  // degrees, CW (to target) - positive
+    float pXY0[NUM_CAMS][2];
+    float common_right;        // mm right, camera center
+    float common_forward;      // mm forward (to target), camera center
+    float common_height;       // mm up, camera center
+    float common_roll;         // degrees CW (to target) camera as a whole
+                               //	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
+                               //	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
+    float rXY[NUM_CAMS][2];    // XY pairs of the in a normal plane, relative to disparityRadius
+                               //	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
+                               // only used for the multi-quad systems
+    float cameraRadius;        // =0; // average distance from the "mass center" of the sensors to the sensors
+    float disparityRadius;     // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
+    float woi_tops[NUM_CAMS];  // used to calculate scanline timing
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(
-		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-		int                  num_cams,
-//		struct tp_task     * gpu_tasks,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-		int                  num_tiles,          // number of tiles in task
-		struct gc          * gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv);
+    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+    int num_cams,
+    //		struct tp_task     * gpu_tasks,
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    int num_tiles,      // number of tiles in task
+    struct gc *gpu_geometry_correction,
+    struct corr_vector *gpu_correction_vector,
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
+    trot_deriv *gpu_rot_deriv);

 extern "C" __global__ void calculate_tiles_offsets(
-		int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
-		int                  num_cams,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task     * gpu_tasks,
-		int                  num_tiles,          // number of tiles in task
-		struct gc          * gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv);
-
+    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
+    int num_cams,
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+                        //		struct tp_task     * gpu_tasks,
+    int num_tiles,      // number of tiles in task
+    struct gc *gpu_geometry_correction,
+    struct corr_vector *gpu_correction_vector,
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
+    trot_deriv *gpu_rot_deriv);

 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
-		int                  num_cams,
-		struct corr_vector * gpu_correction_vector,
-		trot_deriv   * gpu_rot_deriv);
+    int num_cams,
+    struct corr_vector *gpu_correction_vector,
+    trot_deriv *gpu_rot_deriv);

-#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
+#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3)  // fixed blockDim
 // Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calcReverseDistortionTable(
-		struct gc * geometry_correction,
-		float * rByRDist);
-
-
-
+    struct gc *geometry_correction,
+    float *rByRDist);
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -39,61 +39,61 @@
 // Avoiding includes in jcuda, all source files will be merged
 #pragma once
 #ifndef JCUDA
-#define TEST_LWIR                      1
+#define TEST_LWIR 1
 #include <stdio.h>
-#define THREADSX              (DTT_SIZE)
-#define NUM_CAMS                      16 // now maximal number of cameras
+#define THREADSX (DTT_SIZE)
+#define NUM_CAMS 16  // now maximal number of cameras
 //#define NUM_PAIRS                      6
 //#define NUM_COLORS                     1 //3
 // kernels [num_cams][num_colors][KERNELS_HOR][KERNELS_VERT][4][64]
-#define KERNELS_LSTEP                  4
-#define THREADS_PER_TILE               8
-#define TILES_PER_BLOCK                4
-#define CORR_THREADS_PER_TILE          8
-#define CORR_TILES_PER_BLOCK           4
-#define CORR_TILES_PER_BLOCK_NORMALIZE 4 // increase to 8?
-#define CORR_TILES_PER_BLOCK_COMBINE   4 // increase to 16?
+#define KERNELS_LSTEP 4
+#define THREADS_PER_TILE 8
+#define TILES_PER_BLOCK 4
+#define CORR_THREADS_PER_TILE 8
+#define CORR_TILES_PER_BLOCK 4
+#define CORR_TILES_PER_BLOCK_NORMALIZE 4  // increase to 8?
+#define CORR_TILES_PER_BLOCK_COMBINE 4    // increase to 16?
 //#define TEXTURE_THREADS               32 //
-#define NUM_THREADS                   32
-#define TEXTURE_THREADS_PER_TILE       8
-#define TEXTURE_TILES_PER_BLOCK        1
-#define IMCLT_THREADS_PER_TILE        16
-#define IMCLT_TILES_PER_BLOCK          4
-#define CORR_NTILE_SHIFT               8 // higher bits - number of a pair, other bits tile number
+#define NUM_THREADS 32
+#define TEXTURE_THREADS_PER_TILE 8
+#define TEXTURE_TILES_PER_BLOCK 1
+#define IMCLT_THREADS_PER_TILE 16
+#define IMCLT_TILES_PER_BLOCK 4
+#define CORR_NTILE_SHIFT 8  // higher bits - number of a pair, other bits tile number
 // only lower bit will be used to request correlations, correlation mask will be common for all the scene
 //#define CORR_PAIRS_MASK             0x3f// lower bits used to address correlation pair for the selected tile
-#define CORR_TEXTURE_BIT               7 // bit 7 used to request texture for the tile
-#define TASK_CORR_BITS                 4
-#define TASK_TEXTURE_N_BIT             0 // Texture with North neighbor
-#define TASK_TEXTURE_E_BIT             1 // Texture with East  neighbor
-#define TASK_TEXTURE_S_BIT             2 // Texture with South neighbor
-#define TASK_TEXTURE_W_BIT             3 // Texture with West  neighbor
+#define CORR_TEXTURE_BIT 7  // bit 7 used to request texture for the tile
+#define TASK_CORR_BITS 4
+#define TASK_TEXTURE_N_BIT 0  // Texture with North neighbor
+#define TASK_TEXTURE_E_BIT 1  // Texture with East  neighbor
+#define TASK_TEXTURE_S_BIT 2  // Texture with South neighbor
+#define TASK_TEXTURE_W_BIT 3  // Texture with West  neighbor
 //#define TASK_TEXTURE_BIT               3 // bit to request texture calculation int task field of struct tp_task
-#define LIST_TEXTURE_BIT               7 // bit to request texture calculation
+#define LIST_TEXTURE_BIT 7  // bit to request texture calculation
 //#define CORR_OUT_RAD                 7 // full tile (15x15), was 4 (9x9)
-#define FAT_ZERO_WEIGHT                0.0001 // add to port weights to avoid nan
+#define FAT_ZERO_WEIGHT 0.0001  // add to port weights to avoid nan

-#define THREADS_DYNAMIC_BITS           5 // treads in block for CDP creation of the texture list
+#define THREADS_DYNAMIC_BITS 5  // treads in block for CDP creation of the texture list

-#define RBYRDIST_LEN                5001   // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
-#define RBYRDIST_STEP                  0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
-#define TILES_PER_BLOCK_GEOM          (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+#define RBYRDIST_LEN 5001                     // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
+#define RBYRDIST_STEP 0.0004                  // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
+#define TILES_PER_BLOCK_GEOM (32 / NUM_CAMS)  // each tile has NUM_CAMS threads

 #define DEBUG_ANY 1

-#ifdef 	DEBUG_ANY
+#ifdef DEBUG_ANY
 //#define DEBUG_OOB1 1
 // Use CORR_OUT_RAD for the correlation output
 //#define DBG_TILE_X     40
 //#define DBG_TILE_Y     80
 #if TEST_LWIR
-	#define DBG_TILE_X    50 // 52 // 32 // 162 // 151 // 161 // 49
-	#define DBG_TILE_Y    19 //  5 // 36 // 88 // 121 // 69  // 111 // 66
-	#define DBG_TILE    (DBG_TILE_Y * 80 + DBG_TILE_X)
+#define DBG_TILE_X 50  // 52 // 32 // 162 // 151 // 161 // 49
+#define DBG_TILE_Y 19  //  5 // 36 // 88 // 121 // 69  // 111 // 66
+#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
 #else
-	#define DBG_TILE_X     114 // 32 // 162 // 151 // 161 // 49
-	#define DBG_TILE_Y     51  // 52  // 88 // 121 // 69  // 111 // 66
-	#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#define DBG_TILE_X 114  // 32 // 162 // 151 // 161 // 49
+#define DBG_TILE_Y 51   // 52  // 88 // 121 // 69  // 111 // 66
+#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
 #endif
 #undef DBG_MARK_DBG_TILE
 //#undef DBG_TILE
@@ -101,8 +101,7 @@
 //#undef HAS_PRINTF
 #define HAS_PRINTF

-
-//7
+// 7
 //#define DEBUG1 1
 //#define DEBUG2 1
 //#define DEBUG3 1
@@ -118,7 +117,7 @@
 #define DEBUG9 1
 */
 //#define DEBUG8A 1 // generate_RBGA_host
-//textures
+// textures
 //#define DEBUG10 1
 //#define DEBUG11 1
 //#define DEBUG12 1
@@ -127,7 +126,6 @@
 // geom
 //#define DEBUG20 1

-
 #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
 //#define DEBUG20 1 // Geometry Correction
 //#define DEBUG21 1 // Geometry Correction
@@ -136,10 +134,8 @@
 //#define DEBUG22 1
 //#define DEBUG23 1

-#endif //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
-
-#endif //#ifdef 	DEBUG_ANY
-
+#endif  //#if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)

-#endif //#ifndef JCUDA
+#endif  //#ifdef 	DEBUG_ANY

+#endif  //#ifndef JCUDA