ran formatter

6c76931e · Palani Johnson · 4648cb20 · 6c76931e · 6c76931e · 6c76931e
Commit 6c76931e authored Feb 26, 2022 by Palani Johnson
8 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -41,17 +41,16 @@
 #include "tp_defines.h"
 #endif
 extern "C" __global__ void convert_direct(  // called with a single block, single thread
                                            //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
    int num_cams,                           // actual number of cameras
    int num_colors,                         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+    float** gpu_kernel_offsets,             // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
+    float** gpu_kernels,                    // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
+    float** gpu_images,                     // [NUM_CAMS],
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+    float* gpu_ftasks,                      // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
+                                            //		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_clt,                        // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
    size_t dstride,                         // in floats (pixels)
    int num_tiles,                          // number of tiles in task
    int lpf_mask,                           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
@@ -59,41 +58,41 @@ extern "C" __global__ void convert_direct( // called with a single block, single
    int woi_height,
    int kernels_hor,
    int kernels_vert,
-		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+    int* gpu_active_tiles,   // pointer to the calculated number of non-zero tiles
-		int *              pnum_active_tiles,  //  indices to gpu_tasks
+    int* pnum_active_tiles,  //  indices to gpu_tasks
    int tilesx);
 extern "C" __global__ void correlate2D(
    int num_cams,
-//		int *             sel_pairs,
+    //		int *             sel_pairs,
    int sel_pairs0,
    int sel_pairs1,
    int sel_pairs2,
    int sel_pairs3,
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_clt,        // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
    int colors,             // number of colors (3/1)
    float scale0,           // scale for R
    float scale1,           // scale for B
    float scale2,           // scale for G
    float fat_zero2,        // here - absolute, squared
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+    float* gpu_ftasks,      // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
+                            //		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
    int num_tiles,          // number of tiles in task
    int tilesx,             // number of tile rows
-		int             * gpu_corr_indices,   // packed tile+pair
+    int* gpu_corr_indices,  // packed tile+pair
-		int             * pnum_corr_tiles,    // pointer to a number of correlation tiles to process
+    int* pnum_corr_tiles,   // pointer to a number of correlation tiles to process
    size_t corr_stride,     // in floats
-//		int               corr_stride,        // in floats
+                            //		int               corr_stride,        // in floats
    int corr_radius,        // radius of the output correlation (7 for 15x15)
-		float           * gpu_corrs);          // correlation output data
+    float* gpu_corrs);      // correlation output data
 extern "C" __global__ void corr2D_normalize(
    int num_corr_tiles,           // number of correlation tiles to process
    const size_t corr_stride_td,  // in floats
-		float           * gpu_corrs_td,       // correlation tiles in transform domain
+    float* gpu_corrs_td,          // correlation tiles in transform domain
-		float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
+    float* corr_weights,          // null or per-tile weight (fat_zero2 will be divided by it)
    const size_t corr_stride,     // in floats
-		float           * gpu_corrs,          // correlation output data (either pixel domain or transform domain
+    float* gpu_corrs,             // correlation output data (either pixel domain or transform domain
    float fat_zero2,              // here - absolute, squared
    int corr_radius);             // radius of the output correlation (7 for 15x15)
@@ -102,42 +101,41 @@ extern "C" __global__ void corr2D_combine(
    int num_pairs,                   // num pairs per tile (should be the same)
    int init_output,                 // !=0 - reset output tiles to zero before accumulating
    int pairs_mask,                  // selected pairs (0x3 - horizontal, 0xc - vertical, 0xf - quad, 0x30 - cross)
-		int             * gpu_corr_indices,   // packed tile+pair
+    int* gpu_corr_indices,           // packed tile+pair
-		int             * gpu_combo_indices,  // output if noty null: packed tile+pairs_mask (will point to the first used pair
+    int* gpu_combo_indices,          // output if noty null: packed tile+pairs_mask (will point to the first used pair
    const size_t corr_stride,        // (in floats) stride for the input TD correlations
-		float           * gpu_corrs,          // input correlation tiles
+    float* gpu_corrs,                // input correlation tiles
    const size_t corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
-		float           * gpu_corrs_combo);   // combined correlation output (one per tile)
+    float* gpu_corrs_combo);         // combined correlation output (one per tile)
 extern "C" __global__ void textures_nonoverlap(
    int num_cams,       // number of cameras
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats
+    float* gpu_ftasks,  // flattened tasks, 29 floats for quad EO, 101 floats
    //		struct tp_task  * gpu_tasks,
    int num_tiles,             // number of tiles in task list
-//		int               num_tilesx,         // number of tiles in a row
+                               //		int               num_tilesx,         // number of tiles in a row
-// declare arrays in device code?
+                               // declare arrays in device code?
-		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+    int* gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
-		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
+    int* pnum_texture_tiles,   // returns total number of elements in gpu_texture_indices array
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_clt,           // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
    // TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
+    struct gc* gpu_geometry_correction,
    int colors,   // number of colors (3/1)
    int is_lwir,  // do not perform shot correction
    float params[5],
    float weights[3],           // scale for R,B,G
    int dust_remove,            // Do not reduce average weight when only one image differs much from the average
-// combining both non-overlap and overlap (each calculated if pointer is not null )
+                                // combining both non-overlap and overlap (each calculated if pointer is not null )
    size_t texture_stride,      // in floats (now 256*4 = 1024)  // may be 0 if not needed
-		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
+    float* gpu_texture_tiles,   // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
    int linescan_order,         // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
-		float           * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
+    float* gpu_diff_rgb_combo,  //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
    int num_tilesx);
-extern "C"
+extern "C" __global__ void imclt_rbg_all(
-__global__ void imclt_rbg_all(
    int num_cams,
-		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_clt,          // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
+    float** gpu_corr_images,  // [NUM_CAMS][WIDTH, 3 * HEIGHT]
    int apply_lpf,
    int colors,
    int woi_twidth,
@@ -145,12 +143,12 @@ __global__ void imclt_rbg_all(
    const size_t dstride);  // in floats (pixels)
 extern "C" __global__ void erase8x8(
-		float           * gpu_top_left,
+    float* gpu_top_left,
    const size_t dstride);
 extern "C" __global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float* gpu_clt,  // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+    float* gpu_rbg,  // WIDTH, 3 * HEIGHT
    int apply_lpf,
    int mono,   // defines lpf filter
    int color,  // defines location of clt data
@@ -163,19 +161,19 @@ extern "C" __global__ void imclt_rbg(
 extern "C" __global__ void generate_RBGA(
    int num_cams,  // number of cameras used
    // Parameters to generate texture tasks
-		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
+    float* gpu_ftasks,  // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
+                        //		struct tp_task   * gpu_tasks,
    int num_tiles,      // number of tiles in task list
    // declare arrays in device code?
-		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+    int* gpu_texture_indices,  // packed tile + bits (now only (1 << 7)
-		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
+    int* num_texture_tiles,    // number of texture tiles to process  (8 separate elements for accumulation)
-		int              * woi,                // x,y,width,height of the woi
+    int* woi,                  // x,y,width,height of the woi
    int width,                 // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
    int height,                // <= TILES-Y, use for faster processing of LWIR images
    // Parameters for the texture generation
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+    float** gpu_clt,  // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
    // TODO: use geometry_correction rXY !
-		struct gc       * gpu_geometry_correction,
+    struct gc* gpu_geometry_correction,
    int colors,                        // number of colors (3/1)
    int is_lwir,                       // do not perform shot correction
    float params[5],                   // mitigating CUDA_ERROR_INVALID_PTX
@@ -183,5 +181,12 @@ extern "C" __global__ void generate_RBGA(
    int dust_remove,                   // Do not reduce average weight when only one image differs much from the average
    int keep_weights,                  // return channel weights after A in RGBA (was removed)
    const size_t texture_rbga_stride,  // in floats
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+    float* gpu_texture_tiles);         // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void accumulate_correlations(
+    int tilesY,
+    int tilesX,
+    int pairs,
+    float* num_acc,        // number of accumulated tiles [tilesY][tilesX][pair]
+    float* fcorr_td,       // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
+    float* fcorr_td_acc);  // [tilesY][tilesX][pair][256] sparse transform domain representation of corr pairs
--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -74,50 +74,47 @@ __constant__ float COSPI_3_8_SQRT2 = 0.541196f;
 __constant__ float SQRT_2 = 1.414214f;
 __constant__ float SQRT1_2 = 0.707107f;
 __constant__ float SQRT1_8 = 0.353553f;
-__constant__ float COSN1[] = {0.980785f,0.831470f};
+__constant__ float COSN1[] = {0.980785f, 0.831470f};
-__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
+__constant__ float COSN2[] = {0.995185f, 0.956940f, 0.881921f, 0.773010f};
-__constant__ float SINN1[] = {0.195090f,0.555570f};
+__constant__ float SINN1[] = {0.195090f, 0.555570f};
-__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
+__constant__ float SINN2[] = {0.098017f, 0.290285f, 0.471397f, 0.634393f};
-__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
+__constant__ int imclt_indx9[16] = {0x28, 0x29, 0x2a, 0x2b, 0x2b, 0x2a, 0x29, 0x28, 0x27, 0x26, 0x25, 0x24, 0x24, 0x25, 0x26, 0x27};
-__constant__ float idct_signs[4][4][4] ={
+__constant__ float idct_signs[4][4][4] = {
-		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+    {// quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
+     {1, -1, -1, -1},
     {-1, 1, 1, 1},
     {-1, 1, 1, 1},
-				{-1, 1, 1, 1}
+     {-1, 1, 1, 1}},
-		},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+    {// quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
+     {1, 1, 1, -1},
-				{-1,-1,-1, 1},
+     {-1, -1, -1, 1},
-				{-1,-1,-1, 1},
+     {-1, -1, -1, 1},
-				{-1,-1,-1, 1}
+     {-1, -1, -1, 1}},
-		},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+    {// quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1,-1,-1,-1},
+     {1, -1, -1, -1},
-				{ 1,-1,-1,-1},
+     {1, -1, -1, -1},
-				{ 1,-1,-1,-1},
+     {1, -1, -1, -1},
-				{-1, 1, 1, 1}
+     {-1, 1, 1, 1}},
-		},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+    {// quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
-				{ 1, 1, 1,-1},
+     {1, 1, 1, -1},
-				{ 1, 1, 1,-1},
+     {1, 1, 1, -1},
-				{ 1, 1, 1,-1},
+     {1, 1, 1, -1},
-				{-1,-1,-1, 1}
+     {-1, -1, -1, 1}}};
-		}};
 __constant__ float HWINDOW2[] = {0.049009f, 0.145142f, 0.235698f, 0.317197f,
                                 0.386505f, 0.440961f, 0.478470f, 0.497592f};
+inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct);  // does not scale by y[0] (y[7]) by 1/sqrt[0]
+inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct);           // used in GPU_DTT24_DRV
+inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct);           // used in GPU_DTT24_DRV
+inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct);              // not used
+inline __device__ void dctiv_nodiverg(float *x0, int inc);                              // used in TP
+inline __device__ void dstiv_nodiverg(float *x0, int inc);                              // used in TP
-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
+inline __device__ void dct_ii8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dct_iv8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
+inline __device__ void dst_iv8(float x[8], float y[8]);          // x,y point to 8-element arrays each // not used
-inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
+inline __device__ void _dctii_nrecurs8(float x[8], float y[8]);  // x,y point to 8-element arrays each // not used
-inline __device__ void dctiv_nodiverg  (float * x0,  int inc);                    // used in TP
+inline __device__ void _dctiv_nrecurs8(float x[8], float y[8]);  // x,y point to 8-element arrays each // not used
-inline __device__ void dstiv_nodiverg  (float * x0,  int inc);                    // used in TP
-inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
 /**
 **************************************************************************
@@ -140,11 +137,9 @@ inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point t
 * \return None
 */
 #ifdef BBBB
-extern "C"
+extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode) {
-__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
-{
    int dtt_mode0 = dtt_mode & 1;
-	int dtt_mode1 = (dtt_mode >>1) & 1;
+    int dtt_mode1 = (dtt_mode >> 1) & 1;
    __shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
@@ -162,17 +157,17 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
    __syncthreads();
    // horizontal pass
    if (dtt_mode > 3) {
-    	dttii_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
+        dttii_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
    } else {
-    	dttiv_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
+        dttiv_shared_mem(block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
    }
    __syncthreads();
    // vertical pass
    if (dtt_mode > 3) {
-    	dttii_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+        dttii_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
    } else {
-    	dttiv_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+        dttiv_shared_mem(bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
    }
    __syncthreads();
    for (unsigned int i = 0; i < DTT_SIZE; i++)
@@ -180,50 +175,47 @@ __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mo
 }
 #endif  //#ifdef BBBB
+inline __device__ void _dctiv_nrecurs8(float x[8], float y[8])  // x,y point to 8-element arrays each
-inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
 {
-	float u00=            ( COSN2[0] * x[0] + SINN2[0] * x[7]);
+    float u00 = (COSN2[0] * x[0] + SINN2[0] * x[7]);
-	float u10=            (-SINN2[3] * x[3] + COSN2[3] * x[4]);
+    float u10 = (-SINN2[3] * x[3] + COSN2[3] * x[4]);
-	float u01=            ( COSN2[1] * x[1] + SINN2[1] * x[6]);
+    float u01 = (COSN2[1] * x[1] + SINN2[1] * x[6]);
-	float u11=           -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
+    float u11 = -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
-	float u02=            ( COSN2[2] * x[2] + SINN2[2] * x[5]);
+    float u02 = (COSN2[2] * x[2] + SINN2[2] * x[5]);
-	float u12=            (-SINN2[1] * x[1] + COSN2[1] * x[6]);
+    float u12 = (-SINN2[1] * x[1] + COSN2[1] * x[6]);
-	float u03=            ( COSN2[3] * x[3] + SINN2[3] * x[4]);
+    float u03 = (COSN2[3] * x[3] + SINN2[3] * x[4]);
-	float u13=           -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
+    float u13 = -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
+    float ua00 = u00 + u03;
-	float ua10= u00 - u03;
+    float ua10 = u00 - u03;
-	float ua01= u01 + u02;
+    float ua01 = u01 + u02;
-	float ua11= u01 - u02;
+    float ua11 = u01 - u02;
-	float v00= ua00 + ua01;
+    float v00 = ua00 + ua01;
-	float v02= ua00 - ua01;
+    float v02 = ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
+    float ub00 = u10 + u13;
-	float ub10= u10 - u13;
+    float ub10 = u10 - u13;
-	float ub01= u11 + u12;
+    float ub01 = u11 + u12;
-	float ub11= u11 - u12;
+    float ub11 = u11 - u12;
-	float vb00= ub00 + ub01;
+    float vb00 = ub00 + ub01;
-	float vb01= ub00 - ub01;
+    float vb01 = ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    y[0] = SQRT_2 * v00;  // w0[0];
    y[1] = v01 - vb11;    // w1[0];
@@ -238,63 +230,62 @@ inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to
    y[7] = SQRT_2 * vb00;  // w1[3];
 }
-__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
+__device__ void _dttiv(float x0, float x1, float x2, float x3, float x4, float x5, float x6, float x7,
-		float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
+                       float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct) {
-{
    float u00, u01, u02, u03, u10, u11, u12, u13;
    if (dst_not_dct) {  // DSTIV
-		u00=  ( COSN2[0] * x7 + SINN2[0] * x0);
+        u00 = (COSN2[0] * x7 + SINN2[0] * x0);
-		u10=  (-SINN2[3] * x4 + COSN2[3] * x3);
+        u10 = (-SINN2[3] * x4 + COSN2[3] * x3);
-		u01=  ( COSN2[1] * x6 + SINN2[1] * x1);
+        u01 = (COSN2[1] * x6 + SINN2[1] * x1);
-		u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
+        u11 = -(-SINN2[2] * x5 + COSN2[2] * x2);
-		u02=  ( COSN2[2] * x5 + SINN2[2] * x2);
+        u02 = (COSN2[2] * x5 + SINN2[2] * x2);
-		u12=  (-SINN2[1] * x6 + COSN2[1] * x1);
+        u12 = (-SINN2[1] * x6 + COSN2[1] * x1);
-		u03=  ( COSN2[3] * x4 + SINN2[3] * x3);
+        u03 = (COSN2[3] * x4 + SINN2[3] * x3);
-		u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
+        u13 = -(-SINN2[0] * x7 + COSN2[0] * x0);
    } else {  // DCTIV
-		u00=  ( COSN2[0] * x0 + SINN2[0] * x7);
+        u00 = (COSN2[0] * x0 + SINN2[0] * x7);
-		u10=  (-SINN2[3] * x3 + COSN2[3] * x4);
+        u10 = (-SINN2[3] * x3 + COSN2[3] * x4);
-		u01=  ( COSN2[1] * x1 + SINN2[1] * x6);
+        u01 = (COSN2[1] * x1 + SINN2[1] * x6);
-		u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
+        u11 = -(-SINN2[2] * x2 + COSN2[2] * x5);
-		u02=  ( COSN2[2] * x2 + SINN2[2] * x5);
+        u02 = (COSN2[2] * x2 + SINN2[2] * x5);
-		u12=  (-SINN2[1] * x1 + COSN2[1] * x6);
+        u12 = (-SINN2[1] * x1 + COSN2[1] * x6);
-		u03=  ( COSN2[3] * x3 + SINN2[3] * x4);
+        u03 = (COSN2[3] * x3 + SINN2[3] * x4);
-		u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
+        u13 = -(-SINN2[0] * x0 + COSN2[0] * x7);
    }
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
+    float ua00 = u00 + u03;
-	float ua10= u00 - u03;
+    float ua10 = u00 - u03;
-	float ua01= u01 + u02;
+    float ua01 = u01 + u02;
-	float ua11= u01 - u02;
+    float ua11 = u01 - u02;
-	float v00= ua00 + ua01;
+    float v00 = ua00 + ua01;
-	float v02= ua00 - ua01;
+    float v02 = ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
+    float ub00 = u10 + u13;
-	float ub10= u10 - u13;
+    float ub10 = u10 - u13;
-	float ub01= u11 + u12;
+    float ub01 = u11 + u12;
-	float ub11= u11 - u12;
+    float ub11 = u11 - u12;
-	float vb00= ub00 + ub01;
+    float vb00 = ub00 + ub01;
-	float vb01= ub00 - ub01;
+    float vb01 = ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    *y0 = v00 * 0.5f;  // w0[0];
    // j == 1
@@ -316,8 +307,7 @@ __device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,
    }
 }
-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
+inline __device__ void dttii_shared_mem(float *x0, int inc, int dst_not_dct) {
-{
    float *x1 = x0 + inc;
    float *x2 = x1 + inc;
    float *x3 = x2 + inc;
@@ -328,52 +318,52 @@ inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
    float u00, u01, u02, u03, u10, u11, u12, u13;
    if (dst_not_dct) {  // DSTII
        // invert odd input samples
-		u00= ( (*x0) - (*x7));
+        u00 = ((*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
+        u10 = ((*x0) + (*x7));
-		u01= (-(*x1) + (*x6));
+        u01 = (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
+        u11 = (-(*x1) - (*x6));
-		u02= ( (*x2) - (*x5));
+        u02 = ((*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
+        u12 = ((*x2) + (*x5));
-		u03= (-(*x3) + (*x4));
+        u03 = (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
+        u13 = (-(*x3) - (*x4));
    } else {  // DCTII
-		u00= ( (*x0) + (*x7));
+        u00 = ((*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
+        u10 = ((*x0) - (*x7));
-		u01= ( (*x1) + (*x6));
+        u01 = ((*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
+        u11 = ((*x1) - (*x6));
-		u02= ( (*x2) + (*x5));
+        u02 = ((*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
+        u12 = ((*x2) - (*x5));
-		u03= ( (*x3) + (*x4));
+        u03 = ((*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
+        u13 = ((*x3) - (*x4));
    }
    //	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-		float w00= u00 + u03;
+    float w00 = u00 + u03;
-		float w10= u00 - u03;
+    float w10 = u00 - u03;
-		float w01= (u01 + u02);
+    float w01 = (u01 + u02);
-		float w11= (u01 - u02);
+    float w11 = (u01 - u02);
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
    float v11 = w20 - w21 - w30 + w31;
    float v12 = w20 - w21 + w30 - w31;
    if (dst_not_dct) {  // DSTII
        // Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+        *x0 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
        *x1 = v03 * SQRT1_8;
        *x2 = v12 * SQRT1_8;
@@ -395,12 +385,11 @@ inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
        *x5 = v12 * SQRT1_8;
        *x6 = v03 * SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+        *x7 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
    }
 }
-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
+inline __device__ void dttii_shared_mem_nonortho(float *x0, int inc, int dst_not_dct) {
-{
    float *x1 = x0 + inc;
    float *x2 = x1 + inc;
    float *x3 = x2 + inc;
@@ -411,52 +400,52 @@ inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_n
    float u00, u01, u02, u03, u10, u11, u12, u13;
    if (dst_not_dct) {  // DSTII
        // invert odd input samples
-		u00= ( (*x0) - (*x7));
+        u00 = ((*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
+        u10 = ((*x0) + (*x7));
-		u01= (-(*x1) + (*x6));
+        u01 = (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
+        u11 = (-(*x1) - (*x6));
-		u02= ( (*x2) - (*x5));
+        u02 = ((*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
+        u12 = ((*x2) + (*x5));
-		u03= (-(*x3) + (*x4));
+        u03 = (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
+        u13 = (-(*x3) - (*x4));
    } else {  // DCTII
-		u00= ( (*x0) + (*x7));
+        u00 = ((*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
+        u10 = ((*x0) - (*x7));
-		u01= ( (*x1) + (*x6));
+        u01 = ((*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
+        u11 = ((*x1) - (*x6));
-		u02= ( (*x2) + (*x5));
+        u02 = ((*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
+        u12 = ((*x2) - (*x5));
-		u03= ( (*x3) + (*x4));
+        u03 = ((*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
+        u13 = ((*x3) - (*x4));
    }
    //	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-		float w00= u00 + u03;
+    float w00 = u00 + u03;
-		float w10= u00 - u03;
+    float w10 = u00 - u03;
-		float w01= (u01 + u02);
+    float w01 = (u01 + u02);
-		float w11= (u01 - u02);
+    float w11 = (u01 - u02);
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
    float v11 = w20 - w21 - w30 + w31;
    float v12 = w20 - w21 + w30 - w31;
    if (dst_not_dct) {  // DSTII
        // Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+        *x0 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
        *x1 = v03 * SQRT1_8;
        *x2 = v12 * SQRT1_8;
@@ -478,12 +467,11 @@ inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_n
        *x5 = v12 * SQRT1_8;
        *x6 = v03 * SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+        *x7 = (w30 + w31) * 0.5f;  // v13 * SQRT1_8; z10 * 0.5f
    }
 }
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
+inline __device__ void dttiv_shared_mem(float *x0, int inc, int dst_not_dct) {
-{
    float *x1 = x0 + inc;
    float *x2 = x1 + inc;
    float *x3 = x2 + inc;
@@ -493,59 +481,58 @@ inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
    float *x7 = x6 + inc;
    float u00, u01, u02, u03, u10, u11, u12, u13;
    if (dst_not_dct) {  // DSTIV
-		u00=  ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
+        u00 = (COSN2[0] * (*x7) + SINN2[0] * (*x0));
-		u10=  (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
+        u10 = (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
-		u01=  ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
+        u01 = (COSN2[1] * (*x6) + SINN2[1] * (*x1));
-		u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
+        u11 = -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
-		u02=  ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
+        u02 = (COSN2[2] * (*x5) + SINN2[2] * (*x2));
-		u12=  (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
+        u12 = (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
-		u03=  ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
+        u03 = (COSN2[3] * (*x4) + SINN2[3] * (*x3));
-		u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
+        u13 = -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
    } else {  // DCTIV
-		u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+        u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
-		u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+        u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-		u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
+        u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
-		u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
+        u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-		u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+        u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
-		u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+        u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-		u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
+        u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
-		u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+        u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
    }
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
+    float ua00 = u00 + u03;
-	float ua10= u00 - u03;
+    float ua10 = u00 - u03;
-	float ua01= u01 + u02;
+    float ua01 = u01 + u02;
-	float ua11= u01 - u02;
+    float ua11 = u01 - u02;
-	float v00= ua00 + ua01;
+    float v00 = ua00 + ua01;
-	float v02= ua00 - ua01;
+    float v02 = ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
+    float ub00 = u10 + u13;
-	float ub10= u10 - u13;
+    float ub10 = u10 - u13;
-	float ub01= u11 + u12;
+    float ub01 = u11 + u12;
-	float ub11= u11 - u12;
+    float ub11 = u11 - u12;
-	float vb00= ub00 + ub01;
+    float vb00 = ub00 + ub01;
-	float vb01= ub00 - ub01;
+    float vb01 = ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    *x0 = v00 * 0.5f;                   // w0[0];
    *x2 = (v01 + vb11) * SQRT1_8;       // w0[1];
@@ -564,9 +551,8 @@ inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
    }
 }
-inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
+inline __device__ void dttiv_nodiverg(float *x, int inc, int dst_not_dct) {
-{
+    float sgn = 1 - 2 * dst_not_dct;
-	float sgn = 1 - 2* dst_not_dct;
    float *y0 = x;
    float *y1 = y0 + inc;
    float *y2 = y1 + inc;
@@ -587,46 +573,45 @@ inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
    float *x6 = x5 + inc;
    float *x7 = x6 + inc;
    float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-	float ua00= u00 + u03;
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
+    float ua00 = u00 + u03;
-	float ua11= u01 - u02;
+    float ua10 = u00 - u03;
-	float v00= ua00 + ua01;
+    float ua01 = u01 + u02;
-	float v02= ua00 - ua01;
+    float ua11 = u01 - u02;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v00 = ua00 + ua01;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v02 = ua00 - ua01;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-	float ub00= u10 + u13;
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
+    float ub00 = u10 + u13;
-	float ub11= u11 - u12;
+    float ub10 = u10 - u13;
-	float vb00= ub00 + ub01;
+    float ub01 = u11 + u12;
-	float vb01= ub00 - ub01;
+    float ub11 = u11 - u12;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+    float vb00 = ub00 + ub01;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb01 = ub00 - ub01;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    *y0 = v00 * 0.5f;                    // w0[0];
    *y2 = (v01 + vb11) * SQRT1_8;        // w0[1];
@@ -638,8 +623,7 @@ inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
    *y7 = sgn * vb00 * 0.5f;             // w1[3];
 }
-inline __device__ void dctiv_nodiverg(float * x0,  int inc)
+inline __device__ void dctiv_nodiverg(float *x0, int inc) {
-{
    float *x1 = x0 + inc;
    float *x2 = x1 + inc;
    float *x3 = x2 + inc;
@@ -648,46 +632,45 @@ inline __device__ void dctiv_nodiverg(float * x0,  int inc)
    float *x6 = x5 + inc;
    float *x7 = x6 + inc;
    float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-	float ua00= u00 + u03;
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
+    float ua00 = u00 + u03;
-	float ua11= u01 - u02;
+    float ua10 = u00 - u03;
-	float v00= ua00 + ua01;
+    float ua01 = u01 + u02;
-	float v02= ua00 - ua01;
+    float ua11 = u01 - u02;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v00 = ua00 + ua01;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v02 = ua00 - ua01;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-	float ub00= u10 + u13;
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
+    float ub00 = u10 + u13;
-	float ub11= u11 - u12;
+    float ub10 = u10 - u13;
-	float vb00= ub00 + ub01;
+    float ub01 = u11 + u12;
-	float vb01= ub00 - ub01;
+    float ub11 = u11 - u12;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+    float vb00 = ub00 + ub01;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb01 = ub00 - ub01;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    *x0 = v00 * 0.5f;              // w0[0];
    *x2 = (v01 + vb11) * SQRT1_8;  // w0[1];
@@ -699,8 +682,7 @@ inline __device__ void dctiv_nodiverg(float * x0,  int inc)
    *x7 = vb00 * 0.5f;             // w1[3];
 }
-inline __device__ void dstiv_nodiverg(float * x,  int inc)
+inline __device__ void dstiv_nodiverg(float *x, int inc) {
-{
    float *x0 = x + 7 * inc;
    // negate inc, replace
    inc = -inc;
@@ -712,46 +694,45 @@ inline __device__ void dstiv_nodiverg(float * x,  int inc)
    float *x6 = x5 + inc;
    float *x7 = x6 + inc;
    float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+    u00 = (COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+    u10 = (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+    u01 = (COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+    u11 = -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u02 = (COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
+    u12 = (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    u03 = (COSN2[3] * (*x3) + SINN2[3] * (*x4));
+    u13 = -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-	float ua00= u00 + u03;
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
+    float ua00 = u00 + u03;
-	float ua11= u01 - u02;
+    float ua10 = u00 - u03;
-	float v00= ua00 + ua01;
+    float ua01 = u01 + u02;
-	float v02= ua00 - ua01;
+    float ua11 = u01 - u02;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v00 = ua00 + ua01;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+    float v02 = ua00 - ua01;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    float v01 = COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+    float v03 = COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-	float ub00= u10 + u13;
+    //	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
+    float ub00 = u10 + u13;
-	float ub11= u11 - u12;
+    float ub10 = u10 - u13;
-	float vb00= ub00 + ub01;
+    float ub01 = u11 + u12;
-	float vb01= ub00 - ub01;
+    float ub11 = u11 - u12;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+    float vb00 = ub00 + ub01;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+    float vb01 = ub00 - ub01;
+    float vb10 = COSPI_1_8_SQRT2 * ub10 + COSPI_3_8_SQRT2 * ub11;
+    float vb11 = COSPI_3_8_SQRT2 * ub10 - COSPI_1_8_SQRT2 * ub11;
    *x7 = v00 * 0.5f;              // w0[0];
    *x5 = (v01 + vb11) * SQRT1_8;  // w0[1];
@@ -764,49 +745,47 @@ inline __device__ void dstiv_nodiverg(float * x,  int inc)
    *x0 = -vb00 * 0.5f;             // w1[3];
 }
+inline __device__ void _dctii_nrecurs8(float x[8], float y[8])  // x,y point to 8-element arrays each
-inline  __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
 {
-	float u00= (x[0] + x[7]);
+    float u00 = (x[0] + x[7]);
-	float u10= (x[0] - x[7]);
+    float u10 = (x[0] - x[7]);
-	float u01= (x[1] + x[6]);
+    float u01 = (x[1] + x[6]);
-	float u11= (x[1] - x[6]);
+    float u11 = (x[1] - x[6]);
-	float u02= (x[2] + x[5]);
+    float u02 = (x[2] + x[5]);
-	float u12= (x[2] - x[5]);
+    float u12 = (x[2] - x[5]);
-	float u03= (x[3] + x[4]);
+    float u03 = (x[3] + x[4]);
-	float u13= (x[3] - x[4]);
+    float u13 = (x[3] - x[4]);
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+    //	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float w00= u00 + u03;
+    float w00 = u00 + u03;
-	float w10= u00 - u03;
+    float w10 = u00 - u03;
-	float w01= (u01 + u02);
+    float w01 = (u01 + u02);
-	float w11= (u01 - u02);
+    float w11 = (u01 - u02);
-	float v00= w00 + w01;
+    float v00 = w00 + w01;
-	float v02= w00 - w01;
+    float v02 = w00 - w01;
-	float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+    float v01 = COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-	float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+    float v03 = COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+    //	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+    float w20 = (COSN1[0] * u10 + SINN1[0] * u13);
-	float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+    float w30 = (-SINN1[1] * u11 + COSN1[1] * u12);
-	float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+    float w21 = (COSN1[1] * u11 + SINN1[1] * u12);
-	float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+    float w31 = -(-SINN1[0] * u10 + COSN1[0] * u13);
-//	_dctii_nrecurs2(u00, u01, &v00, &v01);
+    //	_dctii_nrecurs2(u00, u01, &v00, &v01);
-	float z00= w20 + w21;
+    float z00 = w20 + w21;
-	float z01= w20 - w21;
+    float z01 = w20 - w21;
-//	_dctii_nrecurs2(u10, u11, &v10, &v11);
+    //	_dctii_nrecurs2(u10, u11, &v10, &v11);
-	float z10= w30 + w31;
+    float z10 = w30 + w31;
-	float z11= w30 - w31;
+    float z11 = w30 - w31;
    float v10 = SQRT_2 * z00;
    float v11 = z01 - z11;
@@ -827,85 +806,80 @@ inline  __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to
    y[7] = v13;
 }
-inline  __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
+inline __device__ void dct_ii8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
    _dctii_nrecurs8(x, y);
 #pragma unroll
-	for (int i = 0; i < 8 ; i++) {
+    for (int i = 0; i < 8; i++) {
        y[i] *= SQRT1_8;
    }
 }
+__device__ void dct_iv8(float x[8], float y[8])  // x,y point to 8-element arrays each
-__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
 {
    _dctiv_nrecurs8(x, y);
 #pragma unroll
-	for (int i = 0; i < 8 ; i++) {
+    for (int i = 0; i < 8; i++) {
        y[i] *= SQRT1_8;
    }
 }
-inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
+inline __device__ void dst_iv8(float x[8], float y[8])  // x,y point to 8-element arrays each
 {
    float xr[8];
 #pragma unroll
-	for (int i=0; i < 8;i++){
+    for (int i = 0; i < 8; i++) {
        xr[i] = x[7 - i];
    }
    _dctiv_nrecurs8(xr, y);
 #pragma unroll
-	for (int i=0; i < 8;i+=2){
+    for (int i = 0; i < 8; i += 2) {
        y[i] *= SQRT1_8;
-		y[i+1] *= -SQRT1_8;
+        y[i + 1] *= -SQRT1_8;
    }
 }
 //=========================== 2D functions ===============
 __device__ void corrUnfoldTile(
    int corr_radius,
-		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+    float *qdata0,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+    float *rslt)    //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
 {
    int size2r1 = 2 * corr_radius + 1;  // 15
-	int crp1 = corr_radius + 1;        //8
+    int crp1 = corr_radius + 1;         // 8
-///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
+    ///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
    int rslt_base_index = size2r1 * crp1 - crp1;  // offset of the center
-	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
+    float *qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
+    float *qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
+    float *qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
    int i = threadIdx.x;
    if (i > corr_radius) {
        return;  // not needed, only use inner
    }
-//	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
+    //	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
    float corr_pixscale = 0.25f;
    int i_transform_size = i * DTT_SIZE1;                   // used to address source rows which are 9 long
    int im1_transform_size = i_transform_size - DTT_SIZE1;  // negative for i = 0, use only after divergence
-///	int rslt_row_offs = i * DTT_SIZE2M1;
+                                                            ///	int rslt_row_offs = i * DTT_SIZE2M1;
    int rslt_row_offs = i * size2r1;
    int rslt_base_index_p = rslt_base_index + rslt_row_offs;             // i * DTT_SIZE2M1;
    int rslt_base_index_m = rslt_base_index - rslt_row_offs;             // i * DTT_SIZE2M1;
    rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size];  // incomplete, will only be used for thread i=0
    rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                   // nop for i=0 incomplete, will only be used for thread i=0
-///	for (int j = 1; j < DTT_SIZE; j++) {
+                                                                         ///	for (int j = 1; j < DTT_SIZE; j++) {
    for (int j = 1; j <= corr_radius; j++) {
        int rslt_base_index_pp = rslt_base_index_p + j;
        int rslt_base_index_pm = rslt_base_index_p - j;
-		rslt[rslt_base_index_pp] = corr_pixscale * (
+        rslt[rslt_base_index_pp] = corr_pixscale * (qdata0[i_transform_size + j] +
-				 qdata0[i_transform_size + j] +
+                                                    qdata1[i_transform_size + j - 1]);  // incomplete, will only be used for thread i=0
-				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
+        rslt[rslt_base_index_pm] = corr_pixscale * (qdata0[i_transform_size + j] +
-		rslt[rslt_base_index_pm] = corr_pixscale * (
+                                                    -qdata1[i_transform_size + j - 1]);  // incomplete, will only be used for thread i=0
-				 qdata0[i_transform_size + j] +
-				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
    }
    if (i == 0) {
        return;
    }
-///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
+    ///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
    float d = corr_pixscale * qdata2[im1_transform_size];
    rslt[rslt_base_index_p] += d;
    rslt[rslt_base_index_m] -= d;
@@ -915,8 +889,8 @@ __device__ void corrUnfoldTile(
        int rslt_base_index_mp = rslt_base_index_m + j;
        int rslt_base_index_mm = rslt_base_index_m - j;
        float d2 = corr_pixscale * qdata2[im1_transform_size + j];
-		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
+        float d3 = corr_pixscale * qdata3[im1_transform_size + j - 1];
-		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
+        // rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
        rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
        rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
        rslt[rslt_base_index_pp] += d2 + d3;
@@ -925,85 +899,81 @@ __device__ void corrUnfoldTile(
 }
 __device__ void dttii_2d(
-		float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+    float *clt_corr)  // shared memory, [4][DTT_SIZE1][DTT_SIZE]
 {
    // change to 16-32 threads?? in next iteration
    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-    for (int q = 0; q < 4; q++){
+    for (int q = 0; q < 4; q++) {
        int is_sin = (q >> 1) & 1;
-    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+        dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x, DTT_SIZE1, is_sin);  // vertical pass, thread is column
    }
    __syncthreads();
    // hor pass, corresponding to vert pass in Java
-    for (int q = 0; q < 4; q++){
+    for (int q = 0; q < 4; q++) {
        int is_sin = q & 1;
-    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+        dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1, 1, is_sin);  // horizontal pass, tread is row
    }
    __syncthreads();
 }
 __device__ void dttiv_color_2d(
-		float * clt_tile,
+    float *clt_tile,
-		int color)
+    int color) {
-{
    dctiv_nodiverg(                            // all colors
        clt_tile + (DTT_SIZE1 * threadIdx.x),  // [0][threadIdx.x], // pointer to start of row
-			1); //int inc);
+        1);                                    // int inc);
-//	__syncthreads();// worsened
+    //	__syncthreads();// worsened
-    if (color == BAYER_GREEN){
+    if (color == BAYER_GREEN) {
        dstiv_nodiverg(                                                 // all colors
            clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE,  // clt_tile[1][threadIdx.x], // pointer to start of row
-    			1); //int inc);
+            1);                                                         // int inc);
    }
-  	 __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #ifdef DEBUG222
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
-        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
+        printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+        debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    dctiv_nodiverg(              // all colors
        clt_tile + threadIdx.x,  //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
        DTT_SIZE1);              // int inc,
-//	__syncthreads();// worsened
+                                 //	__syncthreads();// worsened
-    if (color == BAYER_GREEN){
+    if (color == BAYER_GREEN) {
        dctiv_nodiverg(                                       // all colors
            clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE),  // &clt_tile[1][0][threadIdx.x], // pointer to start of column
            DTT_SIZE1);                                       // int inc,
    }
-  	 __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 }
 __device__ void dttiv_mono_2d(
-		float * clt_tile)
+    float *clt_tile) {
-{
    // Copy 0-> 1
    dctiv_nodiverg(
        clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        1);  // int inc);
    dstiv_nodiverg(
        clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        1);  // int inc);
    dctiv_nodiverg(
        clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        1);  // int inc);
    dstiv_nodiverg(
        clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
-			1); //int inc);
+        1);           // int inc);
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #ifdef DEBUG222
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
-        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
+        printf("\nDTT Tiles after horizontal pass, color=%d\n", color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+        debug_print_clt1(clt_tile, color, (color == BAYER_GREEN) ? 3 : 1);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    dctiv_nodiverg(  // CC
@@ -1018,47 +988,45 @@ __device__ void dttiv_mono_2d(
    dstiv_nodiverg(                                           // SS
        clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE),  // &clt_tile[1][0][threadIdx.x], // pointer to start of column
        DTT_SIZE1);                                           // int inc,
-  	 __syncthreads();// __syncwarp();
+    __syncthreads();                                          // __syncwarp();
 }
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
 __device__ void imclt(
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float *clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile ) //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    float *mclt_tile)  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 {
    int thr3 = threadIdx.x >> 3;
    int column = threadIdx.x;  // modify to use 2*8 threads, if needed.
    int thr012 = threadIdx.x & 7;
    int column4 = threadIdx.x >> 2;
-//	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
+    //	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
-//	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
+    //	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
-	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
+    int wcolumn = ((thr3 << 3) - thr3) ^ thr012;  // 0..7,7,..0
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
 #ifdef DEBUG3
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
        printf("\nDTT Tiles before IDTT\n");
        debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    // perform horizontal dct-iv on quadrants 0 and 1
    dctiv_nodiverg(
-    		clt_tile +  DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
+        clt_tile + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3),  // pointer to start of row for quadrants 0 and 2
        1);
    // perform horizontal dst-iv on quadrants 2 and 3
    dstiv_nodiverg(                                              // all colors
-    		clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
+        clt_tile1 + DTT_SIZE1 * (thr012 + 2 * DTT_SIZE * thr3),  // pointer to start of row for quadrants 1 and 3
        1);
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
                      // perform vertical   dct-iv on quadrants 0 and 2
    dctiv_nodiverg(
        clt_tile + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3,  // pointer to start of row for quadrants 0 and 1
@@ -1067,101 +1035,99 @@ __device__ void imclt(
    dstiv_nodiverg(
        clt_tile2 + thr012 + (DTT_SIZE1 * DTT_SIZE) * thr3,  // pointer to start of row for quadrants 2 and 3
        DTT_SIZE1);
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #ifdef DEBUG3
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
        printf("\nDTT Tiles after IDTT\n");
        debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    float hw = HWINDOW2[wcolumn];
    int clt_offset = imclt_indx9[column];  // index in each of the 4 iclt quadrants, accounting for stride=9
-    float * rslt = mclt_tile + column;
+    float *rslt = mclt_tile + column;
 #pragma unroll
-    for (int i = 0; i < 4; i++){
+    for (int i = 0; i < 4; i++) {
        float val = *rslt;
        float w = HWINDOW2[i] * hw;
        float d0 = idct_signs[0][0][column4] * (*(clt_tile + clt_offset));
        float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
        float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
        float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
+        d0 += d1;
-    	d2+=d3;
+        d2 += d3;
-    	d0+= d2;
+        d0 += d2;
-    	if (i < 3){
+        if (i < 3) {
            clt_offset += DTT_SIZE1;
        }
-//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+        //    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    	val = __fmaf_rd(w,d0,val); // w*d0 + val
+        val = __fmaf_rd(w, d0, val);  // w*d0 + val
        *rslt = val;
        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 4; i < 8; i++){
+    for (int i = 4; i < 8; i++) {
        float val = *rslt;
        float w = HWINDOW2[i] * hw;
        float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
        float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
        float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
        float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
+        d0 += d1;
-    	d2+=d3;
+        d2 += d3;
-    	d0+= d2;
+        d0 += d2;
-//    	if (i < 7){
+        //    	if (i < 7){
        clt_offset -= DTT_SIZE1;
-//    	}
+        //    	}
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 7; i >= 4; i--){
+    for (int i = 7; i >= 4; i--) {
        float val = *rslt;
        float w = HWINDOW2[i] * hw;
        float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
        float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
        float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
        float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
+        d0 += d1;
-    	d2+=d3;
+        d2 += d3;
-    	d0+= d2;
+        d0 += d2;
-    	if (i > 4){
+        if (i > 4) {
            clt_offset -= DTT_SIZE1;
        }
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
        rslt += DTT_SIZE21;
    }
 #pragma unroll
-    for (int i = 3; i >= 0; i--){
+    for (int i = 3; i >= 0; i--) {
        float val = *rslt;
        float w = HWINDOW2[i] * hw;
        float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
        float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
        float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
        float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    	d0+=d1;
+        d0 += d1;
-    	d2+=d3;
+        d2 += d3;
-    	d0+= d2;
+        d0 += d2;
-    	if (i > 0){
+        if (i > 0) {
            clt_offset += DTT_SIZE1;
        }
-    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+        *rslt = __fmaf_rd(w, d0, val);  // w*d0 + val
        rslt += DTT_SIZE21;
    }
 #ifdef DEBUG3
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
-    if ((threadIdx.x) == 0){
+    if ((threadIdx.x) == 0) {
        printf("\nMCLT Tiles after IMCLT\n");
        debug_print_mclt(mclt_tile, -1);  // only 1 quadrant for R,B and 2 - for G
    }
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
 }
 // Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
@@ -1169,24 +1135,23 @@ __device__ void imclt(
 __device__ void imclt8threads(
    int do_acc,        // 1 - add to previous value, 0 - overwrite
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float *clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    float *mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-		int debug)
+    int debug) {
-{
+    //	int thr3 =    threadIdx.x >> 3;
-//	int thr3 =    threadIdx.x >> 3;
+    //	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-//	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
+    //	int thr012 =  threadIdx.x & 7;
-//	int thr012 =  threadIdx.x & 7;
+    //	int column4 = threadIdx.x >> 2;
-//	int column4 = threadIdx.x >> 2;
+    //	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
-//	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
+    float *clt_tile1 = clt_tile + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+    float *clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
 #ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
        printf("\nDTT Tiles before IDTT\n");
        debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25);  // only 1 quadrant for R,B and 2 - for G
    }
-     __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    // perform horizontal dct-iv on quadrants 0 and 1
@@ -1203,7 +1168,7 @@ __device__ void imclt8threads(
    dstiv_nodiverg(                                           // quadrant 3
        clt_tile + threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 3
        DTT_SIZE1);
-    __syncthreads();// __syncwarp();
+    __syncthreads();                         // __syncwarp();
                                             // perform vertical   dct-iv on quadrants 0 and 2
    dctiv_nodiverg(                          // quadrant 0
        clt_tile + DTT_SIZE1 * threadIdx.x,  // pointer to start of row for quadrant 0
@@ -1218,37 +1183,37 @@ __device__ void imclt8threads(
    dstiv_nodiverg(                                                       // quadrant 3
        clt_tile + DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1),  // pointer to start of row for quadrant 3
        1);
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
        printf("\nDTT Tiles after IDTT\n");
        debug_print_clt_scaled(clt_tile, -1, 0xf, 0.25);  // only 1 quadrant for R,B and 2 - for G
    }
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
    // re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
-    for (int thr3 = 0; thr3 < 2; thr3++){
+    for (int thr3 = 0; thr3 < 2; thr3++) {
        int thr3m = (thr3 << 3);
        int column = threadIdx.x + thr3m;       // modify to use 2*8 threads, if needed.
        int thr012 = threadIdx.x & 7;           // == threadIdx.x
        int column4 = column >> 2;              // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
-    	int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
+        int wcolumn = (thr3m - thr3) ^ thr012;  // 0..7,7,..0
        float hw = HWINDOW2[wcolumn];
        int clt_offset = imclt_indx9[column];  // index in each of the 4 iclt quadrants, accounting for stride=9
-    	float * rslt = mclt_tile + column;
+        float *rslt = mclt_tile + column;
 #ifdef DEBUG7
-        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)) {
            printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
                   thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
            debug_print_clt1(clt_tile, -1, 0xf);  // only 1 quadrant for R,B and 2 - for G
        }
-    __syncthreads();// __syncwarp();
+        __syncthreads();  // __syncwarp();
 #endif
 #pragma unroll
-    	for (int i = 0; i < 4; i++){
+        for (int i = 0; i < 4; i++) {
            float val = *rslt;
            // facc
            float w = HWINDOW2[i] * hw;
@@ -1256,91 +1221,87 @@ __device__ void imclt8threads(
            float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
            float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
            float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
+            d0 += d1;
-    		d2+=d3;
+            d2 += d3;
-    		d0+= d2;
+            d0 += d2;
-    		if (i < 3){
+            if (i < 3) {
                clt_offset += DTT_SIZE1;
            }
            //    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
            // val =__fmaf_rd(w,d0,val); // w*d0 + val
            // *rslt = val;
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
            rslt += DTT_SIZE21;
        }
 #pragma unroll
-    	for (int i = 4; i < 8; i++){
+        for (int i = 4; i < 8; i++) {
            float val = *rslt;
            float w = HWINDOW2[i] * hw;
            float d0 = idct_signs[0][1][column4] * (*(clt_tile + clt_offset));
            float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
            float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
            float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
+            d0 += d1;
-    		d2+=d3;
+            d2 += d3;
-    		d0+= d2;
+            d0 += d2;
            //    	if (i < 7){
            clt_offset -= DTT_SIZE1;
            //    	}
-//    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+            //    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
            rslt += DTT_SIZE21;
        }
 #pragma unroll
-    	for (int i = 7; i >= 4; i--){
+        for (int i = 7; i >= 4; i--) {
            float val = *rslt;
            float w = HWINDOW2[i] * hw;
            float d0 = idct_signs[0][2][column4] * (*(clt_tile + clt_offset));
            float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
            float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
            float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
+            d0 += d1;
-    		d2+=d3;
+            d2 += d3;
-    		d0+= d2;
+            d0 += d2;
-    		if (i > 4){
+            if (i > 4) {
                clt_offset -= DTT_SIZE1;
            }
            //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
            rslt += DTT_SIZE21;
        }
 #pragma unroll
-    	for (int i = 3; i >= 0; i--){
+        for (int i = 3; i >= 0; i--) {
            float val = *rslt;
            float w = HWINDOW2[i] * hw;
            float d0 = idct_signs[0][3][column4] * (*(clt_tile + clt_offset));
            float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
            float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
            float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
+            d0 += d1;
-    		d2+=d3;
+            d2 += d3;
-    		d0+= d2;
+            d0 += d2;
-    		if (i > 0){
+            if (i > 0) {
                clt_offset += DTT_SIZE1;
            }
            //*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+            *rslt = do_acc ? __fmaf_rd(w, d0, val) : w * d0;  // w*d0 + val do_acc - common for all thereads
            rslt += DTT_SIZE21;
        }
    }
 #ifdef DEBUG7
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
    for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
-		if (debug  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+        if (debug && (threadIdx.x == 0) && (threadIdx.y == ccam)) {
            printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
            debug_print_mclt(
                mclt_tile,  //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
                -1);
        }
-		__syncthreads();// __syncwarp();
+        __syncthreads();  // __syncwarp();
    }
-    __syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif
 }
 //#endif
--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -64,17 +64,16 @@
 #define DTTTEST_BLOCK_WIDTH 32
 #define DTTTEST_BLOCK_HEIGHT 16
-#define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
+#define DTTTEST_BLK_STRIDE (DTTTEST_BLOCK_WIDTH + 1)
-//extern __constant__ float idct_signs[4][4][4];
-//extern __constant__ int imclt_indx9[16];
-//extern __constant__ float HWINDOW2[];
+// extern __constant__ float idct_signs[4][4][4];
+// extern __constant__ int imclt_indx9[16];
+// extern __constant__ float HWINDOW2[];
 // kernels (not used so far)
 #if 0
 extern "C" __global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode);
-#endif// #if 0
+#endif  // #if 0
 //=========================== 2D functions ===============
 extern __device__ void corrUnfoldTile(
@@ -83,19 +82,19 @@ extern __device__ void corrUnfoldTile(
    float* rslt);   //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
 extern __device__ void dttii_2d(
-		float * clt_corr); // shared memory, [4][DTT_SIZE1][DTT_SIZE]
+    float* clt_corr);  // shared memory, [4][DTT_SIZE1][DTT_SIZE]
 extern __device__ void dttiv_color_2d(
-		float * clt_tile,
+    float* clt_tile,
    int color);
 extern __device__ void dttiv_mono_2d(
-		float * clt_tile);
+    float* clt_tile);
 extern __device__ void imclt(
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float* clt_tile,  //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile );
+    float* mclt_tile);
 extern __device__ void imclt8threads(
    int do_acc,        // 1 - add to previous value, 0 - overwrite
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+    float* clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+    float* mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
    int debug);
--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -37,16 +37,15 @@
 */
 #ifndef JCUDA
-	#include "tp_defines.h"
+#include "tp_defines.h"
-	#include "dtt8x8.h"
+#include "dtt8x8.h"
-	#include "geometry_correction.h"
+#include "geometry_correction.h"
 #endif  // #ifndef JCUDA
 #ifndef get_task_size
-	#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
 #endif
 // Using NUM_CAMS threads per tile
 #define THREADS_PER_BLOCK_GEOM (TILES_PER_BLOCK_GEOM * NUM_CAMS)
 ///#define CYCLES_COPY_GC   ((sizeof(struct gc)/sizeof(float) + THREADS_PER_BLOCK_GEOM - 1) / THREADS_PER_BLOCK_GEOM)
@@ -57,9 +56,8 @@
 #define DBG_CAM 3
-__device__ void printGeometryCorrection(struct gc * g, int num_cams);
+__device__ void printGeometryCorrection(struct gc *g, int num_cams);
-__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
+__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams);
 /**
 * Calculate non-distorted radius from distorted using table approximation
@@ -67,114 +65,124 @@ __device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams);
 * @return corresponding non-distorted radius
 */
 inline __device__ float getRByRDist(float rDist,
-		float rByRDist [RBYRDIST_LEN]); //shared memory
+                                    float rByRDist[RBYRDIST_LEN]);  // shared memory
+__constant__ float ROTS_TEMPLATE[7][3][3][3] = {  //  ...{cos,sin,const}...
-__constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
-		{ // azimuth
-				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
-				{{ 0, 0,0},{0, 0,1},{ 0, 0,0}},
-				{{ 0, 1,0},{0, 0,0},{ 1, 0,0}},
-		},{ // tilt
-				{{ 0, 0,1},{0, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{1, 0,0},{ 0, 1,0}},
-				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
-		},{ // roll*zoom
-				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,1}},
-		},{ // d_azimuth
-				{{ 0,-1,0},{0, 0,0},{-1, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
-		},{ // d_tilt
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0,-1,0},{ 1, 0,0}},
-				{{ 0, 0,0},{-1,0,0},{ 0,-1,0}},
-		},{ // d_roll
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{-1, 0,0},{0,-1,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-		},{ // d_zoom
-				{{ 1, 0,0},{0, 1,0},{ 0, 0,0}},
-				{{ 0,-1,0},{1, 0,0},{ 0, 0,0}},
-				{{ 0, 0,0},{0, 0,0},{ 0, 0,0}},
-		}
-};
-__constant__ int angles_offsets [4] = {
-		offsetof(corr_vector, azimuth)/sizeof(float),
-		offsetof(corr_vector, tilt)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float),
-		offsetof(corr_vector, roll)   /sizeof(float)};
-__constant__ int mm_seq [3][3][3]={
    {
-				{6,5,12}, // a_t * a_z -> tmp0
+        // azimuth
-				{7,6,13}, // a_r * a_t -> tmp1
+        {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
-				{7,9,14}, // a_r * a_dt -> tmp2
+        {{0, 0, 0}, {0, 0, 1}, {0, 0, 0}},
-		}, {
+        {{0, 1, 0}, {0, 0, 0}, {1, 0, 0}},
-				{7,12,0}, // a_r * tmp0 -> rot          - bad
-				{13,8,1}, // tmp1 * a_daz -> deriv0     - good
+    },
-				{14,5,2}, // tmp2 * a_az  -> deriv1     - good
+    {
-		}, {
+        // tilt
-				{10,12,3}, // a_dr * tmp0 -> deriv2     - good
+        {{0, 0, 1}, {0, 0, 0}, {0, 0, 0}},
-				{11,12,4}, // a_dzoom * tnmp0 -> deriv3 - good
+        {{0, 0, 0}, {1, 0, 0}, {0, 1, 0}},
-				{-1,-1,-1} // do nothing
+        {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
+    },
+    {
+        // roll*zoom
+        {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 1}},
+    },
+    {
+        // d_azimuth
+        {{0, -1, 0}, {0, 0, 0}, {-1, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+        {{1, 0, 0}, {0, 0, 0}, {0, -1, 0}},
+    },
+    {
+        // d_tilt
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, -1, 0}, {1, 0, 0}},
+        {{0, 0, 0}, {-1, 0, 0}, {0, -1, 0}},
+    },
+    {
+        // d_roll
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{-1, 0, 0}, {0, -1, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
+    },
+    {
+        // d_zoom
+        {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}},
+        {{0, -1, 0}, {1, 0, 0}, {0, 0, 0}},
+        {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}},
    }};
-__constant__ int offset_rots =     0;                   //0
+__constant__ int angles_offsets[4] = {
+    offsetof(corr_vector, azimuth) / sizeof(float),
+    offsetof(corr_vector, tilt) / sizeof(float),
+    offsetof(corr_vector, roll) / sizeof(float),
+    offsetof(corr_vector, roll) / sizeof(float)};
+__constant__ int mm_seq[3][3][3] = {
+    {
+        {6, 5, 12},  // a_t * a_z -> tmp0
+        {7, 6, 13},  // a_r * a_t -> tmp1
+        {7, 9, 14},  // a_r * a_dt -> tmp2
+    },
+    {
+        {7, 12, 0},  // a_r * tmp0 -> rot          - bad
+        {13, 8, 1},  // tmp1 * a_daz -> deriv0     - good
+        {14, 5, 2},  // tmp2 * a_az  -> deriv1     - good
+    },
+    {
+        {10, 12, 3},  // a_dr * tmp0 -> deriv2     - good
+        {11, 12, 4},  // a_dzoom * tnmp0 -> deriv3 - good
+        {-1, -1, -1}  // do nothing
+    }};
+__constant__ int offset_rots = 0;      // 0
 __constant__ int offset_derivs = 1;    // 1..4 // should be next
 __constant__ int offset_matrices = 5;  // 5..11
 __constant__ int offset_tmp = 12;      // 12..15
-//inline __device__ int get_task_size_gc(int num_cams);
+// inline __device__ int get_task_size_gc(int num_cams);
-inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams);
+inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams);
-inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams);
+inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams);
-//inline __device__ int get_task_size_gc(int num_cams){
+// inline __device__ int get_task_size_gc(int num_cams){
 //	return sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - num_cams);
-//}
+// }
-inline __device__ int get_task_task_gc(int num_tile, float * gpu_ftasks, int num_cams) {
+inline __device__ int get_task_task_gc(int num_tile, float *gpu_ftasks, int num_cams) {
-	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile);
+    return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile);
 }
-inline __device__ int get_task_txy_gc(int num_tile, float * gpu_ftasks, int num_cams) {
+inline __device__ int get_task_txy_gc(int num_tile, float *gpu_ftasks, int num_cams) {
-	return *(int *) (gpu_ftasks +  get_task_size(num_cams) * num_tile + 1);
+    return *(int *)(gpu_ftasks + get_task_size(num_cams) * num_tile + 1);
 }
 /**
 * Calculate rotation matrices and derivatives by az, tilt, roll, zoom
 * NUM_CAMS blocks of 3,3,3 tiles
 */
 extern "C" __global__ void calc_rot_deriv(
    int num_cams,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		trot_deriv   * gpu_rot_deriv)
+    trot_deriv *gpu_rot_deriv) {
-{
+    __shared__ float sincos[4][2];  // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
-	__shared__ float sincos  [4][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
+    __shared__ float matrices[5 + 7 + 4][3][3];
-	__shared__ float matrices[5 + 7 +4][3][3];
    float angle;
    float zoom;
    int ncam = blockIdx.x;                                 // threadIdx.z;
    int nangle1 = threadIdx.x + threadIdx.y * blockDim.x;  // * >> 1;
    int nangle = nangle1 >> 1;                             // 0: az, 1: tilt, 2: roll, 3:roll
    int is_sin = nangle1 & 1;
-	if ((threadIdx.z == 0) && (nangle < 4)){ // others just idle here
+    if ((threadIdx.z == 0) && (nangle < 4)) {                                      // others just idle here
-		float * gangles = (float *) gpu_correction_vector + angles_offsets[nangle]; // pointer for channel 0
+        float *gangles = (float *)gpu_correction_vector + angles_offsets[nangle];  // pointer for channel 0
-///		if (ncam == (NUM_CAMS-1)){ // for the whole block
+                                                                                   ///		if (ncam == (NUM_CAMS-1)){ // for the whole block
-		if (ncam == (num_cams-1)){ // for the whole block
+        if (ncam == (num_cams - 1)) {                                              // for the whole block
            angle = 0.0;
            zoom = 0.0;
-///			for (int n = 0; n < (NUM_CAMS-1); n++){
+            ///			for (int n = 0; n < (NUM_CAMS-1); n++){
-			for (int n = 0; n < (num_cams-1); n++){
+            for (int n = 0; n < (num_cams - 1); n++) {
                angle -= *(gangles + n);
                zoom -= gpu_correction_vector->zoom[n];
            }
-			if (nangle >= 2){ // diverging for roll (last two)
+            if (nangle >= 2) {  // diverging for roll (last two)
                angle = *(gangles + ncam);
            }
@@ -182,72 +190,68 @@ extern "C" __global__ void calc_rot_deriv(
            angle = *(gangles + ncam);
            zoom = gpu_correction_vector->zoom[ncam];
        }
-		if (!is_sin){
+        if (!is_sin) {
-			angle += M_PI/2;
+            angle += M_PI / 2;
        }
        float sc = sinf(angle);
-		if (nangle ==2) {
+        if (nangle == 2) {
            sc *= 1.0 + zoom;
        }
-		sincos[nangle][is_sin]= sc;
+        sincos[nangle][is_sin] = sc;
    }
    __syncthreads();
 #ifdef DEBUG20
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
        printf("\n    Azimuth matrix for   camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", ncam, sincos[0][0], sincos[0][1], zoom);
-		printf(  "    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[1][0], sincos[1][1]);
+        printf("    Tilt matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[1][0], sincos[1][1]);
-		printf(  "    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[2][0], sincos[2][1]);
+        printf("    Roll*Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[2][0], sincos[2][1]);
-		printf(  "    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n",      ncam, sincos[3][0], sincos[3][1]);
+        printf("    Roll matrix for      camera %d, sincos[0] = %f, sincos[1] = %f\n", ncam, sincos[3][0], sincos[3][1]);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG20
+    // Create 3 3x3 matrices for az, tilt, roll/zoom:
+    int axis = offset_matrices + threadIdx.z;  // 0..2
-// Create 3 3x3 matrices for az, tilt, roll/zoom:
-	int axis = offset_matrices+threadIdx.z; // 0..2
    int const_index = threadIdx.z;             // 0..2
    matrices[axis][threadIdx.y][threadIdx.x] =
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] +  // cos
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] +  // sin
        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                            // const
    axis += 3;                                                                              // skip index == 3
-	const_index +=3;
+    const_index += 3;
    matrices[axis][threadIdx.y][threadIdx.x] =
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0]+ // cos
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[threadIdx.z][0] +  // cos
-			ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1]+ // sin
+        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[threadIdx.z][1] +  // sin
        ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                            // const
-	if (threadIdx.z == 0){
+    if (threadIdx.z == 0) {
        axis += 3;
-		const_index +=3;
+        const_index += 3;
        matrices[axis][threadIdx.y][threadIdx.x] =
-				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0]+ // cos
+            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][0] * sincos[3][0] +  // cos
-				ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1]+ // sin
+            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][1] * sincos[3][1] +  // sin
            ROTS_TEMPLATE[const_index][threadIdx.y][threadIdx.x][2];                  // const
    }
    __syncthreads();
 #ifdef DEBUG20
-	const char* matrices_names[] = {"az","tilt","roll*zoom","d_daz","d_tilt","d_roll","d_zoom"};
+    const char *matrices_names[] = {"az", "tilt", "roll*zoom", "d_daz", "d_tilt", "d_roll", "d_zoom"};
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
        for (int i = 0; i < 7; i++) {
            printf("\n----Matrix %s for camera %d:\n", matrices_names[i], ncam);
-			for (int row = 0; row < 3; row++){
+            for (int row = 0; row < 3; row++) {
-				for (int col = 0; col < 3; col++){
+                for (int col = 0; col < 3; col++) {
-					printf("%9.6f, ",matrices[offset_matrices + i][row][col]);
+                    printf("%9.6f, ", matrices[offset_matrices + i][row][col]);
                }
                printf("\n");
            }
        }
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG20
-/*
+    /*
            __constant__ int mm_seq [3][3][3]={
                            {
                                            {6,5,12}, // a_t * a_z -> tmp0
@@ -261,20 +265,20 @@ extern "C" __global__ void calc_rot_deriv(
                                            {10,12,3}, // a_dr * tmp0 -> deriv2
                                            {11,12,4}, // a_dzoom * tnmp0 -> deriv3
                            }};
-*/
+    */
-	for (int i = 0; i < 3; i++){
+    for (int i = 0; i < 3; i++) {
        int srcl = mm_seq[i][threadIdx.z][0];
        int srcr = mm_seq[i][threadIdx.z][1];
        int dst = mm_seq[i][threadIdx.z][2];
-		if (srcl >= 0){
+        if (srcl >= 0) {
            matrices[dst][threadIdx.y][threadIdx.x] =
-					matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x]+
+                matrices[srcl][threadIdx.y][0] * matrices[srcr][0][threadIdx.x] +
-					matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x]+
+                matrices[srcl][threadIdx.y][1] * matrices[srcr][1][threadIdx.x] +
                matrices[srcl][threadIdx.y][2] * matrices[srcr][2][threadIdx.x];
        }
        __syncthreads();
    }
-// copy results to global memory
+    // copy results to global memory
    int gindx = threadIdx.z;
    int lindx = offset_rots + threadIdx.z;
 #ifdef NVRTC_BUG
@@ -283,8 +287,8 @@ extern "C" __global__ void calc_rot_deriv(
 #else
    gpu_rot_deriv->matrices[gindx][ncam][threadIdx.y][threadIdx.x] = matrices[lindx][threadIdx.y][threadIdx.x];
 #endif
-	gindx +=3;
+    gindx += 3;
-	lindx+=3;
+    lindx += 3;
    if (lindx < 5) {
 #ifdef NVRTC_BUG
        // going beyond first dimension
@@ -295,54 +299,49 @@ extern "C" __global__ void calc_rot_deriv(
    }
    __syncthreads();
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
+    if ((ncam == DBG_CAM) && (threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)) {
        printf("\n----All Done with calc_rot_deriv() for ncam=%d\n", ncam);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG20
-// All done - read/verify all arrays
+    // All done - read/verify all arrays
 }
 extern "C" __global__ void calculate_tiles_offsets(
    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    int num_cams,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task     * gpu_tasks,
+                        //		struct tp_task     * gpu_tasks,
    int num_tiles,      // number of tiles in task
-		struct gc          * gpu_geometry_correction,
+    struct gc *gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv)
+    trot_deriv *gpu_rot_deriv) {
-{
+    ///	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
-///	dim3 threads_geom(NUM_CAMS,TILES_PER_BLOCK_GEOM, 1);
+    ///	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
-///	dim3 grid_geom   ((num_tiles+TILES_PER_BLOCK_GEOM-1)/TILES_PER_BLOCK_GEOM, 1, 1);
+    int tiles_per_block_geom = NUM_THREADS / num_cams;
-	int tiles_per_block_geom = NUM_THREADS/ num_cams;
+    dim3 threads_geom(num_cams, tiles_per_block_geom, 1);
-	dim3 threads_geom(num_cams,tiles_per_block_geom, 1);
+    dim3 grid_geom((num_tiles + tiles_per_block_geom - 1) / tiles_per_block_geom, 1, 1);
-	dim3 grid_geom   ((num_tiles + tiles_per_block_geom - 1)/tiles_per_block_geom, 1, 1);
+    //#define NUM_THREADS                   32
-//#define NUM_THREADS                   32
    if (threadIdx.x == 0) {  // always 1
-    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
+        get_tiles_offsets<<<grid_geom, threads_geom>>>(
            uniform_grid,             // int                  uniform_grid, //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
            num_cams,                 // int                  num_cams,
            gpu_ftasks,               // float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//    			gpu_tasks,               // struct tp_task     * gpu_tasks,
+                                      //    			gpu_tasks,               // struct tp_task     * gpu_tasks,
            num_tiles,                // int                  num_tiles,          // number of tiles in task list
            gpu_geometry_correction,  //	struct gc          * gpu_geometry_correction,
            gpu_correction_vector,    //	struct corr_vector * gpu_correction_vector,
            gpu_rByRDist,             //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
            gpu_rot_deriv);           // union trot_deriv   * gpu_rot_deriv);
    }
-//	__syncthreads();// __syncwarp();
+    //	__syncthreads();// __syncwarp();
-//	cudaDeviceSynchronize();
+    //	cudaDeviceSynchronize();
-//	cudaDeviceSynchronize();
+    //	cudaDeviceSynchronize();
 }
 /*
 * blockDim.x = NUM_CAMS
 * blockDim.y = TILES_PER_BLOCK_GEOM
@@ -351,46 +350,45 @@ extern "C" __global__ void calculate_tiles_offsets(
 extern "C" __global__ void get_tiles_offsets(
    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    int num_cams,
-//		struct tp_task     * gpu_tasks,
+    //		struct tp_task     * gpu_tasks,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
    int num_tiles,      // number of tiles in task
-		struct gc          * gpu_geometry_correction,
+    struct gc *gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		float *              gpu_rByRDist,      // length should match RBYRDIST_LEN
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
-		trot_deriv *         gpu_rot_deriv)
+    trot_deriv *gpu_rot_deriv) {
-{
    int task_size = get_task_size(num_cams);
    int task_num = blockIdx.x * blockDim.y + threadIdx.y;  //  blockIdx.x * TILES_PER_BLOCK_GEOM + threadIdx.y
    int thread_xy = blockDim.x * threadIdx.y + threadIdx.x;
    int dim_xy = blockDim.x * blockDim.y;  // number of parallel threads (<=32)
    __shared__ struct gc geometry_correction;
-	__shared__ float rByRDist [RBYRDIST_LEN];
+    __shared__ float rByRDist[RBYRDIST_LEN];
    __shared__ struct corr_vector extrinsic_corr;
    __shared__ trot_deriv rot_deriv;
-///	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
+    ///	__shared__ float pY_offsets[TILES_PER_BLOCK_GEOM][NUM_CAMS];
    __shared__ float pY_offsets[NUM_THREADS][NUM_CAMS];  // maximal dimensions, actual will be smaller
    float pXY[2];                                        // result to be copied to task
-	//blockDim.y
+    // blockDim.y
    //  copy data common to all threads
    {
-		int cycles_copy_gc = ((sizeof(struct gc)/sizeof(float) + dim_xy - 1) / dim_xy);
+        int cycles_copy_gc = ((sizeof(struct gc) / sizeof(float) + dim_xy - 1) / dim_xy);
-		float * gcp_local =  (float *) &geometry_correction;
+        float *gcp_local = (float *)&geometry_correction;
-		float * gcp_global = (float *) gpu_geometry_correction;
+        float *gcp_global = (float *)gpu_geometry_correction;
        int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_gc; i++){
+        for (int i = 0; i < cycles_copy_gc; i++) {
-			if (offset < sizeof(struct gc)/sizeof(float)) {
+            if (offset < sizeof(struct gc) / sizeof(float)) {
                *(gcp_local + offset) = *(gcp_global + offset);
            }
            offset += dim_xy;
        }
    }
    {
-		int cycles_copy_cv = ((sizeof(struct corr_vector)/sizeof(float) + dim_xy - 1) / dim_xy);
+        int cycles_copy_cv = ((sizeof(struct corr_vector) / sizeof(float) + dim_xy - 1) / dim_xy);
-		float * cvp_local =  (float *) &extrinsic_corr;
+        float *cvp_local = (float *)&extrinsic_corr;
-		float * cvp_global = (float *) gpu_correction_vector;
+        float *cvp_global = (float *)gpu_correction_vector;
        int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_cv; i++){
+        for (int i = 0; i < cycles_copy_cv; i++) {
-			if (offset < sizeof(struct corr_vector)/sizeof(float)) {
+            if (offset < sizeof(struct corr_vector) / sizeof(float)) {
                *(cvp_local + offset) = *(cvp_global + offset);
            }
            offset += dim_xy;
@@ -399,10 +397,10 @@ extern "C" __global__ void get_tiles_offsets(
    // TODO: maybe it is better to use system memory and not read all table?
    {
        int cycles_copy_rbrd = (RBYRDIST_LEN + dim_xy - 1) / dim_xy;
-		float * rByRDistp_local =  (float *) rByRDist;
+        float *rByRDistp_local = (float *)rByRDist;
-		float * rByRDistp_global = (float *) gpu_rByRDist;
+        float *rByRDistp_global = (float *)gpu_rByRDist;
        int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_rbrd; i++){
+        for (int i = 0; i < cycles_copy_rbrd; i++) {
            if (offset < RBYRDIST_LEN) {
                *(rByRDistp_local + offset) = *(rByRDistp_global + offset);
            }
@@ -411,12 +409,12 @@ extern "C" __global__ void get_tiles_offsets(
    }
    // copy rotational  matrices (with their derivatives by azimuth, tilt, roll and zoom - for ERS correction)
    {
-		int cycles_copy_rot = ((sizeof(trot_deriv)/sizeof(float)) + dim_xy - 1) / dim_xy;
+        int cycles_copy_rot = ((sizeof(trot_deriv) / sizeof(float)) + dim_xy - 1) / dim_xy;
-		float * rots_local =  (float *) &rot_deriv;
+        float *rots_local = (float *)&rot_deriv;
-		float * rots_global = (float *) gpu_rot_deriv; // rot_matrices;
+        float *rots_global = (float *)gpu_rot_deriv;  // rot_matrices;
        int offset = thread_xy;
-		for (int i = 0; i < cycles_copy_rot; i++){
+        for (int i = 0; i < cycles_copy_rot; i++) {
-			if (offset < sizeof(trot_deriv)/sizeof(float)) {
+            if (offset < sizeof(trot_deriv) / sizeof(float)) {
                *(rots_local + offset) = *(rots_global + offset);
            }
            offset += dim_xy;
@@ -424,7 +422,7 @@ extern "C" __global__ void get_tiles_offsets(
    }
    __syncthreads();
    int ncam = threadIdx.x;
-	if (task_num >= num_tiles){
+    if (task_num >= num_tiles) {
        return;
    }
    int imu_exists =  // todo - calculate once with rot_deriv?
@@ -436,17 +434,15 @@ extern "C" __global__ void get_tiles_offsets(
        (extrinsic_corr.imu_move[2] != 0.0);
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
        printf("\nTile = %d, camera= %d\n", task_num, ncam);
-		printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int) blockIdx.x);
+        printf("\nget_tiles_offsets() threadIdx.x = %d,  threadIdx.y = %d,blockIdx.x= %d\n", (int)threadIdx.x, (int)threadIdx.y, (int)blockIdx.x);
        printGeometryCorrection(&geometry_correction, num_cams);
-		printExtrinsicCorrection(&extrinsic_corr,num_cams);
+        printExtrinsicCorrection(&extrinsic_corr, num_cams);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG21
    //		String dbg_s = corr_vector.toString();
    /* Starting with required tile center X, Y and nominal distortion, for each sensor port:
     * 1) unapply common distortion (maybe for different - master camera)
@@ -458,28 +454,28 @@ extern "C" __global__ void get_tiles_offsets(
     */
    // common code, calculated in parallel
-///	int cxy = gpu_tasks[task_num].txy;
+    ///	int cxy = gpu_tasks[task_num].txy;
-///	float disparity = gpu_tasks[task_num].target_disparity;
+    ///	float disparity = gpu_tasks[task_num].target_disparity;
-	float disparity = * (gpu_ftasks +  task_size * task_num + 2);
+    float disparity = *(gpu_ftasks + task_size * task_num + 2);
    float *centerXY = gpu_ftasks + task_size * task_num + tp_task_centerXY_offset;
    float px = *(centerXY);
    float py = *(centerXY + 1);
-	int cxy =  *(int *) (gpu_ftasks +  task_size * task_num + 1);
+    int cxy = *(int *)(gpu_ftasks + task_size * task_num + 1);
    int tileX = (cxy & 0xffff);
    int tileY = (cxy >> 16);
-//	if (isnan(px)) {
+    //	if (isnan(px)) {
-//	if (__float_as_int(px) == 0x7fffffff) {
+    //	if (__float_as_int(px) == 0x7fffffff) {
    if (uniform_grid) {
 #ifdef DEBUG23
-		if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)){
+        if ((ncam == 0) && (tileX == DBG_TILE_X) && (tileY == DBG_TILE_Y)) {
-			printf ("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX,tileY,ncam);
+            printf("\n  get_tiles_offsets(): Debugging tileX=%d, tileY=%d, ncam = %d\n", tileX, tileY, ncam);
            printf("\n");
            __syncthreads();
        }
 #endif                                         //#ifdef DEBUG23
-		px = tileX * DTT_SIZE + DTT_SIZE/2; //  - shiftX;
+        px = tileX * DTT_SIZE + DTT_SIZE / 2;  //  - shiftX;
-		py = tileY * DTT_SIZE + DTT_SIZE/2; //  - shiftY;
+        py = tileY * DTT_SIZE + DTT_SIZE / 2;  //  - shiftY;
        *(centerXY) = px;
        *(centerXY + 1) = py;
    }
@@ -488,28 +484,27 @@ extern "C" __global__ void get_tiles_offsets(
    float pXcd = px - 0.5 * geometry_correction.pixelCorrectionWidth;
    float pYcd = py - 0.5 * geometry_correction.pixelCorrectionHeight;
-	float rXY [2];
+    float rXY[2];
    rXY[0] = geometry_correction.rXY[ncam][0];
    rXY[1] = geometry_correction.rXY[ncam][1];
-	float rD = sqrtf(pXcd*pXcd + pYcd*pYcd)*0.001*geometry_correction.pixelSize; // distorted radius in a virtual center camera
+    float rD = sqrtf(pXcd * pXcd + pYcd * pYcd) * 0.001 * geometry_correction.pixelSize;  // distorted radius in a virtual center camera
-	float rND2R=getRByRDist(rD/geometry_correction.distortionRadius, rByRDist);
+    float rND2R = getRByRDist(rD / geometry_correction.distortionRadius, rByRDist);
    float pXc = pXcd * rND2R;  // non-distorted coordinates relative to the (0.5 * this.pixelCorrectionWidth, 0.5 * this.pixelCorrectionHeight)
    float pYc = pYcd * rND2R;  // in pixels
-	float xyz [3]; // getWorldCoordinates
+    float xyz[3];              // getWorldCoordinates
    xyz[2] = -SCENE_UNITS_SCALE * geometry_correction.focalLength * geometry_correction.disparityRadius /
             (disparity * 0.001 * geometry_correction.pixelSize);  // "+" - near, "-" far
    xyz[0] = SCENE_UNITS_SCALE * pXc * geometry_correction.disparityRadius / disparity;
    xyz[1] = -SCENE_UNITS_SCALE * pYc * geometry_correction.disparityRadius / disparity;
    // next radial distortion coefficients are for this, not master camera (may be the same)
    //	geometry_correction.rad_coeff[i];
-	float fl_pix = geometry_correction.focalLength/(0.001 * geometry_correction.pixelSize); // focal length in pixels - this camera
+    float fl_pix = geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize);  // focal length in pixels - this camera
    float ri_scale = 0.001 * geometry_correction.pixelSize / geometry_correction.distortionRadius;
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
        printf("\nuniform_grid=%d\n", uniform_grid);
        printf("Tile = %d, camera= %d\n", task_num, ncam);
        printf("TargetDisparity = %f\n", disparity);
@@ -521,12 +516,11 @@ extern "C" __global__ void get_tiles_offsets(
        printf("rD = %f,  rND2R = %f\n", rD, rND2R);
        printf("pXc = %f,  pYc = %f\n", pXc, pYc);
        printf("fl_pix = %f,  ri_scale = %f\n", fl_pix, ri_scale);
-		printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0],xyz[1],xyz[2]);
+        printf("xyz[0] = %f, xyz[1] = %f, xyz[2] = %f\n", xyz[0], xyz[1], xyz[2]);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG21
    // above is common code, below - per camera (was cycle in Java, here individual threads //for (int ncam = 0; ncam < NUM_CAMS; ncam++){
    // non-distorted XY of the shifted location of the individual sensor
@@ -542,30 +536,30 @@ extern "C" __global__ void get_tiles_offsets(
    float rvi[3];
 #pragma unroll
-	for (int j = 0; j< 3; j++){
+    for (int j = 0; j < 3; j++) {
        rvi[j] = rot_deriv.rots[ncam][j][0] * pXci0 + rot_deriv.rots[ncam][j][1] * pYci0 + rot_deriv.rots[ncam][j][2] * fl_pix;
    }
    // get back to the projection plane by normalizing vector
-	float norm_z = fl_pix/rvi[2];
+    float norm_z = fl_pix / rvi[2];
    float pXci = rvi[0] * norm_z;
    float pYci = rvi[1] * norm_z;
    // Re-apply distortion
-	float rNDi =  sqrtf(pXci*pXci + pYci*pYci); // in pixels
+    float rNDi = sqrtf(pXci * pXci + pYci * pYci);  // in pixels
-	float ri =    rNDi* ri_scale; // relative to distortion radius
+    float ri = rNDi * ri_scale;                     // relative to distortion radius
    float rD2rND = 1.0;
    {
        float rri = 1.0;
 #ifdef NVRTC_BUG
 #pragma unroll
-		for (int j = 0; j < RAD_COEFF_LEN; j++){
+        for (int j = 0; j < RAD_COEFF_LEN; j++) {
            rri *= ri;
-			rD2rND +=  ((float *) &geometry_correction.distortionC)[j]*(rri - 1.0);
+            rD2rND += ((float *)&geometry_correction.distortionC)[j] * (rri - 1.0);
        }
 #else
-		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+        for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
            rri *= ri;
-			rD2rND += geometry_correction.rad_coeff[j]*(rri - 1.0);
+            rD2rND += geometry_correction.rad_coeff[j] * (rri - 1.0);
        }
 #endif
    }
@@ -579,14 +573,14 @@ extern "C" __global__ void get_tiles_offsets(
    __syncthreads();
    // Each thread re-calculate same sum
    float lines_avg = 0;
-	for (int i = 0; i < num_cams; i ++){
+    for (int i = 0; i < num_cams; i++) {
        lines_avg += pY_offsets[threadIdx.y][i];
    }
-	lines_avg *= (1.0/num_cams);
+    lines_avg *= (1.0 / num_cams);
    // used when calculating derivatives, TODO: combine calculations !
    float pY_offset = pY_offsets[threadIdx.y][ncam] - lines_avg;
 #ifdef DEBUG21
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
        printf("pXci0 = %f,  pYci0 = %f\n", pXci0, pYci0);
        printf("rvi[0] = %f,  rvi[1] = %f,  rvi[2] = %f\n", rvi[0], rvi[1], rvi[2]);
        printf("norm_z = %f,  pXci = %f,  pYci = %f\n", norm_z, pXci, pYci);
@@ -596,15 +590,15 @@ extern "C" __global__ void get_tiles_offsets(
        printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]);              // OK
        printf("lines_avg = %f,  pY_offset = %f\n", lines_avg, pY_offset);  // *
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG21
-	float drvi_daz [3]; // drvi_daz = deriv_rots[i][0].times(vi);
+    float drvi_daz[3];  // drvi_daz = deriv_rots[i][0].times(vi);
-	float drvi_dtl [3]; // drvi_dtl = deriv_rots[i][1].times(vi);
+    float drvi_dtl[3];  // drvi_dtl = deriv_rots[i][1].times(vi);
-	float drvi_drl [3]; // drvi_drl = deriv_rots[i][2].times(vi);
+    float drvi_drl[3];  // drvi_drl = deriv_rots[i][2].times(vi);
 #pragma unroll
-	for (int j = 0; j< 3; j++){
+    for (int j = 0; j < 3; j++) {
        drvi_daz[j] = rot_deriv.d_daz[ncam][j][0] * pXci0 + rot_deriv.d_daz[ncam][j][1] * pYci0 + rot_deriv.d_daz[ncam][j][2] * fl_pix;
        drvi_dtl[j] = rot_deriv.d_tilt[ncam][j][0] * pXci0 + rot_deriv.d_tilt[ncam][j][1] * pYci0 + rot_deriv.d_tilt[ncam][j][2] * fl_pix;
        drvi_drl[j] = rot_deriv.d_roll[ncam][j][0] * pXci0 + rot_deriv.d_roll[ncam][j][1] * pYci0 + rot_deriv.d_roll[ncam][j][2] * fl_pix;
@@ -618,7 +612,7 @@ extern "C" __global__ void get_tiles_offsets(
    float dpYci_droll = drvi_drl[1] * norm_z - pYci * drvi_drl[2] / rvi[2];
 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
        printf("drvi_daz[0] = %f,  drvi_daz[1] = %f,  drvi_daz[2] = %f\n", drvi_daz[0], drvi_daz[1], drvi_daz[2]);
        printf("drvi_dtl[0] = %f,  drvi_dtl[1] = %f,  drvi_dtl[2] = %f\n", drvi_dtl[0], drvi_dtl[1], drvi_dtl[2]);
        printf("drvi_drl[0] = %f,  drvi_drl[1] = %f,  drvi_drl[2] = %f\n", drvi_drl[0], drvi_drl[1], drvi_drl[2]);
@@ -627,7 +621,7 @@ extern "C" __global__ void get_tiles_offsets(
        printf("dpXci_dtilt = %f,     dpYci_dtilt = %f\n", dpXci_dtilt, dpYci_dtilt);
        printf("dpXci_droll = %f,     dpYci_droll = %f\n", dpXci_droll, dpYci_droll);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG21
    float disp_dist[4];  // only for this channel, to be copied to global gpu_tasks in the end
@@ -641,47 +635,47 @@ extern "C" __global__ void get_tiles_offsets(
                            Matrix dd1 = rots[i].times(dd0).getMatrix(0, 1,0,1).times(norm_z); // get top left 2x2 sub-matrix
     */
-	float dd1[2][2];// get top left 2x2 sub-matrix
+    float dd1[2][2];  // get top left 2x2 sub-matrix
-	dd1[0][0] = (-rot_deriv.rots[ncam][0][0]*rXY[0] -rot_deriv.rots[ncam][0][1]*rXY[1])*norm_z;
+    dd1[0][0] = (-rot_deriv.rots[ncam][0][0] * rXY[0] - rot_deriv.rots[ncam][0][1] * rXY[1]) * norm_z;
-	dd1[0][1] = ( rot_deriv.rots[ncam][0][0]*rXY[1] -rot_deriv.rots[ncam][0][1]*rXY[0])*norm_z;
+    dd1[0][1] = (rot_deriv.rots[ncam][0][0] * rXY[1] - rot_deriv.rots[ncam][0][1] * rXY[0]) * norm_z;
-	dd1[1][0] = (-rot_deriv.rots[ncam][1][0]*rXY[0] -rot_deriv.rots[ncam][1][1]*rXY[1])*norm_z;
+    dd1[1][0] = (-rot_deriv.rots[ncam][1][0] * rXY[0] - rot_deriv.rots[ncam][1][1] * rXY[1]) * norm_z;
-	dd1[1][1] = ( rot_deriv.rots[ncam][1][0]*rXY[1] -rot_deriv.rots[ncam][1][1]*rXY[0])*norm_z;
+    dd1[1][1] = (rot_deriv.rots[ncam][1][0] * rXY[1] - rot_deriv.rots[ncam][1][1] * rXY[0]) * norm_z;
 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
-		printf("dd1[0][0] = %f,  dd1[0][1] = %f\n",dd1[0][0],dd1[0][1]);
+        printf("dd1[0][0] = %f,  dd1[0][1] = %f\n", dd1[0][0], dd1[0][1]);
-		printf("dd1[1][0] = %f,  dd1[1][1] = %f\n",dd1[1][0],dd1[1][1]);
+        printf("dd1[1][0] = %f,  dd1[1][1] = %f\n", dd1[1][0], dd1[1][1]);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();  // __syncwarp();
 #endif                // DEBUG21
    // now first column of 2x2 dd1 - x, y components of derivatives by disparity, second column - derivatives by ortho to disparity (~Y in 2d correlation)
    // unity vector in the direction of radius
-	float c_dist = pXci/rNDi;
+    float c_dist = pXci / rNDi;
-	float s_dist = pYci/rNDi;
+    float s_dist = pYci / rNDi;
    //#undef NVRTC_BUG
    float drD2rND_dri = 0.0;
    {
        float rri = 1.0;
 #ifdef NVRTC_BUG
 #pragma unroll
-		for (int j = 0; j < RAD_COEFF_LEN; j++){
+        for (int j = 0; j < RAD_COEFF_LEN; j++) {
-			drD2rND_dri += ((float *) &geometry_correction.distortionC)[j] * (j+1) * rri;
+            drD2rND_dri += ((float *)&geometry_correction.distortionC)[j] * (j + 1) * rri;
            rri *= ri;
        }
 #else
 #pragma unroll
-		for (int j = 0; j < sizeof(geometry_correction.rad_coeff)/sizeof(float); j++){
+        for (int j = 0; j < sizeof(geometry_correction.rad_coeff) / sizeof(float); j++) {
-			drD2rND_dri += geometry_correction.rad_coeff[j] * (j+1) * rri;
+            drD2rND_dri += geometry_correction.rad_coeff[j] * (j + 1) * rri;
            rri *= ri;
        }
 #endif
    }
-	float scale_distort00 = rD2rND + ri* drD2rND_dri;
+    float scale_distort00 = rD2rND + ri * drD2rND_dri;
    float scale_distort11 = rD2rND;
    float scale_distortXrot2Xdd1[2][2];
-	scale_distortXrot2Xdd1[0][0] = ( c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
+    scale_distortXrot2Xdd1[0][0] = (c_dist * dd1[0][0] + s_dist * dd1[1][0]) * scale_distort00;
-	scale_distortXrot2Xdd1[0][1] = ( c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
+    scale_distortXrot2Xdd1[0][1] = (c_dist * dd1[0][1] + s_dist * dd1[1][1]) * scale_distort00;
    scale_distortXrot2Xdd1[1][0] = (-s_dist * dd1[0][0] + c_dist * dd1[1][0]) * scale_distort11;
    scale_distortXrot2Xdd1[1][1] = (-s_dist * dd1[0][1] + c_dist * dd1[1][1]) * scale_distort11;
@@ -691,21 +685,21 @@ extern "C" __global__ void get_tiles_offsets(
    disp_dist[3] = s_dist * scale_distortXrot2Xdd1[0][1] + c_dist * scale_distortXrot2Xdd1[1][1];
 #ifdef DEBUG210
-	if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+    if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
-		printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n",scale_distortXrot2Xdd1[0][0],scale_distortXrot2Xdd1[0][1]);
+        printf("scale_distortXrot2Xdd1[0][0] = %f,  scale_distortXrot2Xdd1[0][1] = %f\n", scale_distortXrot2Xdd1[0][0], scale_distortXrot2Xdd1[0][1]);
-		printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n",scale_distortXrot2Xdd1[1][0],scale_distortXrot2Xdd1[1][1]);
+        printf("scale_distortXrot2Xdd1[1][0] = %f,  scale_distortXrot2Xdd1[1][1] = %f\n", scale_distortXrot2Xdd1[1][0], scale_distortXrot2Xdd1[1][1]);
        printf("disp_dist[0] = %f\n", disp_dist[0]);
        printf("disp_dist[1] = %f\n", disp_dist[1]);
        printf("disp_dist[2] = %f\n", disp_dist[2]);
        printf("disp_dist[3] = %f\n", disp_dist[3]);
    }
-	__syncthreads();// __syncwarp();
+    __syncthreads();                                                                                       // __syncwarp();
 #endif                                                                                                     // DEBUG21
-///	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][0] = disp_dist[0];
-///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][1] = disp_dist[1];
-///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][2] = disp_dist[2];
-///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
+                                                                                                           ///	gpu_tasks[task_num].disp_dist[ncam][3] = disp_dist[3];
-	float * disp_dist_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + num_cams* 2 + ncam * 4; //  ncam = threadIdx.x, so each thread will have different offset
+    float *disp_dist_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + num_cams * 2 + ncam * 4;  //  ncam = threadIdx.x, so each thread will have different offset
    *(disp_dist_p++) = disp_dist[0];                                                                       // global memory
    *(disp_dist_p++) = disp_dist[1];
    *(disp_dist_p++) = disp_dist[2];
@@ -715,7 +709,7 @@ extern "C" __global__ void get_tiles_offsets(
    //	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
    //	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19 geometry_correction.imu_move
    // ERS linear does not yet use per-port rotations, probably not needed
-	if (imu_exists){
+    if (imu_exists) {
        float ers_x =
            dpXci_dtilt * extrinsic_corr.imu_rot[0] +
            dpXci_dazimuth * extrinsic_corr.imu_rot[1] +
@@ -726,114 +720,121 @@ extern "C" __global__ void get_tiles_offsets(
            dpYci_droll * extrinsic_corr.imu_rot[2];
 #ifdef DEBUG21
-		if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+        if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
            printf("ers_x = %f,  ers_y = %f\n", ers_x, ers_y);
        }
-		__syncthreads();// __syncwarp();
+        __syncthreads();                   // __syncwarp();
 #endif                                     // DEBUG21
-		if (disparity >= MIN_DISPARITY){ // all threads together
+        if (disparity >= MIN_DISPARITY) {  // all threads together
            float k = SCENE_UNITS_SCALE * geometry_correction.disparityRadius;
            float wdisparity = disparity;
-			float dwdisp_dz = (k * geometry_correction.focalLength / (0.001*geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
+            float dwdisp_dz = (k * geometry_correction.focalLength / (0.001 * geometry_correction.pixelSize)) / (xyz[2] * xyz[2]);
            dpXci_pYci_imu_lin[0][0] = -wdisparity / k;            // dpx/ dworld_X
            dpXci_pYci_imu_lin[1][1] = wdisparity / k;             // dpy/ dworld_Y
            dpXci_pYci_imu_lin[0][2] = (xyz[0] / k) * dwdisp_dz;   // dpx/ dworld_Z
-////			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
+                                                                   ////			dpXci_pYci_imu_lin[1][2] =  (xyz[1] / k) * dwdisp_dz; // dpy/ dworld_Z
            dpXci_pYci_imu_lin[1][2] = -(xyz[1] / k) * dwdisp_dz;  // dpy/ dworld_Z
            ers_x += dpXci_pYci_imu_lin[0][0] * extrinsic_corr.imu_move[0] +
                     dpXci_pYci_imu_lin[0][2] * extrinsic_corr.imu_move[2];
            ers_y += dpXci_pYci_imu_lin[1][1] * extrinsic_corr.imu_move[1] +
                     dpXci_pYci_imu_lin[1][2] * extrinsic_corr.imu_move[2];
-			float delta_t = (pY_offset/ (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time; // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
+            float delta_t = (pY_offset / (1.0 - geometry_correction.line_time * ers_y)) * geometry_correction.line_time;  // positive for top cameras, negative - for bottom //disp_dist[2]=dd2.get(1, 0)
            pXY[0] += delta_t * ers_x * rD2rND;  // added correction to pixel X
            pXY[1] += delta_t * ers_y * rD2rND;  // added correction to pixel Y
 #ifdef DEBUG21
-			if ((ncam == DBG_CAM)  && (task_num == DBG_TILE)){
+            if ((ncam == DBG_CAM) && (task_num == DBG_TILE)) {
                printf("k = %f,  wdisparity = %f,  dwdisp_dz = %f\n", k, wdisparity, dwdisp_dz);
-				printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0],dpXci_pYci_imu_lin[0][2]);
+                printf("dpXci_pYci_imu_lin[0][0] = %f,  dpXci_pYci_imu_lin[0][2] = %f\n", dpXci_pYci_imu_lin[0][0], dpXci_pYci_imu_lin[0][2]);
-				printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1],dpXci_pYci_imu_lin[1][2]);
+                printf("dpXci_pYci_imu_lin[1][1] = %f,  dpXci_pYci_imu_lin[1][2] = %f\n", dpXci_pYci_imu_lin[1][1], dpXci_pYci_imu_lin[1][2]);
                printf("delta_t = %f,  ers_x = %f,  ers_y = %f\n", delta_t, ers_x, ers_y);
                printf("pXY[0] = %f,  pXY[1] = %f\n", pXY[0], pXY[1]);  // OK
            }
-			__syncthreads();// __syncwarp();
+            __syncthreads();  // __syncwarp();
 #endif                        // DEBUG21
        }
    }
    // copy results to global memory pXY,  disp_dist (already copied)
-//	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
+    //	gpu_tasks[task_num].xy[ncam][0] = pXY[0];
-//	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
+    //	gpu_tasks[task_num].xy[ncam][1] = pXY[1];
-//	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+    //	float * tile_xy_p = gpu_ftasks +  task_size * task_num + 3 + num_cams * 4 + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
    // .xy goes right after 3 commonn (tak, txy and target_disparity
-	float * tile_xy_p = gpu_ftasks +  task_size * task_num + tp_task_xy_offset + ncam * 2; //  ncam = threadIdx.x, so each thread will have different offset
+    float *tile_xy_p = gpu_ftasks + task_size * task_num + tp_task_xy_offset + ncam * 2;  //  ncam = threadIdx.x, so each thread will have different offset
    *(tile_xy_p++) = pXY[0];                                                              // global memory
    *(tile_xy_p++) = pXY[1];                                                              // global memory
 }
 extern "C" __global__ void calcReverseDistortionTable(
-		struct gc * geometry_correction,
+    struct gc *geometry_correction,
-		float * rByRDist)
+    float *rByRDist) {
-{
+    // int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
-	//int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
    int indx = ((blockIdx.x * blockDim.z + threadIdx.z) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
-//	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
+    //	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
-//	double delta=1E-4; //rByRDist error = 0.000072
+    //	double delta=1E-4; //rByRDist error = 0.000072
-	double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
+    double delta = 1E-10;  // 12; // 10; // -8; 0.730000 ms
-	double minDerivative=0.01;
+    double minDerivative = 0.01;
-	int numIterations=1000;
+    int numIterations = 1000;
-	double drDistDr=1.0;
+    double drDistDr = 1.0;
-	double d=1.0
+    double d = 1.0 - geometry_correction->distortionA8 - geometry_correction->distortionA7 - geometry_correction->distortionA6 - geometry_correction->distortionA5 - geometry_correction->distortionA - geometry_correction->distortionB - geometry_correction->distortionC;
-			-geometry_correction -> distortionA8
+    double rPrev = 0.0;
-			-geometry_correction -> distortionA7
-			-geometry_correction -> distortionA6
-			-geometry_correction -> distortionA5
-			-geometry_correction -> distortionA
-			-geometry_correction -> distortionB
-			-geometry_correction -> distortionC;
-	double rPrev=0.0;
    int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
-	for (int p = 0; p < num_points; p ++){
+    for (int p = 0; p < num_points; p++) {
-		int i = indx * num_points +p;
+        int i = indx * num_points + p;
-		if (i >= RBYRDIST_LEN){
+        if (i >= RBYRDIST_LEN) {
            return;
        }
-		if (i == 0){
+        if (i == 0) {
-			rByRDist[0]= (float) 1.0/d;
+            rByRDist[0] = (float)1.0 / d;
            continue;
        }
        double rDist = RBYRDIST_STEP * i;
        double r = (p == 0) ? rDist : rPrev;
-		for (int iteration=0;iteration<numIterations;iteration++){
+        for (int iteration = 0; iteration < numIterations; iteration++) {
-			double k=(((((((
+            double k = (((((((
-					geometry_correction -> distortionA8) * r +
+                                 geometry_correction->distortionA8) *
-					geometry_correction -> distortionA7) * r +
+                                 r +
-					geometry_correction -> distortionA6) * r +
+                             geometry_correction->distortionA7) *
-					geometry_correction -> distortionA5) * r +
+                                r +
-					geometry_correction -> distortionA) * r +
+                            geometry_correction->distortionA6) *
-					geometry_correction -> distortionB) * r +
+                               r +
-					geometry_correction -> distortionC) * r + d;
+                           geometry_correction->distortionA5) *
-			drDistDr=(((((((
+                              r +
-					8 * geometry_correction -> distortionA8) * r +
+                          geometry_correction->distortionA) *
-					7 * geometry_correction -> distortionA7) * r +
+                             r +
-					6 * geometry_correction -> distortionA6) * r +
+                         geometry_correction->distortionB) *
-					5 * geometry_correction -> distortionA5) * r +
+                            r +
-					4 * geometry_correction -> distortionA) * r +
+                        geometry_correction->distortionC) *
-					3 * geometry_correction -> distortionB) * r+
+                           r +
-					2 * geometry_correction -> distortionC) * r+d;
+                       d;
-			if (drDistDr<minDerivative) { // folds backwards !
+            drDistDr = (((((((
+                                 8 * geometry_correction->distortionA8) *
+                                 r +
+                             7 * geometry_correction->distortionA7) *
+                                r +
+                            6 * geometry_correction->distortionA6) *
+                               r +
+                           5 * geometry_correction->distortionA5) *
+                              r +
+                          4 * geometry_correction->distortionA) *
+                             r +
+                         3 * geometry_correction->distortionB) *
+                            r +
+                        2 * geometry_correction->distortionC) *
+                           r +
+                       d;
+            if (drDistDr < minDerivative) {  // folds backwards !
                return;                      // too high distortion
            }
-			double rD=r*k;
+            double rD = r * k;
-			if (fabs(rD-rDist)<delta){
+            if (fabs(rD - rDist) < delta) {
                break;
            }
-			r+=(rDist-rD)/drDistDr;
+            r += (rDist - rD) / drDistDr;
        }
-		rPrev=r;
+        rPrev = r;
-		rByRDist[i]= (float) r/rDist;
+        rByRDist[i] = (float)r / rDist;
    }
 }
@@ -843,14 +844,14 @@ extern "C" __global__ void calcReverseDistortionTable(
 * @return corresponding non-distorted radius
 */
 inline __device__ float getRByRDist(float rDist,
-		float rByRDist [RBYRDIST_LEN]) //shared memory
+                                    float rByRDist[RBYRDIST_LEN])  // shared memory
 {
    if (rDist < 0) {
        return 0.0f;  // normally should not happen
    }
-	float findex = rDist/RBYRDIST_STEP;
+    float findex = rDist / RBYRDIST_STEP;
-	int index= (int) floorf(findex);
+    int index = (int)floorf(findex);
-	if (index < 0){
+    if (index < 0) {
        index = 0;
    }
    if (index > (RBYRDIST_LEN - 3)) {
@@ -858,95 +859,107 @@ inline __device__ float getRByRDist(float rDist,
    }
    float mu = fmaxf(findex - index, 0.0f);
    float mu2 = mu * mu;
-	float y0 = (index > 0)? rByRDist[index-1] : ( 2 * rByRDist[index] - rByRDist[index+1]);
+    float y0 = (index > 0) ? rByRDist[index - 1] : (2 * rByRDist[index] - rByRDist[index + 1]);
    // use Catmull-Rom
-	float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index+1] + 0.5 * rByRDist[index+2];
+    float a0 = -0.5 * y0 + 1.5 * rByRDist[index] - 1.5 * rByRDist[index + 1] + 0.5 * rByRDist[index + 2];
-	float a1 =        y0 - 2.5 * rByRDist[index] + 2   * rByRDist[index+1] - 0.5 * rByRDist[index+2];
+    float a1 = y0 - 2.5 * rByRDist[index] + 2 * rByRDist[index + 1] - 0.5 * rByRDist[index + 2];
-	float a2 = -0.5 * y0                              + 0.5 * rByRDist[index+1];
+    float a2 = -0.5 * y0 + 0.5 * rByRDist[index + 1];
    float a3 = rByRDist[index];
-	float result= a0*mu*mu2+a1*mu2+a2*mu+a3;
+    float result = a0 * mu * mu2 + a1 * mu2 + a2 * mu + a3;
    return result;
 }
-__device__ void printGeometryCorrection(struct gc * g, int num_cams){
+__device__ void printGeometryCorrection(struct gc *g, int num_cams) {
 #ifndef JCUDA
    printf("\nGeometry Correction\n------------------\n");
-	printf("%22s: %f\n","pixelCorrectionWidth",  g->pixelCorrectionWidth);
+    printf("%22s: %f\n", "pixelCorrectionWidth", g->pixelCorrectionWidth);
-	printf("%22s: %f\n","pixelCorrectionHeight", g->pixelCorrectionHeight);
+    printf("%22s: %f\n", "pixelCorrectionHeight", g->pixelCorrectionHeight);
-	printf("%22s: %f\n","line_time",             g->line_time);
+    printf("%22s: %f\n", "line_time", g->line_time);
-	printf("%22s: %f\n","focalLength", g->focalLength);
+    printf("%22s: %f\n", "focalLength", g->focalLength);
-	printf("%22s: %f\n","pixelSize",   g->pixelSize);
+    printf("%22s: %f\n", "pixelSize", g->pixelSize);
-	printf("%22s: %f\n","distortionRadius",g->distortionRadius);
+    printf("%22s: %f\n", "distortionRadius", g->distortionRadius);
-	printf("%22s: %f\n","distortionC", g->distortionC);
+    printf("%22s: %f\n", "distortionC", g->distortionC);
-	printf("%22s: %f\n","distortionB", g->distortionB);
+    printf("%22s: %f\n", "distortionB", g->distortionB);
-	printf("%22s: %f\n","distortionA", g->distortionA);
+    printf("%22s: %f\n", "distortionA", g->distortionA);
-	printf("%22s: %f\n","distortionA5",g->distortionA5);
+    printf("%22s: %f\n", "distortionA5", g->distortionA5);
-	printf("%22s: %f\n","distortionA6",g->distortionA6);
+    printf("%22s: %f\n", "distortionA6", g->distortionA6);
-	printf("%22s: %f\n","distortionA7",g->distortionA7);
+    printf("%22s: %f\n", "distortionA7", g->distortionA7);
-	printf("%22s: %f\n","distortionA8",g->distortionA8);
+    printf("%22s: %f\n", "distortionA8", g->distortionA8);
-	printf("%22s: %f\n","elevation",   g->elevation);
+    printf("%22s: %f\n", "elevation", g->elevation);
-	printf("%22s: %f\n","heading",     g->heading);
+    printf("%22s: %f\n", "heading", g->heading);
-//	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","forward", g->forward[0], g->forward[1], g->forward[2], g->forward[3]);
-//	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","right",   g->right[0],   g->right[1],   g->right[2],   g->right[3]);
-//	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","height",  g->height[0],  g->height[1],  g->height[2],  g->height[3]);
-//	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","roll",    g->roll[0],    g->roll[1],    g->roll[2],    g->roll[3]);
-//	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[0]", g->pXY0[0][0], g->pXY0[0][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[1]", g->pXY0[1][0], g->pXY0[1][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[2]", g->pXY0[2][0], g->pXY0[2][1]);
-//	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
+    //	printf("%22s: %f, %f \n",        "pXY0[3]", g->pXY0[3][0], g->pXY0[3][1]);
-	printf("%22s:","forward"); for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]); printf("\n");
+    printf("%22s:", "forward");
-	printf("%22s:","right");   for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right  [ncam]); printf("\n");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->forward[ncam]);
-	printf("%22s:","height");  for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height [ncam]); printf("\n");
+    printf("\n");
-	printf("%22s:","roll");    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll   [ncam]); printf("\n");
+    printf("%22s:", "right");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->right[ncam]);
+    printf("\n");
+    printf("%22s:", "height");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->height[ncam]);
+    printf("\n");
+    printf("%22s:", "roll");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->roll[ncam]);
+    printf("\n");
    for (int ncam = 0; ncam < num_cams; ncam++) {
-		printf("%19s%2d]: %f, %f \n", "pXY0[",ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
+        printf("%19s%2d]: %f, %f \n", "pXY0[", ncam, g->pXY0[ncam][0], g->pXY0[ncam][1]);
    }
-	printf("%22s: %f\n","common_right",   g->common_right);
+    printf("%22s: %f\n", "common_right", g->common_right);
-	printf("%22s: %f\n","common_forward", g->common_forward);
+    printf("%22s: %f\n", "common_forward", g->common_forward);
-	printf("%22s: %f\n","common_height",  g->common_height);
+    printf("%22s: %f\n", "common_height", g->common_height);
-	printf("%22s: %f\n","common_roll",    g->common_roll);
+    printf("%22s: %f\n", "common_roll", g->common_roll);
-//	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[0]", g->rXY[0][0], g->rXY[0][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[1]", g->rXY[1][0], g->rXY[1][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[2]", g->rXY[2][0], g->rXY[2][1]);
-//	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
+    //	printf("%22s: x=%f, y=%f\n","rXY[3]", g->rXY[3][0], g->rXY[3][1]);
    for (int ncam = 0; ncam < num_cams; ncam++) {
        printf("%19s%2d]: %f, %f \n", "rXY[", ncam, g->rXY[ncam][0], g->rXY[ncam][1]);
    }
-	printf("%22s: %f\n","cameraRadius",    g->cameraRadius);
+    printf("%22s: %f\n", "cameraRadius", g->cameraRadius);
-	printf("%22s: %f\n","disparityRadius", g->disparityRadius);
+    printf("%22s: %f\n", "disparityRadius", g->disparityRadius);
-//	printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
+    //	printf("%22s: %f, %f, %f, %f \n","woi_tops", g->woi_tops[0], g->woi_tops[1], g->woi_tops[2], g->woi_tops[3]);
-	printf("%22s:","woi_tops");    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]); printf("\n");
+    printf("%22s:", "woi_tops");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", g->woi_tops[ncam]);
+    printf("\n");
-#endif //ifndef JCUDA
+#endif  // ifndef JCUDA
 }
-__device__ void printExtrinsicCorrection(corr_vector * cv, int num_cams)
+__device__ void printExtrinsicCorrection(corr_vector *cv, int num_cams) {
-{
 #ifndef JCUDA
    printf("\nExtrinsic Correction Vector\n---------------------------\n");
-//	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
+    //	printf("%22s: %f, %f, %f\n",     "tilt",    cv->tilt[0],    cv->tilt[1],    cv->tilt[2]);
-//	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
+    //	printf("%22s: %f, %f, %f\n",     "azimuth", cv->azimuth[0], cv->azimuth[1], cv->azimuth[2]);
-//	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
+    //	printf("%22s: %f, %f, %f, %f\n", "roll",    cv->roll[0],    cv->roll[1],    cv->roll[2],      cv->roll[3]);
-//	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
+    //	printf("%22s: %f, %f, %f\n",     "zoom",    cv->zoom[0],    cv->zoom[1],    cv->zoom[2]);
-	printf("%22s:","tilt");    for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->tilt[ncam]);    printf("\n");
-	printf("%22s:","azimuth"); for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->azimuth[ncam]); printf("\n");
-	printf("%22s:","roll");    for (int ncam = 0; ncam <  num_cams;    ncam++) printf(" %f,", cv->roll[ncam]);    printf("\n");
-	printf("%22s:","zoom");    for (int ncam = 0; ncam < (num_cams-1); ncam++) printf(" %f,", cv->zoom[ncam]);    printf("\n");
+    printf("%22s:", "tilt");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->tilt[ncam]);
+    printf("\n");
+    printf("%22s:", "azimuth");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->azimuth[ncam]);
+    printf("\n");
+    printf("%22s:", "roll");
+    for (int ncam = 0; ncam < num_cams; ncam++) printf(" %f,", cv->roll[ncam]);
+    printf("\n");
+    printf("%22s:", "zoom");
+    for (int ncam = 0; ncam < (num_cams - 1); ncam++) printf(" %f,", cv->zoom[ncam]);
+    printf("\n");
    printf("%22s: %f(t), %f(a), %f(r)\n", "imu_rot", cv->imu_rot[0], cv->imu_rot[1], cv->imu_rot[2]);
    printf("%22s: %f(x), %f(y), %f(z)\n", "imu_move", cv->imu_move[0], cv->imu_move[1], cv->imu_move[2]);
-#endif //ifndef JCUDA
+#endif  // ifndef JCUDA
 }
--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -41,18 +41,16 @@
 #include "tp_defines.h"
 #endif
 #define NVRTC_BUG 1
 #ifndef M_PI
 #define M_PI 3.14159265358979323846 /* pi */
 #endif
 #ifndef offsetof
 #define offsetof(st, m) \
-    ((size_t)&(((st *)0)->m))
+    ((size_t) & (((st *)0)->m))
 //#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER)
 #endif
 #define SCENE_UNITS_SCALE 0.001  // meters from mm
 #define MIN_DISPARITY 0.01       // minimal disparity to try to convert to world coordinates
 struct tp_task {
@@ -68,37 +66,37 @@ struct tp_task {
    float disp_dist[NUM_CAMS][4];  // calculated with getPortsCoordinates()
 };
-#define get_task_size(x) (sizeof(struct tp_task)/sizeof(float) - 6 * (NUM_CAMS - x))
+#define get_task_size(x) (sizeof(struct tp_task) / sizeof(float) - 6 * (NUM_CAMS - x))
 #define tp_task_xy_offset 5
 #define tp_task_centerXY_offset 3
-struct corr_vector{
+struct corr_vector {
-	float tilt    [NUM_CAMS-1]; // 0..2
+    float tilt[NUM_CAMS - 1];     // 0..2
-	float azimuth [NUM_CAMS-1]; // 3..5
+    float azimuth[NUM_CAMS - 1];  // 3..5
-	float roll    [NUM_CAMS];   // 6..9
+    float roll[NUM_CAMS];         // 6..9
-	float zoom    [NUM_CAMS-1]; // 10..12
+    float zoom[NUM_CAMS - 1];     // 10..12
    // for ERS correction:
-	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+    float imu_rot[3];   // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
    float imu_move[3];  // dx/dt, dy/dt, dz/dt 16..19
 };
 #ifdef NVRTC_BUG
-struct trot_deriv{
+struct trot_deriv {
-	float rots    [NUM_CAMS][3][3];
+    float rots[NUM_CAMS][3][3];
-	float d_daz   [NUM_CAMS][3][3];
+    float d_daz[NUM_CAMS][3][3];
-	float d_tilt  [NUM_CAMS][3][3];
+    float d_tilt[NUM_CAMS][3][3];
-	float d_roll  [NUM_CAMS][3][3];
+    float d_roll[NUM_CAMS][3][3];
-	float d_zoom  [NUM_CAMS][3][3];
+    float d_zoom[NUM_CAMS][3][3];
 };
 #else
-union trot_deriv{
+union trot_deriv {
    struct {
-		float rots    [NUM_CAMS][3][3];
+        float rots[NUM_CAMS][3][3];
-		float d_daz   [NUM_CAMS][3][3];
+        float d_daz[NUM_CAMS][3][3];
-		float d_tilt  [NUM_CAMS][3][3];
+        float d_tilt[NUM_CAMS][3][3];
-		float d_roll  [NUM_CAMS][3][3];
+        float d_roll[NUM_CAMS][3][3];
-		float d_zoom  [NUM_CAMS][3][3];
+        float d_zoom[NUM_CAMS][3][3];
    };
-	float matrices [5][NUM_CAMS][3][3];
+    float matrices[5][NUM_CAMS][3][3];
 };
 #endif
@@ -116,72 +114,68 @@ struct gc {
            float distortionC;   // r^2
            float distortionB;   // r^3
            float distortionA;   // r^4 (normalized to focal length or to sensor half width?)
-			float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+            float distortionA5;  // r^5 (normalized to focal length or to sensor half width?)
-			float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+            float distortionA6;  // r^6 (normalized to focal length or to sensor half width?)
-			float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+            float distortionA7;  // r^7 (normalized to focal length or to sensor half width?)
-			float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+            float distortionA8;  // r^8 (normalized to focal length or to sensor half width?)
 #ifndef NVRTC_BUG
        };
-		float rad_coeff [7];
+        float rad_coeff[7];
    };
 #endif
    // parameters, common for all sensors
    float elevation;  // degrees, up - positive;
    float heading;    // degrees, CW (from top) - positive
-	float forward    [NUM_CAMS];
+    float forward[NUM_CAMS];
-	float right      [NUM_CAMS];
+    float right[NUM_CAMS];
-	float height     [NUM_CAMS];
+    float height[NUM_CAMS];
-	float roll       [NUM_CAMS];    // degrees, CW (to target) - positive
+    float roll[NUM_CAMS];  // degrees, CW (to target) - positive
-	float pXY0       [NUM_CAMS][2];
+    float pXY0[NUM_CAMS][2];
    float common_right;        // mm right, camera center
    float common_forward;      // mm forward (to target), camera center
    float common_height;       // mm up, camera center
    float common_roll;         // degrees CW (to target) camera as a whole
-//	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
+                               //	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
-//	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
+                               //	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
-	float rXY        [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
+    float rXY[NUM_CAMS][2];    // XY pairs of the in a normal plane, relative to disparityRadius
-//	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
+                               //	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
-// only used for the multi-quad systems
+                               // only used for the multi-quad systems
    float cameraRadius;        // =0; // average distance from the "mass center" of the sensors to the sensors
    float disparityRadius;     // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
-	float woi_tops   [NUM_CAMS]; // used to calculate scanline timing
+    float woi_tops[NUM_CAMS];  // used to calculate scanline timing
 };
 #define RAD_COEFF_LEN 7
 extern "C" __global__ void get_tiles_offsets(
    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    int num_cams,
-//		struct tp_task     * gpu_tasks,
+    //		struct tp_task     * gpu_tasks,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
    int num_tiles,      // number of tiles in task
-		struct gc          * gpu_geometry_correction,
+    struct gc *gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv);
+    trot_deriv *gpu_rot_deriv);
 extern "C" __global__ void calculate_tiles_offsets(
    int uniform_grid,  //==0: use provided centers (as for interscene) , !=0 calculate uniform grid
    int num_cams,
-		float              * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+    float *gpu_ftasks,  // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task     * gpu_tasks,
+                        //		struct tp_task     * gpu_tasks,
    int num_tiles,      // number of tiles in task
-		struct gc          * gpu_geometry_correction,
+    struct gc *gpu_geometry_correction,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
+    float *gpu_rByRDist,  // length should match RBYRDIST_LEN
-		trot_deriv   * gpu_rot_deriv);
+    trot_deriv *gpu_rot_deriv);
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
    int num_cams,
-		struct corr_vector * gpu_correction_vector,
+    struct corr_vector *gpu_correction_vector,
-		trot_deriv   * gpu_rot_deriv);
+    trot_deriv *gpu_rot_deriv);
 #define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3)  // fixed blockDim
 // Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calcReverseDistortionTable(
-		struct gc * geometry_correction,
+    struct gc *geometry_correction,
-		float * rByRDist);
+    float *rByRDist);
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -77,7 +77,7 @@
 #define RBYRDIST_LEN 5001                     // for doubles 10001 - floats   // length of rByRDist to allocate shared memory
 #define RBYRDIST_STEP 0.0004                  // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
-#define TILES_PER_BLOCK_GEOM          (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+#define TILES_PER_BLOCK_GEOM (32 / NUM_CAMS)  // each tile has NUM_CAMS threads
 #define DEBUG_ANY 1
@@ -87,13 +87,13 @@
 //#define DBG_TILE_X     40
 //#define DBG_TILE_Y     80
 #if TEST_LWIR
-	#define DBG_TILE_X    50 // 52 // 32 // 162 // 151 // 161 // 49
+#define DBG_TILE_X 50  // 52 // 32 // 162 // 151 // 161 // 49
-	#define DBG_TILE_Y    19 //  5 // 36 // 88 // 121 // 69  // 111 // 66
+#define DBG_TILE_Y 19  //  5 // 36 // 88 // 121 // 69  // 111 // 66
-	#define DBG_TILE    (DBG_TILE_Y * 80 + DBG_TILE_X)
+#define DBG_TILE (DBG_TILE_Y * 80 + DBG_TILE_X)
 #else
-	#define DBG_TILE_X     114 // 32 // 162 // 151 // 161 // 49
+#define DBG_TILE_X 114  // 32 // 162 // 151 // 161 // 49
-	#define DBG_TILE_Y     51  // 52  // 88 // 121 // 69  // 111 // 66
+#define DBG_TILE_Y 51   // 52  // 88 // 121 // 69  // 111 // 66
-	#define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
+#define DBG_TILE (DBG_TILE_Y * 324 + DBG_TILE_X)
 #endif
 #undef DBG_MARK_DBG_TILE
 //#undef DBG_TILE
@@ -101,8 +101,7 @@
 //#undef HAS_PRINTF
 #define HAS_PRINTF
+// 7
-//7
 //#define DEBUG1 1
 //#define DEBUG2 1
 //#define DEBUG3 1
@@ -118,7 +117,7 @@
 #define DEBUG9 1
 */
 //#define DEBUG8A 1 // generate_RBGA_host
-//textures
+// textures
 //#define DEBUG10 1
 //#define DEBUG11 1
 //#define DEBUG12 1
@@ -127,7 +126,6 @@
 // geom
 //#define DEBUG20 1
 #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
 //#define DEBUG20 1 // Geometry Correction
 //#define DEBUG21 1 // Geometry Correction
@@ -140,6 +138,4 @@
 #endif  //#ifdef 	DEBUG_ANY
 #endif  //#ifndef JCUDA