preparing for separate compilation in java

3c3ebfa9 · Andrey Filippov · 0253bab4 · 3c3ebfa9 · 3c3ebfa9 · 0253bab4
Commit 3c3ebfa9 authored Apr 07, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 1347 additions and 2485 deletions

TileProcessor.cuh src/TileProcessor.cuh +211 -689

dtt8x8.cu src/dtt8x8.cu +1136 -950

dtt8x8.cuh src/dtt8x8.cuh +0 -846

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -37,9 +37,8 @@
 */
 // Avoiding includes in jcuda, all source files will be merged
-#ifndef JCUDA
 #pragma once
-#include "dtt8x8.cuh"
+#ifndef JCUDA
 #define THREADSX         (DTT_SIZE)
 #define NUM_CAMS                  4
 #define NUM_PAIRS                 6
@@ -93,10 +92,9 @@
 #define DEBUG_OOB1 1
 #endif //#ifndef JCUDA
+#include "dtt8x8.h"
 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
 //#define IMCLT14
 //#define NOICLT 1
 //#define TEST_IMCLT
@@ -148,25 +146,10 @@
 // Make TILESYA >= TILESX and a multiple of 4
 #define TILESYA       ((TILESY +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
-#define DTT_SIZE1        (DTT_SIZE + 1)
-#define DTT_SIZE2        (2 * DTT_SIZE)
-#define DTT_SIZE21       (DTT_SIZE2 + 1)
-//#define DTT_SIZE22       (DTT_SIZE2 + 2)
 #define MCLT_UNION_LEN   (DTT_SIZE2 * (DTT_SIZE2 + 2))
-#define DTT_SIZE4        (4 * DTT_SIZE)
-#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
 // Use CORR_OUT_RAD for the correlation output
-#define BAYER_RED   0
-#define BAYER_BLUE  1
-#define BAYER_GREEN 2
-// assuming GR/BG as now
-#define BAYER_RED_ROW 0
-#define BAYER_RED_COL 1
-//#define BAYER_BLUE_ROW (1 - BAYER_RED_ROW)
-//#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
 //#define DBG_TILE_X     40
 //#define DBG_TILE_Y     80
 #define DBG_TILE_X     161 // 49
@@ -312,11 +295,14 @@ def set_imclt_sa(stride=9):
        print('0x%02x,'%(d), end="")
    print('0x%2x};'%(sa8s[-1]))
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 import numpy as np # math
 def printAlphaFade(transform_size):
    ts2 = 2 * transform_size
    ts2m1 = ts2-1
-    alphaFade = np.zeros(shape=(16,ts2*ts2), dtype=float) # 	double [][] alphaFade = new double[16][ts2*ts2];
+    alphaFade = np.zeros(shape=(9,ts2*ts2), dtype=float) # 	double [][] alphaFade = new double[16][ts2*ts2];
+    alphaIndex = np.zeros(shape=(16,), dtype=int)
    fade1d =    np.zeros(shape=(16,), dtype=float) # double [] fade1d = new double [ts2];
    for i in range (ts2):
        fade1d[i] = 0.5 * (1.0 - np.cos(np.pi * (i +0.5) /ts2))
@@ -326,27 +312,49 @@ def printAlphaFade(transform_size):
            for m in range (16):
 #                if   m == 0:
 #                    alphaFade[m][indx] = 0
-                elif m == 1: # 0
+                if m == 1: # 0
-                    alphaFade[m][indx] = fade1d[ts2m1 - i]
+                    alphaIndex[m] = 1
+                    alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - i]
                elif m == 2:
-                    alphaFade[m][indx] = fade1d[j]
+                    alphaIndex[m] = 2
+                    alphaFade[alphaIndex[m]][indx] = fade1d[j]
                elif m == 4:
-                    alphaFade[m][indx] = fade1d[i]
+                    alphaIndex[m] = 3
+                    alphaFade[alphaIndex[m]][indx] = fade1d[i]
                elif m == 8:
-                    alphaFade[m][indx] = fade1d[ts2m1 - j]
+                    alphaIndex[m] = 4
+                    alphaFade[alphaIndex[m]][indx] = fade1d[ts2m1 - j]
                elif m == 3:
-                    alphaFade[m][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
+                    alphaIndex[m] = 5
+                    alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - i],fade1d[j])[j > ts2m1 - i]
                elif m == 6:
-                    alphaFade[m][indx] = (fade1d[i],fade1d[j])[j > i]
+                    alphaIndex[m] = 6
+                    alphaFade[alphaIndex[m]][indx] = (fade1d[i],fade1d[j])[j > i]
                elif m == 9:
-                    alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
+                    alphaIndex[m] = 7
+                    alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[ts2m1 - i])[j > i]
                elif m == 12:
-                    alphaFade[m][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
+                    alphaIndex[m] = 8
+                    alphaFade[alphaIndex[m]][indx] = (fade1d[ts2m1 - j],fade1d[i])[i > ts2m1 - j]
                else:
-                    alphaFade[m][indx] = 1.0
+                    alphaIndex[m] = 0
+                    alphaFade[alphaIndex[m]][indx] = 1.0
    floats_in_line=8
-    print("__constant__ float alphaFade[16][%d] = {"%(ts2*ts2))
+    print("__constant__ int alphaIndex[16] = {")
    for m in range (16):
+        if ((m % floats_in_line) == 0):
+            print("\n    ",end="")
+        else:
+            print(" ",end="")
+        print("%d"%(alphaIndex[m]), end ="")
+        if (m < (16-1)):
+            print(",",end="")
+    print("};")
+    print("__constant__ float alphaFade[9][%d] = {"%(ts2*ts2))
+    for m in range (9):
        for i in range (ts2 * ts2):
            if ((i % floats_in_line) == 0):
                print("    ",end="")
@@ -362,20 +370,45 @@ def printAlphaFade(transform_size):
                    print(",")
            else:
                print(", ",end="")
-        if (m == 15):
+        if (m == (9-1)):
            print("};")
        else:
            print(",")
+printAlphaFade(8)
+"""
+__constant__ float idct_signs[4][4][4] ={
+		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1,-1,-1,-1},
+				{-1, 1, 1, 1},
+				{-1, 1, 1, 1},
+				{-1, 1, 1, 1}
+		},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1, 1, 1,-1},
+				{-1,-1,-1, 1},
+				{-1,-1,-1, 1},
+				{-1,-1,-1, 1}
+		},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1,-1,-1,-1},
+				{ 1,-1,-1,-1},
+				{ 1,-1,-1,-1},
+				{-1, 1, 1, 1}
+		},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1, 1, 1,-1},
+				{ 1, 1, 1,-1},
+				{ 1, 1, 1,-1},
+				{-1,-1,-1, 1}
+		}};
 */
 __constant__ float HWINDOW[] =   {0.098017f, 0.290285f, 0.471397f, 0.634393f,
                                  0.773010f, 0.881921f, 0.956940f, 0.995185f};
-__constant__ float HWINDOW2[] =  {0.049009f, 0.145142f, 0.235698f, 0.317197f,
-                                  0.386505f, 0.440961f, 0.478470f, 0.497592f};
 __constant__ float HWINDOW_SQ[] = {0.009607f, 0.084265f, 0.222215f, 0.402455f,
                                   0.597545f, 0.777785f, 0.915735f, 0.990393f};
@@ -392,9 +425,12 @@ __constant__ int fold_inc[]=          {0x02feee12, 0x021eeef2};
 //__constant__ int imclt_indx[16] = {0x24,0x2c,0x34,0x3c,0x3c,0x34,0x2c,0x24,0x1c,0x22,0x21,0x20,0x20,0x21,0x22,0x23};
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
-__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
+#ifdef BBBB
+__constant__ float HWINDOW2[] =  {0.049009f, 0.145142f, 0.235698f, 0.317197f,
+                                  0.386505f, 0.440961f, 0.478470f, 0.497592f};
+__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
 // Hope that if 2 outer indices are known at compile time there will be no integer multiplications
 __constant__ float idct_signs[4][4][4] ={
 		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
@@ -418,6 +454,7 @@ __constant__ float idct_signs[4][4][4] ={
 				{ 1, 1, 1,-1},
 				{-1,-1,-1, 1}
 		}};
+#endif
 // LPF for sigma 0.9 each color (modify through cudaMemcpyToSymbol() or similar in Driver API
 //#ifndef NOICLT
 __constant__ float lpf_data[4][64]={
@@ -489,39 +526,40 @@ __constant__ int pairs[6][2]={
 		{0, 3},
 		{2, 1}};
-__constant__ float alphaFade[16][256] = {
+__constant__ int alphaIndex[16] = {0, 1, 2, 5, 3, 0, 6, 0, 4, 7, 0, 0, 8, 0, 0, 0};
-    {0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+__constant__ float alphaFade[9][256] = {
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f},
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
    {0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
@@ -586,38 +624,6 @@ __constant__ float alphaFade[16][256] = {
     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
-    {0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
-     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
-     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
    {0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
     0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f, 0.002408f,
     0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f, 0.021530f,
@@ -650,102 +656,6 @@ __constant__ float alphaFade[16][256] = {
     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
-    {0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
-     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
-     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
    {0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
@@ -780,228 +690,132 @@ __constant__ float alphaFade[16][256] = {
     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
    {0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
+     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
+     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
+     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
+     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
+     0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
+     0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
+     0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
+     0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
+     0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
+     0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
+     0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
+     0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f},
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+    {0.002408f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.021530f, 0.021530f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.059039f, 0.059039f, 0.059039f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.113495f, 0.113495f, 0.113495f, 0.113495f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
-    {0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
+     0.549009f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
+     0.645142f, 0.645142f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
+     0.735698f, 0.735698f, 0.735698f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
+     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
+     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.940961f, 0.978470f, 0.997592f,
     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
+     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.978470f, 0.997592f,
     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
+     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.997592f,
-     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
+     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f},
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+    {0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f},
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+    {0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.002408f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.021530f, 0.021530f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.059039f, 0.059039f, 0.059039f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.113495f, 0.113495f, 0.113495f, 0.113495f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.182803f, 0.182803f, 0.182803f, 0.182803f, 0.182803f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f, 0.264302f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f, 0.354858f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f, 0.450991f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f, 0.549009f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.645142f, 0.645142f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f, 0.645142f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.735698f, 0.735698f, 0.735698f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f, 0.735698f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f, 0.817197f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f, 0.886505f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f, 0.940961f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f, 0.978470f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f},
+     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f,
-    {1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
+     0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f, 0.997592f}};
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
-     1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f}};
 //#endif
 __device__ void convertCorrectTile(
@@ -1065,16 +879,10 @@ __device__ void resetCorrelation(
 __device__ void normalizeTileAmplitude(
 		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float fat_zero);  // fat zero is absolute, scale it outside
-__device__ void corrUnfoldTile(
-		int corr_radius,
+//__device__ void imclt(  // for 16 threads implemented, used // why is it twice?
-		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
-//__device__ void imclt(  // implemented, used // why is it twice?
 //		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 //		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-__device__ void imclt(  // for 16 threads implemented, used // why is it twice?
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
 __device__ void imclt8threads(// for 8 threads
 		int     do_acc,     // 1 - add to previous value, 0 - overwrite
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
@@ -1356,14 +1164,8 @@ __global__ void correlate2D(
     __syncthreads();// __syncwarp();
 #endif
 #endif
+     dttii_2d(clt_corr);
 /*
-Java code:
-     	for (int quadrant = 0; quadrant < 4; quadrant++){
-    		int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
-    		tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
-    	}
- */
    // change to 16-32 threads?? in next iteration
    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
    for (int q = 0; q < 4; q++){
@@ -1371,15 +1173,6 @@ Java code:
    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
    }
    __syncthreads();
-#ifdef DBG_TILE
-#ifdef DEBUG6
-    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
-        printf("\ncorrelate2D AFTER VERTICAL (HORIZONTAL) PASS\n");
-    	debug_print_clt1(clt_corr, -1,  0xf);
-    }
-     __syncthreads();// __syncwarp();
-#endif
-#endif
    // hor pass, corresponding to vert pass in Java
    for (int q = 0; q < 4; q++){
@@ -1387,6 +1180,9 @@ Java code:
    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
    }
    __syncthreads();
+*/
 #ifdef DBG_TILE
 #ifdef DEBUG6
    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 4)){
@@ -2498,7 +2294,8 @@ __global__ void textures_accumulate(
 		}
 		__syncthreads();// __syncwarp();
 #endif // DEBUG12
-		if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
+		int alpha_mode = alphaIndex[tile_code];
+		if (!alpha_mode){ // only multiply if needed, alpha_mode == 0 - keep as is.
 			for (int pass = 0; pass < 8; pass ++) {
 				int row = pass * 2 + (threadIdx.y >> 1);
 				int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
@@ -2509,12 +2306,12 @@ __global__ void textures_accumulate(
 				if (colors == 3){
 #pragma unroll
 					for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
-						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
+						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[alpha_mode][gi]; // reduce [tile_code] by LUT
 					}
 				} else { // assuming colors = 1
 #pragma unroll
 					for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
-						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[tile_code][gi]; // reduce [tile_code] by LUT
+						*(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21)) *= alphaFade[alpha_mode][gi]; // reduce [tile_code] by LUT
 					}
 				}
 			}
@@ -2940,72 +2737,6 @@ __device__ void normalizeTileAmplitude(
 		clt_tile_j3 ++; // =DTT_SIZE1;
 	}
 }
-/*
-Converted from DttRad2.java:443
-	public  double [] corr_unfold_tile(
-		double [][]  qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
-		int          transform_size
-	)
- */
-__device__ void corrUnfoldTile(
-		int corr_radius,
-		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
-{
-	int size2r1 = 2 * corr_radius + 1; // 15
-	int crp1 = corr_radius + 1;        //8
-///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
-	int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
-	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
-	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
-	int i = threadIdx.x;
-	if (i > corr_radius) {
-		return; // not needed, only use inner
-	}
-//	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
-	float corr_pixscale = 0.25f;
-	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
-	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
-///	int rslt_row_offs = i * DTT_SIZE2M1;
-	int rslt_row_offs = i * size2r1;
-	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
-	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
-	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
-	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
-///	for (int j = 1; j < DTT_SIZE; j++) {
-	for (int j = 1; j <= corr_radius; j++) {
-		int rslt_base_index_pp = rslt_base_index_p + j;
-		int rslt_base_index_pm = rslt_base_index_p - j;
-		rslt[rslt_base_index_pp] = corr_pixscale * (
-				 qdata0[i_transform_size + j] +
-				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-		rslt[rslt_base_index_pm] = corr_pixscale * (
-				 qdata0[i_transform_size + j] +
-				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-	}
-	if (i == 0) {
-		return;
-	}
-///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
-	float d = corr_pixscale * qdata2[im1_transform_size];
-	rslt[rslt_base_index_p] += d;
-	rslt[rslt_base_index_m] -= d;
-	for (int j = 1; j <= corr_radius; j++) {
-		int rslt_base_index_pp = rslt_base_index_p + j;
-		int rslt_base_index_pm = rslt_base_index_p - j;
-		int rslt_base_index_mp = rslt_base_index_m + j;
-		int rslt_base_index_mm = rslt_base_index_m - j;
-		float d2 = corr_pixscale * qdata2[im1_transform_size + j];
-		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
-		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
-		rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
-		rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
-		rslt[rslt_base_index_pp] += d2 + d3;
-		rslt[rslt_base_index_pm] += d2 - d3;
-	}
-}
 __device__ void debug_print_lpf(
 		float * lpf_tile)
@@ -3362,28 +3093,6 @@ __device__ void convertCorrectTile(
    }
     __syncthreads();// __syncwarp();
 #endif
-/*
-    if (color == BAYER_GREEN) {
-    	// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
-//    	float *dtt_buf =  ((float *) clt_tile[0]) + threadIdx.x;
-//    	float *dtt_buf1 = ((float *) clt_tile[2]) + threadIdx.x;
-    	float *dtt_buf =  clt_tile + threadIdx.x;
-    	float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
-    	(*dtt_buf) += (*dtt_buf1);
-    	dtt_buf +=    (4 * DTT_SIZE1);
-    	dtt_buf1 +=   (4 * DTT_SIZE1);
-    	(*dtt_buf) += (*dtt_buf1);
-    	dtt_buf =         clt_tile + (DTT_SIZE1 * DTT_SIZE) + threadIdx.x; // ((float *) clt_tile[1]) + threadIdx.x;
-    	dtt_buf1 =        dtt_buf +  (2 * DTT_SIZE1 * DTT_SIZE);           // ((float *) clt_tile[3]) + threadIdx.x;
-    	(*dtt_buf) += (*dtt_buf1);
-    	dtt_buf += (4 * DTT_SIZE1);
-    	dtt_buf1 += (4 * DTT_SIZE1);
-    	(*dtt_buf) += (*dtt_buf1);
-    	 __syncthreads();// __syncwarp();
-    }
-*/
     if (color == BAYER_GREEN) {
    	 // reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
    	 float *dtt_buf =  clt_tile + threadIdx.x;
@@ -3404,21 +3113,16 @@ __device__ void convertCorrectTile(
    }
     __syncthreads();// __syncwarp();
 #endif
+     dttiv_color_2d(
+     		clt_tile,
+     		color);
+/*
    dctiv_nodiverg( // all colors
-#ifdef USE_UMUL24
-    		clt_tile + __umul24(threadIdx.x,DTT_SIZE1), // [0][threadIdx.x], // pointer to start of row
-#else
 			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
-#endif
 			1); //int inc);
    if (color == BAYER_GREEN){
        dstiv_nodiverg( // all colors
-#ifdef USE_UMUL24
+				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
-        		clt_tile + __umul24(threadIdx.x + DTT_SIZE, DTT_SIZE1), // clt_tile[1][threadIdx.x], // pointer to start of row
-#else
-				clt_tile + DTT_SIZE1 * (threadIdx.x + DTT_SIZE), // clt_tile[1][threadIdx.x], // pointer to start of row
-#endif
    			1); //int inc);
    }
@@ -3435,12 +3139,12 @@ __device__ void convertCorrectTile(
    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
 			DTT_SIZE1); // int inc,
    if (color == BAYER_GREEN){
-//        dstiv_nodiverg( // all colors
          dctiv_nodiverg( // all colors
        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
    			DTT_SIZE1); // int inc,
    }
  	 __syncthreads();// __syncwarp();
+*/
 #ifdef DEBUG2
    if ((threadIdx.x) == 0){
@@ -3692,6 +3396,7 @@ __global__ void test_imclt(
 }
 #endif // NOICLT1
+#ifdef BBBB
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
@@ -3721,23 +3426,19 @@ __device__ void imclt(
 	// perform horizontal dct-iv on quadrants 0 and 1
    dctiv_nodiverg(
-//    		clt_tile +  DTT_SIZE1 * (thr012 +   DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 1
    		clt_tile +  DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
 			1);
 	// perform horizontal dst-iv on quadrants 2 and 3
    dstiv_nodiverg( // all colors
-//    		clt_tile2 + DTT_SIZE1 * (thr012 +   DTT_SIZE * thr3), // pointer to start of row for quadrants 2 and 3
    		clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
 			1);
    __syncthreads();// __syncwarp();
 	// perform vertical   dct-iv on quadrants 0 and 2
    dctiv_nodiverg(
-//    		clt_tile +  thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 2
    		clt_tile +  thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
 			DTT_SIZE1);
 	// perform vertical   dst-iv on quadrants 1 and 3
    dstiv_nodiverg(
-//    		clt_tile1 + thr012 + (DTT_SIZE1 * 2*DTT_SIZE) * thr3, // pointer to start of row for quadrants 1 and 3
    		clt_tile2 + thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
 			DTT_SIZE1);
    __syncthreads();// __syncwarp();
@@ -3833,188 +3534,9 @@ __device__ void imclt(
    __syncthreads();// __syncwarp();
 #endif
 }
-//#endif
-// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
-// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
-// first pass
-//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
-__device__ void imclt8threads(
-		int     do_acc,     // 1 - add to previous value, 0 - overwrite
-		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
-		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-		int debug)
-{
-//	int thr3 =    threadIdx.x >> 3;
-//	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
-//	int thr012 =  threadIdx.x & 7;
-//	int column4 = threadIdx.x >> 2;
-//	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
-	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
-	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
-#ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
-        printf("\nDTT Tiles before IDTT\n");
-        debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
-    }
-     __syncthreads();// __syncwarp();
-#endif
-	// perform horizontal dct-iv on quadrants 0 and 1
-    dctiv_nodiverg( // quadrant 0
-    		clt_tile +  threadIdx.x,                              // pointer to start of row for quadrant 0
-			DTT_SIZE1);
-    dctiv_nodiverg( // quadrant 1
-    		clt_tile +  threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
-			DTT_SIZE1);
-	// perform horizontal dst-iv on quadrants 2 and 3
-    dstiv_nodiverg( // quadrant 2
-    		clt_tile +  threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
-			DTT_SIZE1);
-    dstiv_nodiverg( // quadrant 3
-    		clt_tile +  threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
-			DTT_SIZE1);
-    __syncthreads();// __syncwarp();
-	// perform vertical   dct-iv on quadrants 0 and 2
-    dctiv_nodiverg( // quadrant 0
-    		clt_tile +  DTT_SIZE1 * threadIdx.x,                              // pointer to start of row for quadrant 0
-			1);
-    dctiv_nodiverg( // quadrant 2
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
-			1);
-    // perform vertical   dst-iv on quadrants 1 and 3
-    dstiv_nodiverg( // quadrant 1
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
-			1);
-    dstiv_nodiverg( // quadrant 3
-    		clt_tile +  DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
-			1);
-    __syncthreads();// __syncwarp();
-#ifdef DEBUG7
-    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
-    	printf("\nDTT Tiles after IDTT\n");
-    	debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
-    }
-    __syncthreads();// __syncwarp();
-#endif
-    // re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
-    for (int thr3 = 0; thr3 < 2; thr3++){
-    	int thr3m = (thr3 << 3);
-    	int column =  threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
-    	int thr012 =  threadIdx.x & 7; // == threadIdx.x
-    	int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
-    	int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
-    	float hw = HWINDOW2[wcolumn];
-    	int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
-    	float * rslt = mclt_tile + column;
-#ifdef DEBUG7
-        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
-    	printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
-    			thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
-    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
-    }
-    __syncthreads();// __syncwarp();
 #endif
-#pragma unroll
-    	for (int i = 0; i < 4; i++){
-    		float val = *rslt;
-    		// facc
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i < 3){
-    			clt_offset +=  DTT_SIZE1;
-    		}
-    		//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		// val =__fmaf_rd(w,d0,val); // w*d0 + val
-    		// *rslt = val;
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
-#pragma unroll
-    	for (int i = 4; i < 8; i++){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		//    	if (i < 7){
-    		clt_offset -=  DTT_SIZE1;
-    		//    	}
-//    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
-#pragma unroll
-    	for (int i = 7; i >= 4; i--){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i > 4){
-    			clt_offset -=  DTT_SIZE1;
-    		}
-    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
-#pragma unroll
-    	for (int i = 3; i >= 0; i--){
-    		float val = *rslt;
-    		float w = HWINDOW2[i] * hw;
-    		float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
-    		float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
-    		float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
-    		float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
-    		d0+=d1;
-    		d2+=d3;
-    		d0+= d2;
-    		if (i > 0){
-    			clt_offset +=  DTT_SIZE1;
-    		}
-    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
-    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
-    		rslt += DTT_SIZE21;
-    	}
-    }
-#ifdef DEBUG7
-    __syncthreads();// __syncwarp();
-	for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
-		if (debug  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
-			printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
-			debug_print_mclt(
-					mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-					-1);
-		}
-		__syncthreads();// __syncwarp();
-	}
-    __syncthreads();// __syncwarp();
-#endif
-}
 __device__ void debayer_shot(
 		const int rb_mode,    // 0 - green, 1 - r/b

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
 /**
 **
- ** dtt8x8.cu - CPU test code to run GPU tile processor
+ ** dtt8x8.cuh
 **
 ** Copyright (C) 2018 Elphel, Inc.
 **
 ** -----------------------------------------------------------------------------**
 **
- **  dtt8x8.cu is free software: you can redistribute it and/or modify
+ **  dtt8x8.cuh is free software: you can redistribute it and/or modify
 **  it under the terms of the GNU General Public License as published by
 **  the Free Software Foundation, either version 3 of the License, or
 **  (at your option) any later version.
@@ -30,1069 +30,1255 @@
 ** -----------------------------------------------------------------------------**
 */
-#include <stdio.h>
+/**
-#include <stdlib.h>
+**************************************************************************
-#include <math.h>
+* \file dtt8x8.cuh
-#include <cuda_runtime.h>
+* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
-#include <helper_cuda.h>
+*        in GPU
-#include <helper_functions.h>
+* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
+* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
+* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
+* here it is used to convert convolution kernels and LPF to the frequency domain - done in
+* software.
+*
+* This file is cpompatible with both runtime and driver API, runtime is used for development
+* with Nvidia Nsight, driver API when calling these kernels from Java
+*/
+#include "dtt8x8.h"
+//#define CUDART_INF_F            __int_as_float(0x7f800000)
+/*
+ Python code to generate constant coefficients:
+def dct_constants():
+    COSPI_1_8_SQRT2 = math.cos(math.pi/8)*math.sqrt(2.0)
+    COSPI_3_8_SQRT2 = math.cos(3*math.pi/8)*math.sqrt(2.0)
+    SQRT_2 = math.sqrt(2.0)
+    SQRT1_2 = 1/math.sqrt(2.0)
+    SQRT1_8 = 1/math.sqrt(8.0)
+    CN = [[math.cos((2*k+1)*(math.pi/(8*(2 << t))))  for k in range (2 << t)] for t in range (2)]
+    SN = [[math.sin((2*k+1)*(math.pi/(8*(2 << t))))  for k in range (2 << t)] for t in range (2)]
+    print("__constant__ float COSPI_1_8_SQRT2 = %ff;"%(COSPI_1_8_SQRT2))
+    print("__constant__ float COSPI_3_8_SQRT2 = %ff;"%(COSPI_3_8_SQRT2))
+    print("__constant__ float SQRT_2 = %ff;"%         (SQRT_2))
+    print("__constant__ float SQRT1_2 = %ff;"%        (SQRT1_2))
+    print("__constant__ float SQRT1_8 = %ff;"%        (SQRT1_8))
+    print("__constant__ float COSN1[] = {%ff,%ff};"%         (CN[0][0],CN[0][1]))
+    print("__constant__ float COSN2[] = {%ff,%ff,%ff,%ff};"% (CN[1][0],CN[1][1],CN[1][2],CN[1][3]))
+    print("__constant__ float SINN1[] = {%ff,%ff};"%         (SN[0][0],SN[0][1]))
+    print("__constant__ float SINN2[] = {%ff,%ff,%ff,%ff};"% (SN[1][0],SN[1][1],SN[1][2],SN[1][3]))
+*/
+__constant__ float COSPI_1_8_SQRT2 = 1.306563f;
+__constant__ float COSPI_3_8_SQRT2 = 0.541196f;
+__constant__ float SQRT_2 = 1.414214f;
+__constant__ float SQRT1_2 = 0.707107f;
+__constant__ float SQRT1_8 = 0.353553f;
+__constant__ float COSN1[] = {0.980785f,0.831470f};
+__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
+__constant__ float SINN1[] = {0.195090f,0.555570f};
+__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
+__constant__ int imclt_indx9[16] = {0x28,0x29,0x2a,0x2b,0x2b,0x2a,0x29,0x28,0x27,0x26,0x25,0x24,0x24,0x25,0x26,0x27};
+__constant__ float idct_signs[4][4][4] ={
+		{ // quadrant 0, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1,-1,-1,-1},
+				{-1, 1, 1, 1},
+				{-1, 1, 1, 1},
+				{-1, 1, 1, 1}
+		},{ // quadrant 1, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1, 1, 1,-1},
+				{-1,-1,-1, 1},
+				{-1,-1,-1, 1},
+				{-1,-1,-1, 1}
+		},{ // quadrant 2, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1,-1,-1,-1},
+				{ 1,-1,-1,-1},
+				{ 1,-1,-1,-1},
+				{-1, 1, 1, 1}
+		},{ // quadrant 3, each elements corresponds to 4x4 pixel output, covering altogether 16x16
+				{ 1, 1, 1,-1},
+				{ 1, 1, 1,-1},
+				{ 1, 1, 1,-1},
+				{-1,-1,-1, 1}
+		}};
+__constant__ float HWINDOW2[] =  {0.049009f, 0.145142f, 0.235698f, 0.317197f,
+                                  0.386505f, 0.440961f, 0.478470f, 0.497592f};
-// for reading binary files
-#include <fstream>
-#include <iterator>
-#include <vector>
-#include "dtt8x8.cuh"
-#include "TileProcessor.cuh"
-///#include "cuda_profiler_api.h"
-//#include "cudaProfiler.h"
+/**
+**************************************************************************
+*  Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
+*  Mostly for testing and profiling individual conversions
+*
+* \param dst                        [OUT] - Coefficients as 8x8 tiles
+* \param src                         [IN] - Source image of floats
+* \param src_stride                  [IN] - Source image stride
+* \param mode                        [IN] - DTT mode:
+*     0 - horizontal DCT-IV followed by vertical DCT-IV
+*     1 - horizontal DST-IV followed by vertical DCT-IV
+*     2 - horizontal DCT-IV followed by vertical DST-IV
+*     3 - horizontal DST-IV followed by vertical DST-IV
+*     4 - horizontal DCT-II followed by vertical DCT-II
+*     5 - horizontal DST-II followed by vertical DCT-II
+*     6 - horizontal DCT-II followed by vertical DST-II
+*     7 - horizontal DST-II followed by vertical DST-II
+*
+* \return None
+*/
-float * copyalloc_kernel_gpu(float * kernel_host,
+extern "C"
-		                int size, // size in floats
+__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
-						int full_size)
 {
-	float *kernel_gpu;
+	int dtt_mode0 = dtt_mode & 1;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
+	int dtt_mode1 = (dtt_mode >>1) & 1;
-    checkCudaErrors(cudaMemcpy( // segfault
-    		kernel_gpu,
-    		kernel_host,
-			size * sizeof(float),
-            cudaMemcpyHostToDevice));
-    return kernel_gpu;
-}
-float * copyalloc_kernel_gpu(float * kernel_host,
+    __shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
-		                int size)
-{
-	return copyalloc_kernel_gpu(kernel_host,
-            size, // size in floats
-			size);
-}
+    int OffsThreadInRow = threadIdx.y * DTT_SIZE + threadIdx.x;
+    int OffsThreadInCol = threadIdx.z * DTT_SIZE;
+    src += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
+    dst += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
+    float *bl_ptr = block + OffsThreadInCol * DTTTEST_BLK_STRIDE + OffsThreadInRow;
+#pragma unroll
-float * alloccopy_from_gpu(
+    for (unsigned int i = 0; i < DTT_SIZE; i++)
-		float * gpu_data,
+        bl_ptr[i * DTTTEST_BLK_STRIDE] = src[i * src_stride];
-		float * cpu_data, // if null, will allocate
-		int size)
+    __syncthreads();
-{
+    // horizontal pass
-	if (!cpu_data) {
+    if (dtt_mode > 3) {
-		cpu_data = (float *)malloc(size*sizeof(float));
+    	dttii_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
+    } else {
+    	dttiv_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
    }
-	checkCudaErrors(cudaMemcpy( // segfault
-			cpu_data,
-			gpu_data,
-			size * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	return cpu_data;
+    __syncthreads();
+    // vertical pass
+    if (dtt_mode > 3) {
+    	dttii_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+    } else {
+    	dttiv_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
+    }
+    __syncthreads();
+    for (unsigned int i = 0; i < DTT_SIZE; i++)
+        dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
 }
-float * alloc_kernel_gpu(int size) // size in floats
+inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
 {
-	float *kernel_gpu;
+	float u00=            ( COSN2[0] * x[0] + SINN2[0] * x[7]);
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
+	float u10=            (-SINN2[3] * x[3] + COSN2[3] * x[4]);
-    return kernel_gpu;
-}
+	float u01=            ( COSN2[1] * x[1] + SINN2[1] * x[6]);
+	float u11=           -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
-float ** copyalloc_pointers_gpu(float ** gpu_pointer,
+	float u02=            ( COSN2[2] * x[2] + SINN2[2] * x[5]);
-		                int size) // number of entries (cameras)
+	float u12=            (-SINN2[1] * x[1] + COSN2[1] * x[6]);
-{
-	float ** gpu_pointer_to_gpu_pointers;
-    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
-    checkCudaErrors(cudaMemcpy(
-    		gpu_pointer_to_gpu_pointers,
-			gpu_pointer,
-			size * sizeof(float*),
-            cudaMemcpyHostToDevice));
-    return gpu_pointer_to_gpu_pointers;
-}
+	float u03=            ( COSN2[3] * x[3] + SINN2[3] * x[4]);
+	float u13=           -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
-float * copyalloc_image_gpu(float * image_host,
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-						size_t* dstride, // in floats !
-		                int width,
-						int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    checkCudaErrors(cudaMemcpy2D(
-    		image_gpu,
-            *dstride, //  * sizeof(float),
-			image_host,
-			width * sizeof(float), // make in 16*n?
-            width * sizeof(float),
-			height,
-			cudaMemcpyHostToDevice));
-    return image_gpu;
-}
-float * alloc_image_gpu(size_t* dstride, // in bytes!!
+	float ua00= u00 + u03;
-		                int width,
+	float ua10= u00 - u03;
-						int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    return image_gpu;
-}
-int readFloatsFromFile(float *       data, // allocated array
+	float ua01= u01 + u02;
-					   const char *  path) // file path
+	float ua11= u01 - u02;
-{
-    std::ifstream input(path, std::ios::binary );
+	float v00= ua00 + ua01;
-    // copies all data into buffer
+	float v02= ua00 - ua01;
-    std::vector<char> buffer((
-            std::istreambuf_iterator<char>(input)),
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-            (std::istreambuf_iterator<char>()));
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-    std::copy( buffer.begin(), buffer.end(), (char *) data);
-	return 0;
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-}
-int writeFloatsToFile(float *       data, // allocated array
+	float ub00= u10 + u13;
-		               int           size, // length in elements
+	float ub10= u10 - u13;
-					   const char *  path) // file path
-{
+	float ub01= u11 + u12;
+	float ub11= u11 - u12;
-//  std::ifstream input(path, std::ios::binary );
+	float vb00= ub00 + ub01;
-	std::ofstream ofile(path, std::ios::binary);
+	float vb01= ub00 - ub01;
-	ofile.write((char *) data, size * sizeof(float));
-	return 0;
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+	y[0] =  SQRT_2 * v00;    // w0[0];
+	y[1] =  v01 -  vb11;    // w1[0];
+	// j == 1
+	y[2] =  v01 +  vb11;    // w0[1];
+	y[3] =  v02 +  vb01;    // w1[1];
+	// j == 2
+	y[4] =  v02 -  vb01;    // w0[2];
+	y[5] =  v03 -  vb10;    // w1[2]; - same as y[3]
+	// j == 3
+	y[6] =  v03 +  vb10;    // w0[3];
+	y[7] =  SQRT_2 * vb00;    // w1[3];
 }
-// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
+__device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
-void set_clt_lpf(
+		float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
-		float * lpf,    // size*size array to be filled out
-		float   sigma,
-		const int     dct_size)
 {
-	int dct_len = dct_size * dct_size;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (sigma == 0.0f) {
+	if (dst_not_dct) { // DSTIV
-		lpf[0] = 1.0f;
+		u00=  ( COSN2[0] * x7 + SINN2[0] * x0);
-		for (int i = 1; i < dct_len; i++){
+		u10=  (-SINN2[3] * x4 + COSN2[3] * x3);
-			lpf[i] = 0.0;
+		u01=  ( COSN2[1] * x6 + SINN2[1] * x1);
+		u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
+		u02=  ( COSN2[2] * x5 + SINN2[2] * x2);
+		u12=  (-SINN2[1] * x6 + COSN2[1] * x1);
+		u03=  ( COSN2[3] * x4 + SINN2[3] * x3);
+		u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
+	} else { // DCTIV
+		u00=  ( COSN2[0] * x0 + SINN2[0] * x7);
+		u10=  (-SINN2[3] * x3 + COSN2[3] * x4);
+		u01=  ( COSN2[1] * x1 + SINN2[1] * x6);
+		u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
+		u02=  ( COSN2[2] * x2 + SINN2[2] * x5);
+		u12=  (-SINN2[1] * x1 + COSN2[1] * x6);
+		u03=  ( COSN2[3] * x3 + SINN2[3] * x4);
+		u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
 	}
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
+	float ua00= u00 + u03;
+	float ua10= u00 - u03;
+	float ua01= u01 + u02;
+	float ua11= u01 - u02;
+	float v00= ua00 + ua01;
+	float v02= ua00 - ua01;
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+	float ub00= u10 + u13;
+	float ub10= u10 - u13;
+	float ub01= u11 + u12;
+	float ub11= u11 - u12;
+	float vb00= ub00 + ub01;
+	float vb01= ub00 - ub01;
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+	*y0 =  v00 * 0.5f;              // w0[0];
+	// j == 1
+	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
+	// j == 2
+	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
+	// j == 3
+	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
+	if (dst_not_dct) { // DSTIV
+		*y1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
+		*y3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
+		*y5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
+		*y7 = -vb00 * 0.5f;             // w1[3];
 	} else {
-		for (int i = 0; i < dct_size; i++){
+		*y1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-			for (int j = 0; j < dct_size; j++){
+		*y3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-				lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
+		*y5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
+		*y7 =  vb00 * 0.5f;             // w1[3];
 	}
+}
+inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
+{
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	if (dst_not_dct) { // DSTII
+		// invert odd input samples
+		u00= ( (*x0) - (*x7));
+		u10= ( (*x0) + (*x7));
+		u01= (-(*x1) + (*x6));
+		u11= (-(*x1) - (*x6));
+		u02= ( (*x2) - (*x5));
+		u12= ( (*x2) + (*x5));
+		u03= (-(*x3) + (*x4));
+		u13= (-(*x3) - (*x4));
+	} else { // DCTII
+		u00= ( (*x0) + (*x7));
+		u10= ( (*x0) - (*x7));
+		u01= ( (*x1) + (*x6));
+		u11= ( (*x1) - (*x6));
+		u02= ( (*x2) + (*x5));
+		u12= ( (*x2) - (*x5));
+		u03= ( (*x3) + (*x4));
+		u13= ( (*x3) - (*x4));
 	}
-		// normalize
+	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-		double sum = 0;
-		for (int i = 0; i < dct_size; i++){
+		float w00= u00 + u03;
-			for (int j = 0; j < dct_size; j++){
+		float w10= u00 - u03;
-				double d = 	lpf[i*dct_size+j];
-				d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
+		float w01= (u01 + u02);
-				if (i > 0) d*= 2.0;
+		float w11= (u01 - u02);
-				if (j > 0) d*= 2.0;
-				sum +=d;
+		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-			}
+		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-		}
+	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		for (int i = 0; i< dct_len; i++){
+		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-			lpf[i] /= sum;
+		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-		}
+		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+		float v11 = w20 - w21 - w30 + w31;
+		float v12 = w20 - w21 + w30 - w31;
+	if (dst_not_dct) { // DSTII
+		// Invert output sequence
+		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+		*x1 =   v03 *         SQRT1_8;
+		*x2 =   v12 *         SQRT1_8;
+		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x4 =   v11 *         SQRT1_8;
+		*x5 =   v01 *         SQRT1_8;
+		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x7 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
+	} else {
+		*x0 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
+		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x2 =   v01 *         SQRT1_8;
+		*x3 =   v11 *         SQRT1_8;
+		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x5 =   v12 *         SQRT1_8;
+		*x6 =   v03 *         SQRT1_8;
+		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
 	}
 }
+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
+{
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	if (dst_not_dct) { // DSTII
+		// invert odd input samples
+		u00= ( (*x0) - (*x7));
+		u10= ( (*x0) + (*x7));
+		u01= (-(*x1) + (*x6));
+		u11= (-(*x1) - (*x6));
+		u02= ( (*x2) - (*x5));
+		u12= ( (*x2) + (*x5));
+		u03= (-(*x3) + (*x4));
+		u13= (-(*x3) - (*x4));
+	} else { // DCTII
+		u00= ( (*x0) + (*x7));
+		u10= ( (*x0) - (*x7));
+		u01= ( (*x1) + (*x6));
+		u11= ( (*x1) - (*x6));
+		u02= ( (*x2) + (*x5));
+		u12= ( (*x2) - (*x5));
+		u03= ( (*x3) + (*x4));
+		u13= ( (*x3) - (*x4));
+	}
+	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
+		float w00= u00 + u03;
+		float w10= u00 - u03;
-/**
+		float w01= (u01 + u02);
-**************************************************************************
+		float w11= (u01 - u02);
-*  Program entry point
-*
+		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-* \param argc       [IN] - Number of command-line arguments
+		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-* \param argv       [IN] - Array of command-line arguments
+	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-*
+		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-* \return Status code
+		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-*/
+		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+		float v11 = w20 - w21 - w30 + w31;
+		float v12 = w20 - w21 + w30 - w31;
+	if (dst_not_dct) { // DSTII
+		// Invert output sequence
+		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+		*x1 =   v03 *         SQRT1_8;
+		*x2 =   v12 *         SQRT1_8;
+		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-int main(int argc, char **argv)
+		*x4 =   v11 *         SQRT1_8;
+		*x5 =   v01 *         SQRT1_8;
+		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x7 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+	} else {
+		*x0 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x2 =   v01 *         SQRT1_8;
+		*x3 =   v11 *         SQRT1_8;
+		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x5 =   v12 *         SQRT1_8;
+		*x6 =   v03 *         SQRT1_8;
+		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+	}
+}
+inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
 {
-    //
+	float *x1 = x0 + inc;
-    // Sample initialization
+	float *x2 = x1 + inc;
-    //
+	float *x3 = x2 + inc;
-    printf("%s Starting...\n\n", argv[0]);
+	float *x4 = x3 + inc;
-    printf("sizeof(float*)=%d\n",(int)sizeof(float*));
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
-    //initialize CUDA
+	float *x7 = x6 + inc;
-    findCudaDevice(argc, (const char **)argv);
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	if (dst_not_dct) { // DSTIV
-    // CLT testing
+		u00=  ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
+		u10=  (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
-    const char* kernel_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
+		u01=  ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
+		u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
+		u02=  ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
+		u12=  (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
-    const char* kernel_offs_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
+		u03=  ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
+		u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
+	} else { // DCTIV
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
+		u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+		u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-    const char* image_files[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
+		u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
+		u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
+		u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+		u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-    const char* ports_offs_xy_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
+		u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
+		u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
+	}
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
-    const char* ports_clt_file[] = { // never referenced
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
-    const char* result_rbg_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
-    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
-    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
-    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
-    // not yet used
-    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
-    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
-			{-0.5, -0.5},
-			{ 0.5, -0.5},
-			{-0.5,  0.5},
-			{ 0.5,  0.5}};
-    int keep_texture_weights = 1; // try with 0 also
-    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-/*
+	float ua00= u00 + u03;
-#define IMG_WIDTH    2592
+	float ua10= u00 - u03;
-#define IMG_HEIGHT   1936
-#define NUM_CAMS        4
-#define NUM_COLORS      3
-#define KERNELS_STEP   16
-#define KERNELS_HOR   164
-#define KERNELS_VERT  123
-#define KERNEL_OFFSETS  8
-#define TILESX        324
-#define TILESY        242
-*/
-/*
-    struct tp_task {
-    	long task;
-		short ty;
-		short tx;
-		float xy[NUM_CAMS][2];
-    } ;
-*/
-    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
-    int KERN_SIZE =  KERN_TILES * 4 * 64;
-//    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
+	float ua01= u01 + u02;
-    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
+	float ua11= u01 - u02;
+	float v00= ua00 + ua01;
+	float v02= ua00 - ua01;
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+	float ub00= u10 + u13;
-    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
+	float ub10= u10 - u13;
-//    int                texture_indices      [TILESX*TILESY];
-    int                texture_indices      [TILESX*TILESYA];
-    int                cpu_woi              [4];
-    // host array of pointers to GPU memory
+	float ub01= u11 + u12;
-    float            * gpu_kernels_h        [NUM_CAMS];
+	float ub11= u11 - u12;
-    struct CltExtra  * gpu_kernel_offsets_h [NUM_CAMS];
-    float            * gpu_images_h         [NUM_CAMS];
-    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
-    float            * gpu_clt_h            [NUM_CAMS];
-    float            * gpu_lpf_h            [NUM_COLORS]; // never used
-#ifndef NOICLT
-    float            * gpu_corr_images_h    [NUM_CAMS];
-#endif
-    float            * gpu_corrs;
+	float vb00= ub00 + ub01;
-    int              * gpu_corr_indices;
+	float vb01= ub00 - ub01;
-    float            * gpu_textures;
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-    float            * gpu_textures_rbga;
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-    int              * gpu_texture_indices;
-    int              * gpu_woi;
-    int              * gpu_num_texture_tiles;
+	*x0 =  v00 * 0.5f;              // w0[0];
-    float            * gpu_port_offsets;
+	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-    int                num_corrs;
+	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-    int                num_textures;
+	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-    int                num_ports = NUM_CAMS;
+	if (dst_not_dct) { // DSTIV
-    // GPU pointers to GPU pointers to memory
+		*x1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-    float           ** gpu_kernels; //           [NUM_CAMS];
+		*x3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
+		*x5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-    float           ** gpu_images; //            [NUM_CAMS];
+		*x7 = -vb00 * 0.5f;             // w1[3];
-    float           ** gpu_clt;    //           [NUM_CAMS];
-    float           ** gpu_lpf;    //           [NUM_CAMS]; // never referenced
-    // GPU pointers to GPU memory
-//    float * gpu_tasks;
-    struct tp_task  * gpu_tasks;
-    size_t  dstride;          // in bytes !
-    size_t  dstride_rslt;     // in bytes !
-    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
-    size_t  dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
-    float lpf_rbg[3][64]; // not used
-    for (int ncol = 0; ncol < 3; ncol++) {
-    	if (lpf_sigmas[ncol] > 0.0) {
-    		set_clt_lpf (
-    				lpf_rbg[ncol], // float * lpf,    // size*size array to be filled out
-					lpf_sigmas[ncol], // float   sigma,
-					8); // int     dct_size)
-    		gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
 	} else {
-    		gpu_lpf_h[ncol] = NULL;
+		*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-    	}
+		*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
+		*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
+		*x7 =  vb00 * 0.5f;             // w1[3];
 	}
+}
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
-        readFloatsFromFile(
+{
-        		host_kern_buf, // float * data, // allocated array
+	float sgn = 1 - 2* dst_not_dct;
-				kernel_file[ncam]); // 			   char *  path) // file path
+	float *y0 = x;
-        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
+	float *y1 = y0 + inc;
+	float *y2 = y1 + inc;
-        readFloatsFromFile(
+	float *y3 = y2 + inc;
-        		host_kern_buf, // float * data, // allocated array
+	float *y4 = y3 + inc;
-				kernel_offs_file[ncam]); // 			   char *  path) // file path
+	float *y5 = y4 + inc;
-        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
+	float *y6 = y5 + inc;
-        		host_kern_buf,
+	float *y7 = y6 + inc;
-				KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
-        // will get results back
+	float *x0 =  x + dst_not_dct * 7 * inc;
-        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
+	// negate inc, replace
-        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
+	inc *= sgn;
-        // allocate result images (3x height to accommodate 3 colors
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
-        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
+	float *x3 = x2 + inc;
-        //host array of pointers to GPU arrays
+	float *x4 = x3 + inc;
-#ifndef NOICLT
+	float *x5 = x4 + inc;
-        gpu_corr_images_h[ncam] = alloc_image_gpu(
+	float *x6 = x5 + inc;
-        		&dstride_rslt,                // size_t* dstride, // in bytes!!
+	float *x7 = x6 + inc;
-				IMG_WIDTH + DTT_SIZE,         // int width,
+	float u00, u01, u02, u03, u10, u11, u12, u13;
-				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
+	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-#endif
+	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-    }
-    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
+	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-    gpu_corrs = alloc_image_gpu(
+	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			CORR_SIZE,                      // int width,
+	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-			NUM_PAIRS * TILESX * TILESY);   // int height);
+	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-    // read channel images (assuming host_kern_buf size > image size, reusing it)
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-        readFloatsFromFile(
+	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-        		host_kern_buf, // float * data, // allocated array
-				image_files[ncam]); // 			   char *  path) // file path
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-        gpu_images_h[ncam] =  copyalloc_image_gpu(
-        		host_kern_buf, // float * image_host,
+	float ua00= u00 + u03;
-				&dstride,      // size_t* dstride,
+	float ua10= u00 - u03;
-				IMG_WIDTH,     // int width,
-				IMG_HEIGHT);   // int height);
+	float ua01= u01 + u02;
-    }
+	float ua11= u01 - u02;
-//#define DBG_TILE  (174*324 +118)
+	float v00= ua00 + ua01;
+	float v02= ua00 - ua01;
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+	float ub00= u10 + u13;
+	float ub10= u10 - u13;
+	float ub01= u11 + u12;
+	float ub11= u11 - u12;
+	float vb00= ub00 + ub01;
+	float vb01= ub00 - ub01;
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
+	*y0 =  v00 * 0.5f;              // w0[0];
+	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
+	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
+	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
+	*y1 =  sgn * (v01 -  vb11) * SQRT1_8; // w1[0];
+	*y3 =  sgn * (v02 +  vb01) * SQRT1_8; // w1[1];
+	*y5 =  sgn * (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
+	*y7 =  sgn * vb00 * 0.5f;             // w1[3];
+}
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+inline __device__ void dctiv_nodiverg(float * x0,  int inc)
-        readFloatsFromFile(
+{
-			    (float *) &tile_coords_h[ncam],
+	float *x1 = x0 + inc;
-				ports_offs_xy_file[ncam]); // 			   char *  path) // file path
+	float *x2 = x1 + inc;
-    }
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-    // build TP task that processes all tiles in linescan order
+	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-    for (int ty = 0; ty < TILESY; ty++){
+	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-        for (int tx = 0; tx < TILESX; tx++){
-            int nt = ty * TILESX + tx;
-            task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
-            task_data[nt].txy = tx + (ty << 16);
-            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
-                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
-            }
-        }
-    }
-    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
+	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
+	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
+	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-#ifdef DBG0
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-//#define NUM_TEST_TILES 128
-#define NUM_TEST_TILES 1
-    for (int t = 0; t < NUM_TEST_TILES; t++) {
-    	task_data[t].task = 1;
-    	task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
-    	int nt = task_data[t].ty * TILESX + task_data[t].tx;
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+	float ua00= u00 + u03;
-    		task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
+	float ua10= u00 - u03;
-    		task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
-    	}
-    }
-    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
-#endif
+	float ua01= u01 + u02;
+	float ua11= u01 - u02;
-    // segfault in the next
+	float v00= ua00 + ua01;
-    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+	float v02= ua00 - ua01;
-    // build corr_indices
-    num_corrs = 0;
-    for (int ty = 0; ty < TILESY; ty++){
-    	for (int tx = 0; tx < TILESX; tx++){
-    		int nt = ty * TILESX + tx;
-    		int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
-    		if (cm){
-    			for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
-    				corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
-    			}
-    		}
-    	}
-    }
-    // num_corrs now has the total number of correlations
-    // copy corr_indices to gpu
-//    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
-    gpu_corr_indices = (int  *) copyalloc_kernel_gpu(
-    		(float * ) corr_indices,
-			num_corrs,
-			NUM_PAIRS * TILESX * TILESY);
-    // build texture_indices
-    num_textures = 0;
-    for (int ty = 0; ty < TILESY; ty++){
-    	for (int tx = 0; tx < TILESX; tx++){
-    		int nt = ty * TILESX + tx;
-//    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
-    		int cm = task_data[nt].task & TASK_TEXTURE_BITS;
-    		if (cm){
-    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
-    		}
-    	}
-    }
-    // num_textures now has the total number of textures
-    // copy corr_indices to gpu
-//  gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
-    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
-    		(float * ) texture_indices,
-			num_textures,
-			TILESX * TILESYA); // number of rows - multiple of 4
-    // just allocate
-    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
-    // number of border tiles
-    // copy port indices to gpu
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+	float ub00= u10 + u13;
+	float ub10= u10 - u13;
-//    int keep_texture_weights = 1; // try with 0 also
+	float ub01= u11 + u12;
-//    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
+	float ub11= u11 - u12;
-//		double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
+	float vb00= ub00 + ub01;
+	float vb01= ub00 - ub01;
-    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-    gpu_textures = alloc_image_gpu(
-    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
-			tile_texture_size,              // int width (floats),
-			TILESX * TILESY);               // int height);
-    int rgba_width =   (TILESX+1) * DTT_SIZE;
+	*x0 =  v00 * 0.5f;              // w0[0];
-    int rgba_height =  (TILESY+1) * DTT_SIZE;
+	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-    int rbga_slices =  texture_colors + 1; // 4/1
+	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
+	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
+	*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
+	*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
+	*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
+	*x7 =   vb00 * 0.5f;             // w1[3];
+}
-    gpu_textures_rbga = alloc_image_gpu(
+inline __device__ void dstiv_nodiverg(float * x,  int inc)
-    		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
+{
-			rgba_width,              // int width (floats),
+	float *x0 =  x +  7 * inc;
-			rgba_height * rbga_slices);               // int height);
+	// negate inc, replace
+	inc = -inc;
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
+	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
+	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
+	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
+	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
+	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
+	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
+	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      NUM_CAMS);
-    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
-//    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
-    //create and start CUDA timer
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-    StopWatchInterface *timerTP = 0;
-    sdkCreateTimer(&timerTP);
+	float ua00= u00 + u03;
+	float ua10= u00 - u03;
-    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
+	float ua01= u01 + u02;
-    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+	float ua11= u01 - u02;
-    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
-    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
-#ifdef DBG_TILE
+	float v00= ua00 + ua01;
-    const int numIterations = 1; //0;
+	float v02= ua00 - ua01;
-    const int i0 =  0; // -1;
-#else
-    const int numIterations = 10; // 0; //0;
-    const int i0 = -1; // 0; // -1;
-#endif
-    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
-///    cudaProfilerStart();
-    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];
-    for (int i = i0; i < numIterations; i++)
-    {
-        if (i == 0)
-        {
-            checkCudaErrors(cudaDeviceSynchronize());
-            sdkResetTimer(&timerTP);
-            sdkStartTimer(&timerTP);
-        }
-        convert_correct_tiles<<<grid_tp,threads_tp>>>(
+	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
+	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-				gpu_kernels,           // 		float           ** gpu_kernels,
-				gpu_images,            // 		float           ** gpu_images,
-				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
-				gpu_clt,               //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				dstride/sizeof(float), // 		size_t            dstride, // for gpu_images
-				tp_task_size,          // 		int               num_tiles) // number of tiles in task
-				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-        getLastCudaError("Kernel execution failed");
+	float ub00= u10 + u13;
-        checkCudaErrors(cudaDeviceSynchronize());
+	float ub10= u10 - u13;
-        printf("%d\n",i);
-    }
-//    checkCudaErrors(cudaDeviceSynchronize());
-    sdkStopTimer(&timerTP);
-    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
-    sdkDeleteTimer(&timerTP);
-    printf("Run time =%f ms\n",  avgTime);
-#ifdef SAVE_CLT
-    int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
-    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaMemcpy( // segfault
-    			cpu_clt,
-				gpu_clt_h[ncam],
-				rslt_size * sizeof(float),
-    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
-        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
-    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
-    			rslt_size, // int           size, // length in elements
-				ports_clt_file[ncam]); // 			   const char *  path) // file path
-#endif
-    }
-#endif
-#ifdef TEST_IMCLT
+	float ub01= u11 + u12;
-     {
+	float ub11= u11 - u12;
-    	// testing imclt
-    	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    	dim3 grid_imclt(1,1,1);
-    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		test_imclt<<<grid_imclt,threads_imclt>>>(
-    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-					ncam);                                        // int             ncam); // just for debug print
-    	}
-    	getLastCudaError("Kernel execution failed");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test_imclt() DONE\n");
-    }
-#endif
+	float vb00= ub00 + ub01;
+	float vb01= ub00 - ub01;
-#ifndef NOICLT
+	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-    // testing imclt
+	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    StopWatchInterface *timerIMCLT = 0;
-    sdkCreateTimer(&timerIMCLT);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerIMCLT);
-    		sdkStartTimer(&timerIMCLT);
-    	}
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		for (int color = 0; color < NUM_COLORS; color++) {
-#ifdef IMCLT14
-    			for (int v_offs = 0; v_offs < 1; v_offs++){     // temporarily for debugging
-    				for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
-#else
-    	    			for (int v_offs = 0; v_offs < 2; v_offs++){
-    	    				for (int h_offs = 0; h_offs < 2; h_offs++){
-#endif
-    					int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
-    					int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
-    					int tiles_in_pass = tilesy_half * tilesx_half;
-    					dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
-    					//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-    					imclt_rbg<<<grid_imclt,threads_imclt>>>(
-    							gpu_clt_h[ncam], // float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-								gpu_corr_images_h[ncam], // float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-								color, // int               color,
-								v_offs, // int               v_offset,
-								h_offs, // int               h_offset,
-								dstride_rslt/sizeof(float));            //const size_t      dstride);            // in floats (pixels)
-    				}
-    			}
-    		}
-    	}
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerIMCLT);
+	*x7 =  v00 * 0.5f;              // w0[0];
-    float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
+	*x5 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-    sdkDeleteTimer(&timerIMCLT);
+	*x3 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-    printf("Average IMCLT run time =%f ms\n",  avgTimeIMCLT);
+	*x1 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-    int rslt_img_size =       NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
+	*x6 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
+	*x4 = -(v02 +  vb01) * SQRT1_8; // w1[1];
+	*x2 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
+	*x0 = -vb00 * 0.5f;             // w1[3];
+}
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+inline  __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
-    	checkCudaErrors(cudaMemcpy2D( // segfault
+{
-    			cpu_corr_image,
+	float u00= (x[0] + x[7]);
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
+	float u10= (x[0] - x[7]);
-				gpu_corr_images_h[ncam],
-				dstride_rslt,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				3* (IMG_HEIGHT + DTT_SIZE),
-    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
+	float u01= (x[1] + x[6]);
-        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
+	float u11= (x[1] - x[6]);
-    	writeFloatsToFile( // will have margins
-    			cpu_corr_image, // float *       data, // allocated array
-				rslt_img_size, // int           size, // length in elements
-				result_rbg_file[ncam]); // 			   const char *  path) // file path
-#endif
-    }
-    free(cpu_corr_image);
+	float u02= (x[2] + x[5]);
-#endif
+	float u12= (x[2] - x[5]);
+	float u03= (x[3] + x[4]);
+	float u13= (x[3] - x[4]);
-#ifndef NOCORR
+//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-//    cudaProfilerStart();
-    // testing corr
-    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
-    StopWatchInterface *timerCORR = 0;
-    sdkCreateTimer(&timerCORR);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerCORR);
-    		sdkStartTimer(&timerCORR);
-    	}
-        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+	float w00= u00 + u03;
-        correlate2D<<<grid_corr,threads_corr>>>(
+	float w10= u00 - u03;
-		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		3,         // int               colors,             // number of colors (3/1)
-		0.25,      // float             scale0,             // scale for R
-		0.25,      // float             scale1,             // scale for B
-		0.5,       // float             scale2,             // scale for G
-		30.0,      // float             fat_zero,           // here - absolute
-		num_corrs, // size_t            num_corr_tiles,     // number of correlation tiles to process
-		gpu_corr_indices, //  int             * gpu_corr_indices,   // packed tile+pair
-		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
-		CORR_OUT_RAD, // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		gpu_corrs); // float           * gpu_corrs);          // correlation output data
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerCORR);
+	float w01= (u01 + u02);
-    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
+	float w11= (u01 - u02);
-    sdkDeleteTimer(&timerCORR);
-    printf("Average CORR run time =%f ms\n",  avgTimeCORR);
-    int corr_size =        2 * CORR_OUT_RAD + 1;
+	float v00= w00 + w01;
-    int rslt_corr_size =   num_corrs * corr_size * corr_size;
+	float v02= w00 - w01;
-    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+	float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+	float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+	float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+	float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+	float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+	float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-    checkCudaErrors(cudaMemcpy2D(
+//	_dctii_nrecurs2(u00, u01, &v00, &v01);
-    		cpu_corr,
+	float z00= w20 + w21;
-			(corr_size * corr_size) * sizeof(float),
+	float z01= w20 - w21;
-			gpu_corrs,
-			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
-			num_corrs,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_CORR
+//	_dctii_nrecurs2(u10, u11, &v10, &v11);
-    		printf("Writing phase correlation data to %s\n",  result_corr_file);
+	float z10= w30 + w31;
-    		writeFloatsToFile(
+	float z11= w30 - w31;
-    				cpu_corr,    // float *       data, // allocated array
-					rslt_corr_size,    // int           size, // length in elements
+	float v10 = SQRT_2 * z00;
-					result_corr_file); // 			   const char *  path) // file path
+	float v11 = z01 - z11;
-#endif
-    		free(cpu_corr);
+	float v12 = z01 + z11;
-#endif // ifndef NOCORR
+	float v13 = SQRT_2 * z10;
+	y[0] =   v00;
-// -----------------
+	y[1] =   v10;
-#ifndef NOTEXTURES
+	y[2] =   v01;
-//    cudaProfilerStart();
+	y[3] =   v11;
-    // testing textures
-    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+	y[4] =   v02;
-    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+	y[5] =   v12;
-    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
-    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
+	y[6] =   v03;
-    StopWatchInterface *timerTEXTURE = 0;
+	y[7] =   v13;
-    sdkCreateTimer(&timerTEXTURE);
+}
-    for (int i = i0; i < numIterations; i++)
+inline  __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
-    {
+{
-    	if (i == 0)
+	_dctii_nrecurs8(x, y);
-    	{
+#pragma unroll
-    		checkCudaErrors(cudaDeviceSynchronize());
+	for (int i = 0; i < 8 ; i++) {
-    		sdkResetTimer(&timerTEXTURE);
+		y[i] *= SQRT1_8;
-    		sdkStartTimer(&timerTEXTURE);
 	}
+}
-		// Channel0 weight = 0.294118
+__device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
-		// Channel1 weight = 0.117647
+{
-		// Channel2 weight = 0.588235
+	_dctiv_nrecurs8(x, y);
-    	textures_accumulate<<<grid_texture,threads_texture>>> (
+#pragma unroll
-//    			0,          // int               border_tile,        // if 1 - watch for border
+	for (int i = 0; i < 8 ; i++) {
-    			(int *) 0,  //      int             * woi,                // x, y, width,height
+		y[i] *= SQRT1_8;
-		        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				num_textures,          // size_t            num_texture_tiles,  // number of texture tiles to process
-				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-				texture_colors,        // int               colors,             // number of colors (3/1)
-				(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				10.0,                  // float             min_shot,           // 10.0
-				3.0,                   // float             scale_shot,         // 3.0
-				1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-				10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-				3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				0.294118,              // float             weight0,            // scale for R
-				0.117647,              // float             weight1,            // scale for B
-				0.588235,              // float             weight2,            // scale for G
-				1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
-    	// combining both non-overlap and overlap (each calculated if pointer is not null )
-    			0, // const size_t      texture_rbg_stride, // in floats
-    			(float *) 0, // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-				dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
-				gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
 	}
-///	cudaProfilerStop();
-    sdkStopTimer(&timerTEXTURE);
+}
-    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
-    sdkDeleteTimer(&timerTEXTURE);
+inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
-    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
+{
+	float xr[8];
-    int rslt_texture_size =   num_textures * tile_texture_size;
+#pragma unroll
-    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
+	for (int i=0; i < 8;i++){
+		xr[i] = x[7 - i];
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_textures,
-			tile_texture_size * sizeof(float),
-			gpu_textures,
-			dstride_textures,
-			tile_texture_size * sizeof(float),
-			num_textures,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_TEXTURES
-    		printf("Writing phase texture data to %s\n",  result_textures_file);
-    		writeFloatsToFile(
-    				cpu_textures,    // float *       data, // allocated array
-					rslt_texture_size,    // int           size, // length in elements
-					result_textures_file); // 			   const char *  path) // file path
-//DBG_TILE
-#ifdef DEBUG10
-    		int texture_offset = DBG_TILE * tile_texture_size;
-    		int chn = 0;
-    		for (int i = 0; i < tile_texture_size; i++){
-    			if ((i % 256) == 0){
-    				printf("\nchn = %d\n", chn++);
 	}
-    			printf("%10.4f", *(cpu_textures + texture_offset + i));
+	_dctiv_nrecurs8(xr, y);
-    			if (((i + 1) % 16) == 0){
+#pragma unroll
-    				printf("\n");
+	for (int i=0; i < 8;i+=2){
-    			} else {
+		y[i]   *=  SQRT1_8;
-    				printf(" ");
+		y[i+1] *= -SQRT1_8;
 	}
+}
+//=========================== 2D functions ===============
+__device__ void corrUnfoldTile(
+		int corr_radius,
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+{
+	int size2r1 = 2 * corr_radius + 1; // 15
+	int crp1 = corr_radius + 1;        //8
+///	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE; // offset of the center
+	int rslt_base_index = size2r1 * crp1 - crp1; // offset of the center
+	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
+	int i = threadIdx.x;
+	if (i > corr_radius) {
+		return; // not needed, only use inner
 	}
-//    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+//	printf("\corrUnfoldTile() corr_radius=%d, i=%d\n",corr_radius,i);
-#endif // DEBUG9
+	float corr_pixscale = 0.25f;
-#endif
+	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
-    		free(cpu_textures);
+	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
-#endif // ifndef NOTEXTURES
+///	int rslt_row_offs = i * DTT_SIZE2M1;
+	int rslt_row_offs = i * size2r1;
+	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
-#define GEN_TEXTURE_LIST
+	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
-#ifdef  GEN_TEXTURE_LIST
+	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
-    		dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
-    		dim3 grid_list   (1,1,1);
+///	for (int j = 1; j < DTT_SIZE; j++) {
-    		printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
+	for (int j = 1; j <= corr_radius; j++) {
-    		printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
+		int rslt_base_index_pp = rslt_base_index_p + j;
-    		StopWatchInterface *timerTEXTURELIST = 0;
+		int rslt_base_index_pm = rslt_base_index_p - j;
-    		sdkCreateTimer(&timerTEXTURELIST);
+		rslt[rslt_base_index_pp] = corr_pixscale * (
-    		for (int i = i0; i < numIterations; i++)
+				 qdata0[i_transform_size + j] +
-    		{
+				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-    			if (i == 0)
+		rslt[rslt_base_index_pm] = corr_pixscale * (
-    			{
+				 qdata0[i_transform_size + j] +
-    				checkCudaErrors(cudaDeviceSynchronize());
+				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
-    				sdkResetTimer(&timerTEXTURELIST);
-    				sdkStartTimer(&timerTEXTURELIST);
 	}
+	if (i == 0) {
-    			prepare_texture_list<<<grid_list,threads_list>>> (
+		return;
-    					gpu_tasks,             // struct tp_task   * gpu_tasks,
-						tp_task_size,          // int                num_tiles,          // number of tiles in task list
-						gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-						gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-						gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-						TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-						TILESY);               // int                height); // <= TILESY, use for faster processing of LWIR images
-    			getLastCudaError("Kernel failure");
-    			checkCudaErrors(cudaDeviceSynchronize());
-    			printf("test pass: %d\n",i);
 	}
-    		///	cudaProfilerStop();
+///	im1_transform_size = i_transform_size - DTT_SIZE1; // already is calculated
-    		sdkStopTimer(&timerTEXTURELIST);
+	float d = corr_pixscale * qdata2[im1_transform_size];
-    		float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
+	rslt[rslt_base_index_p] += d;
-    		sdkDeleteTimer(&timerTEXTURELIST);
+	rslt[rslt_base_index_m] -= d;
-    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
+	for (int j = 1; j <= corr_radius; j++) {
+		int rslt_base_index_pp = rslt_base_index_p + j;
-    		int cpu_num_texture_tiles[8];
+		int rslt_base_index_pm = rslt_base_index_p - j;
-    		checkCudaErrors(cudaMemcpy(
+		int rslt_base_index_mp = rslt_base_index_m + j;
-    				cpu_woi,
+		int rslt_base_index_mm = rslt_base_index_m - j;
-					gpu_woi,
+		float d2 = corr_pixscale * qdata2[im1_transform_size + j];
-					4 * sizeof(float),
+		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
-					cudaMemcpyDeviceToHost));
+		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
-    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+		rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
-    		checkCudaErrors(cudaMemcpy(
+		rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
-    				cpu_num_texture_tiles,
+		rslt[rslt_base_index_pp] += d2 + d3;
-					gpu_num_texture_tiles,
+		rslt[rslt_base_index_pm] += d2 - d3;
-					8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
-					//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
-					cudaMemcpyDeviceToHost));
-    		printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
-    				cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
-					cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
-					cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
-					cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
-    				cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
-					cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
-					cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
-					cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
-					tp_task_size
-					);
-    		for (int q = 0; q < 4; q++) {
-    			checkCudaErrors(cudaMemcpy(
-    					texture_indices  + q * TILESX * (TILESYA >> 2),
-						gpu_texture_indices  + q * TILESX * (TILESYA >> 2),
-						cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
-						cudaMemcpyDeviceToHost));
 	}
-    		for (int q = 0; q < 4; q++) {
+}
-        		printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
+__device__ void dttii_2d(
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
+		float * clt_corr) // shared memory, [4][DTT_SIZE1][DTT_SIZE]
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
+{
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
+    // change to 16-32 threads?? in next iteration
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
+    // vert pass (hor pass in Java, before transpose. Here transposed, no transform needed)
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
+    for (int q = 0; q < 4; q++){
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
+    	int is_sin = (q >> 1) & 1;
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
+    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
    }
-#endif //GEN_TEXTURE_LIST
+    __syncthreads();
+    // hor pass, corresponding to vert pass in Java
+    for (int q = 0; q < 4; q++){
+    	int is_sin = q & 1;
+    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    }
+    __syncthreads();
+}
-#ifndef NOTEXTURE_RGBA
+__device__ void dttiv_color_2d(
-    dim3 threads_rgba(1, 1, 1);
+		float * clt_tile,
-    dim3 grid_rgba(1,1,1);
+		int color)
-    printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
+{
-    printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
+    dctiv_nodiverg( // all colors
-    StopWatchInterface *timerRGBA = 0;
+			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
-    sdkCreateTimer(&timerRGBA);
+			1); //int inc);
+    if (color == BAYER_GREEN){
+        dstiv_nodiverg( // all colors
+				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
+    			1); //int inc);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerRGBA);
-    		sdkStartTimer(&timerRGBA);
    }
+  	 __syncthreads();// __syncwarp();
-    	generate_RBGA<<<grid_rgba,threads_rgba>>> (
+#ifdef DEBUG222
-    	// Parameters to generate texture tasks
+    if ((threadIdx.x) == 0){
-                gpu_tasks,             // struct tp_task   * gpu_tasks,
+        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
-                tp_task_size,          // int                num_tiles,          // number of tiles in task list
+    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
-    	// declare arrays in device code?
-	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
-    	// Parameters for the texture generation
-	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-	            gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-	            texture_colors,        // int               colors,             // number of colors (3/1)
-	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-	            10.0,                  // float             min_shot,           // 10.0
-	            3.0,                   // float             scale_shot,         // 3.0
-	            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-	            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-	            3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	            0.294118,              // float             weight0,            // scale for R
-	            0.117647,              // float             weight1,            // scale for B
-	            0.588235,              // float             weight2,            // scale for G
-	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-	            0,                     // int               keep_weights,       // return channel weights after A in RGBA
-				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
-				gpu_textures_rbga);    // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
    }
-    sdkStopTimer(&timerRGBA);
+     __syncthreads();// __syncwarp();
-    float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
-    sdkDeleteTimer(&timerRGBA);
-    printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
-	checkCudaErrors(cudaMemcpy(
-			cpu_woi,
-			gpu_woi,
-			4 * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
-	// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
-    int rgba_woi_width =  (cpu_woi[2] + 1) * DTT_SIZE;
-    int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
-    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
-    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_textures_rgba,
-			rgba_width * sizeof(float),
-			gpu_textures_rbga,
-			dstride_textures_rbga,
-			rgba_width * sizeof(float),
-			rgba_height * rbga_slices,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_TEXTURES
-    printf("Writing RBGA texture slices to %s\n",  result_textures_rgba_file);
-    writeFloatsToFile(
-    		cpu_textures_rgba,    // float *       data, // allocated array
-			rslt_rgba_size,    // int           size, // length in elements
-			result_textures_rgba_file); // 			   const char *  path) // file path
 #endif
-#ifdef DEBUG11
+    dctiv_nodiverg( // all colors
-    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
+    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
-    for (int chn = 0; chn < rbga_slices; chn++){
+			DTT_SIZE1); // int inc,
-    	printf("\nchn = %d\n", chn);
+    if (color == BAYER_GREEN){
-    	int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
+          dctiv_nodiverg( // all colors
+        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
-    	for (int i = 0; i < 8; i++){
+    			DTT_SIZE1); // int inc,
-    		for (int j = 0; j < 8; j++){
-    			printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
-    		}
-    		printf("\n");
-    	}
    }
-#endif // DEBUG11
+  	 __syncthreads();// __syncwarp();
-    free(cpu_textures_rgba);
+}
-#endif // ifndef NOTEXTURES
+//
+// Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
+// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
+// first pass
+//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
+__device__ void imclt(
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile ) //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+{
+	int thr3 =    threadIdx.x >> 3;
+	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
+	int thr012 =  threadIdx.x & 7;
+	int column4 = threadIdx.x >> 2;
+//	int wcolumn =column ^ (7 * thr3); //0..7,7,..0
+//	int wcolumn = ((thr3 << 3) -1) ^ thr3; //0..7,7,..0
+	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
+	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
+	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
+#ifdef DEBUG3
+    if ((threadIdx.x) == 0){
+        printf("\nDTT Tiles before IDTT\n");
+    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
+    }
+     __syncthreads();// __syncwarp();
+#endif
+	// perform horizontal dct-iv on quadrants 0 and 1
+    dctiv_nodiverg(
+    		clt_tile +  DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 0 and 2
+			1);
+	// perform horizontal dst-iv on quadrants 2 and 3
+    dstiv_nodiverg( // all colors
+    		clt_tile1 + DTT_SIZE1 * (thr012 + 2*DTT_SIZE * thr3), // pointer to start of row for quadrants 1 and 3
+			1);
+    __syncthreads();// __syncwarp();
+	// perform vertical   dct-iv on quadrants 0 and 2
+    dctiv_nodiverg(
+    		clt_tile +  thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 0 and 1
+			DTT_SIZE1);
+	// perform vertical   dst-iv on quadrants 1 and 3
+    dstiv_nodiverg(
+    		clt_tile2 + thr012 + (DTT_SIZE1 *   DTT_SIZE) * thr3, // pointer to start of row for quadrants 2 and 3
+			DTT_SIZE1);
+    __syncthreads();// __syncwarp();
+#ifdef DEBUG3
+    if ((threadIdx.x) == 0){
+        printf("\nDTT Tiles after IDTT\n");
+    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
+    }
+     __syncthreads();// __syncwarp();
+#endif
+    float hw = HWINDOW2[wcolumn];
+    int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
+    float * rslt = mclt_tile + column;
+#pragma unroll
+    for (int i = 0; i < 4; i++){
+    	float val = *rslt;
+    	float w = HWINDOW2[i] * hw;
+    	float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
+    	float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
+    	float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
+    	float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
+    	d0+=d1;
+    	d2+=d3;
+    	d0+= d2;
+    	if (i < 3){
+    		clt_offset +=  DTT_SIZE1;
+    	}
+//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    	val = __fmaf_rd(w,d0,val); // w*d0 + val
+    	*rslt = val;
+    	rslt += DTT_SIZE21;
+    }
+#pragma unroll
+    for (int i = 4; i < 8; i++){
+    	float val = *rslt;
+    	float w = HWINDOW2[i] * hw;
+    	float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
+    	float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
+    	float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
+    	float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
+    	d0+=d1;
+    	d2+=d3;
+    	d0+= d2;
+//    	if (i < 7){
+   		clt_offset -=  DTT_SIZE1;
+//    	}
+    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    	rslt += DTT_SIZE21;
+    }
+#pragma unroll
+    for (int i = 7; i >= 4; i--){
+    	float val = *rslt;
+    	float w = HWINDOW2[i] * hw;
+    	float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
+    	float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
+    	float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
+    	float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
+    	d0+=d1;
+    	d2+=d3;
+    	d0+= d2;
+    	if (i > 4){
+    		clt_offset -=  DTT_SIZE1;
+    	}
+    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    	rslt += DTT_SIZE21;
+    }
+#pragma unroll
+    for (int i = 3; i >= 0; i--){
+    	float val = *rslt;
+    	float w = HWINDOW2[i] * hw;
+    	float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
+    	float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
+    	float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
+    	float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
+    	d0+=d1;
+    	d2+=d3;
+    	d0+= d2;
+    	if (i > 0){
+    		clt_offset +=  DTT_SIZE1;
+    	}
+    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    	rslt += DTT_SIZE21;
+    }
+#ifdef DEBUG3
+    __syncthreads();// __syncwarp();
+    if ((threadIdx.x) == 0){
+        printf("\nMCLT Tiles after IMCLT\n");
+    	debug_print_mclt(mclt_tile, -1); // only 1 quadrant for R,B and 2 - for G
+    }
+    __syncthreads();// __syncwarp();
+#endif
+}
+// Uses 8 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds to the 16x16
+// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
+// first pass
+//__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
+__device__ void imclt8threads(
+		int     do_acc,     // 1 - add to previous value, 0 - overwrite
+		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
+		float * mclt_tile,  //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
+		int debug)
+{
+//	int thr3 =    threadIdx.x >> 3;
+//	int column =  threadIdx.x; // modify to use 2*8 threads, if needed.
+//	int thr012 =  threadIdx.x & 7;
+//	int column4 = threadIdx.x >> 2;
+//	int wcolumn = ((thr3 << 3) - thr3) ^ thr012; //0..7,7,..0
+	float * clt_tile1 = clt_tile +  (DTT_SIZE1 * DTT_SIZE);
+	float * clt_tile2 = clt_tile1 + (DTT_SIZE1 * DTT_SIZE);
+	float * clt_tile3 = clt_tile2 + (DTT_SIZE1 * DTT_SIZE);
+#ifdef DEBUG7
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+        printf("\nDTT Tiles before IDTT\n");
+        debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
+    }
+     __syncthreads();// __syncwarp();
+#endif
+	// perform horizontal dct-iv on quadrants 0 and 1
+    dctiv_nodiverg( // quadrant 0
+    		clt_tile +  threadIdx.x,                              // pointer to start of row for quadrant 0
+			DTT_SIZE1);
+    dctiv_nodiverg( // quadrant 1
+    		clt_tile +  threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
+			DTT_SIZE1);
+	// perform horizontal dst-iv on quadrants 2 and 3
+    dstiv_nodiverg( // quadrant 2
+    		clt_tile +  threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
+			DTT_SIZE1);
+    dstiv_nodiverg( // quadrant 3
+    		clt_tile +  threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
+			DTT_SIZE1);
+    __syncthreads();// __syncwarp();
+	// perform vertical   dct-iv on quadrants 0 and 2
+    dctiv_nodiverg( // quadrant 0
+    		clt_tile +  DTT_SIZE1 * threadIdx.x,                              // pointer to start of row for quadrant 0
+			1);
+    dctiv_nodiverg( // quadrant 2
+    		clt_tile +  DTT_SIZE1 * threadIdx.x + (2 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 2
+			1);
+    // perform vertical   dst-iv on quadrants 1 and 3
+    dstiv_nodiverg( // quadrant 1
+    		clt_tile +  DTT_SIZE1 * threadIdx.x + (1 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 1
+			1);
+    dstiv_nodiverg( // quadrant 3
+    		clt_tile +  DTT_SIZE1 * threadIdx.x + (3 * DTT_SIZE * DTT_SIZE1), // pointer to start of row for quadrant 3
+			1);
+    __syncthreads();// __syncwarp();
+#ifdef DEBUG7
+    if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+    	printf("\nDTT Tiles after IDTT\n");
+    	debug_print_clt_scaled(clt_tile, -1,  0xf, 0.25); // only 1 quadrant for R,B and 2 - for G
+    }
+    __syncthreads();// __syncwarp();
+#endif
+    // re-using 16-thread code (thr3 was bit 3 of threadIdx.x).
+    for (int thr3 = 0; thr3 < 2; thr3++){
+    	int thr3m = (thr3 << 3);
+    	int column =  threadIdx.x + thr3m; // modify to use 2*8 threads, if needed.
+    	int thr012 =  threadIdx.x & 7; // == threadIdx.x
+    	int column4 = column >> 2; // (threadIdx.x >> 2) | (thr3 << 1) ; // different !
+    	int wcolumn = (thr3m - thr3) ^ thr012; //0..7,7,..0
+    	float hw = HWINDOW2[wcolumn];
+    	int clt_offset = imclt_indx9[column]; // index in each of the 4 iclt quadrants, accounting for stride=9
+    	float * rslt = mclt_tile + column;
+#ifdef DEBUG7
+        if (debug && (threadIdx.x == 0) && (threadIdx.y == 0)){
+    	printf("\nUnrolling: thr3=%d, thr3m=%d, column=%d, thr012=%d, column4=%d, wcolumn=%d, hw=%f, clt_offset=%d\n",
+    			thr3, thr3m, column, thr012, column4, wcolumn, hw, clt_offset);
+    	debug_print_clt1(clt_tile, -1,  0xf); // only 1 quadrant for R,B and 2 - for G
+    }
+    __syncthreads();// __syncwarp();
+#endif
+#pragma unroll
+    	for (int i = 0; i < 4; i++){
+    		float val = *rslt;
+    		// facc
+    		float w = HWINDOW2[i] * hw;
+    		float d0 = idct_signs[0][0][column4] * (*(clt_tile +  clt_offset));
+    		float d1 = idct_signs[1][0][column4] * (*(clt_tile1 + clt_offset));
+    		float d2 = idct_signs[2][0][column4] * (*(clt_tile2 + clt_offset));
+    		float d3 = idct_signs[3][0][column4] * (*(clt_tile3 + clt_offset));
+    		d0+=d1;
+    		d2+=d3;
+    		d0+= d2;
+    		if (i < 3){
+    			clt_offset +=  DTT_SIZE1;
+    		}
+    		//    	*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    		// val =__fmaf_rd(w,d0,val); // w*d0 + val
+    		// *rslt = val;
+    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+    		rslt += DTT_SIZE21;
+    	}
+#pragma unroll
+    	for (int i = 4; i < 8; i++){
+    		float val = *rslt;
+    		float w = HWINDOW2[i] * hw;
+    		float d0 = idct_signs[0][1][column4] * (*(clt_tile +  clt_offset));
+    		float d1 = idct_signs[1][1][column4] * (*(clt_tile1 + clt_offset));
+    		float d2 = idct_signs[2][1][column4] * (*(clt_tile2 + clt_offset));
+    		float d3 = idct_signs[3][1][column4] * (*(clt_tile3 + clt_offset));
+    		d0+=d1;
+    		d2+=d3;
+    		d0+= d2;
+    		//    	if (i < 7){
+    		clt_offset -=  DTT_SIZE1;
+    		//    	}
+//    		*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+    		rslt += DTT_SIZE21;
+    	}
+#pragma unroll
+    	for (int i = 7; i >= 4; i--){
+    		float val = *rslt;
+    		float w = HWINDOW2[i] * hw;
+    		float d0 = idct_signs[0][2][column4] * (*(clt_tile +  clt_offset));
+    		float d1 = idct_signs[1][2][column4] * (*(clt_tile1 + clt_offset));
+    		float d2 = idct_signs[2][2][column4] * (*(clt_tile2 + clt_offset));
+    		float d3 = idct_signs[3][2][column4] * (*(clt_tile3 + clt_offset));
+    		d0+=d1;
+    		d2+=d3;
+    		d0+= d2;
+    		if (i > 4){
+    			clt_offset -=  DTT_SIZE1;
+    		}
+    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+    		rslt += DTT_SIZE21;
+    	}
+#pragma unroll
+    	for (int i = 3; i >= 0; i--){
+    		float val = *rslt;
+    		float w = HWINDOW2[i] * hw;
+    		float d0 = idct_signs[0][3][column4] * (*(clt_tile +  clt_offset));
+    		float d1 = idct_signs[1][3][column4] * (*(clt_tile1 + clt_offset));
+    		float d2 = idct_signs[2][3][column4] * (*(clt_tile2 + clt_offset));
+    		float d3 = idct_signs[3][3][column4] * (*(clt_tile3 + clt_offset));
+    		d0+=d1;
+    		d2+=d3;
+    		d0+= d2;
+    		if (i > 0){
+    			clt_offset +=  DTT_SIZE1;
+    		}
+    		//*rslt = __fmaf_rd(w,d0,val); // w*d0 + val
+    		*rslt = do_acc? __fmaf_rd(w,d0,val) : w * d0; // w*d0 + val do_acc - common for all thereads
+    		rslt += DTT_SIZE21;
+    	}
+    }
+#ifdef DEBUG7
+    __syncthreads();// __syncwarp();
+	for (int ccam = 0; ccam < NUM_CAMS; ccam++) {
+		if (debug  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+			printf("\nMCLT Tiles after IMCLT, cam=%d\n", threadIdx.y);
+			debug_print_mclt(
+					mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+					-1);
+		}
+		__syncthreads();// __syncwarp();
+	}
+    __syncthreads();// __syncwarp();
+#endif
+}
-#ifdef SAVE_CLT
+//#endif
-    free(cpu_clt);
-#endif
-    free (host_kern_buf);
-    // TODO: move somewhere when all is done
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
-#ifndef NOICLT
-    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
-#endif
-    }
-	checkCudaErrors(cudaFree(gpu_tasks));
-	checkCudaErrors(cudaFree(gpu_kernels));
-	checkCudaErrors(cudaFree(gpu_kernel_offsets));
-	checkCudaErrors(cudaFree(gpu_images));
-	checkCudaErrors(cudaFree(gpu_clt));
-//	checkCudaErrors(cudaFree(gpu_corr_images));
-	checkCudaErrors(cudaFree(gpu_corrs));
-	checkCudaErrors(cudaFree(gpu_corr_indices));
-	checkCudaErrors(cudaFree(gpu_texture_indices));
-	checkCudaErrors(cudaFree(gpu_port_offsets));
-	checkCudaErrors(cudaFree(gpu_textures));
-	checkCudaErrors(cudaFree(gpu_textures_rbga));
-	checkCudaErrors(cudaFree(gpu_woi));
-	checkCudaErrors(cudaFree(gpu_num_texture_tiles));
-	exit(0);
-}
--- a/src/dtt8x8.cuh
+++ b/src/dtt8x8.cuh
-/**
- **
- ** dtt8x8.cuh
- **
- ** Copyright (C) 2018 Elphel, Inc.
- **
- ** -----------------------------------------------------------------------------**
- **
- **  dtt8x8.cuh is free software: you can redistribute it and/or modify
- **  it under the terms of the GNU General Public License as published by
- **  the Free Software Foundation, either version 3 of the License, or
- **  (at your option) any later version.
- **
- **  This program is distributed in the hope that it will be useful,
- **  but WITHOUT ANY WARRANTY; without even the implied warranty of
- **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- **  GNU General Public License for more details.
- **
- **  You should have received a copy of the GNU General Public License
- **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- **
- **  Additional permission under GNU GPL version 3 section 7
- **
- **  If you modify this Program, or any covered work, by linking or
- **  combining it with NVIDIA Corporation's CUDA libraries from the
- **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
- **  containing parts covered by the terms of NVIDIA CUDA Toolkit
- **  EULA, the licensors of this Program grant you additional
- **  permission to convey the resulting work.
- ** -----------------------------------------------------------------------------**
- */
-/**
-**************************************************************************
-* \file dtt8x8.cuh
-* \brief DCT-II, DST-II, DCT-IV and DST-IV for Complex Lapped Transform of 16x16 (stride 8)
-*        in GPU
-* This file contains building blocks for the 16x16 stride 8 COmplex Lapped Transform (CLT)
-* implementation. DTT-IV are used for forward and inverse 2D CLT, DTT-II - to convert correlation
-* results from the frequency to pixel domain. DTT-III (inverse of DTT-II) is not implemented
-* here it is used to convert convolution kernels and LPF to the frequency domain - done in
-* software.
-*
-* This file is cpompatible with both runtime and driver API, runtime is used for development
-* with Nvidia Nsight, driver API when calling these kernels from Java
-*/
-#ifndef JCUDA
-#define DTT_SIZE_LOG2                 3
-//#define DTT_SIZE                      8
-#endif
-#pragma once
-#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
-#define DTTTEST_BLOCK_WIDTH          32
-#define DTTTEST_BLOCK_HEIGHT         16
-#define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
-//#define CUDART_INF_F            __int_as_float(0x7f800000)
-/*
- Python code to generate constant coefficients:
-def dct_constants():
-    COSPI_1_8_SQRT2 = math.cos(math.pi/8)*math.sqrt(2.0)
-    COSPI_3_8_SQRT2 = math.cos(3*math.pi/8)*math.sqrt(2.0)
-    SQRT_2 = math.sqrt(2.0)
-    SQRT1_2 = 1/math.sqrt(2.0)
-    SQRT1_8 = 1/math.sqrt(8.0)
-    CN = [[math.cos((2*k+1)*(math.pi/(8*(2 << t))))  for k in range (2 << t)] for t in range (2)]
-    SN = [[math.sin((2*k+1)*(math.pi/(8*(2 << t))))  for k in range (2 << t)] for t in range (2)]
-    print("__constant__ float COSPI_1_8_SQRT2 = %ff;"%(COSPI_1_8_SQRT2))
-    print("__constant__ float COSPI_3_8_SQRT2 = %ff;"%(COSPI_3_8_SQRT2))
-    print("__constant__ float SQRT_2 = %ff;"%         (SQRT_2))
-    print("__constant__ float SQRT1_2 = %ff;"%        (SQRT1_2))
-    print("__constant__ float SQRT1_8 = %ff;"%        (SQRT1_8))
-    print("__constant__ float COSN1[] = {%ff,%ff};"%         (CN[0][0],CN[0][1]))
-    print("__constant__ float COSN2[] = {%ff,%ff,%ff,%ff};"% (CN[1][0],CN[1][1],CN[1][2],CN[1][3]))
-    print("__constant__ float SINN1[] = {%ff,%ff};"%         (SN[0][0],SN[0][1]))
-    print("__constant__ float SINN2[] = {%ff,%ff,%ff,%ff};"% (SN[1][0],SN[1][1],SN[1][2],SN[1][3]))
-*/
-__constant__ float COSPI_1_8_SQRT2 = 1.306563f;
-__constant__ float COSPI_3_8_SQRT2 = 0.541196f;
-__constant__ float SQRT_2 = 1.414214f;
-__constant__ float SQRT1_2 = 0.707107f;
-__constant__ float SQRT1_8 = 0.353553f;
-__constant__ float COSN1[] = {0.980785f,0.831470f};
-__constant__ float COSN2[] = {0.995185f,0.956940f,0.881921f,0.773010f};
-__constant__ float SINN1[] = {0.195090f,0.555570f};
-__constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
-inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
-inline __device__ void dctiv_nodiverg  (float * x0,  int inc);                    // used in TP
-inline __device__ void dstiv_nodiverg  (float * x0,  int inc);                    // used in TP
-inline __device__ void dct_ii8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dct_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void dst_iv8         ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctii_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-inline __device__ void _dctiv_nrecurs8 ( float x[8], float y[8]); // x,y point to 8-element arrays each // not used
-/**
-**************************************************************************
-*  Converts 2D image (in the GPU memory) using 8x8 DTT 8x8 tiles.
-*  Mostly for testing and profiling individual conversions
-*
-* \param dst                        [OUT] - Coefficients as 8x8 tiles
-* \param src                         [IN] - Source image of floats
-* \param src_stride                  [IN] - Source image stride
-* \param mode                        [IN] - DTT mode:
-*     0 - horizontal DCT-IV followed by vertical DCT-IV
-*     1 - horizontal DST-IV followed by vertical DCT-IV
-*     2 - horizontal DCT-IV followed by vertical DST-IV
-*     3 - horizontal DST-IV followed by vertical DST-IV
-*     4 - horizontal DCT-II followed by vertical DCT-II
-*     5 - horizontal DST-II followed by vertical DCT-II
-*     6 - horizontal DCT-II followed by vertical DST-II
-*     7 - horizontal DST-II followed by vertical DST-II
-*
-* \return None
-*/
-extern "C"
-__global__ void GPU_DTT24_DRV(float *dst, float *src, int src_stride, int dtt_mode)
-{
-	int dtt_mode0 = dtt_mode & 1;
-	int dtt_mode1 = (dtt_mode >>1) & 1;
-    __shared__ float block[DTTTEST_BLOCK_HEIGHT * DTTTEST_BLK_STRIDE];
-    int OffsThreadInRow = threadIdx.y * DTT_SIZE + threadIdx.x;
-    int OffsThreadInCol = threadIdx.z * DTT_SIZE;
-    src += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
-    dst += ((blockIdx.y * DTTTEST_BLOCK_HEIGHT + OffsThreadInCol) * src_stride) + blockIdx.x * DTTTEST_BLOCK_WIDTH + OffsThreadInRow;
-    float *bl_ptr = block + OffsThreadInCol * DTTTEST_BLK_STRIDE + OffsThreadInRow;
-#pragma unroll
-    for (unsigned int i = 0; i < DTT_SIZE; i++)
-        bl_ptr[i * DTTTEST_BLK_STRIDE] = src[i * src_stride];
-    __syncthreads();
-    // horizontal pass
-    if (dtt_mode > 3) {
-    	dttii_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
-    } else {
-    	dttiv_shared_mem                   (block + (OffsThreadInCol + threadIdx.x) * DTTTEST_BLK_STRIDE + OffsThreadInRow - threadIdx.x, 1, dtt_mode0);
-    }
-    __syncthreads();
-    // vertical pass
-    if (dtt_mode > 3) {
-    	dttii_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
-    } else {
-    	dttiv_shared_mem                    (bl_ptr, DTTTEST_BLK_STRIDE, dtt_mode1);
-    }
-    __syncthreads();
-    for (unsigned int i = 0; i < DTT_SIZE; i++)
-        dst[i * src_stride] = bl_ptr[i * DTTTEST_BLK_STRIDE];
-}
-inline __device__ void _dctiv_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	float u00=            ( COSN2[0] * x[0] + SINN2[0] * x[7]);
-	float u10=            (-SINN2[3] * x[3] + COSN2[3] * x[4]);
-	float u01=            ( COSN2[1] * x[1] + SINN2[1] * x[6]);
-	float u11=           -(-SINN2[2] * x[2] + COSN2[2] * x[5]);
-	float u02=            ( COSN2[2] * x[2] + SINN2[2] * x[5]);
-	float u12=            (-SINN2[1] * x[1] + COSN2[1] * x[6]);
-	float u03=            ( COSN2[3] * x[3] + SINN2[3] * x[4]);
-	float u13=           -(-SINN2[0] * x[0] + COSN2[0] * x[7]);
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	y[0] =  SQRT_2 * v00;    // w0[0];
-	y[1] =  v01 -  vb11;    // w1[0];
-	// j == 1
-	y[2] =  v01 +  vb11;    // w0[1];
-	y[3] =  v02 +  vb01;    // w1[1];
-	// j == 2
-	y[4] =  v02 -  vb01;    // w0[2];
-	y[5] =  v03 -  vb10;    // w1[2]; - same as y[3]
-	// j == 3
-	y[6] =  v03 +  vb10;    // w0[3];
-	y[7] =  SQRT_2 * vb00;    // w1[3];
-}
-inline __device__ void _dttiv(float x0, float x1,float x2, float x3,float x4, float x5,float x6, float x7,
-		float *y0, float *y1, float *y2, float *y3, float *y4, float *y5, float *y6, float *y7, int dst_not_dct)
-{
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTIV
-		u00=  ( COSN2[0] * x7 + SINN2[0] * x0);
-		u10=  (-SINN2[3] * x4 + COSN2[3] * x3);
-		u01=  ( COSN2[1] * x6 + SINN2[1] * x1);
-		u11= -(-SINN2[2] * x5 + COSN2[2] * x2);
-		u02=  ( COSN2[2] * x5 + SINN2[2] * x2);
-		u12=  (-SINN2[1] * x6 + COSN2[1] * x1);
-		u03=  ( COSN2[3] * x4 + SINN2[3] * x3);
-		u13= -(-SINN2[0] * x7 + COSN2[0] * x0);
-	} else { // DCTIV
-		u00=  ( COSN2[0] * x0 + SINN2[0] * x7);
-		u10=  (-SINN2[3] * x3 + COSN2[3] * x4);
-		u01=  ( COSN2[1] * x1 + SINN2[1] * x6);
-		u11= -(-SINN2[2] * x2 + COSN2[2] * x5);
-		u02=  ( COSN2[2] * x2 + SINN2[2] * x5);
-		u12=  (-SINN2[1] * x1 + COSN2[1] * x6);
-		u03=  ( COSN2[3] * x3 + SINN2[3] * x4);
-		u13= -(-SINN2[0] * x0 + COSN2[0] * x7);
-	}
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	*y0 =  v00 * 0.5f;              // w0[0];
-	// j == 1
-	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	// j == 2
-	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	// j == 3
-	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	if (dst_not_dct) { // DSTIV
-		*y1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-		*y3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-		*y5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-		*y7 = -vb00 * 0.5f;             // w1[3];
-	} else {
-		*y1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-		*y3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-		*y5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-		*y7 =  vb00 * 0.5f;             // w1[3];
-	}
-}
-inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTII
-		// invert odd input samples
-		u00= ( (*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
-		u01= (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
-		u02= ( (*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
-		u03= (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
-	} else { // DCTII
-		u00= ( (*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
-		u01= ( (*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
-		u02= ( (*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
-		u03= ( (*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
-	}
-	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-		float w00= u00 + u03;
-		float w10= u00 - u03;
-		float w01= (u01 + u02);
-		float w11= (u01 - u02);
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-		float v11 = w20 - w21 - w30 + w31;
-		float v12 = w20 - w21 + w30 - w31;
-	if (dst_not_dct) { // DSTII
-		// Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-		*x1 =   v03 *         SQRT1_8;
-		*x2 =   v12 *         SQRT1_8;
-		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x4 =   v11 *         SQRT1_8;
-		*x5 =   v01 *         SQRT1_8;
-		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x7 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
-	} else {
-		*x0 =   (w00 + w01) * SQRT1_8; // v00 * SQRT1_8
-		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x2 =   v01 *         SQRT1_8;
-		*x3 =   v11 *         SQRT1_8;
-		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x5 =   v12 *         SQRT1_8;
-		*x6 =   v03 *         SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-	}
-}
-inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTII
-		// invert odd input samples
-		u00= ( (*x0) - (*x7));
-		u10= ( (*x0) + (*x7));
-		u01= (-(*x1) + (*x6));
-		u11= (-(*x1) - (*x6));
-		u02= ( (*x2) - (*x5));
-		u12= ( (*x2) + (*x5));
-		u03= (-(*x3) + (*x4));
-		u13= (-(*x3) - (*x4));
-	} else { // DCTII
-		u00= ( (*x0) + (*x7));
-		u10= ( (*x0) - (*x7));
-		u01= ( (*x1) + (*x6));
-		u11= ( (*x1) - (*x6));
-		u02= ( (*x2) + (*x5));
-		u12= ( (*x2) - (*x5));
-		u03= ( (*x3) + (*x4));
-		u13= ( (*x3) - (*x4));
-	}
-	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
-		float w00= u00 + u03;
-		float w10= u00 - u03;
-		float w01= (u01 + u02);
-		float w11= (u01 - u02);
-		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-		float v11 = w20 - w21 - w30 + w31;
-		float v12 = w20 - w21 + w30 - w31;
-	if (dst_not_dct) { // DSTII
-		// Invert output sequence
-		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-		*x1 =   v03 *         SQRT1_8;
-		*x2 =   v12 *         SQRT1_8;
-		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x4 =   v11 *         SQRT1_8;
-		*x5 =   v01 *         SQRT1_8;
-		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x7 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
-	} else {
-		*x0 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
-		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
-		*x2 =   v01 *         SQRT1_8;
-		*x3 =   v11 *         SQRT1_8;
-		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
-		*x5 =   v12 *         SQRT1_8;
-		*x6 =   v03 *         SQRT1_8;
-		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
-	}
-}
-inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	if (dst_not_dct) { // DSTIV
-		u00=  ( COSN2[0] * (*x7) + SINN2[0] * (*x0));
-		u10=  (-SINN2[3] * (*x4) + COSN2[3] * (*x3));
-		u01=  ( COSN2[1] * (*x6) + SINN2[1] * (*x1));
-		u11= -(-SINN2[2] * (*x5) + COSN2[2] * (*x2));
-		u02=  ( COSN2[2] * (*x5) + SINN2[2] * (*x2));
-		u12=  (-SINN2[1] * (*x6) + COSN2[1] * (*x1));
-		u03=  ( COSN2[3] * (*x4) + SINN2[3] * (*x3));
-		u13= -(-SINN2[0] * (*x7) + COSN2[0] * (*x0));
-	} else { // DCTIV
-		u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-		u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-		u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-		u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-		u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-		u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-		u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-		u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-	}
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	*x0 =  v00 * 0.5f;              // w0[0];
-	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	if (dst_not_dct) { // DSTIV
-		*x1 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-		*x3 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-		*x5 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-		*x7 = -vb00 * 0.5f;             // w1[3];
-	} else {
-		*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-		*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-		*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-		*x7 =  vb00 * 0.5f;             // w1[3];
-	}
-}
-inline __device__ void dttiv_nodiverg(float * x,  int inc, int dst_not_dct)
-{
-	float sgn = 1 - 2* dst_not_dct;
-	float *y0 = x;
-	float *y1 = y0 + inc;
-	float *y2 = y1 + inc;
-	float *y3 = y2 + inc;
-	float *y4 = y3 + inc;
-	float *y5 = y4 + inc;
-	float *y6 = y5 + inc;
-	float *y7 = y6 + inc;
-	float *x0 =  x + dst_not_dct * 7 * inc;
-	// negate inc, replace
-	inc *= sgn;
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	*y0 =  v00 * 0.5f;              // w0[0];
-	*y2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*y4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*y6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	*y1 =  sgn * (v01 -  vb11) * SQRT1_8; // w1[0];
-	*y3 =  sgn * (v02 +  vb01) * SQRT1_8; // w1[1];
-	*y5 =  sgn * (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-	*y7 =  sgn * vb00 * 0.5f;             // w1[3];
-}
-inline __device__ void dctiv_nodiverg(float * x0,  int inc)
-{
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	*x0 =  v00 * 0.5f;              // w0[0];
-	*x2 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x4 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x6 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	*x1 =  (v01 -  vb11) * SQRT1_8; // w1[0];
-	*x3 =  (v02 +  vb01) * SQRT1_8; // w1[1];
-	*x5 =  (v03 -  vb10) * SQRT1_8; // w1[2]; - same as y[3]
-	*x7 =   vb00 * 0.5f;             // w1[3];
-}
-inline __device__ void dstiv_nodiverg(float * x,  int inc)
-{
-	float *x0 =  x +  7 * inc;
-	// negate inc, replace
-	inc = -inc;
-	float *x1 = x0 + inc;
-	float *x2 = x1 + inc;
-	float *x3 = x2 + inc;
-	float *x4 = x3 + inc;
-	float *x5 = x4 + inc;
-	float *x6 = x5 + inc;
-	float *x7 = x6 + inc;
-	float u00, u01, u02, u03, u10, u11, u12, u13;
-	u00=  ( COSN2[0] * (*x0) + SINN2[0] * (*x7));
-	u10=  (-SINN2[3] * (*x3) + COSN2[3] * (*x4));
-	u01=  ( COSN2[1] * (*x1) + SINN2[1] * (*x6));
-	u11= -(-SINN2[2] * (*x2) + COSN2[2] * (*x5));
-	u02=  ( COSN2[2] * (*x2) + SINN2[2] * (*x5));
-	u12=  (-SINN2[1] * (*x1) + COSN2[1] * (*x6));
-	u03=  ( COSN2[3] * (*x3) + SINN2[3] * (*x4));
-	u13= -(-SINN2[0] * (*x0) + COSN2[0] * (*x7));
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float ua00= u00 + u03;
-	float ua10= u00 - u03;
-	float ua01= u01 + u02;
-	float ua11= u01 - u02;
-	float v00= ua00 + ua01;
-	float v02= ua00 - ua01;
-	float v01= COSPI_1_8_SQRT2 * ua10 + COSPI_3_8_SQRT2 * ua11;
-	float v03= COSPI_3_8_SQRT2 * ua10 - COSPI_1_8_SQRT2 * ua11;
-//	_dctii_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float ub00= u10 + u13;
-	float ub10= u10 - u13;
-	float ub01= u11 + u12;
-	float ub11= u11 - u12;
-	float vb00= ub00 + ub01;
-	float vb01= ub00 - ub01;
-	float vb10= COSPI_1_8_SQRT2*ub10 + COSPI_3_8_SQRT2*ub11;
-	float vb11= COSPI_3_8_SQRT2*ub10 - COSPI_1_8_SQRT2*ub11;
-	*x7 =  v00 * 0.5f;              // w0[0];
-	*x5 =  (v01 +  vb11) * SQRT1_8; // w0[1];
-	*x3 =  (v02 -  vb01) * SQRT1_8; // w0[2];
-	*x1 =  (v03 +  vb10) * SQRT1_8; // w0[3];
-	*x6 =  (vb11 - v01)  * SQRT1_8; // w1[0];
-	*x4 = -(v02 +  vb01) * SQRT1_8; // w1[1];
-	*x2 =  (vb10 - v03)  * SQRT1_8; // w1[2]; - same as y[3]
-	*x0 = -vb00 * 0.5f;             // w1[3];
-}
-inline __device__ void _dctii_nrecurs8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	float u00= (x[0] + x[7]);
-	float u10= (x[0] - x[7]);
-	float u01= (x[1] + x[6]);
-	float u11= (x[1] - x[6]);
-	float u02= (x[2] + x[5]);
-	float u12= (x[2] - x[5]);
-	float u03= (x[3] + x[4]);
-	float u13= (x[3] - x[4]);
-//	_dctii_nrecurs4(u00, u01, u02, u03, &v00, &v01, &v02, &v03);
-	float w00= u00 + u03;
-	float w10= u00 - u03;
-	float w01= (u01 + u02);
-	float w11= (u01 - u02);
-	float v00= w00 + w01;
-	float v02= w00 - w01;
-	float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
-	float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
-//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
-	float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
-	float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
-	float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
-	float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
-//	_dctii_nrecurs2(u00, u01, &v00, &v01);
-	float z00= w20 + w21;
-	float z01= w20 - w21;
-//	_dctii_nrecurs2(u10, u11, &v10, &v11);
-	float z10= w30 + w31;
-	float z11= w30 - w31;
-	float v10 = SQRT_2 * z00;
-	float v11 = z01 - z11;
-	float v12 = z01 + z11;
-	float v13 = SQRT_2 * z10;
-	y[0] =   v00;
-	y[1] =   v10;
-	y[2] =   v01;
-	y[3] =   v11;
-	y[4] =   v02;
-	y[5] =   v12;
-	y[6] =   v03;
-	y[7] =   v13;
-}
-inline __device__ void dct_ii8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	_dctii_nrecurs8(x, y);
-#pragma unroll
-	for (int i = 0; i < 8 ; i++) {
-		y[i] *= SQRT1_8;
-	}
-}
-inline __device__ void dct_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	_dctiv_nrecurs8(x, y);
-#pragma unroll
-	for (int i = 0; i < 8 ; i++) {
-		y[i] *= SQRT1_8;
-	}
-}
-inline __device__ void dst_iv8( float x[8], float y[8]) // x,y point to 8-element arrays each
-{
-	float xr[8];
-#pragma unroll
-	for (int i=0; i < 8;i++){
-		xr[i] = x[7 - i];
-	}
-	_dctiv_nrecurs8(xr, y);
-#pragma unroll
-	for (int i=0; i < 8;i+=2){
-		y[i]   *=  SQRT1_8;
-		y[i+1] *= -SQRT1_8;
-	}
-}