started geometry correction

524c2e2f · Andrey Filippov · f15b9b74 · 524c2e2f · 524c2e2f · 524c2e2f
Commit 524c2e2f authored Apr 09, 2020 by Andrey Filippov
8 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -92,8 +92,11 @@ public class GPUTileProcessor {
 	static String GPU_RESOURCE_DIR =              "kernels";
 	static String [] GPU_KERNEL_FILES = {"dtt8x8.cuh","TileProcessor.cuh"};
 	// "*" - generated defines, first index - separately compiled unit
-//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu"},{"*","dtt8x8.h","TileProcessor.cuh"}};
-	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","TileProcessor.cuh"}};
+/*	static String [][] GPU_SRC_FILES = {
+	{"*","dtt8x8.h","dtt8x8.cu"},
+	{"*","dtt8x8.h","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
+*/
+	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.h","dtt8x8.cu","geometry_correction.h","TileProcessor.h","TileProcessor.cuh"}};
 	//	static String [][] GPU_SRC_FILES = {{"*","dtt8x8.cuh","TileProcessor.cuh"}};
 	static String GPU_CONVERT_CORRECT_TILES_NAME = "convert_correct_tiles"; // name in C code
 	static String GPU_IMCLT_RBG_NAME =             "imclt_rbg"; // name in C code

--- a/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/GeometryCorrection.java
 package com.elphel.imagej.tileprocessor;
+import java.io.DataOutputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.channels.Channels;
+import java.nio.channels.WritableByteChannel;
 import java.util.ArrayList;
 import java.util.Properties;

@@ -105,10 +112,109 @@ public class GeometryCorrection {
 	public RigOffset   rigOffset =    null;
 	public int []      woi_tops; // used to calculate scanline timing

+
+	public float [] toFloatArray() { // for GPU comparison
+		return new float[] {
+				(float) focalLength,      // =FOCAL_LENGTH;
+				(float) pixelSize,        // =  PIXEL_SIZE; //um
+				(float) distortionRadius, // =  DISTORTION_RADIUS; // mm - half width of the sensor
+				(float) distortionA8,     //r^8 (normalized to focal length or to sensor half width?)
+				(float) distortionA7,     //r^7 (normalized to focal length or to sensor half width?)
+				(float) distortionA6,     //r^6 (normalized to focal length or to sensor half width?)
+				(float) distortionA5,     //r^5 (normalized to focal length or to sensor half width?)
+				(float) distortionA,      // r^4 (normalized to focal length or to sensor half width?)
+				(float) distortionB,      // r^3
+				(float) distortionC,      // r^2
+				// parameters, common for all sensors
+				(float)    elevation,     // degrees, up - positive;
+				(float)    heading,       // degrees, CW (from top) - positive
+				(float) forward[0], (float) forward[1], (float) forward[2], (float) forward[3], //    [NUM_CAMS];
+				(float) right[0],   (float) right[1],   (float) right[2],   (float) right[3],   // [NUM_CAMS];
+				(float) height[0],  (float) height[1],  (float) height[2],  (float) height[3],  //     [NUM_CAMS];
+				(float) roll[0],    (float) roll[1],    (float) roll[2],    (float) roll[3],     //    [NUM_CAMS];  // degrees, CW (to target) - positive
+				(float) common_right,    // mm right, camera center
+				(float) common_forward,  // mm forward (to target), camera center
+				(float) common_height,   // mm up, camera center
+				(float) common_roll,     // degrees CW (to target) camera as a whole
+//				(float) [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
+//				(float) [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
+				(float) rXY[0][0], (float) rXY[0][1],        // [NUM_CAMS][2]; // XY pairs of the in a normal plane, relative to disparityRadius
+				(float) rXY[1][0], (float) rXY[1][1],
+				(float) rXY[2][0], (float) rXY[2][1],
+				(float) rXY[3][0], (float) rXY[3][1],
+//				(float) [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
+			// only used for the multi-quad systems
+				(float) cameraRadius,     // average distance from the "mass center" of the sensors to the sensors
+				(float) disparityRadius   //=150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
+		};
+	}
+
 	public int [] getWOITops() {// not used in lwir
 		return woi_tops;
 	}

+	public double [] getRByRDist() {
+		return this.rByRDist;
+	}
+	public double getStepR() {
+		return this.stepR;
+	}
+
+
+	// save files for GPU comparison
+	public void saveFloatsGPU(String file_prefix)  throws IOException {
+		// Save GeometryCorrection global data
+		int sizeof_float = 4;
+		{
+			String gc_path =  file_prefix+".geometry_correction";
+			FileOutputStream fos = new FileOutputStream(gc_path);
+			DataOutputStream dos = new DataOutputStream(fos);
+			WritableByteChannel channel = Channels.newChannel(dos);
+			float [] fgc = toFloatArray();
+			ByteBuffer bb = ByteBuffer.allocate(fgc.length * sizeof_float);
+			bb.order(ByteOrder.LITTLE_ENDIAN);
+			bb.clear();
+			for (int i = 0; i <  fgc.length; i++) {
+				bb.putFloat(fgc[i]);
+			}
+			bb.flip();
+			channel.write(bb);
+			dos.close();
+		}
+		{
+			String gc_path =  file_prefix+".correction_vector";
+			FileOutputStream fos = new FileOutputStream(gc_path);
+			DataOutputStream dos = new DataOutputStream(fos);
+			WritableByteChannel channel = Channels.newChannel(dos);
+			float [] fcv = getCorrVector().toFloatArray();
+			ByteBuffer bb = ByteBuffer.allocate(fcv.length * sizeof_float);
+			bb.order(ByteOrder.LITTLE_ENDIAN);
+			bb.clear();
+			for (int i = 0; i <  fcv.length; i++) {
+				bb.putFloat(fcv[i]);
+			}
+			bb.flip();
+			channel.write(bb);
+			dos.close();
+		}
+//double [] getRByRDist()
+		{
+			String gc_path =  file_prefix+".rbyrdist";
+			FileOutputStream fos = new FileOutputStream(gc_path);
+			DataOutputStream dos = new DataOutputStream(fos);
+			WritableByteChannel channel = Channels.newChannel(dos);
+			double [] rByRDist = getRByRDist();
+			ByteBuffer bb = ByteBuffer.allocate(rByRDist.length * sizeof_float);
+			bb.order(ByteOrder.LITTLE_ENDIAN);
+			bb.clear();
+			for (int i = 0; i <  rByRDist.length; i++) {
+				bb.putFloat((float) rByRDist[i]);
+			}
+			bb.flip();
+			channel.write(bb);
+			dos.close();
+		}
+	}

 	public int [] getSensorWH() {
 		int [] wh = {this.pixelCorrectionWidth, this.pixelCorrectionHeight};
@@ -1333,6 +1439,17 @@ public class GeometryCorrection {
 			return new CorrVector(vector);
 		}

+		public float [] toFloatArray() {
+			if (vector == null) {
+				return null;
+			}
+			float [] fvector = new float [vector.length];
+			for (int i = 0; i < vector.length; i++) {
+				fvector[i] = (float) vector[i];
+			}
+			return fvector;
+		}
+
 		public double [] toArray() // USED in lwir
 		{
 			return vector;
@@ -2725,9 +2842,6 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 			double pXci_dbg =  vi.get(0, 0) * norm_z_dbg;
 			double pYci_dbg =  vi.get(1, 0) * norm_z_dbg;

-
-
-
 			// Re-apply distortion
 			double rNDi = Math.sqrt(pXci*pXci + pYci*pYci); // in pixels
 			//		Rdist/R=A8*R^7+A7*R^6+A6*R^5+A5*R^4+A*R^3+B*R^2+C*R+(1-A6-A7-A6-A5-A-B-C)");
@@ -2745,13 +2859,14 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 				rD2rND += rad_coeff[j]*(rri - 1.0); // Fixed
 			}

+/*
 			double rD2rND_dbg = 1.0;
 			double rri_dbg = 1.0;
 			for (int j = 0; j < rad_coeff.length; j++){
 				rri_dbg *= ri_dbg;
 				rD2rND_dbg += rad_coeff[j]*(rri_dbg - 1.0); // Fixed
 			}
-
+*/


 			// Get port pixel coordinates by scaling the 2d vector with Rdistorted/Dnondistorted coefficient)
@@ -2842,7 +2957,6 @@ matrix([[-0.125, -0.125,  0.125,  0.125, -0.125,  0.125, -0.   , -0.   ,   -0.
 					double ers_Yci = delta_t* (dpYci_dtilt * imu[0] + dpYci_dazimuth * imu[1]  + dpYci_droll * imu[2]);
 					if (xyz != null) {
 						double k = SCENE_UNITS_SCALE * this.disparityRadius;
-//						double wdisparity = -(k * this.focalLength / (0.001*this.pixelSize)) / xyz[2];
 						double wdisparity = disparity;
 						double dwdisp_dz = (k * this.focalLength / (0.001*this.pixelSize)) / (xyz[2] * xyz[2]);
 						dpXci_pYci_imu_lin[0][0] = -wdisparity / k; // dpx/ dworld_X

--- a/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/ImageDtt.java
@@ -2548,40 +2548,6 @@ public class ImageDtt {
 				{ 0.5,  0.5}};
 		final int transform_len = transform_size * transform_size;

-
-/*
-		final double [] filter_direct= new double[transform_len];
-		if (corr_sigma == 0) {
-			filter_direct[0] = 1.0;
-			for (int i= 1; i<filter_direct.length;i++) filter_direct[i] =0;
-		} else {
-			for (int i = 0; i < transform_size; i++){
-				for (int j = 0; j < transform_size; j++){
-					filter_direct[i*transform_size+j] = Math.exp(-(i*i+j*j)/(2*corr_sigma)); // FIXME: should be sigma*sigma !
-				}
-			}
-		}
-		// normalize
-		double sum = 0;
-		for (int i = 0; i < transform_size; i++){
-			for (int j = 0; j < transform_size; j++){
-				double d = 	filter_direct[i*transform_size+j];
-				d*=Math.cos(Math.PI*i/(2*transform_size))*Math.cos(Math.PI*j/(2*transform_size));
-				if (i > 0) d*= 2.0;
-				if (j > 0) d*= 2.0;
-				sum +=d;
-			}
-		}
-		for (int i = 0; i<filter_direct.length; i++){
-			filter_direct[i] /= sum;
-		}
-
-		DttRad2 dtt = new DttRad2(transform_size);
-		final double [] filter= dtt.dttt_iiie(filter_direct);
-		for (int i=0; i < filter.length;i++) filter[i] *= 2*transform_size;
-
- 		*/
-
 		final double [] filter =  doubleGetCltLpfFd(corr_sigma);

 		// prepare disparity maps and weights

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -1502,10 +1502,16 @@ public class TwoQuadCLT {
 						true);
 			} catch (IOException e) {
 				System.out.println("Failed to save flattened kernels tp "+kernel_dir);
-				// TODO Auto-generated catch block
 				e.printStackTrace();
 			} // boolean transpose);

+			try {
+				quadCLT_main.getGeometryCorrection().saveFloatsGPU(kernel_dir +"main");
+			} catch (IOException e) {
+				System.out.println("Failed to save geometry correction data to "+kernel_dir);
+				e.printStackTrace();
+			}
+
 			if (debugLevel < -1000) {
 				return null;
 			}

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -41,6 +41,8 @@
 #ifndef JCUDA
 #include "tp_defines.h"
 #include "dtt8x8.h"
+#include "geometry_correction.h"
+#include "TileProcessor.h"
 #endif // #ifndef JCUDA

 #define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
@@ -106,11 +108,12 @@
 #define DBG_TILE_Y     111 // 66

 #define DBG_TILE    (DBG_TILE_Y * 324 + DBG_TILE_X)
-#undef DBG_MARK_DBG_TILE 1
+#undef DBG_MARK_DBG_TILE

 //56494
 // struct tp_task
 //#define TASK_SIZE      12
+#if 0
 struct tp_task {
 	int   task;
 	union {
@@ -119,6 +122,7 @@ struct tp_task {
 	};
 	float xy[NUM_CAMS][2];
 };
+#endif
 struct CltExtra{
 	float data_x;   // kernel data is relative to this displacement X (0.5 pixel increments)
 	float data_y;   // kernel data is relative to this displacement Y (0.5 pixel increments)
@@ -826,6 +830,7 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)

+#if 0
 extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
@@ -892,6 +897,7 @@ __global__ void imclt_rbg(
 		int               h_offset,
 		const size_t      dstride);            // in floats (pixels)
 //===========================
+#endif

 extern "C"
 __global__ void correlate2D(

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
+/**
+ **
+ ** TileProcessor.h
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  TileProcessor.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+
+/**
+**************************************************************************
+* \file TileProcessor.h
+* \brief header file for  the Tile Processor for frequency domain
+
+*/
+#pragma once
+#ifndef NUM_CAMS
+#include "tp_defines.h"
+#endif
+
+extern "C" __global__ void clear_texture_list(
+		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		int                width,  // <= TILESX, use for faster processing of LWIR images
+		int                height); // <= TILESY, use for faster processing of LWIR images
+
+extern "C" __global__ void mark_texture_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,            // number of tiles in task list
+		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+extern "C" __global__ void mark_texture_neighbor_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * woi);                // x,y,width,height of the woi
+extern "C" __global__ void gen_texture_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,  // number of texture tiles to process
+		int              * woi);                // x,y,width,height of the woi
+
+extern "C" __global__ void clear_texture_rbga(
+		int               texture_width,
+		int               texture_slice_height,
+		const size_t      texture_rbga_stride,     // in floats 8*stride
+		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+extern "C" __global__ void textures_accumulate(
+//		int               border_tile,        // if 1 - watch for border
+		int             * woi,                // x, y, width,height
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		size_t            num_texture_tiles,  // number of texture tiles to process
+		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+		int               colors,             // number of colors (3/1)
+		int               is_lwir,            // do not perform shot correction
+		float             min_shot,           // 10.0
+		float             scale_shot,         // 3.0
+		float             diff_sigma,         // pixel value/pixel change
+		float             diff_threshold,     // pixel value/pixel change
+		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             weight0,            // scale for R
+		float             weight1,            // scale for B
+		float             weight2,            // scale for G
+		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+// combining both non-overlap and overlap (each calculated if pointer is not null )
+		size_t            texture_rbg_stride, // in floats
+		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
+		size_t            texture_stride,     // in floats (now 256*4 = 1024)
+		float           * gpu_texture_tiles); // (number of colors +1 + ?)*16*16 rgba texture tiles
+
+extern "C" __global__ void imclt_rbg(
+		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+		int               apply_lpf,
+		int               mono,               // defines lpf filter
+		int               color,              // defines location of clt data
+		int               v_offset,
+		int               h_offset,
+		const size_t      dstride);            // in floats (pixels)
+
+
+
--- a/src/main/resources/kernels/geometry_correction.h
+++ b/src/main/resources/kernels/geometry_correction.h
+/**
+ **
+ ** geometry_correction.h
+ **
+ ** Copyright (C) 2020 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  geometry_correction.h is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+
+/**
+**************************************************************************
+* \file geometry_correction.h
+* \brief header file for geometry correction - per-tile/per camera calculation of the tile offset
+
+*/
+#pragma once
+#ifndef NUM_CAMS
+#include "tp_defines.h"
+#endif
+
+struct tp_task {
+	int   task;
+	union {
+		int      txy;
+		unsigned short sxy[2];
+	};
+	float xy[NUM_CAMS][2];
+};
+
+struct corr_vector{
+	float tilt    [NUM_CAMS-1]; // 0..2
+	float azimuth [NUM_CAMS-1]; // 3..5
+	float roll    [NUM_CAMS];   // 6..9
+	float zoom    [NUM_CAMS-1]; // 10..12
+	// for ERS correction:
+	float imu_rot [3]; // d_tilt/dt (rad/s), d_az/dt, d_roll/dt 13..15
+	float imu_move[3]; // dx/dt, dy/dt, dz/dt 16..19
+};
+
+struct gc {
+	float focalLength;      // =FOCAL_LENGTH;
+	float pixelSize;        // =  PIXEL_SIZE; //um
+	float distortionRadius; // =  DISTORTION_RADIUS; // mm - half width of the sensor
+
+	float distortionA8;     //r^8 (normalized to focal length or to sensor half width?)
+	float distortionA7;     //r^7 (normalized to focal length or to sensor half width?)
+	float distortionA6;     //r^6 (normalized to focal length or to sensor half width?)
+	float distortionA5;     //r^5 (normalized to focal length or to sensor half width?)
+	float distortionA;      // r^4 (normalized to focal length or to sensor half width?)
+	float distortionB;      // r^3
+	float distortionC;      // r^2
+	// parameters, common for all sensors
+	float    elevation;     // degrees, up - positive;
+	float    heading;       // degrees, CW (from top) - positive
+
+	float forward    [NUM_CAMS];
+	float right      [NUM_CAMS];
+	float height     [NUM_CAMS];
+	float roll       [NUM_CAMS];  // degrees, CW (to target) - positive
+
+	float common_right;    // mm right, camera center
+	float common_forward;  // mm forward (to target), camera center
+	float common_height;   // mm up, camera center
+	float common_roll;     // degrees CW (to target) camera as a whole
+//	float [][] XYZ_he;     // all cameras coordinates transformed to eliminate heading and elevation (rolls preserved)
+//	float [][] XYZ_her = null; // XYZ of the lenses in a corrected CCS (adjusted for to elevation, heading,  common_roll)
+	float rXY        [NUM_CAMS][3]; // XY pairs of the in a normal plane, relative to disparityRadius
+//	float [][] rXY_ideal = {{-0.5, -0.5}, {0.5,-0.5}, {-0.5, 0.5}, {0.5,0.5}};
+// only used for the multi-quad systems
+	float cameraRadius; // =0; // average distance from the "mass center" of the sensors to the sensors
+	float disparityRadius; // =150.0; // distance between cameras to normalize disparity units to. sqrt(2)*disparityRadius for quad
+};
+
+
--- a/src/main/resources/kernels/test_tp.cu
+++ b/src/main/resources/kernels/test_tp.cu
+/**
+ **
+ ** dtt8x8.cu - CPU test code to run GPU tile processor
+ **
+ ** Copyright (C) 2018 Elphel, Inc.
+ **
+ ** -----------------------------------------------------------------------------**
+ **
+ **  dtt8x8.cu is free software: you can redistribute it and/or modify
+ **  it under the terms of the GNU General Public License as published by
+ **  the Free Software Foundation, either version 3 of the License, or
+ **  (at your option) any later version.
+ **
+ **  This program is distributed in the hope that it will be useful,
+ **  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ **  GNU General Public License for more details.
+ **
+ **  You should have received a copy of the GNU General Public License
+ **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ **
+ **  Additional permission under GNU GPL version 3 section 7
+ **
+ **  If you modify this Program, or any covered work, by linking or
+ **  combining it with NVIDIA Corporation's CUDA libraries from the
+ **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
+ **  containing parts covered by the terms of NVIDIA CUDA Toolkit
+ **  EULA, the licensors of this Program grant you additional
+ **  permission to convey the resulting work.
+ ** -----------------------------------------------------------------------------**
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <helper_functions.h>
+
+// for reading binary files
+#include <fstream>
+#include <iterator>
+#include <vector>
+
+//#include "dtt8x8.cuh"
+#include "dtt8x8.h"
+#include "TileProcessor.cuh"
+///#include "cuda_profiler_api.h"
+//#include "cudaProfiler.h"
+
+
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size, // size in floats
+						int full_size)
+{
+	float *kernel_gpu;
+    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
+    checkCudaErrors(cudaMemcpy( // segfault
+    		kernel_gpu,
+    		kernel_host,
+			size * sizeof(float),
+            cudaMemcpyHostToDevice));
+    return kernel_gpu;
+}
+
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size)
+{
+	return copyalloc_kernel_gpu(kernel_host,
+            size, // size in floats
+			size);
+}
+
+
+
+float * alloccopy_from_gpu(
+		float * gpu_data,
+		float * cpu_data, // if null, will allocate
+		int size)
+{
+	if (!cpu_data) {
+		cpu_data = (float *)malloc(size*sizeof(float));
+	}
+	checkCudaErrors(cudaMemcpy( // segfault
+			cpu_data,
+			gpu_data,
+			size * sizeof(float),
+			cudaMemcpyDeviceToHost));
+
+	return cpu_data;
+}
+
+
+float * alloc_kernel_gpu(int size) // size in floats
+{
+	float *kernel_gpu;
+    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
+    return kernel_gpu;
+}
+
+
+float ** copyalloc_pointers_gpu(float ** gpu_pointer,
+		                int size) // number of entries (cameras)
+{
+	float ** gpu_pointer_to_gpu_pointers;
+    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
+    checkCudaErrors(cudaMemcpy(
+    		gpu_pointer_to_gpu_pointers,
+			gpu_pointer,
+			size * sizeof(float*),
+            cudaMemcpyHostToDevice));
+    return gpu_pointer_to_gpu_pointers;
+}
+
+
+float * copyalloc_image_gpu(float * image_host,
+						size_t* dstride, // in floats !
+		                int width,
+						int height)
+{
+	float *image_gpu;
+    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
+    checkCudaErrors(cudaMemcpy2D(
+    		image_gpu,
+            *dstride, //  * sizeof(float),
+			image_host,
+			width * sizeof(float), // make in 16*n?
+            width * sizeof(float),
+			height,
+			cudaMemcpyHostToDevice));
+    return image_gpu;
+}
+
+float * alloc_image_gpu(size_t* dstride, // in bytes!!
+		                int width,
+						int height)
+{
+	float *image_gpu;
+    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
+    return image_gpu;
+}
+
+int readFloatsFromFile(float *       data, // allocated array
+					   const char *  path) // file path
+{
+
+    std::ifstream input(path, std::ios::binary );
+    // copies all data into buffer
+    std::vector<char> buffer((
+            std::istreambuf_iterator<char>(input)),
+            (std::istreambuf_iterator<char>()));
+    std::copy( buffer.begin(), buffer.end(), (char *) data);
+	return 0;
+}
+int writeFloatsToFile(float *       data, // allocated array
+		               int           size, // length in elements
+					   const char *  path) // file path
+{
+
+//  std::ifstream input(path, std::ios::binary );
+	std::ofstream ofile(path, std::ios::binary);
+	ofile.write((char *) data, size * sizeof(float));
+	return 0;
+}
+
+// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
+void set_clt_lpf(
+		float * lpf,    // size*size array to be filled out
+		float   sigma,
+		const int     dct_size)
+{
+	int dct_len = dct_size * dct_size;
+	if (sigma == 0.0f) {
+		lpf[0] = 1.0f;
+		for (int i = 1; i < dct_len; i++){
+			lpf[i] = 0.0;
+		}
+	} else {
+		for (int i = 0; i < dct_size; i++){
+			for (int j = 0; j < dct_size; j++){
+				lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
+			}
+		}
+		// normalize
+		double sum = 0;
+		for (int i = 0; i < dct_size; i++){
+			for (int j = 0; j < dct_size; j++){
+				double d = 	lpf[i*dct_size+j];
+				d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
+				if (i > 0) d*= 2.0;
+				if (j > 0) d*= 2.0;
+				sum +=d;
+			}
+		}
+		for (int i = 0; i< dct_len; i++){
+			lpf[i] /= sum;
+		}
+	}
+}
+
+
+
+/**
+**************************************************************************
+*  Program entry point
+*
+* \param argc       [IN] - Number of command-line arguments
+* \param argv       [IN] - Array of command-line arguments
+*
+* \return Status code
+*/
+
+
+int main(int argc, char **argv)
+{
+    //
+    // Sample initialization
+    //
+    printf("%s Starting...\n\n", argv[0]);
+    printf("sizeof(float*)=%d\n",(int)sizeof(float*));
+
+    //initialize CUDA
+    findCudaDevice(argc, (const char **)argv);
+
+    // CLT testing
+
+    const char* kernel_file[] = {
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
+
+    const char* kernel_offs_file[] = {
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
+
+    const char* image_files[] = {
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
+
+    const char* ports_offs_xy_file[] = {
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
+
+    const char* ports_clt_file[] = { // never referenced
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
+    const char* result_rbg_file[] = {
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
+    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
+    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
+    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
+    // not yet used
+    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
+
+    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
+			{-0.5, -0.5},
+			{ 0.5, -0.5},
+			{-0.5,  0.5},
+			{ 0.5,  0.5}};
+
+    int keep_texture_weights = 1; // try with 0 also
+    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
+
+
+/*
+#define IMG_WIDTH    2592
+#define IMG_HEIGHT   1936
+#define NUM_CAMS        4
+#define NUM_COLORS      3
+#define KERNELS_STEP   16
+#define KERNELS_HOR   164
+#define KERNELS_VERT  123
+#define KERNEL_OFFSETS  8
+#define TILESX        324
+#define TILESY        242
+*/
+/*
+    struct tp_task {
+    	long task;
+		short ty;
+		short tx;
+		float xy[NUM_CAMS][2];
+    } ;
+*/
+    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
+    int KERN_SIZE =  KERN_TILES * 4 * 64;
+
+//    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
+    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
+
+
+
+    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
+
+    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
+//    int                texture_indices      [TILESX*TILESY];
+    int                texture_indices      [TILESX*TILESYA];
+    int                cpu_woi              [4];
+
+    // host array of pointers to GPU memory
+    float            * gpu_kernels_h        [NUM_CAMS];
+    struct CltExtra  * gpu_kernel_offsets_h [NUM_CAMS];
+    float            * gpu_images_h         [NUM_CAMS];
+    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
+    float            * gpu_clt_h            [NUM_CAMS];
+    float            * gpu_lpf_h            [NUM_COLORS]; // never used
+#ifndef NOICLT
+    float            * gpu_corr_images_h    [NUM_CAMS];
+#endif
+
+    float            * gpu_corrs;
+    int              * gpu_corr_indices;
+
+    float            * gpu_textures;
+    float            * gpu_textures_rbga;
+    int              * gpu_texture_indices;
+    int              * gpu_woi;
+    int              * gpu_num_texture_tiles;
+    float            * gpu_port_offsets;
+    int                num_corrs;
+    int                num_textures;
+    int                num_ports = NUM_CAMS;
+    // GPU pointers to GPU pointers to memory
+    float           ** gpu_kernels; //           [NUM_CAMS];
+    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
+    float           ** gpu_images; //            [NUM_CAMS];
+    float           ** gpu_clt;    //           [NUM_CAMS];
+    float           ** gpu_lpf;    //           [NUM_CAMS]; // never referenced
+
+    // GPU pointers to GPU memory
+//    float * gpu_tasks;
+    struct tp_task  * gpu_tasks;
+    size_t  dstride;          // in bytes !
+    size_t  dstride_rslt;     // in bytes !
+    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
+    size_t  dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
+
+
+    float lpf_rbg[3][64]; // not used
+    for (int ncol = 0; ncol < 3; ncol++) {
+    	if (lpf_sigmas[ncol] > 0.0) {
+    		set_clt_lpf (
+    				lpf_rbg[ncol], // float * lpf,    // size*size array to be filled out
+					lpf_sigmas[ncol], // float   sigma,
+					8); // int     dct_size)
+    		gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
+    	} else {
+    		gpu_lpf_h[ncol] = NULL;
+    	}
+    }
+
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+        readFloatsFromFile(
+        		host_kern_buf, // float * data, // allocated array
+				kernel_file[ncam]); // 			   char *  path) // file path
+        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
+
+        readFloatsFromFile(
+        		host_kern_buf, // float * data, // allocated array
+				kernel_offs_file[ncam]); // 			   char *  path) // file path
+        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
+        		host_kern_buf,
+				KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
+        // will get results back
+        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
+        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
+        // allocate result images (3x height to accommodate 3 colors
+
+        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
+        //host array of pointers to GPU arrays
+#ifndef NOICLT
+        gpu_corr_images_h[ncam] = alloc_image_gpu(
+        		&dstride_rslt,                // size_t* dstride, // in bytes!!
+				IMG_WIDTH + DTT_SIZE,         // int width,
+				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
+#endif
+    }
+    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
+    gpu_corrs = alloc_image_gpu(
+    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			CORR_SIZE,                      // int width,
+			NUM_PAIRS * TILESX * TILESY);   // int height);
+    // read channel images (assuming host_kern_buf size > image size, reusing it)
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+        readFloatsFromFile(
+        		host_kern_buf, // float * data, // allocated array
+				image_files[ncam]); // 			   char *  path) // file path
+        gpu_images_h[ncam] =  copyalloc_image_gpu(
+        		host_kern_buf, // float * image_host,
+				&dstride,      // size_t* dstride,
+				IMG_WIDTH,     // int width,
+				IMG_HEIGHT);   // int height);
+    }
+//#define DBG_TILE  (174*324 +118)
+
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+        readFloatsFromFile(
+			    (float *) &tile_coords_h[ncam],
+				ports_offs_xy_file[ncam]); // 			   char *  path) // file path
+    }
+
+    // build TP task that processes all tiles in linescan order
+    for (int ty = 0; ty < TILESY; ty++){
+        for (int tx = 0; tx < TILESX; tx++){
+            int nt = ty * TILESX + tx;
+            task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
+            task_data[nt].txy = tx + (ty << 16);
+            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
+                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
+            }
+        }
+    }
+
+    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
+
+
+#ifdef DBG0
+//#define NUM_TEST_TILES 128
+#define NUM_TEST_TILES 1
+    for (int t = 0; t < NUM_TEST_TILES; t++) {
+    	task_data[t].task = 1;
+    	task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
+    	int nt = task_data[t].ty * TILESX + task_data[t].tx;
+
+    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    		task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
+    		task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
+    	}
+    }
+    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
+
+#endif
+
+    // segfault in the next
+    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+
+    // build corr_indices
+    num_corrs = 0;
+    for (int ty = 0; ty < TILESY; ty++){
+    	for (int tx = 0; tx < TILESX; tx++){
+    		int nt = ty * TILESX + tx;
+    		int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
+    		if (cm){
+    			for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
+    				corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
+    			}
+    		}
+    	}
+    }
+    // num_corrs now has the total number of correlations
+    // copy corr_indices to gpu
+//    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
+    gpu_corr_indices = (int  *) copyalloc_kernel_gpu(
+    		(float * ) corr_indices,
+			num_corrs,
+			NUM_PAIRS * TILESX * TILESY);
+
+    // build texture_indices
+    num_textures = 0;
+    for (int ty = 0; ty < TILESY; ty++){
+    	for (int tx = 0; tx < TILESX; tx++){
+    		int nt = ty * TILESX + tx;
+//    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
+    		int cm = task_data[nt].task & TASK_TEXTURE_BITS;
+    		if (cm){
+    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+    		}
+    	}
+    }
+    // num_textures now has the total number of textures
+    // copy corr_indices to gpu
+//  gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
+    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
+    		(float * ) texture_indices,
+			num_textures,
+			TILESX * TILESYA); // number of rows - multiple of 4
+    // just allocate
+    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
+    // number of border tiles
+
+    // copy port indices to gpu
+    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
+
+
+
+//    int keep_texture_weights = 1; // try with 0 also
+//    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
+
+//		double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
+
+    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+
+    gpu_textures = alloc_image_gpu(
+    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
+			tile_texture_size,              // int width (floats),
+			TILESX * TILESY);               // int height);
+
+    int rgba_width =   (TILESX+1) * DTT_SIZE;
+    int rgba_height =  (TILESY+1) * DTT_SIZE;
+    int rbga_slices =  texture_colors + 1; // 4/1
+
+    gpu_textures_rbga = alloc_image_gpu(
+    		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
+			rgba_width,              // int width (floats),
+			rgba_height * rbga_slices);               // int height);
+
+
+    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
+
+    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
+    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
+    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      NUM_CAMS);
+    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
+//    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
+
+    //create and start CUDA timer
+    StopWatchInterface *timerTP = 0;
+    sdkCreateTimer(&timerTP);
+
+
+    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
+    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
+    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
+
+#ifdef DBG_TILE
+    const int numIterations = 1; //0;
+    const int i0 =  0; // -1;
+#else
+    const int numIterations = 10; // 0; //0;
+    const int i0 = -1; // 0; // -1;
+#endif
+    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
+///    cudaProfilerStart();
+    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];
+
+    for (int i = i0; i < numIterations; i++)
+    {
+        if (i == 0)
+        {
+            checkCudaErrors(cudaDeviceSynchronize());
+            sdkResetTimer(&timerTP);
+            sdkStartTimer(&timerTP);
+        }
+
+        convert_correct_tiles<<<grid_tp,threads_tp>>>(
+        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
+				gpu_kernels,           // 		float           ** gpu_kernels,
+				gpu_images,            // 		float           ** gpu_images,
+				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
+				gpu_clt,               //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				dstride/sizeof(float), // 		size_t            dstride, // for gpu_images
+				tp_task_size,          // 		int               num_tiles) // number of tiles in task
+				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+
+
+        getLastCudaError("Kernel execution failed");
+        checkCudaErrors(cudaDeviceSynchronize());
+        printf("%d\n",i);
+    }
+//    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&timerTP);
+    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
+    sdkDeleteTimer(&timerTP);
+    printf("Run time =%f ms\n",  avgTime);
+
+
+#ifdef SAVE_CLT
+    int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
+    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    	checkCudaErrors(cudaMemcpy( // segfault
+    			cpu_clt,
+				gpu_clt_h[ncam],
+				rslt_size * sizeof(float),
+    			cudaMemcpyDeviceToHost));
+#ifndef DBG_TILE
+        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
+    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
+    			rslt_size, // int           size, // length in elements
+				ports_clt_file[ncam]); // 			   const char *  path) // file path
+#endif
+    }
+#endif
+
+#ifdef TEST_IMCLT
+     {
+    	// testing imclt
+    	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
+    	dim3 grid_imclt(1,1,1);
+    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
+    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
+    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    		test_imclt<<<grid_imclt,threads_imclt>>>(
+    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+					ncam);                                        // int             ncam); // just for debug print
+    	}
+    	getLastCudaError("Kernel execution failed");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test_imclt() DONE\n");
+    }
+#endif
+
+
+#ifndef NOICLT
+    // testing imclt
+    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
+    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
+    StopWatchInterface *timerIMCLT = 0;
+    sdkCreateTimer(&timerIMCLT);
+
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerIMCLT);
+    		sdkStartTimer(&timerIMCLT);
+    	}
+
+    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    		for (int color = 0; color < NUM_COLORS; color++) {
+#ifdef IMCLT14
+    			for (int v_offs = 0; v_offs < 1; v_offs++){     // temporarily for debugging
+    				for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
+#else
+    	    			for (int v_offs = 0; v_offs < 2; v_offs++){
+    	    				for (int h_offs = 0; h_offs < 2; h_offs++){
+#endif
+    					int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
+    					int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
+    					int tiles_in_pass = tilesy_half * tilesx_half;
+    					dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
+    					//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
+    					imclt_rbg<<<grid_imclt,threads_imclt>>>(
+    							gpu_clt_h[ncam],         // float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+								gpu_corr_images_h[ncam], // float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
+								1,                       // int               apply_lpf,
+								0,                       // int               mono,               // defines lpf filter
+								color,                   // int               color,              // defines location of clt data
+								v_offs,                  // int               v_offset,
+								h_offs,                  // int               h_offset,
+								dstride_rslt/sizeof(float));            //const size_t      dstride);            // in floats (pixels)
+    				}
+    			}
+    		}
+    	}
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+
+    sdkStopTimer(&timerIMCLT);
+    float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
+    sdkDeleteTimer(&timerIMCLT);
+    printf("Average IMCLT run time =%f ms\n",  avgTimeIMCLT);
+
+    int rslt_img_size =       NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
+    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
+
+
+
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    	checkCudaErrors(cudaMemcpy2D( // segfault
+    			cpu_corr_image,
+				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
+				gpu_corr_images_h[ncam],
+				dstride_rslt,
+				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
+				3* (IMG_HEIGHT + DTT_SIZE),
+    			cudaMemcpyDeviceToHost));
+
+#ifndef DBG_TILE
+        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
+    	writeFloatsToFile( // will have margins
+    			cpu_corr_image, // float *       data, // allocated array
+				rslt_img_size, // int           size, // length in elements
+				result_rbg_file[ncam]); // 			   const char *  path) // file path
+#endif
+    }
+
+    free(cpu_corr_image);
+#endif
+
+
+#ifndef NOCORR
+//    cudaProfilerStart();
+    // testing corr
+    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
+    StopWatchInterface *timerCORR = 0;
+    sdkCreateTimer(&timerCORR);
+
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerCORR);
+    		sdkStartTimer(&timerCORR);
+    	}
+
+        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+        correlate2D<<<grid_corr,threads_corr>>>(
+		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		3,         // int               colors,             // number of colors (3/1)
+		0.25,      // float             scale0,             // scale for R
+		0.25,      // float             scale1,             // scale for B
+		0.5,       // float             scale2,             // scale for G
+		30.0,      // float             fat_zero,           // here - absolute
+		num_corrs, // size_t            num_corr_tiles,     // number of correlation tiles to process
+		gpu_corr_indices, //  int             * gpu_corr_indices,   // packed tile+pair
+		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
+		CORR_OUT_RAD, // int               corr_radius,        // radius of the output correlation (7 for 15x15)
+		gpu_corrs); // float           * gpu_corrs);          // correlation output data
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+
+    sdkStopTimer(&timerCORR);
+    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
+    sdkDeleteTimer(&timerCORR);
+    printf("Average CORR run time =%f ms\n",  avgTimeCORR);
+
+    int corr_size =        2 * CORR_OUT_RAD + 1;
+    int rslt_corr_size =   num_corrs * corr_size * corr_size;
+    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+
+
+
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr,
+			(corr_size * corr_size) * sizeof(float),
+			gpu_corrs,
+			dstride_corr,
+			(corr_size * corr_size) * sizeof(float),
+			num_corrs,
+    		cudaMemcpyDeviceToHost));
+
+#ifndef NSAVE_CORR
+    		printf("Writing phase correlation data to %s\n",  result_corr_file);
+    		writeFloatsToFile(
+    				cpu_corr,    // float *       data, // allocated array
+					rslt_corr_size,    // int           size, // length in elements
+					result_corr_file); // 			   const char *  path) // file path
+#endif
+    		free(cpu_corr);
+#endif // ifndef NOCORR
+
+
+// -----------------
+
+#ifndef NOTEXTURES
+//    cudaProfilerStart();
+    // testing textures
+    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
+    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
+    StopWatchInterface *timerTEXTURE = 0;
+    sdkCreateTimer(&timerTEXTURE);
+
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerTEXTURE);
+    		sdkStartTimer(&timerTEXTURE);
+    	}
+
+		// Channel0 weight = 0.294118
+		// Channel1 weight = 0.117647
+		// Channel2 weight = 0.588235
+    	textures_accumulate<<<grid_texture,threads_texture>>> (
+//    			0,          // int               border_tile,        // if 1 - watch for border
+    			(int *) 0,  //      int             * woi,                // x, y, width,height
+		        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				num_textures,          // size_t            num_texture_tiles,  // number of texture tiles to process
+				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+				texture_colors,        // int               colors,             // number of colors (3/1)
+				(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+				10.0,                  // float             min_shot,           // 10.0
+				3.0,                   // float             scale_shot,         // 3.0
+				1.5f,                  // float             diff_sigma,         // pixel value/pixel change
+				10.0f,                 // float             diff_threshold,     // pixel value/pixel change
+				3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+				0.294118,              // float             weight0,            // scale for R
+				0.117647,              // float             weight1,            // scale for B
+				0.588235,              // float             weight2,            // scale for G
+				1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
+    	// combining both non-overlap and overlap (each calculated if pointer is not null )
+    			0, // const size_t      texture_rbg_stride, // in floats
+    			(float *) 0, // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+				dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
+				gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+///	cudaProfilerStop();
+    sdkStopTimer(&timerTEXTURE);
+    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
+    sdkDeleteTimer(&timerTEXTURE);
+    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
+
+    int rslt_texture_size =   num_textures * tile_texture_size;
+    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
+
+
+
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_textures,
+			tile_texture_size * sizeof(float),
+			gpu_textures,
+			dstride_textures,
+			tile_texture_size * sizeof(float),
+			num_textures,
+    		cudaMemcpyDeviceToHost));
+
+#ifndef NSAVE_TEXTURES
+    		printf("Writing phase texture data to %s\n",  result_textures_file);
+    		writeFloatsToFile(
+    				cpu_textures,    // float *       data, // allocated array
+					rslt_texture_size,    // int           size, // length in elements
+					result_textures_file); // 			   const char *  path) // file path
+
+//DBG_TILE
+#ifdef DEBUG10
+    		int texture_offset = DBG_TILE * tile_texture_size;
+    		int chn = 0;
+    		for (int i = 0; i < tile_texture_size; i++){
+    			if ((i % 256) == 0){
+    				printf("\nchn = %d\n", chn++);
+    			}
+    			printf("%10.4f", *(cpu_textures + texture_offset + i));
+    			if (((i + 1) % 16) == 0){
+    				printf("\n");
+    			} else {
+    				printf(" ");
+    			}
+    		}
+//    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+#endif // DEBUG9
+#endif
+    		free(cpu_textures);
+#endif // ifndef NOTEXTURES
+
+
+#define GEN_TEXTURE_LIST
+#ifdef  GEN_TEXTURE_LIST
+    		dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+    		dim3 grid_list   (1,1,1);
+    		printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
+    		printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
+    		StopWatchInterface *timerTEXTURELIST = 0;
+    		sdkCreateTimer(&timerTEXTURELIST);
+    		for (int i = i0; i < numIterations; i++)
+    		{
+    			if (i == 0)
+    			{
+    				checkCudaErrors(cudaDeviceSynchronize());
+    				sdkResetTimer(&timerTEXTURELIST);
+    				sdkStartTimer(&timerTEXTURELIST);
+    			}
+
+    			prepare_texture_list<<<grid_list,threads_list>>> (
+    					gpu_tasks,             // struct tp_task   * gpu_tasks,
+						tp_task_size,          // int                num_tiles,          // number of tiles in task list
+						gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+						gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+						gpu_woi,               // int              * woi,                // x,y,width,height of the woi
+						TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+						TILESY);               // int                height); // <= TILESY, use for faster processing of LWIR images
+
+    			getLastCudaError("Kernel failure");
+    			checkCudaErrors(cudaDeviceSynchronize());
+    			printf("test pass: %d\n",i);
+    		}
+    		///	cudaProfilerStop();
+    		sdkStopTimer(&timerTEXTURELIST);
+    		float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
+    		sdkDeleteTimer(&timerTEXTURELIST);
+    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
+
+    		int cpu_num_texture_tiles[8];
+    		checkCudaErrors(cudaMemcpy(
+    				cpu_woi,
+					gpu_woi,
+					4 * sizeof(float),
+					cudaMemcpyDeviceToHost));
+    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+    		checkCudaErrors(cudaMemcpy(
+    				cpu_num_texture_tiles,
+					gpu_num_texture_tiles,
+					8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
+					//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
+					cudaMemcpyDeviceToHost));
+    		printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
+    				cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
+					cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
+					cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
+					cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
+    				cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
+					cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
+					cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
+					cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
+					tp_task_size
+					);
+    		for (int q = 0; q < 4; q++) {
+    			checkCudaErrors(cudaMemcpy(
+    					texture_indices  + q * TILESX * (TILESYA >> 2),
+						gpu_texture_indices  + q * TILESX * (TILESYA >> 2),
+						cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
+						cudaMemcpyDeviceToHost));
+    		}
+    		for (int q = 0; q < 4; q++) {
+        		printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
+    		}
+#endif //GEN_TEXTURE_LIST
+
+
+
+#ifndef NOTEXTURE_RGBA
+    dim3 threads_rgba(1, 1, 1);
+    dim3 grid_rgba(1,1,1);
+    printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
+    printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
+    StopWatchInterface *timerRGBA = 0;
+    sdkCreateTimer(&timerRGBA);
+
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerRGBA);
+    		sdkStartTimer(&timerRGBA);
+    	}
+
+    	generate_RBGA<<<grid_rgba,threads_rgba>>> (
+    	// Parameters to generate texture tasks
+                gpu_tasks,             // struct tp_task   * gpu_tasks,
+                tp_task_size,          // int                num_tiles,          // number of tiles in task list
+    	// declare arrays in device code?
+	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
+	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
+    	// Parameters for the texture generation
+	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+	            gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+	            texture_colors,        // int               colors,             // number of colors (3/1)
+	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+	            10.0,                  // float             min_shot,           // 10.0
+	            3.0,                   // float             scale_shot,         // 3.0
+	            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
+	            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
+	            3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	            0.294118,              // float             weight0,            // scale for R
+	            0.117647,              // float             weight1,            // scale for B
+	            0.588235,              // float             weight2,            // scale for G
+	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+	            0,                     // int               keep_weights,       // return channel weights after A in RGBA
+				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
+				gpu_textures_rbga);    // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    sdkStopTimer(&timerRGBA);
+    float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
+    sdkDeleteTimer(&timerRGBA);
+    printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
+
+	checkCudaErrors(cudaMemcpy(
+			cpu_woi,
+			gpu_woi,
+			4 * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+
+
+	// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
+    int rgba_woi_width =  (cpu_woi[2] + 1) * DTT_SIZE;
+    int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
+
+    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
+    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
+
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_textures_rgba,
+			rgba_width * sizeof(float),
+			gpu_textures_rbga,
+			dstride_textures_rbga,
+			rgba_width * sizeof(float),
+			rgba_height * rbga_slices,
+    		cudaMemcpyDeviceToHost));
+
+#ifndef NSAVE_TEXTURES
+    printf("Writing RBGA texture slices to %s\n",  result_textures_rgba_file);
+    writeFloatsToFile(
+    		cpu_textures_rgba,    // float *       data, // allocated array
+			rslt_rgba_size,    // int           size, // length in elements
+			result_textures_rgba_file); // 			   const char *  path) // file path
+#endif
+#ifdef DEBUG11
+    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
+    for (int chn = 0; chn < rbga_slices; chn++){
+    	printf("\nchn = %d\n", chn);
+    	int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
+
+    	for (int i = 0; i < 8; i++){
+    		for (int j = 0; j < 8; j++){
+    			printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
+    		}
+    		printf("\n");
+    	}
+    }
+#endif // DEBUG11
+    free(cpu_textures_rgba);
+#endif // ifndef NOTEXTURES
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#ifdef SAVE_CLT
+    free(cpu_clt);
+#endif
+
+    free (host_kern_buf);
+    // TODO: move somewhere when all is done
+    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
+    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
+    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));
+    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
+#ifndef NOICLT
+    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
+#endif
+    }
+	checkCudaErrors(cudaFree(gpu_tasks));
+	checkCudaErrors(cudaFree(gpu_kernels));
+	checkCudaErrors(cudaFree(gpu_kernel_offsets));
+	checkCudaErrors(cudaFree(gpu_images));
+	checkCudaErrors(cudaFree(gpu_clt));
+//	checkCudaErrors(cudaFree(gpu_corr_images));
+	checkCudaErrors(cudaFree(gpu_corrs));
+	checkCudaErrors(cudaFree(gpu_corr_indices));
+	checkCudaErrors(cudaFree(gpu_texture_indices));
+	checkCudaErrors(cudaFree(gpu_port_offsets));
+	checkCudaErrors(cudaFree(gpu_textures));
+	checkCudaErrors(cudaFree(gpu_textures_rbga));
+	checkCudaErrors(cudaFree(gpu_woi));
+	checkCudaErrors(cudaFree(gpu_num_texture_tiles));
+
+
+
+	exit(0);
+}