updated GPU code

2f464dff · Andrey Filippov · 3c033192 · 2f464dff · 3c033192
Commit 2f464dff authored Apr 13, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 1217 deletions

TileProcessor.cuh src/main/resources/kernels/TileProcessor.cuh +46 -118

test_tp.cu src/main/resources/kernels/test_tp.cu +0 -1099

No files found.
--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -1268,8 +1268,6 @@ __global__ void generate_RBGA(
 #endif
 			    /* */
 			    textures_accumulate<<<grid_texture,threads_texture>>>(
-			    		// get rid of border tile
-//			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
 			    		woi,                             // int             * woi,                // x, y, width,height
 						gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
@@ -2418,30 +2416,26 @@ __global__ void imclt_rbg(
    //	save result (back)
    float * rbg_p = rbg_top;
    mclt_tile =     mclt_top;
-    if ((tileX == 0)  && (tileY == 0)){
-#pragma unroll
-    	for (int i = 0; i < DTT_SIZE2; i++){
-    		*rbg_p = 100.0f; // just testing
-    		mclt_tile += DTT_SIZE21;
-    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
-    	}
 #ifdef DBG_MARK_DBG_TILE
-    } else if ((tileX == DBG_TILE_X)  && (tileY == DBG_TILE_Y)){
+    if ((tileX == DBG_TILE_X)  && (tileY == DBG_TILE_Y)){
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE2; i++){
    		*rbg_p = (*mclt_tile) * 2.0; // just testing
    		mclt_tile += DTT_SIZE21;
    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
    	}
-#endif
    } else {
+#endif // #ifdef DBG_MARK_DBG_TILE
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE2; i++){
    		*rbg_p = *mclt_tile;
    		mclt_tile += DTT_SIZE21;
    		rbg_p +=     dstride; // DTT_SIZE2; // FIXME
    	}
+#ifdef DBG_MARK_DBG_TILE
    }
+#endif //#ifdef DBG_MARK_DBG_TILE
 }
@@ -2796,29 +2790,30 @@ __device__ void convertCorrectTile(
 	    float window_vert_cos [2*DTT_SIZE])
 {
+	// TODO: pass these values instead of constants to handle EO/LWIR
+	int max_px =   IMG_WIDTH  - 1; // odd
+	int max_py =   IMG_HEIGHT - 1; // odd
+	int max_pxm1 = max_px - 1; // even
+	int max_pym1 = max_py - 1; // even
+	int max_kernel_hor = KERNELS_HOR -1;
+	int max_kernel_vert = KERNELS_VERT-1;
 	int   ktileX, ktileY;
 	int   kernel_index; // common for all coors
 	float kdx, kdy;
 	if (threadIdx.x == 0){
-		ktileX = min(KERNELS_HOR-1,  max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
+		ktileX = min(max_kernel_hor,  max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
-		ktileY = min(KERNELS_VERT-1, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
+		ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
 		kdx =    centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
 		kdy =    centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
-#ifdef USE_UMUL24
-    	kernel_index = __umul24((ktileX + __umul24(ktileY, KERNELS_HOR)), NUM_COLORS);
-#else
    	kernel_index = (ktileX + ktileY * KERNELS_HOR) * NUM_COLORS;
-#endif
 	}
-////     __syncthreads();// __syncwarp();
    // broadcast kernel_index
    kernel_index =  __shfl_sync(
    		0xffffffff,        // unsigned mask,
 			kernel_index,      // T var,
 			0,                 // int srcLane,
 			THREADS_PER_TILE); // int width=warpSize);
-////     __syncthreads();// __syncwarp(); // is it needed?
    kdx =  __shfl_sync(
    		0xffffffff,        // unsigned mask,
 			kdx,               // T var,
@@ -2834,12 +2829,7 @@ __device__ void convertCorrectTile(
    float px, py;
    // copy kernel
 	int kernel_full_index = kernel_index + color;
-#ifdef USE_UMUL24
-    float * kernel_src = gpu_kernels + __umul24(kernel_full_index, (DTT_SIZE * DTT_SIZE * 4));
-#else
    float * kernel_src = gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
-#endif
    float * kernelp =  clt_kernels;
    kernel_src += threadIdx.x; // lsb;
@@ -2857,12 +2847,6 @@ __device__ void convertCorrectTile(
 	px = centerX - DTT_SIZE - (clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy) ; // fractional left corner
 	int itlx = (int) floorf(px +0.5f);
-	if (itlx < 0){
-		itlx &= 1; // for color - extend by pairs
-	}
-	if (itlx >= (IMG_WIDTH - DTT_SIZE)){
-		itlx = itlx & 1 +(IMG_WIDTH - DTT_SIZE - 2); // for color - extend by pairs
-	}
 	int_topleft [0] = itlx;
 	float shift_hor =  itlx - px;
 	residual_shift[0] = shift_hor;
@@ -2898,15 +2882,9 @@ __device__ void convertCorrectTile(
 	py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
 	int itly = (int) floorf(py +0.5f);
-	if (itly < 0){
-		itly &= 1; // for color - extend by pairs
-	}
-	if (itly >= (IMG_HEIGHT - DTT_SIZE)){
-		itly = (itly & 1) +(IMG_HEIGHT - DTT_SIZE - 2); // for color - extend by pairs
-	}
 	int_topleft[1] = itly;
-#ifdef DEBUG_OOB1
+#ifdef DEBUG_OOB11
 	if ((int_topleft[0] < 0) || (int_topleft[1] < 0) || (int_topleft[0] >= (IMG_WIDTH - DTT_SIZE)) || (int_topleft[1] >= IMG_HEIGHT - DTT_SIZE)){
 		printf("Source data OOB, left=%d, top=%d\n",int_topleft[0],int_topleft[1]);
 		printf("\n");
@@ -2964,68 +2942,47 @@ __device__ void convertCorrectTile(
    	int row_tl = int_topleft[1];
    	// for red, blue and green, pass 0
    	int local_col = ((col_tl & 1) ^ (BAYER_RED_COL ^ color0 ^ color1 ^ gpass)) + (threadIdx.x << 1); // green red row: invert column from red
-//    	int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ gpass);                            // use red row
-//    	int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ color0 ^ color1 ^ gpass);                            // use red row
    	int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ color0          ^ gpass);                            // use red row
    	float hwind_cos = window_hor_cos[local_col];
    	float hwind_sin = window_hor_sin[local_col]; // **** only used for green
    	int dtt_offset =     fold_indx2[local_row][local_col];
    	int dtt_offset_inc = fold_inc[local_row];
-#ifdef USE_UMUL24
-    	float *dct_buf = clt_tile + __umul24(gpass << 1 , DTT_SIZE * DTT_SIZE1);
-    	float *dst_buf = clt_tile + __umul24((gpass << 1) + 1 , DTT_SIZE * DTT_SIZE1);   // **** only used for green
-#else
    	float *dct_buf = clt_tile + ((gpass << 1) * (DTT_SIZE * DTT_SIZE1));
    	float *dst_buf = clt_tile + (((gpass << 1) + 1) * (DTT_SIZE * DTT_SIZE1));   // **** only used for green
-#endif
-    	if ((col_tl >= 0) && ((col_tl < (IMG_WIDTH - DTT_SIZE * 2))) && (row_tl >= 0) && ((row_tl < (IMG_HEIGHT - DTT_SIZE * 2)))) {
+    	int col_src = col_tl + local_col;
-#ifdef USE_UMUL24
+    	if (col_src < 0) {
-    		float *image_p = gpu_images + __umul24(row_tl + local_row, dstride)+ col_tl + local_col;
+    		col_src &= 1; // same Bayer
-#else
+    	} else if (col_src > max_px){
-    		float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
+    		col_src = (col_src & 1) + max_pxm1;
-#endif
+    	}
-#pragma unroll
+    	int row_src = row_tl + local_row;
-    		for (int i = 0; i < 8; i++) {
+    	int row_use = row_src;
-//    			float d = (*image_p) * window_vert_cos[local_row]; //warp illegal address (0,2,1)
+    	if (row_use < 0) {
-    			float d = (*image_p);
+    		row_use &= 1; // same Bayer
-    			d *= window_vert_cos[local_row]; //warp illegal address (0,2,1)
+    	} else if (row_use > max_py){
-    			int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
+    		row_use = (row_use & 1) + max_pym1;
-    			dct_buf[dtt_offset1] = d * hwind_cos;
-    			dst_buf[dtt_offset1] = d * hwind_sin; // **** only used for green
-    			dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
-    			dtt_offset_inc >>= 4;
-    			local_row += 2;
-    			image_p += dstride2;
-    		}
-    	} else { // handling border tiles (slower)
-    		int eff_col = (min(IMG_HEIGHT/2 -1, max(0, col_tl >> 1)) << 1) + (col_tl & 1);
-    		int row_lsb =  row_tl & 1;
-    		int row_pair = row_tl >> 1;
-#ifdef USE_UMUL24
-    		float *image_p = gpu_images + __umul24(local_row, dstride) + (eff_col + local_col);
-#else
-    		float *image_p = gpu_images + dstride * local_row+ (eff_col + local_col);
-#endif
-#pragma unroll
-    		for (int i = 0; i < 8; i++) {
-    			int eff_row = (min(IMG_WIDTH/2 - 1, max(0, row_pair + i)) << 1) + row_lsb;
-#ifdef USE_UMUL24
-    			float d =  image_p[__umul24(eff_row,dstride)] * window_vert_cos[local_row];
-#else
-    			float d =  image_p[dstride * eff_row] * window_vert_cos[local_row];
-#endif
-    			int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
-    			dct_buf[dtt_offset1] = d * hwind_cos;
-    			dst_buf[dtt_offset1] = d * hwind_sin; // **** only used for green
-    			dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
-    			dtt_offset_inc >>= 4;
-    			local_row += 2;
-    		}
    	}
+//		float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
+		float *image_p = gpu_images + dstride * row_use + col_src;
+#pragma unroll
+		for (int i = 0; i < 8; i++) {
+			float d = (*image_p);
+			d *= window_vert_cos[local_row]; //warp illegal address (0,2,1)
+			int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
+			dct_buf[dtt_offset1] = d * hwind_cos;
+			dst_buf[dtt_offset1] = d * hwind_sin; // **** only used for green
+			dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
+			dtt_offset_inc >>= 4;
+			local_row += 2;
+			row_src +=2;
+			if ((row_src >= 0) && (row_src <= max_pym1)){
+				image_p += dstride2;
+			}
+		}
    }
     __syncthreads();// __syncwarp();
 #ifdef DEBUG2
@@ -3058,35 +3015,6 @@ __device__ void convertCorrectTile(
     dttiv_color_2d(
     		clt_tile,
     		color);
-/*
-    dctiv_nodiverg( // all colors
-			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
-			1); //int inc);
-    if (color == BAYER_GREEN){
-        dstiv_nodiverg( // all colors
-				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
-    			1); //int inc);
-    }
-  	 __syncthreads();// __syncwarp();
-#ifdef DEBUG2
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
-    }
-     __syncthreads();// __syncwarp();
-#endif
-    dctiv_nodiverg( // all colors
-    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
-			DTT_SIZE1); // int inc,
-    if (color == BAYER_GREEN){
-          dctiv_nodiverg( // all colors
-        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
-    			DTT_SIZE1); // int inc,
-    }
-  	 __syncthreads();// __syncwarp();
-*/
 #ifdef DEBUG2
    if ((threadIdx.x) == 0){

--- a/src/main/resources/kernels/test_tp.cu
+++ b/src/main/resources/kernels/test_tp.cu
-/**
- **
- ** dtt8x8.cu - CPU test code to run GPU tile processor
- **
- ** Copyright (C) 2018 Elphel, Inc.
- **
- ** -----------------------------------------------------------------------------**
- **
- **  dtt8x8.cu is free software: you can redistribute it and/or modify
- **  it under the terms of the GNU General Public License as published by
- **  the Free Software Foundation, either version 3 of the License, or
- **  (at your option) any later version.
- **
- **  This program is distributed in the hope that it will be useful,
- **  but WITHOUT ANY WARRANTY; without even the implied warranty of
- **  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- **  GNU General Public License for more details.
- **
- **  You should have received a copy of the GNU General Public License
- **  along with this program.  If not, see <http://www.gnu.org/licenses/>.
- **
- **  Additional permission under GNU GPL version 3 section 7
- **
- **  If you modify this Program, or any covered work, by linking or
- **  combining it with NVIDIA Corporation's CUDA libraries from the
- **  NVIDIA CUDA Toolkit (or a modified version of those libraries),
- **  containing parts covered by the terms of NVIDIA CUDA Toolkit
- **  EULA, the licensors of this Program grant you additional
- **  permission to convey the resulting work.
- ** -----------------------------------------------------------------------------**
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-#include <helper_functions.h>
-// for reading binary files
-#include <fstream>
-#include <iterator>
-#include <vector>
-//#include "dtt8x8.cuh"
-#include "dtt8x8.h"
-#include "TileProcessor.cuh"
-///#include "cuda_profiler_api.h"
-//#include "cudaProfiler.h"
-float * copyalloc_kernel_gpu(float * kernel_host,
-		                int size, // size in floats
-						int full_size)
-{
-	float *kernel_gpu;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
-    checkCudaErrors(cudaMemcpy( // segfault
-    		kernel_gpu,
-    		kernel_host,
-			size * sizeof(float),
-            cudaMemcpyHostToDevice));
-    return kernel_gpu;
-}
-float * copyalloc_kernel_gpu(float * kernel_host,
-		                int size)
-{
-	return copyalloc_kernel_gpu(kernel_host,
-            size, // size in floats
-			size);
-}
-float * alloccopy_from_gpu(
-		float * gpu_data,
-		float * cpu_data, // if null, will allocate
-		int size)
-{
-	if (!cpu_data) {
-		cpu_data = (float *)malloc(size*sizeof(float));
-	}
-	checkCudaErrors(cudaMemcpy( // segfault
-			cpu_data,
-			gpu_data,
-			size * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	return cpu_data;
-}
-float * alloc_kernel_gpu(int size) // size in floats
-{
-	float *kernel_gpu;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
-    return kernel_gpu;
-}
-float ** copyalloc_pointers_gpu(float ** gpu_pointer,
-		                int size) // number of entries (cameras)
-{
-	float ** gpu_pointer_to_gpu_pointers;
-    checkCudaErrors(cudaMalloc((void **)&gpu_pointer_to_gpu_pointers, size * sizeof(float*)));
-    checkCudaErrors(cudaMemcpy(
-    		gpu_pointer_to_gpu_pointers,
-			gpu_pointer,
-			size * sizeof(float*),
-            cudaMemcpyHostToDevice));
-    return gpu_pointer_to_gpu_pointers;
-}
-float * copyalloc_image_gpu(float * image_host,
-						size_t* dstride, // in floats !
-		                int width,
-						int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    checkCudaErrors(cudaMemcpy2D(
-    		image_gpu,
-            *dstride, //  * sizeof(float),
-			image_host,
-			width * sizeof(float), // make in 16*n?
-            width * sizeof(float),
-			height,
-			cudaMemcpyHostToDevice));
-    return image_gpu;
-}
-float * alloc_image_gpu(size_t* dstride, // in bytes!!
-		                int width,
-						int height)
-{
-	float *image_gpu;
-    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
-    return image_gpu;
-}
-int readFloatsFromFile(float *       data, // allocated array
-					   const char *  path) // file path
-{
-    std::ifstream input(path, std::ios::binary );
-    // copies all data into buffer
-    std::vector<char> buffer((
-            std::istreambuf_iterator<char>(input)),
-            (std::istreambuf_iterator<char>()));
-    std::copy( buffer.begin(), buffer.end(), (char *) data);
-	return 0;
-}
-int writeFloatsToFile(float *       data, // allocated array
-		               int           size, // length in elements
-					   const char *  path) // file path
-{
-//  std::ifstream input(path, std::ios::binary );
-	std::ofstream ofile(path, std::ios::binary);
-	ofile.write((char *) data, size * sizeof(float));
-	return 0;
-}
-// Prepare low pass filter (64 long) to be applied to each quadrant of the CLT data
-void set_clt_lpf(
-		float * lpf,    // size*size array to be filled out
-		float   sigma,
-		const int     dct_size)
-{
-	int dct_len = dct_size * dct_size;
-	if (sigma == 0.0f) {
-		lpf[0] = 1.0f;
-		for (int i = 1; i < dct_len; i++){
-			lpf[i] = 0.0;
-		}
-	} else {
-		for (int i = 0; i < dct_size; i++){
-			for (int j = 0; j < dct_size; j++){
-				lpf[i*dct_size+j] = exp(-(i*i+j*j)/(2*sigma));
-			}
-		}
-		// normalize
-		double sum = 0;
-		for (int i = 0; i < dct_size; i++){
-			for (int j = 0; j < dct_size; j++){
-				double d = 	lpf[i*dct_size+j];
-				d*=cos(M_PI*i/(2*dct_size))*cos(M_PI*j/(2*dct_size));
-				if (i > 0) d*= 2.0;
-				if (j > 0) d*= 2.0;
-				sum +=d;
-			}
-		}
-		for (int i = 0; i< dct_len; i++){
-			lpf[i] /= sum;
-		}
-	}
-}
-/**
-**************************************************************************
-*  Program entry point
-*
-* \param argc       [IN] - Number of command-line arguments
-* \param argv       [IN] - Array of command-line arguments
-*
-* \return Status code
-*/
-int main(int argc, char **argv)
-{
-    //
-    // Sample initialization
-    //
-    printf("%s Starting...\n\n", argv[0]);
-    printf("sizeof(float*)=%d\n",(int)sizeof(float*));
-    //initialize CUDA
-    findCudaDevice(argc, (const char **)argv);
-    // CLT testing
-    const char* kernel_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
-    const char* kernel_offs_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
-    const char* image_files[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
-    const char* ports_offs_xy_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
-    const char* ports_clt_file[] = { // never referenced
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
-    const char* result_rbg_file[] = {
-    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
-			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
-    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
-    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
-    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
-    // not yet used
-    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
-    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
-			{-0.5, -0.5},
-			{ 0.5, -0.5},
-			{-0.5,  0.5},
-			{ 0.5,  0.5}};
-    int keep_texture_weights = 1; // try with 0 also
-    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
-/*
-#define IMG_WIDTH    2592
-#define IMG_HEIGHT   1936
-#define NUM_CAMS        4
-#define NUM_COLORS      3
-#define KERNELS_STEP   16
-#define KERNELS_HOR   164
-#define KERNELS_VERT  123
-#define KERNEL_OFFSETS  8
-#define TILESX        324
-#define TILESY        242
-*/
-/*
-    struct tp_task {
-    	long task;
-		short ty;
-		short tx;
-		float xy[NUM_CAMS][2];
-    } ;
-*/
-    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
-    int KERN_SIZE =  KERN_TILES * 4 * 64;
-//    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
-    int CORR_SIZE = (2 * CORR_OUT_RAD + 1) * (2 * CORR_OUT_RAD + 1);
-    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
-    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
-    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
-//    int                texture_indices      [TILESX*TILESY];
-    int                texture_indices      [TILESX*TILESYA];
-    int                cpu_woi              [4];
-    // host array of pointers to GPU memory
-    float            * gpu_kernels_h        [NUM_CAMS];
-    struct CltExtra  * gpu_kernel_offsets_h [NUM_CAMS];
-    float            * gpu_images_h         [NUM_CAMS];
-    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
-    float            * gpu_clt_h            [NUM_CAMS];
-    float            * gpu_lpf_h            [NUM_COLORS]; // never used
-#ifndef NOICLT
-    float            * gpu_corr_images_h    [NUM_CAMS];
-#endif
-    float            * gpu_corrs;
-    int              * gpu_corr_indices;
-    float            * gpu_textures;
-    float            * gpu_textures_rbga;
-    int              * gpu_texture_indices;
-    int              * gpu_woi;
-    int              * gpu_num_texture_tiles;
-    float            * gpu_port_offsets;
-    int                num_corrs;
-    int                num_textures;
-    int                num_ports = NUM_CAMS;
-    // GPU pointers to GPU pointers to memory
-    float           ** gpu_kernels; //           [NUM_CAMS];
-    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
-    float           ** gpu_images; //            [NUM_CAMS];
-    float           ** gpu_clt;    //           [NUM_CAMS];
-    float           ** gpu_lpf;    //           [NUM_CAMS]; // never referenced
-    // GPU pointers to GPU memory
-//    float * gpu_tasks;
-    struct tp_task  * gpu_tasks;
-    size_t  dstride;          // in bytes !
-    size_t  dstride_rslt;     // in bytes !
-    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-    size_t  dstride_textures; // in bytes ! for one rgba/ya 16x16 tile
-    size_t  dstride_textures_rbga; // in bytes ! for one rgba/ya 16x16 tile
-    float lpf_rbg[3][64]; // not used
-    for (int ncol = 0; ncol < 3; ncol++) {
-    	if (lpf_sigmas[ncol] > 0.0) {
-    		set_clt_lpf (
-    				lpf_rbg[ncol], // float * lpf,    // size*size array to be filled out
-					lpf_sigmas[ncol], // float   sigma,
-					8); // int     dct_size)
-    		gpu_lpf_h[ncol] = copyalloc_kernel_gpu(lpf_rbg[ncol], 64);
-    	} else {
-    		gpu_lpf_h[ncol] = NULL;
-    	}
-    }
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				kernel_file[ncam]); // 			   char *  path) // file path
-        gpu_kernels_h[ncam] = copyalloc_kernel_gpu(host_kern_buf, KERN_SIZE);
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				kernel_offs_file[ncam]); // 			   char *  path) // file path
-        gpu_kernel_offsets_h[ncam] = (struct CltExtra *) copyalloc_kernel_gpu(
-        		host_kern_buf,
-				KERN_TILES * (sizeof( struct CltExtra)/sizeof(float)));
-        // will get results back
-        gpu_clt_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
-        printf("Allocating GPU memory, 0x%x floats\n", (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE)) ;
-        // allocate result images (3x height to accommodate 3 colors
-        // Image is extended by 4 pixels each side to avoid checking (mclt tiles extend by 4)
-        //host array of pointers to GPU arrays
-#ifndef NOICLT
-        gpu_corr_images_h[ncam] = alloc_image_gpu(
-        		&dstride_rslt,                // size_t* dstride, // in bytes!!
-				IMG_WIDTH + DTT_SIZE,         // int width,
-				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
-#endif
-    }
-    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
-    gpu_corrs = alloc_image_gpu(
-    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
-			CORR_SIZE,                      // int width,
-			NUM_PAIRS * TILESX * TILESY);   // int height);
-    // read channel images (assuming host_kern_buf size > image size, reusing it)
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-        readFloatsFromFile(
-        		host_kern_buf, // float * data, // allocated array
-				image_files[ncam]); // 			   char *  path) // file path
-        gpu_images_h[ncam] =  copyalloc_image_gpu(
-        		host_kern_buf, // float * image_host,
-				&dstride,      // size_t* dstride,
-				IMG_WIDTH,     // int width,
-				IMG_HEIGHT);   // int height);
-    }
-//#define DBG_TILE  (174*324 +118)
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-        readFloatsFromFile(
-			    (float *) &tile_coords_h[ncam],
-				ports_offs_xy_file[ncam]); // 			   char *  path) // file path
-    }
-    // build TP task that processes all tiles in linescan order
-    for (int ty = 0; ty < TILESY; ty++){
-        for (int tx = 0; tx < TILESX; tx++){
-            int nt = ty * TILESX + tx;
-            task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
-            task_data[nt].txy = tx + (ty << 16);
-            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
-                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
-            }
-        }
-    }
-    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
-#ifdef DBG0
-//#define NUM_TEST_TILES 128
-#define NUM_TEST_TILES 1
-    for (int t = 0; t < NUM_TEST_TILES; t++) {
-    	task_data[t].task = 1;
-    	task_data[t].txy = ((DBG_TILE + t) - 324* ((DBG_TILE + t) / 324)) + (((DBG_TILE + t) / 324)) << 16;
-    	int nt = task_data[t].ty * TILESX + task_data[t].tx;
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		task_data[t].xy[ncam][0] = tile_coords_h[ncam][nt][0];
-    		task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
-    	}
-    }
-    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
-#endif
-    // segfault in the next
-    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
-    // build corr_indices
-    num_corrs = 0;
-    for (int ty = 0; ty < TILESY; ty++){
-    	for (int tx = 0; tx < TILESX; tx++){
-    		int nt = ty * TILESX + tx;
-    		int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
-    		if (cm){
-    			for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
-    				corr_indices[num_corrs++] = (nt << CORR_NTILE_SHIFT) | b;
-    			}
-    		}
-    	}
-    }
-    // num_corrs now has the total number of correlations
-    // copy corr_indices to gpu
-//    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
-    gpu_corr_indices = (int  *) copyalloc_kernel_gpu(
-    		(float * ) corr_indices,
-			num_corrs,
-			NUM_PAIRS * TILESX * TILESY);
-    // build texture_indices
-    num_textures = 0;
-    for (int ty = 0; ty < TILESY; ty++){
-    	for (int tx = 0; tx < TILESX; tx++){
-    		int nt = ty * TILESX + tx;
-//    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
-    		int cm = task_data[nt].task & TASK_TEXTURE_BITS;
-    		if (cm){
-    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
-    		}
-    	}
-    }
-    // num_textures now has the total number of textures
-    // copy corr_indices to gpu
-//  gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
-    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
-    		(float * ) texture_indices,
-			num_textures,
-			TILESX * TILESYA); // number of rows - multiple of 4
-    // just allocate
-    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
-    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 8 * sizeof(float))); // for each subsequence - number of non-border,
-    // number of border tiles
-    // copy port indices to gpu
-    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
-//    int keep_texture_weights = 1; // try with 0 also
-//    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
-//		double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
-    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
-    gpu_textures = alloc_image_gpu(
-    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
-			tile_texture_size,              // int width (floats),
-			TILESX * TILESY);               // int height);
-    int rgba_width =   (TILESX+1) * DTT_SIZE;
-    int rgba_height =  (TILESY+1) * DTT_SIZE;
-    int rbga_slices =  texture_colors + 1; // 4/1
-    gpu_textures_rbga = alloc_image_gpu(
-    		&dstride_textures_rbga,              // in bytes ! for one rgba/ya 16x16 tile
-			rgba_width,              // int width (floats),
-			rgba_height * rbga_slices);               // int height);
-    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
-    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
-    gpu_kernel_offsets = (struct CltExtra **) copyalloc_pointers_gpu ((float **) gpu_kernel_offsets_h, NUM_CAMS);
-    gpu_images =         copyalloc_pointers_gpu (gpu_images_h,      NUM_CAMS);
-    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
-//    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
-    //create and start CUDA timer
-    StopWatchInterface *timerTP = 0;
-    sdkCreateTimer(&timerTP);
-    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
-    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
-    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
-    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
-#ifdef DBG_TILE
-    const int numIterations = 1; //0;
-    const int i0 =  0; // -1;
-#else
-    const int numIterations = 10; // 0; //0;
-    const int i0 = -1; // 0; // -1;
-#endif
-    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
-///    cudaProfilerStart();
-    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];
-    for (int i = i0; i < numIterations; i++)
-    {
-        if (i == 0)
-        {
-            checkCudaErrors(cudaDeviceSynchronize());
-            sdkResetTimer(&timerTP);
-            sdkStartTimer(&timerTP);
-        }
-        convert_correct_tiles<<<grid_tp,threads_tp>>>(
-        		fgpu_kernel_offsets,    // struct CltExtra      ** gpu_kernel_offsets,
-				gpu_kernels,           // 		float           ** gpu_kernels,
-				gpu_images,            // 		float           ** gpu_images,
-				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
-				gpu_clt,               //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				dstride/sizeof(float), // 		size_t            dstride, // for gpu_images
-				tp_task_size,          // 		int               num_tiles) // number of tiles in task
-				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
-        getLastCudaError("Kernel execution failed");
-        checkCudaErrors(cudaDeviceSynchronize());
-        printf("%d\n",i);
-    }
-//    checkCudaErrors(cudaDeviceSynchronize());
-    sdkStopTimer(&timerTP);
-    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
-    sdkDeleteTimer(&timerTP);
-    printf("Run time =%f ms\n",  avgTime);
-#ifdef SAVE_CLT
-    int rslt_size = (TILESY * TILESX * NUM_COLORS * 4 * DTT_SIZE * DTT_SIZE);
-    float * cpu_clt = (float *)malloc(rslt_size*sizeof(float));
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaMemcpy( // segfault
-    			cpu_clt,
-				gpu_clt_h[ncam],
-				rslt_size * sizeof(float),
-    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
-        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
-    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
-    			rslt_size, // int           size, // length in elements
-				ports_clt_file[ncam]); // 			   const char *  path) // file path
-#endif
-    }
-#endif
-#ifdef TEST_IMCLT
-     {
-    	// testing imclt
-    	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    	dim3 grid_imclt(1,1,1);
-    	printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    	printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		test_imclt<<<grid_imclt,threads_imclt>>>(
-    				gpu_clt_h[ncam], // ncam]); //                //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-					ncam);                                        // int             ncam); // just for debug print
-    	}
-    	getLastCudaError("Kernel execution failed");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test_imclt() DONE\n");
-    }
-#endif
-#ifndef NOICLT
-    // testing imclt
-    dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-    printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    StopWatchInterface *timerIMCLT = 0;
-    sdkCreateTimer(&timerIMCLT);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerIMCLT);
-    		sdkStartTimer(&timerIMCLT);
-    	}
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		for (int color = 0; color < NUM_COLORS; color++) {
-#ifdef IMCLT14
-    			for (int v_offs = 0; v_offs < 1; v_offs++){     // temporarily for debugging
-    				for (int h_offs = 0; h_offs < 1; h_offs++){ // temporarily for debugging
-#else
-    	    			for (int v_offs = 0; v_offs < 2; v_offs++){
-    	    				for (int h_offs = 0; h_offs < 2; h_offs++){
-#endif
-    					int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
-    					int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
-    					int tiles_in_pass = tilesy_half * tilesx_half;
-    					dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
-    					//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-    					imclt_rbg<<<grid_imclt,threads_imclt>>>(
-    							gpu_clt_h[ncam],         // float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-								gpu_corr_images_h[ncam], // float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-								1,                       // int               apply_lpf,
-								0,                       // int               mono,               // defines lpf filter
-								color,                   // int               color,              // defines location of clt data
-								v_offs,                  // int               v_offset,
-								h_offs,                  // int               h_offset,
-								dstride_rslt/sizeof(float));            //const size_t      dstride);            // in floats (pixels)
-    				}
-    			}
-    		}
-    	}
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerIMCLT);
-    float avgTimeIMCLT = (float)sdkGetTimerValue(&timerIMCLT) / (float)numIterations;
-    sdkDeleteTimer(&timerIMCLT);
-    printf("Average IMCLT run time =%f ms\n",  avgTimeIMCLT);
-    int rslt_img_size =       NUM_COLORS * (IMG_HEIGHT + DTT_SIZE) * (IMG_WIDTH + DTT_SIZE);
-    float * cpu_corr_image = (float *)malloc(rslt_img_size * sizeof(float));
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaMemcpy2D( // segfault
-    			cpu_corr_image,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				gpu_corr_images_h[ncam],
-				dstride_rslt,
-				(IMG_WIDTH + DTT_SIZE) * sizeof(float),
-				3* (IMG_HEIGHT + DTT_SIZE),
-    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
-        printf("Writing RBG data to %s\n",  result_rbg_file[ncam]);
-    	writeFloatsToFile( // will have margins
-    			cpu_corr_image, // float *       data, // allocated array
-				rslt_img_size, // int           size, // length in elements
-				result_rbg_file[ncam]); // 			   const char *  path) // file path
-#endif
-    }
-    free(cpu_corr_image);
-#endif
-#ifndef NOCORR
-//    cudaProfilerStart();
-    // testing corr
-    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
-    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
-    StopWatchInterface *timerCORR = 0;
-    sdkCreateTimer(&timerCORR);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerCORR);
-    		sdkStartTimer(&timerCORR);
-    	}
-        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
-        correlate2D<<<grid_corr,threads_corr>>>(
-		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		3,         // int               colors,             // number of colors (3/1)
-		0.25,      // float             scale0,             // scale for R
-		0.25,      // float             scale1,             // scale for B
-		0.5,       // float             scale2,             // scale for G
-		30.0,      // float             fat_zero,           // here - absolute
-		num_corrs, // size_t            num_corr_tiles,     // number of correlation tiles to process
-		gpu_corr_indices, //  int             * gpu_corr_indices,   // packed tile+pair
-		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
-		CORR_OUT_RAD, // int               corr_radius,        // radius of the output correlation (7 for 15x15)
-		gpu_corrs); // float           * gpu_corrs);          // correlation output data
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerCORR);
-    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
-    sdkDeleteTimer(&timerCORR);
-    printf("Average CORR run time =%f ms\n",  avgTimeCORR);
-    int corr_size =        2 * CORR_OUT_RAD + 1;
-    int rslt_corr_size =   num_corrs * corr_size * corr_size;
-    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_corr,
-			(corr_size * corr_size) * sizeof(float),
-			gpu_corrs,
-			dstride_corr,
-			(corr_size * corr_size) * sizeof(float),
-			num_corrs,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_CORR
-    		printf("Writing phase correlation data to %s\n",  result_corr_file);
-    		writeFloatsToFile(
-    				cpu_corr,    // float *       data, // allocated array
-					rslt_corr_size,    // int           size, // length in elements
-					result_corr_file); // 			   const char *  path) // file path
-#endif
-    		free(cpu_corr);
-#endif // ifndef NOCORR
-// -----------------
-#ifndef NOTEXTURES
-//    cudaProfilerStart();
-    // testing textures
-    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
-    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
-    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
-    StopWatchInterface *timerTEXTURE = 0;
-    sdkCreateTimer(&timerTEXTURE);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerTEXTURE);
-    		sdkStartTimer(&timerTEXTURE);
-    	}
-		// Channel0 weight = 0.294118
-		// Channel1 weight = 0.117647
-		// Channel2 weight = 0.588235
-    	textures_accumulate<<<grid_texture,threads_texture>>> (
-//    			0,          // int               border_tile,        // if 1 - watch for border
-    			(int *) 0,  //      int             * woi,                // x, y, width,height
-		        gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				num_textures,          // size_t            num_texture_tiles,  // number of texture tiles to process
-				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-				texture_colors,        // int               colors,             // number of colors (3/1)
-				(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				10.0,                  // float             min_shot,           // 10.0
-				3.0,                   // float             scale_shot,         // 3.0
-				1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-				10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-				3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				0.294118,              // float             weight0,            // scale for R
-				0.117647,              // float             weight1,            // scale for B
-				0.588235,              // float             weight2,            // scale for G
-				1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
-    	// combining both non-overlap and overlap (each calculated if pointer is not null )
-    			0, // const size_t      texture_rbg_stride, // in floats
-    			(float *) 0, // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-				dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
-				gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-///	cudaProfilerStop();
-    sdkStopTimer(&timerTEXTURE);
-    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
-    sdkDeleteTimer(&timerTEXTURE);
-    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
-    int rslt_texture_size =   num_textures * tile_texture_size;
-    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_textures,
-			tile_texture_size * sizeof(float),
-			gpu_textures,
-			dstride_textures,
-			tile_texture_size * sizeof(float),
-			num_textures,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_TEXTURES
-    		printf("Writing phase texture data to %s\n",  result_textures_file);
-    		writeFloatsToFile(
-    				cpu_textures,    // float *       data, // allocated array
-					rslt_texture_size,    // int           size, // length in elements
-					result_textures_file); // 			   const char *  path) // file path
-//DBG_TILE
-#ifdef DEBUG10
-    		int texture_offset = DBG_TILE * tile_texture_size;
-    		int chn = 0;
-    		for (int i = 0; i < tile_texture_size; i++){
-    			if ((i % 256) == 0){
-    				printf("\nchn = %d\n", chn++);
-    			}
-    			printf("%10.4f", *(cpu_textures + texture_offset + i));
-    			if (((i + 1) % 16) == 0){
-    				printf("\n");
-    			} else {
-    				printf(" ");
-    			}
-    		}
-//    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
-#endif // DEBUG9
-#endif
-    		free(cpu_textures);
-#endif // ifndef NOTEXTURES
-#define GEN_TEXTURE_LIST
-#ifdef  GEN_TEXTURE_LIST
-    		dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-    		dim3 grid_list   (1,1,1);
-    		printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
-    		printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
-    		StopWatchInterface *timerTEXTURELIST = 0;
-    		sdkCreateTimer(&timerTEXTURELIST);
-    		for (int i = i0; i < numIterations; i++)
-    		{
-    			if (i == 0)
-    			{
-    				checkCudaErrors(cudaDeviceSynchronize());
-    				sdkResetTimer(&timerTEXTURELIST);
-    				sdkStartTimer(&timerTEXTURELIST);
-    			}
-    			prepare_texture_list<<<grid_list,threads_list>>> (
-    					gpu_tasks,             // struct tp_task   * gpu_tasks,
-						tp_task_size,          // int                num_tiles,          // number of tiles in task list
-						gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-						gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-						gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-						TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-						TILESY);               // int                height); // <= TILESY, use for faster processing of LWIR images
-    			getLastCudaError("Kernel failure");
-    			checkCudaErrors(cudaDeviceSynchronize());
-    			printf("test pass: %d\n",i);
-    		}
-    		///	cudaProfilerStop();
-    		sdkStopTimer(&timerTEXTURELIST);
-    		float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
-    		sdkDeleteTimer(&timerTEXTURELIST);
-    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
-    		int cpu_num_texture_tiles[8];
-    		checkCudaErrors(cudaMemcpy(
-    				cpu_woi,
-					gpu_woi,
-					4 * sizeof(float),
-					cudaMemcpyDeviceToHost));
-    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
-    		checkCudaErrors(cudaMemcpy(
-    				cpu_num_texture_tiles,
-					gpu_num_texture_tiles,
-					8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
-					//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
-					cudaMemcpyDeviceToHost));
-    		printf("cpu_num_texture_tiles=(%d(%d), %d(%d), %d(%d), %d(%d) -> %d tp_task_size=%d)\n",
-    				cpu_num_texture_tiles[0], cpu_num_texture_tiles[1],
-					cpu_num_texture_tiles[2], cpu_num_texture_tiles[3],
-					cpu_num_texture_tiles[4], cpu_num_texture_tiles[5],
-					cpu_num_texture_tiles[6], cpu_num_texture_tiles[7],
-    				cpu_num_texture_tiles[0] + cpu_num_texture_tiles[1] +
-					cpu_num_texture_tiles[2] + cpu_num_texture_tiles[3] +
-					cpu_num_texture_tiles[4] + cpu_num_texture_tiles[5] +
-					cpu_num_texture_tiles[6] + cpu_num_texture_tiles[7],
-					tp_task_size
-					);
-    		for (int q = 0; q < 4; q++) {
-    			checkCudaErrors(cudaMemcpy(
-    					texture_indices  + q * TILESX * (TILESYA >> 2),
-						gpu_texture_indices  + q * TILESX * (TILESYA >> 2),
-						cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
-						cudaMemcpyDeviceToHost));
-    		}
-    		for (int q = 0; q < 4; q++) {
-        		printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
-        				(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
-    		}
-#endif //GEN_TEXTURE_LIST
-#ifndef NOTEXTURE_RGBA
-    dim3 threads_rgba(1, 1, 1);
-    dim3 grid_rgba(1,1,1);
-    printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
-    printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
-    StopWatchInterface *timerRGBA = 0;
-    sdkCreateTimer(&timerRGBA);
-    for (int i = i0; i < numIterations; i++)
-    {
-    	if (i == 0)
-    	{
-    		checkCudaErrors(cudaDeviceSynchronize());
-    		sdkResetTimer(&timerRGBA);
-    		sdkStartTimer(&timerRGBA);
-    	}
-    	generate_RBGA<<<grid_rgba,threads_rgba>>> (
-    	// Parameters to generate texture tasks
-                gpu_tasks,             // struct tp_task   * gpu_tasks,
-                tp_task_size,          // int                num_tiles,          // number of tiles in task list
-    	// declare arrays in device code?
-	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
-	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
-	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
-	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
-    	// Parameters for the texture generation
-	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-	            gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-	            texture_colors,        // int               colors,             // number of colors (3/1)
-	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-	            10.0,                  // float             min_shot,           // 10.0
-	            3.0,                   // float             scale_shot,         // 3.0
-	            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-	            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-	            3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	            0.294118,              // float             weight0,            // scale for R
-	            0.117647,              // float             weight1,            // scale for B
-	            0.588235,              // float             weight2,            // scale for G
-	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-	            0,                     // int               keep_weights,       // return channel weights after A in RGBA
-				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
-				gpu_textures_rbga);    // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-    	getLastCudaError("Kernel failure");
-    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("test pass: %d\n",i);
-    }
-    sdkStopTimer(&timerRGBA);
-    float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
-    sdkDeleteTimer(&timerRGBA);
-    printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
-	checkCudaErrors(cudaMemcpy(
-			cpu_woi,
-			gpu_woi,
-			4 * sizeof(float),
-			cudaMemcpyDeviceToHost));
-	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
-	// temporarily use larger array (4 pixels each size, switch to cudaMemcpy2DFromArray()
-    int rgba_woi_width =  (cpu_woi[2] + 1) * DTT_SIZE;
-    int rgba_woi_height = (cpu_woi[3] + 1)* DTT_SIZE;
-    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
-    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
-    checkCudaErrors(cudaMemcpy2D(
-    		cpu_textures_rgba,
-			rgba_width * sizeof(float),
-			gpu_textures_rbga,
-			dstride_textures_rbga,
-			rgba_width * sizeof(float),
-			rgba_height * rbga_slices,
-    		cudaMemcpyDeviceToHost));
-#ifndef NSAVE_TEXTURES
-    printf("Writing RBGA texture slices to %s\n",  result_textures_rgba_file);
-    writeFloatsToFile(
-    		cpu_textures_rgba,    // float *       data, // allocated array
-			rslt_rgba_size,    // int           size, // length in elements
-			result_textures_rgba_file); // 			   const char *  path) // file path
-#endif
-#ifdef DEBUG11
-    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
-    for (int chn = 0; chn < rbga_slices; chn++){
-    	printf("\nchn = %d\n", chn);
-    	int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
-    	for (int i = 0; i < 8; i++){
-    		for (int j = 0; j < 8; j++){
-    			printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
-    		}
-    		printf("\n");
-    	}
-    }
-#endif // DEBUG11
-    free(cpu_textures_rgba);
-#endif // ifndef NOTEXTURES
-#ifdef SAVE_CLT
-    free(cpu_clt);
-#endif
-    free (host_kern_buf);
-    // TODO: move somewhere when all is done
-    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    	checkCudaErrors(cudaFree(gpu_kernels_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_kernel_offsets_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_images_h[ncam]));
-    	checkCudaErrors(cudaFree(gpu_clt_h[ncam]));
-#ifndef NOICLT
-    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
-#endif
-    }
-	checkCudaErrors(cudaFree(gpu_tasks));
-	checkCudaErrors(cudaFree(gpu_kernels));
-	checkCudaErrors(cudaFree(gpu_kernel_offsets));
-	checkCudaErrors(cudaFree(gpu_images));
-	checkCudaErrors(cudaFree(gpu_clt));
-//	checkCudaErrors(cudaFree(gpu_corr_images));
-	checkCudaErrors(cudaFree(gpu_corrs));
-	checkCudaErrors(cudaFree(gpu_corr_indices));
-	checkCudaErrors(cudaFree(gpu_texture_indices));
-	checkCudaErrors(cudaFree(gpu_port_offsets));
-	checkCudaErrors(cudaFree(gpu_textures));
-	checkCudaErrors(cudaFree(gpu_textures_rbga));
-	checkCudaErrors(cudaFree(gpu_woi));
-	checkCudaErrors(cudaFree(gpu_num_texture_tiles));
-	exit(0);
-}