implemented and tested 2D phase correlation

f20a0723 · Andrey Filippov · 6b065851 · f20a0723 · f20a0723 · f20a0723
Commit f20a0723 authored Mar 10, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 766 additions and 76 deletions

TileProcessor.cuh src/TileProcessor.cuh +526 -33

dtt8x8.cu src/dtt8x8.cu +157 -43

dtt8x8.cuh src/dtt8x8.cuh +83 -0

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -41,17 +41,24 @@
 #pragma once
 #include "dtt8x8.cuh"
 #define THREADSX         (DTT_SIZE)
-#define IMG_WIDTH       2592
+#define IMG_WIDTH            2592
-#define IMG_HEIGHT      1936
+#define IMG_HEIGHT           1936
-#define KERNELS_HOR      164
+#define KERNELS_HOR           164
-#define KERNELS_VERT     123
+#define KERNELS_VERT          123
-#define NUM_CAMS           4
+#define NUM_CAMS                4
-#define NUM_COLORS         3
+#define NUM_PAIRS               6
-#define KERNELS_LSTEP      4
+#define NUM_COLORS              3
-#define THREADS_PER_TILE   8
+#define KERNELS_LSTEP           4
-#define TILES_PER_BLOCK    4
+#define THREADS_PER_TILE        8
+#define TILES_PER_BLOCK         4
+#define CORR_THREADS_PER_TILE   8
+#define CORR_TILES_PER_BLOCK    4
 #define IMCLT_THREADS_PER_TILE 16
 #define IMCLT_TILES_PER_BLOCK   4
+#define CORR_PAIR_SHIFT         8 // 8 lower bits - number of a pair, other bits tile number
+#define TASK_CORR_BITS          4
+#define CORR_OUT_RAD            7
 #endif
 //#define IMCLT14
@@ -106,6 +113,11 @@
 #define DTT_SIZE1        (DTT_SIZE + 1)
 #define DTT_SIZE2        (2 * DTT_SIZE)
 #define DTT_SIZE21       (DTT_SIZE2 + 1)
+#define DTT_SIZE4        (4 * DTT_SIZE)
+#define DTT_SIZE2M1      (DTT_SIZE2 - 1)
+// Use CORR_OUT_RAD for the correlation output
 #define BAYER_RED   0
 #define BAYER_BLUE  1
@@ -117,15 +129,16 @@
 //#define BAYER_BLUE_COL (1 - BAYER_RED_COL)
-#define DBG_TILE_X     174
+#define DBG_TILE_X     40
-#define DBG_TILE_Y     118
+#define DBG_TILE_Y     80
-//#define DBG_TILE     (DBG_TILE_Y * 324 + DBG_TILE_X)
+#define DBG_TILE     (DBG_TILE_Y * 324 + DBG_TILE_X)
 //#define DEBUG1 1
 //#define DEBUG2 1
 //#define DEBUG3 1
 //#define DEBUG4 1
 //#define DEBUG5 1
+#define DEBUG6 1
 //56494
 // struct tp_task
 //#define TASK_SIZE      12
@@ -311,6 +324,24 @@ __constant__ float lpf_data[3][64]={
 				0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
 				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
 		}};
+__constant__ float lpf_corr[64]={ // modify if needed
+				1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
+				0.87041007f, 0.75761368f, 0.57398049f, 0.37851747f, 0.21734206f, 0.10895863f, 0.04888546f, 0.02374977f,
+				0.65943687f, 0.57398049f, 0.43485698f, 0.28677101f, 0.16466189f, 0.08254883f, 0.03703642f, 0.01799322f,
+				0.43487258f, 0.37851747f, 0.28677101f, 0.18911416f, 0.10858801f, 0.05443770f, 0.02442406f, 0.01186582f,
+				0.24970076f, 0.21734206f, 0.16466189f, 0.10858801f, 0.06235047f, 0.03125774f, 0.01402412f, 0.00681327f,
+				0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
+				0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
+				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
+		};
+__constant__ int pairs[6][2]={
+		{0, 1},
+		{2, 3},
+		{0, 2},
+		{1, 3},
+		{0, 3},
+		{2, 1}};
 //#endif
 __device__ void convertCorrectTile(
 		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
@@ -333,43 +364,306 @@ __device__ void convertCorrectTile(
 	    float window_hor_sin  [2*DTT_SIZE],
 	    float window_vert_cos [2*DTT_SIZE]);
+__device__ void debug_print_lpf(
+		float * lpf_tile);
+__device__ void debug_print_clt1(
+		float * clt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+		const int color,
+		int mask);
+__device__ void debug_print_mclt(
+		float * mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+		const int color);
+__device__ void debug_print_corr_15x15(
+		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
+		const int color);
 // Fractional pixel shift (phase rotation), horizontal. In-place.
-__device__ void shiftTileHor(
+__device__ void shiftTileHor( // implemented, used
 		float * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float residual_shift                         );
 // Fractional pixel shift (phase rotation), vertical. In-place.
-__device__ void shiftTileVert(
+__device__ void shiftTileVert( // implemented, used
 		float *clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
 		float residual_shift                         );
-__device__ void convolveTiles(
+__device__ void convolveTiles( // implemented, used
 		float* clt_tile, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
 		float* kernel); //      [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the CLT kernel (DTT3 converted)
-__device__ void imclt(
+__device__ void correlateAccumulateTiles(
+		float  scale,      //    scale correlation
+		float* clt_tile1,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+		float* clt_tile2,  //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+		float* corr_tile); //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+__device__ void resetCorrelation(
+		float* corr_tile); //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+__device__ void normalizeTileAmplitude(
+		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+		float fat_zero);  // fat zero is absolute, scale it outside
+__device__ void corrUnfoldTile(
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt);  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+__device__ void imclt(  // implemented, used // why is it twice?
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-__device__ void imclt(
+__device__ void imclt(  // implemented, used // why is it twice?
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile ); //           [2* DTT_SIZE][DTT_SIZE1+ DTT_SIZE], // +1 to alternate column ports[16][17]
-__device__ void imclt_plane(
+__device__ void imclt_plane( // not implemented, not used
 		int               color,
 		float           * gpu_clt,   // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)
 extern "C"
-__global__ void convert_correct_tiles(
+__global__ void correlate2D(
-//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct paraeters
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
+//		int               tilesX,             // make it variable
-		float           ** gpu_kernels,        // [NUM_CAMS],
+		int               colors,             // number of colors (3/1)
-		float           ** gpu_images,         // [NUM_CAMS],
+		float             scale0,             // scale for R
-		struct tp_task  * gpu_tasks,
+		float             scale1,             // scale for B
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float             scale2,             // scale for G
-		size_t            dstride,             // in floats (pixels)
+		float             fat_zero,           // here - absolute
-		int               num_tiles,           // number of tiles in task
+		size_t            num_corr_tiles,     // number of correlation tiles to process
-		int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+		int             * gpu_corr_indices,   // packed tile+pair
+		const size_t      corr_stride,        // in floats
+		float           * gpu_corrs)          // correlation output data
+{
+///	int thr3 =        threadIdx.x >> 3; // now zero?
+///	int column =      threadIdx.x; // modify to use 2 * 8 threads, if needed.
+	float scales[3] = {scale0, scale1, scale2};
+	int corr_in_block = threadIdx.y;
+	int corr_num = blockIdx.x * CORR_TILES_PER_BLOCK + corr_in_block;
+	if (corr_num >= num_corr_tiles){
+		return; // nothing to do
+	}
+	// get number of pair and number of tile
+#define ALLTILES 1
+#ifdef ALLTILES
+	int corr_pair = corr_num % NUM_PAIRS;
+	int tile_num =  corr_num / NUM_PAIRS;
+#else
+	int corr_pair = gpu_corr_indices[corr_num];
+	int tile_num = corr_pair >> CORR_PAIR_SHIFT;
+#endif
+	corr_pair &= (corr_pair & ((1 << CORR_PAIR_SHIFT) - 1));
+	if (corr_pair > NUM_PAIRS){
+		return; // BUG - should not happen
+	}
+	int cam1 = pairs[corr_pair][0]; // number of the first camera in a pair
+	int cam2 = pairs[corr_pair][1]; // number of the first camera in a pair
+    __syncthreads();// __syncwarp();
+    __shared__ float clt_tiles1  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_tiles2  [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float clt_corrs   [CORR_TILES_PER_BLOCK][4][DTT_SIZE][DTT_SIZE1];
+    __shared__ float mlt_corrs   [CORR_TILES_PER_BLOCK][DTT_SIZE2M1][DTT_SIZE2M1]; // result correlation
+    // set clt_corr to all zeros
+    float * clt_corr =  ((float *) clt_corrs) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1); // top left quadrant0
+    float * mclt_corr = ((float *) mlt_corrs) +  corr_in_block * (DTT_SIZE2M1*DTT_SIZE2M1);
+    resetCorrelation(clt_corr);
+    for (int color = 0; color < colors; color++){
+        // copy clt (frequency domain data)
+        float * clt_tile1 = ((float *) clt_tiles1) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        float * clt_tile2 = ((float *) clt_tiles2) +  corr_in_block * (4 * DTT_SIZE * DTT_SIZE1);
+        int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
+        float * gpu_tile1 = ((float *) gpu_clt[cam1]) + offs;
+        float * gpu_tile2 = ((float *) gpu_clt[cam2]) + offs;
+		float * clt_tile1i = clt_tile1 + threadIdx.x;
+		float * clt_tile2i = clt_tile2 + threadIdx.x;
+#pragma unroll
+		for (int i = 0; i < DTT_SIZE4; i++){ // copy 32 rows (4 quadrants of 8 rows)
+			*clt_tile1i= *gpu_tile1;
+			*clt_tile2i= *gpu_tile2;
+			clt_tile1i += DTT_SIZE1;
+			clt_tile2i += DTT_SIZE1;
+			gpu_tile1 += DTT_SIZE;
+			gpu_tile2 += DTT_SIZE;
+		}
+		__syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA1\n",tile_num, corr_pair,color);
+    	debug_print_clt1(clt_tile1, color,  0xf); //
+        printf("\ncorrelate2D tile = %d, pair=%d, color = %d   CAMERA22\n",tile_num, corr_pair,color);
+    	debug_print_clt1(clt_tile2, color,  0xf); //
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+		// each thread should get the same pointers here, offsets are inside
+        correlateAccumulateTiles(
+        		scales[color], // float  scale,     // scale correlation
+				clt_tile1, // float* clt_tile1, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+				clt_tile2, // float* clt_tile2, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+				clt_corr); // float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+    	__syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D, color = %d CORRELATION\n", color);
+    	debug_print_clt1(clt_corr, color,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    }
+    normalizeTileAmplitude(
+    		clt_corr, // float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+			fat_zero); // float fat_zero ) // fat zero is absolute, scale it outside
+// Low Pass Filter from constant area (is it possible to replace?)
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D CORRELATION NORMALIZED, fat_zero=%f\n",fat_zero);
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D LPF\n");
+        debug_print_lpf(lpf_corr);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    float *clt = clt_corr + threadIdx.x;
+#pragma unroll
+    for (int q = 0; q < 4; q++){
+		float *lpf = lpf_corr + threadIdx.x;
+#pragma unroll
+    	for (int i = 0; i < DTT_SIZE; i++){
+    		(*clt) *= (*lpf);
+    		clt   += DTT_SIZE1;
+    		lpf   += DTT_SIZE;
+    	}
+    }
+    __syncthreads();// __syncwarp();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D CORRELATION LPF-ed\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+// now new part - need to transform with DCT-II and make 15x15
+/*
+    //    quadrant 0 dct_ii hor, dct_ii vert,
+    //    quadrant 1 dct_ii hor, dst_ii vert,
+    //    quadrant 2 dst_ii hor, dct_ii vert,
+    //    quadrant 3 dst_ii hor, dst_ii vert,
+Java code:
+     	for (int quadrant = 0; quadrant < 4; quadrant++){
+    		int mode = ((quadrant << 1) & 2) | ((quadrant >> 1) & 1); // transpose
+    		tcorr[first_col][quadrant] = dtt.dttt_iie(tcorr[first_col][quadrant], mode, transform_size);
+    	}
+ */
+    // change to 16-32 threads?? in next iteration
+    // hor pass
+    for (int q = 0; q < 4; q++){
+    	int is_sin = (q >> 1) & 1;
+//    	int is_sin = q & 1;
+//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+    	dttii_shared_mem_nonortho(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D AFTER VERTICAL (HORIZONTAL) PASS\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    // vert pass
+    for (int q = 0; q < 4; q++){
+    	int is_sin = q & 1;
+//    	int is_sin = (q >> 1) & 1;
+//    	dttii_shared_mem(clt_corr + q * (DTT_SIZE1 * DTT_SIZE) + threadIdx.x , DTT_SIZE1, is_sin); // vertical pass, thread is column
+//    	dttii_shared_mem(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    	dttii_shared_mem_nonortho(clt_corr + (q * DTT_SIZE + threadIdx.x) * DTT_SIZE1 ,  1, is_sin); // horizontal pass, tread is row
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D AFTER HOSIZONTAL (VERTICAL) PASS\n");
+    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    corrUnfoldTile(
+    		(float *) clt_corr,  // float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+			(float *) mclt_corr); // float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D after UNFOLD\n");
+    	debug_print_corr_15x15(mclt_corr, -1);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+    // copy 15x15 tile to main memory
+     int corr_tile_offset =  + corr_stride * corr_num;
+    float *mem_corr = gpu_corrs + corr_tile_offset;
+    //CORR_THREADS_PER_TILE
+//    int offs = threadIdx.x;
+#pragma unroll
+    for (int offs = threadIdx.x; offs < DTT_SIZE2M1*DTT_SIZE2M1; offs+=CORR_THREADS_PER_TILE){ // variable number of cycles per thread
+    	mem_corr[offs] = mclt_corr[offs];
+    }
+    __syncthreads();
+#ifdef DBG_TILE
+#ifdef DEBUG6
+    if ((tile_num == DBG_TILE) && (corr_pair == 0) && (threadIdx.x == 0)){
+        printf("\ncorrelate2D after copy to main memory\n");
+//    	debug_print_clt1(clt_corr, -1,  0xf);
+    }
+     __syncthreads();// __syncwarp();
+#endif
+#endif
+}
+extern "C"
+__global__ void convert_correct_tiles(
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask)           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
 {
-//	struct CltExtra* gpu_kernel_offsets = (struct CltExtra*) vgpu_kernel_offsets;
 	dim3 t = threadIdx;
 	int tile_in_block = threadIdx.y;
 	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
@@ -379,8 +673,6 @@ __global__ void convert_correct_tiles(
 	__shared__ struct tp_task tt [TILES_PER_BLOCK];
 	// Copy task data to shared memory
 	tt[tile_in_block].task =          gpu_task -> task;
-//	tt[tile_in_block].tx =            gpu_task -> tx;
-//	tt[tile_in_block].ty =            gpu_task -> ty;
 	tt[tile_in_block].txy =           gpu_task -> txy;
 	int thread0 =  threadIdx.x & 1;
 	int thread12 = threadIdx.x >>1;
@@ -426,7 +718,6 @@ __global__ void convert_correct_tiles(
 					lpf_mask,                        // const int         lpf_mask,
 					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
 					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
-//					tt[tile_in_block].tx | (tt[tile_in_block].ty <<16), //  const int txy,
 					tt[tile_in_block].txy,           //  const int txy,
 					dstride,                         // size_t            dstride, // in floats (pixels)
 					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE])
@@ -556,6 +847,191 @@ __device__ void convolveTiles(
 	}
 }
+__device__ void correlateAccumulateTiles(
+		float  scale,     // scale correlation
+		float* clt_tile1, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 1, rows extended to optimize shared ports
+		float* clt_tile2, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data 2, rows extended to optimize shared ports
+		float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+	float * clt_tile2_j; //  =   clt_tile2 +      joffs;                // ==&clt_tile2[0][j][0]
+	float * clt_tile1_j0 = clt_tile1 +    joffs;                // ==&clt_tile[0][j][0]
+	float * clt_tile1_j1 = clt_tile1_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * clt_tile1_j2 = clt_tile1_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * clt_tile1_j3 = clt_tile1_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+	float * corr_tile_j0 = corr_tile +    joffs;                // ==&clt_tile[0][j][0]
+	float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+//#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++){
+		// k=0
+		clt_tile2_j =   clt_tile2 + joffs + i;
+		float clt2 = *(clt_tile2_j);
+		float r0 =  *(clt_tile1_j0) * clt2;
+		float r1 = -*(clt_tile1_j1) * clt2;
+		float r2 = -*(clt_tile1_j2) * clt2;
+		float r3 =  *(clt_tile1_j3) * clt2;
+		// k = 1
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j1) * clt2;
+		r1 +=  *(clt_tile1_j0) * clt2;
+		r2 -=  *(clt_tile1_j3) * clt2;
+		r3 -=  *(clt_tile1_j2) * clt2;
+		// k=2
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j2) * clt2;
+		r1 -=  *(clt_tile1_j3) * clt2;
+		r2 +=  *(clt_tile1_j0) * clt2;
+		r3 -=  *(clt_tile1_j1) * clt2;
+		// k=3
+		clt_tile2_j += (DTT_SIZE1*DTT_SIZE);
+		clt2 = *(clt_tile2_j);
+		r0 +=  *(clt_tile1_j3) * clt2;
+		r1 +=  *(clt_tile1_j2) * clt2;
+		r2 +=  *(clt_tile1_j1) * clt2;
+		r3 +=  *(clt_tile1_j0) * clt2;
+		*(corr_tile_j0) += scale * r0;
+		*(corr_tile_j1) += scale * r1;
+		*(corr_tile_j2) += scale * r2;
+		*(corr_tile_j3) += scale * r3;
+		clt_tile1_j0 ++;
+		clt_tile1_j1 ++;
+		clt_tile1_j2 ++;
+		clt_tile1_j3 ++;
+		corr_tile_j0 ++;
+		corr_tile_j1 ++;
+		corr_tile_j2 ++;
+		corr_tile_j3 ++;
+	}
+}
+__device__ void resetCorrelation(
+		float* corr_tile) //    [4][DTT_SIZE][DTT_SIZE1]) // 4 quadrants of the correlation result
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+	float * corr_tile_j0 = corr_tile +    joffs;                // k = 0
+	float * corr_tile_j1 = corr_tile_j0 + (DTT_SIZE1*DTT_SIZE); // k = 1
+	float * corr_tile_j2 = corr_tile_j1 + (DTT_SIZE1*DTT_SIZE); // k = 2
+	float * corr_tile_j3 = corr_tile_j2 + (DTT_SIZE1*DTT_SIZE); // k = 3
+//#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++){
+		*(corr_tile_j0) = 0;
+		*(corr_tile_j1) = 0;
+		*(corr_tile_j2) = 0;
+		*(corr_tile_j3) = 0;
+		corr_tile_j0 ++;
+		corr_tile_j1 ++;
+		corr_tile_j2 ++;
+		corr_tile_j3 ++;
+	}
+}
+__device__ void normalizeTileAmplitude(
+		float * clt_tile, //       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+		float fat_zero ) // fat zero is absolute, scale it outside
+{
+	int joffs = threadIdx.x * DTT_SIZE1;
+	float * clt_tile_j0 = clt_tile +    joffs;                // ==&clt_tile[0][j][0]
+	float * clt_tile_j1 = clt_tile_j0 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[1][j][0]
+	float * clt_tile_j2 = clt_tile_j1 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[2][j][0]
+	float * clt_tile_j3 = clt_tile_j2 + (DTT_SIZE1*DTT_SIZE); // ==&clt_tile[3][j][0]
+#pragma unroll
+	for (int i = 0; i < DTT_SIZE; i++) {
+		float s2 = fat_zero * fat_zero +
+				*(clt_tile_j0) * *(clt_tile_j0) +
+				*(clt_tile_j1) * *(clt_tile_j1) +
+				*(clt_tile_j2) * *(clt_tile_j2) +
+				*(clt_tile_j3) * *(clt_tile_j3);
+		float scale = rsqrtf(s2); // 1.0/sqrt(s2)
+		*(clt_tile_j0) *= scale;
+		*(clt_tile_j1) *= scale;
+		*(clt_tile_j2) *= scale;
+		*(clt_tile_j3) *= scale;
+		clt_tile_j0 ++; // =DTT_SIZE1;
+		clt_tile_j1 ++; // =DTT_SIZE1;
+		clt_tile_j2 ++; // =DTT_SIZE1;
+		clt_tile_j3 ++; // =DTT_SIZE1;
+	}
+}
+/*
+Converted from DttRad2.java:443
+	public  double [] corr_unfold_tile(
+		double [][]  qdata, // [4][transform_size*transform_size] data after DCT2 (pixel domain)
+		int          transform_size
+	)
+ */
+__device__ void corrUnfoldTile(
+		float* qdata0, //    [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+		float* rslt)  //   [DTT_SIZE2M1][DTT_SIZE2M1]) // 15x15
+{
+	const int rslt_base_index = DTT_SIZE2M1 * (DTT_SIZE) - DTT_SIZE;
+	float * qdata1 = qdata0 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata2 = qdata1 + (DTT_SIZE * DTT_SIZE1);
+	float * qdata3 = qdata2 + (DTT_SIZE * DTT_SIZE1);
+	int i = threadIdx.x;
+	float corr_pixscale = 0.25f;
+	int i_transform_size = i * DTT_SIZE1; // used to address source rows which are 9 long
+	int im1_transform_size = i_transform_size - DTT_SIZE1; // negative for i = 0, use only after divergence
+	int rslt_row_offs = i * DTT_SIZE2M1;
+	int rslt_base_index_p = rslt_base_index + rslt_row_offs; // i * DTT_SIZE2M1;
+	int rslt_base_index_m = rslt_base_index - rslt_row_offs; // i * DTT_SIZE2M1;
+	rslt[rslt_base_index_p] = corr_pixscale * qdata0[i_transform_size]; // incomplete, will only be used for thread i=0
+	rslt[rslt_base_index_m] = rslt[rslt_base_index_p];                  // nop for i=0 incomplete, will only be used for thread i=0
+	for (int j = 1; j < DTT_SIZE; j++) {
+		int rslt_base_index_pp = rslt_base_index_p + j;
+		int rslt_base_index_pm = rslt_base_index_p - j;
+///		int rslt_base_index_mp = rslt_base_index_m + j;
+///		int rslt_base_index_mm = rslt_base_index_m - j;
+		rslt[rslt_base_index_pp] = corr_pixscale * (
+				 qdata0[i_transform_size + j] +
+				 qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
+		rslt[rslt_base_index_pm] = corr_pixscale * (
+				 qdata0[i_transform_size + j] +
+				-qdata1[i_transform_size + j -1]); // incomplete, will only be used for thread i=0
+	}
+	if (i == 0) {
+		return;
+	}
+///	int im1 = i-1;
+	im1_transform_size = i_transform_size - DTT_SIZE1;
+	float d = corr_pixscale * qdata2[im1_transform_size];
+	rslt[rslt_base_index_p] += d;
+	rslt[rslt_base_index_m] -= d;
+	for (int j = 1; j < DTT_SIZE; j++) {
+		int rslt_base_index_pp = rslt_base_index_p + j;
+		int rslt_base_index_pm = rslt_base_index_p - j;
+		int rslt_base_index_mp = rslt_base_index_m + j;
+		int rslt_base_index_mm = rslt_base_index_m - j;
+		float d2 = corr_pixscale * qdata2[im1_transform_size + j];
+		float d3 = corr_pixscale * qdata3[im1_transform_size + j -1];
+		//rslt[rslt_base_index_mp], rslt[rslt_base_index_mp] are partially calculated in the cycle common with i=0
+		rslt[rslt_base_index_mp] = rslt[rslt_base_index_pp] - d2 - d3;
+		rslt[rslt_base_index_mm] = rslt[rslt_base_index_pm] - d2 + d3;
+		rslt[rslt_base_index_pp] += d2 + d3;
+		rslt[rslt_base_index_pm] += d2 - d3;
+	}
+}
+__device__ void debug_print_lpf(
+		float * lpf_tile)
+{
+	for (int dbg_row = 0; dbg_row < DTT_SIZE; dbg_row++){
+		for (int dbg_col = 0; dbg_col < DTT_SIZE; dbg_col++){
+			printf ("%10.5f ", lpf_tile[dbg_row * DTT_SIZE + dbg_col]);
+		}
+		printf("\n");
+	}
+}
 __device__ void debug_print_clt1(
 		float * clt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 		const int color,
@@ -591,6 +1067,23 @@ __device__ void debug_print_mclt(
 	printf("\n");
 }
+__device__ void debug_print_corr_15x15(
+		float * mclt_tile, //DTT_SIZE2M1 x DTT_SIZE2M1
+		const int color)
+{
+	if (color >= 0) printf("----------- Color = %d -----------\n",color);
+	for (int dbg_row = 0; dbg_row < DTT_SIZE2M1; dbg_row++){
+		for (int dbg_col = 0; dbg_col < DTT_SIZE2M1; dbg_col++){
+			printf ("%10.5f ", mclt_tile[dbg_row * DTT_SIZE2M1 + dbg_col]);
+		}
+		printf("\n");
+	}
+	printf("\n");
+}
 __device__ void convertCorrectTile(
 		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
 		float               * gpu_kernels,        // [tileY][tileX][color]
@@ -1361,7 +1854,7 @@ __device__ void imclt_plane(
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
-// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Shuld be zeroed before the
+// adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the
 // first pass
 //__constant__ int imclt_indx9[16] = {0x28,0x31,0x3a,0x43,0x43,0x3a,0x31,0x28,0x1f,0x16,0x0d,0x04,0x04,0x0d,0x16,0x1f};
 __device__ void imclt(

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -212,39 +212,40 @@ int main(int argc, char **argv)
    // CLT testing
    const char* kernel_file[] = {
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0_transposed.kernel",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1_transposed.kernel",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2_transposed.kernel",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3_transposed.kernel"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel"};
    const char* kernel_offs_file[] = {
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0_transposed.kernel_offsets",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0_transposed.kernel_offsets",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1_transposed.kernel_offsets",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1_transposed.kernel_offsets",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2_transposed.kernel_offsets",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2_transposed.kernel_offsets",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3_transposed.kernel_offsets"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3_transposed.kernel_offsets"};
    const char* image_files[] = {
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.bayer",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.bayer",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.bayer",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.bayer",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.bayer",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.bayer",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.bayer"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.bayer"};
    const char* ports_offs_xy_file[] = {
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.portsxy",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.portsxy",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.portsxy",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.portsxy",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.portsxy",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.portsxy"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
    const char* ports_clt_file[] = { // never referenced
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.clt",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.clt",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.clt",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.clt",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.clt"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.clt"};
    const char* result_rbg_file[] = {
-    		"/data_ssd/git/cuda_dtt8x8/clt/main_chn0.rbg",
+    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.rbg",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn1.rbg",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn2.rbg",
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
-			"/data_ssd/git/cuda_dtt8x8/clt/main_chn3.rbg"};
+			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
+    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
    // not yet used
    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
@@ -272,10 +273,12 @@ int main(int argc, char **argv)
    int KERN_TILES = KERNELS_HOR *  KERNELS_VERT * NUM_COLORS;
    int KERN_SIZE =  KERN_TILES * 4 * 64;
+    int CORR_SIZE = (2 * DTT_SIZE -1) * (2 * DTT_SIZE -1);
    float            * host_kern_buf =  (float *)malloc(KERN_SIZE * sizeof(float));
    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
+    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
    // host array of pointers to GPU memory
    float            * gpu_kernels_h        [NUM_CAMS];
@@ -288,6 +291,10 @@ int main(int argc, char **argv)
    float            * gpu_corr_images_h    [NUM_CAMS];
 #endif
+    float            * gpu_corrs;
+//    float            * gpu_corr_indices;
+    int              * gpu_corr_indices;
+    int                num_corrs;
    // GPU pointers to GPU pointers to memory
    float           ** gpu_kernels; //           [NUM_CAMS];
    struct CltExtra ** gpu_kernel_offsets; //    [NUM_CAMS];
@@ -300,6 +307,8 @@ int main(int argc, char **argv)
    struct tp_task  * gpu_tasks;
    size_t  dstride; // in bytes !
    size_t  dstride_rslt; // in bytes !
+    size_t  dstride_corr; // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
    float lpf_rbg[3][64];
    for (int ncol = 0; ncol < 3; ncol++) {
@@ -339,9 +348,12 @@ int main(int argc, char **argv)
 				IMG_WIDTH + DTT_SIZE,         // int width,
 				3*(IMG_HEIGHT + DTT_SIZE));   // int height);
 #endif
    }
+    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
+    gpu_corrs = alloc_image_gpu(
+    		&dstride_corr,                  // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
+			CORR_SIZE,                      // int width,
+			NUM_PAIRS * TILESX * TILESY);   // int height);
    // read channel images (assuming host_kern_buf size > image size, reusing it)
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
@@ -361,8 +373,24 @@ int main(int argc, char **argv)
 				ports_offs_xy_file[ncam]); // 			   char *  path) // file path
    }
+    // build TP task that processes all tiles in linescan order
+    for (int ty = 0; ty < TILESY; ty++){
+        for (int tx = 0; tx < TILESX; tx++){
+            int nt = ty * TILESX + tx;
+            task_data[nt].task = 0xf | (((1 << NUM_PAIRS)-1) << TASK_CORR_BITS);
+            task_data[nt].txy = tx + (ty << 16);
+            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
+                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
+                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
+            }
+        }
+    }
+    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
 #ifdef DBG_TILE
+#ifdef DBG0
 //#define NUM_TEST_TILES 128
 #define NUM_TEST_TILES 1
    for (int t = 0; t < NUM_TEST_TILES; t++) {
@@ -375,28 +403,34 @@ int main(int argc, char **argv)
    		task_data[t].xy[ncam][1] = tile_coords_h[ncam][nt][1];
    	}
    }
-    int tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
+    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
-#else
-    // build TP task that processes all tiles in linescan order
-    for (int ty = 0; ty < TILESY; ty++){
-        for (int tx = 0; tx < TILESX; tx++){
-            int nt = ty * TILESX + tx;
-            task_data[nt].task = 1;
-            task_data[nt].txy = tx + (ty << 16);
-            for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-                task_data[nt].xy[ncam][0] = tile_coords_h[ncam][nt][0];
-                task_data[nt].xy[ncam][1] = tile_coords_h[ncam][nt][1];
-            }
-        }
-    }
-    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
+#endif
 #endif
    // segfault in the next
    gpu_tasks = (struct tp_task  *) copyalloc_kernel_gpu((float * ) &task_data, tp_task_size * (sizeof(struct tp_task)/sizeof(float)));
+    // build corr_indices
+    num_corrs = 0;
+    for (int ty = 0; ty < TILESY; ty++){
+    	for (int tx = 0; tx < TILESX; tx++){
+    		int nt = ty * TILESX + tx;
+    		int cm = (task_data[nt].task >> TASK_CORR_BITS) & ((1 << NUM_PAIRS)-1);
+    		if (cm){
+    			for (int b = 0; b < NUM_PAIRS; b++) if ((cm & (1 << b)) != 0) {
+    				corr_indices[num_corrs++] = (nt << CORR_PAIR_SHIFT) | b;
+    			}
+    		}
+    	}
+    }
+    // num_corrs now has the total number of correlations
+    // copy corr_indices to gpu
+//    gpu_corr_indices = (float  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
+    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
+    // will need to pass num_corrs too
    // Now copy arrays of per-camera pointers to GPU memory to GPU itself
    gpu_kernels =        copyalloc_pointers_gpu (gpu_kernels_h,     NUM_CAMS);
@@ -440,9 +474,11 @@ int main(int argc, char **argv)
 				gpu_images,            // 		float           ** gpu_images,
 				gpu_tasks,             // 		struct tp_task  * gpu_tasks,
 				gpu_clt,               //       float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+//				gpu_corrs,             // 		float            * gpu_corrs,          // [][15x15] - padded
 				dstride/sizeof(float), // 		size_t            dstride, // for gpu_images
+//				dstride_corr/sizeof(float), //size_t             dstride_corr,       // in floats: padded correlation size
 				tp_task_size,          // 		int               num_tiles) // number of tiles in task
-				7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+				0); // 7); // 0); // 7);                    //       int               lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
        getLastCudaError("Kernel execution failed");
@@ -584,6 +620,82 @@ int main(int argc, char **argv)
    free(cpu_corr_image);
 #endif
+#ifndef NOCORR
+    // testing corr
+    dim3 threads_corr(CORR_THREADS_PER_TILE, CORR_TILES_PER_BLOCK, 1);
+    printf("threads_corr=(%d, %d, %d)\n",threads_corr.x,threads_corr.y,threads_corr.z);
+    StopWatchInterface *timerCORR = 0;
+    sdkCreateTimer(&timerCORR);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerCORR);
+    		sdkStartTimer(&timerCORR);
+    	}
+        dim3 grid_corr((num_corrs + CORR_TILES_PER_BLOCK-1) / CORR_TILES_PER_BLOCK,1,1);
+        correlate2D<<<grid_corr,threads_corr>>>(
+		gpu_clt,   // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		3,         // int               colors,             // number of colors (3/1)
+		0.25,      // float             scale0,             // scale for R
+		0.25,      // float             scale1,             // scale for B
+		0.5,       // float             scale2,             // scale for G
+		30.0,      // float             fat_zero,           // here - absolute
+		num_corrs, // size_t            num_corr_tiles,     // number of correlation tiles to process
+		gpu_corr_indices, //  int             * gpu_corr_indices,   // packed tile+pair
+		dstride_corr/sizeof(float), // const size_t      corr_stride,        // in floats
+		gpu_corrs); // float           * gpu_corrs);          // correlation output data
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+#ifdef DEBUG4
+    	break;
+#endif
+#ifdef DEBUG5
+    		break;
+#endif
+    }
+    sdkStopTimer(&timerCORR);
+    float avgTimeCORR = (float)sdkGetTimerValue(&timerCORR) / (float)numIterations;
+    sdkDeleteTimer(&timerCORR);
+    printf("Average CORR run time =%f ms\n",  avgTimeCORR);
+    int corr_size =        2 * CORR_OUT_RAD + 1;
+    int rslt_corr_size =   num_corrs * corr_size * corr_size;
+    float * cpu_corr = (float *)malloc(rslt_corr_size * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr,
+			(corr_size * corr_size) * sizeof(float),
+			gpu_corrs,
+			dstride_corr,
+			(corr_size * corr_size) * sizeof(float),
+			num_corrs,
+    		cudaMemcpyDeviceToHost));
+#ifndef NSAVE_CORR
+    		printf("Writing phase correlation data to %s\n",  result_corr_file);
+    		writeFloatsToFile(
+    				cpu_corr,    // float *       data, // allocated array
+					rslt_corr_size,    // int           size, // length in elements
+					result_corr_file); // 			   const char *  path) // file path
+#endif
+    		free(cpu_corr);
+#endif // ifndef NOCORR
 #ifdef SAVE_CLT
    free(cpu_clt);
 #endif
@@ -605,5 +717,7 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaFree(gpu_images));
 	checkCudaErrors(cudaFree(gpu_clt));
 //	checkCudaErrors(cudaFree(gpu_corr_images));
+	checkCudaErrors(cudaFree(gpu_corrs));
+	checkCudaErrors(cudaFree(gpu_corr_indices));
 	exit(0);
 }
--- a/src/dtt8x8.cuh
+++ b/src/dtt8x8.cuh
@@ -84,6 +84,7 @@ __constant__ float SINN1[] = {0.195090f,0.555570f};
 __constant__ float SINN2[] = {0.098017f,0.290285f,0.471397f,0.634393f};
+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct); // does not scale by y[0] (y[7]) by 1/sqrt[0]
 inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
 inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct);   // used in GPU_DTT24_DRV
 inline __device__ void dttiv_nodiverg  (float * x,   int inc, int dst_not_dct);   // not used
@@ -376,6 +377,88 @@ inline __device__ void dttii_shared_mem(float * x0,  int inc, int dst_not_dct)
 	}
 }
+inline __device__ void dttii_shared_mem_nonortho(float * x0,  int inc, int dst_not_dct)
+{
+	float *x1 = x0 + inc;
+	float *x2 = x1 + inc;
+	float *x3 = x2 + inc;
+	float *x4 = x3 + inc;
+	float *x5 = x4 + inc;
+	float *x6 = x5 + inc;
+	float *x7 = x6 + inc;
+	float u00, u01, u02, u03, u10, u11, u12, u13;
+	if (dst_not_dct) { // DSTII
+		// invert odd input samples
+		u00= ( (*x0) - (*x7));
+		u10= ( (*x0) + (*x7));
+		u01= (-(*x1) + (*x6));
+		u11= (-(*x1) - (*x6));
+		u02= ( (*x2) - (*x5));
+		u12= ( (*x2) + (*x5));
+		u03= (-(*x3) + (*x4));
+		u13= (-(*x3) - (*x4));
+	} else { // DCTII
+		u00= ( (*x0) + (*x7));
+		u10= ( (*x0) - (*x7));
+		u01= ( (*x1) + (*x6));
+		u11= ( (*x1) - (*x6));
+		u02= ( (*x2) + (*x5));
+		u12= ( (*x2) - (*x5));
+		u03= ( (*x3) + (*x4));
+		u13= ( (*x3) - (*x4));
+	}
+	//	_dctii_nrecurs4(u00,u01, u02, u03, &v00, &v01, &v02, &v03);
+		float w00= u00 + u03;
+		float w10= u00 - u03;
+		float w01= (u01 + u02);
+		float w11= (u01 - u02);
+		float v01= COSPI_1_8_SQRT2 * w10 + COSPI_3_8_SQRT2 * w11;
+		float v03= COSPI_3_8_SQRT2 * w10 - COSPI_1_8_SQRT2 * w11;
+	//	_dctiv_nrecurs4(u10, u11, u12, u13, &v10, &v11, &v12, &v13);
+		float w20=            ( COSN1[0] * u10 + SINN1[0] * u13);
+		float w30=            (-SINN1[1] * u11 + COSN1[1] * u12);
+		float w21=            ( COSN1[1] * u11 + SINN1[1] * u12);
+		float w31=           -(-SINN1[0] * u10 + COSN1[0] * u13);
+		float v11 = w20 - w21 - w30 + w31;
+		float v12 = w20 - w21 + w30 - w31;
+	if (dst_not_dct) { // DSTII
+		// Invert output sequence
+		*x0 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+		*x1 =   v03 *         SQRT1_8;
+		*x2 =   v12 *         SQRT1_8;
+		*x3 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x4 =   v11 *         SQRT1_8;
+		*x5 =   v01 *         SQRT1_8;
+		*x6 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x7 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+	} else {
+		*x0 =   (w00 + w01) * 0.5f;    // SQRT1_8; // v00 * SQRT1_8 //*** no 1/sqrt(2)!
+		*x1 =   (w20 + w21) * 0.5f;    // v10 * SQRT1_8; z00 * 0.5f;
+		*x2 =   v01 *         SQRT1_8;
+		*x3 =   v11 *         SQRT1_8;
+		*x4 =   (w00 - w01) * SQRT1_8; // v02 * SQRT1_8
+		*x5 =   v12 *         SQRT1_8;
+		*x6 =   v03 *         SQRT1_8;
+		*x7 =   (w30 + w31)*  0.5f;    // v13 * SQRT1_8; z10 * 0.5f
+	}
+}
 inline __device__ void dttiv_shared_mem(float * x0,  int inc, int dst_not_dct)
 {