changing direct conversion to CDP, handling sparse tasks

3c30a0bc · Andrey Filippov · f134cfa4 · 3c30a0bc · 3c30a0bc · 3c30a0bc
Commit 3c30a0bc authored Apr 16, 2020 by Andrey Filippov
5 changed files
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -104,6 +104,8 @@ GPU run time =523.451927ms, (direct conversion: 24.080189999999998ms, imclt: 17.
 #define KERNELS_STEP  (1 << KERNELS_LSTEP)
 #define TILESX        (IMG_WIDTH / DTT_SIZE)
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
+#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
 // Make TILESYA >= TILESX and a multiple of 4
 #define TILESYA       ((TILESY +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
@@ -1541,10 +1543,132 @@ __global__ void gen_texture_list(
 #endif //#ifdef USE_CDP
+//#define CONVERT_DIRECT_INDEXING_THREADS_LOG2 5
+//#define CONVERT_DIRECT_INDEXING_THREADS (1 << CONVERT_DIRECT_INDEXING_THREADS_LOG2) // 32
+//#define CONVERT_DIRECT_NUM_CHUNKS  ((TILESY*TILESX+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
+//#define CONVERT_DIRECT_NUM_CHUNKS2 ((CONVERT_DIRECT_NUM_CHUNKS+CONVERT_DIRECT_INDEXING_THREADS-1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2)
+//__global__ int num_active_tiles;
+//__global__ int active_tiles        [TILESY*TILESX]; // indices of tiles in gpu_tasks that have non-zero correlations and/or textures
+//__device__ int num_acive_per_chunk [CONVERT_DIRECT_NUM_CHUNKS+1];
+//__device__ int num_acive_per_chunk2[CONVERT_DIRECT_NUM_CHUNKS2+1];
+// not maintaining order of the tiles to be processed
+extern "C" __global__ void index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+		int *              num_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+{
+	int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	if (num_tile >= num_tiles){
+		return;
+	}
+	if (gpu_tasks[num_tile].task != 0) {
+		active_tiles[atomicAdd(num_active_tiles, 1)] = num_tile;
+	}
+}
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			int *              pnum_active_tiles)  //  indices to gpu_tasks
+{
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+		 *pnum_active_tiles = 0;
+		 index_direct<<<blocks0,threads0>>>(
+				 gpu_tasks,           // struct tp_task   * gpu_tasks,
+				 num_tiles,           //int                num_tiles,          // number of tiles in task
+				 gpu_active_tiles,    //int *              active_tiles,      // pointer to the calculated number of non-zero tiles
+				 pnum_active_tiles);  //int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
+		 cudaDeviceSynchronize();
+		 // now call actual convert_correct_tiles
+		 dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
+		 dim3 grid_tp((*pnum_active_tiles + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+		 convert_correct_tiles<<<grid_tp,threads_tp>>>(
+				 gpu_kernel_offsets, // float           ** gpu_kernel_offsets, // [NUM_CAMS],
+				 gpu_kernels,        // float           ** gpu_kernels,        // [NUM_CAMS],
+				 gpu_images,         // float           ** gpu_images,         // [NUM_CAMS],
+				 gpu_tasks,          // struct tp_task   * gpu_tasks,          // array of tasks
+				 gpu_active_tiles,   // int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+				 *pnum_active_tiles, // int                num_active_tiles,   // number of tiles in task
+				 gpu_clt,            // float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				 dstride,            // size_t             dstride,            // in floats (pixels)
+				 lpf_mask,           // int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+				 woi_width,          // int                woi_width,          // varaible to swict between EO and LWIR
+				 woi_height,         // int                woi_height,         // varaible to swict between EO and LWIR
+				 kernels_hor,        // int                kernels_hor,        // varaible to swict between EO and LWIR
+				 kernels_vert);      // int                kernels_vert);      // varaible to swict between EO and LWIR
+	 }
+}
+#if 0 // trying to keep the same order
+extern "C" __global__ void index_direct_init(
+		int                num_chunks,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	int chunk_index = blockIdx.x * blockDim.x + threadIdx.x;
+	if (chunk_index <= num_chunks){
+		tmp->num_acive_per_chunk[chunk_index] = 0;
+	}
+}
+extern "C" __global__ void index_direct(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	__shared__ int num_active;
+	if (threadIdx.x == 0) {
+		num_active = 0;
+	}
+	 __syncthreads();
+	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	 if (num_tile < num_tiles) {
+		 if (gpu_tasks[num_tile].task){
+			 atomicAdd(&num_active, 1);
+		 }
+	 }
+	 __syncthreads();
+	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
+}
-extern "C"
+extern "C" __global__ void build_index_direct(
-__global__ void convert_correct_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,          // number of tiles in task
+		struct  convert_direct_tmp* tmp)
+{
+	__shared__ int num_active;
+	if (threadIdx.x == 0) {
+		num_active = 0;
+	}
+	 __syncthreads();
+	 int num_tile = blockIdx.x * blockDim.x + threadIdx.x;
+	 if (num_tile < num_tiles) {
+		 if (gpu_tasks[num_tile].task){
+			 atomicAdd(&num_active, 1);
+		 }
+	 }
+	 __syncthreads();
+	 tmp -> num_acive_per_chunk[(num_tile >> CONVERT_DIRECT_INDEXING_THREADS_LOG2) + 1] = num_active; // skip [0]
+}
+/**
+ * Top level to call other kernel with CDP
+ */
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
 //		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
 			float           ** gpu_kernel_offsets, // [NUM_CAMS],
 			float           ** gpu_kernels,        // [NUM_CAMS],
@@ -1553,6 +1677,85 @@ __global__ void convert_correct_tiles(
 			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 			size_t             dstride,            // in floats (pixels)
 			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              num_active_tiles,
+			struct  convert_direct_tmp* tmp) // temporary storage - avoiding static data for future overlap of kernel execution
+{
+	 int num_chunks = (num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 int num_chunk_blocks = (num_chunks + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+	 dim3 blocks0 (num_chunk_blocks,1, 1);
+	 __shared__ int superchunks[CONVERT_DIRECT_INDEXING_THREADS + 1];
+	 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+		 index_direct_init<<<blocks0,threads0>>>(num_chunks, tmp); // zero num_acive_per_chunk[]
+		 cudaDeviceSynchronize(); // not needed yet, just for testing
+		 index_direct<<<blocks0,threads0>>>(
+				 gpu_tasks,  // struct tp_task   * gpu_tasks,
+				 num_tiles, // int                num_tiles)
+				 tmp);
+		 cudaDeviceSynchronize(); // not needed yet, just for testing
+		 // single-threaded - make cumulative
+//		 tmp-> num_acive_per_chunk2[0] = 0;
+		 superchunks[0] = 0;
+	 }
+	 __syncthreads();
+	 // calculate cumulative in 3 steps
+	 //1. num_acive_per_chunk2 with each element being a sum of CONVERT_DIRECT_INDEXING_THREADS (32) elements of num_acive_per_chunk
+	 int num_passes = (num_chunk_blocks + CONVERT_DIRECT_INDEXING_THREADS - 1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2;
+	 for (int pass = 0; pass < num_passes; pass++){
+		 int num_cluster2 = (pass << CONVERT_DIRECT_INDEXING_THREADS_LOG2) + threadIdx.x + 1; // skip 0
+		 if (num_cluster2 <= num_chunk_blocks){
+			 superchunks[threadIdx.x+1] = superchunks[0];
+			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 if (indx <= num_chunks) {
+					 superchunks[threadIdx.x+1] += tmp -> num_acive_per_chunk[indx++];
+				 }
+			 }
+		 }
+		 __syncthreads();
+		 // make superchunks cumulative (single-threaded
+		 if (threadIdx.x == 0) { // of CONVERT_DIRECT_INDEXING_THREADS
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 superchunks[i + 1] += superchunks[i];
+			 }
+		 }
+		 __syncthreads();
+		 // now update tmp -> num_acive_per_chunk[] by adding them together and adding the initial value
+		 if (num_cluster2 <= num_chunk_blocks){
+			 int indx = num_cluster2 << CONVERT_DIRECT_INDEXING_THREADS_LOG2 + 1;
+			 tmp -> num_acive_per_chunk[indx] += superchunks[threadIdx.x];
+			 for (int i = 0; i < CONVERT_DIRECT_INDEXING_THREADS; i++){
+				 int prev = tmp -> num_acive_per_chunk[indx++];
+				 if (indx <= num_chunks) {
+					 tmp -> num_acive_per_chunk[indx] += prev;
+				 }
+			 }
+		 }
+		 __syncthreads();
+	 }
+}
+#endif
+extern "C" __global__ void convert_correct_tiles(
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+			int                num_active_tiles,   // number of tiles in task
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+//			int                num_tiles,          // number of tiles in task
 			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
 			int                woi_width,
 			int                woi_height,
@@ -1561,8 +1764,14 @@ __global__ void convert_correct_tiles(
 {
 	dim3 t = threadIdx;
 	int tile_in_block = threadIdx.y;
-	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
+//	int task_num = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
-	if (task_num >= num_tiles) return; // nothing to do
+//	if (task_num >= num_tiles) return; // nothing to do
+	int task_indx = blockIdx.x * TILES_PER_BLOCK + tile_in_block;
+	if (task_indx >=  num_active_tiles){
+		return; // nothing to do
+	}
+	int task_num = gpu_active_tiles[task_indx];
 	struct tp_task  * gpu_task = &gpu_tasks[task_num];
 	if (!gpu_task->task)       return; // NOP tile
 	__shared__ struct tp_task tt [TILES_PER_BLOCK];
@@ -1626,8 +1835,7 @@ __global__ void convert_correct_tiles(
 					woi_height,                      // int                woi_height,
 					kernels_hor,                     // int                kernels_hor,
 					kernels_vert); //int                kernels_vert)
+    		 __syncthreads();
-    		 __syncthreads();// __syncwarp();
    	}
    }
 }
@@ -1989,7 +2197,7 @@ __global__ void textures_accumulate(
 		int tileX = tile_num - tileY * TILESX;
 		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
-		int height = *(woi + 3) << DTT_SIZE_LOG2;
+///		int height = *(woi + 3) << DTT_SIZE_LOG2;
 #ifdef DEBUG12
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -41,21 +41,44 @@
 #include "tp_defines.h"
 #endif
+extern "C" __global__ void index_direct(
-extern "C"
-__global__ void convert_correct_tiles(
-		float           ** gpu_kernel_offsets, // [NUM_CAMS],
-		float           ** gpu_kernels,        // [NUM_CAMS],
-		float           ** gpu_images,         // [NUM_CAMS],
 		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
-		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+		int *              active_tiles,      // pointer to the calculated number of non-zero tiles
-		int                woi_width,
+		int *              num_active_tiles);  //  indices to gpu_tasks  // should be initialized to zero
-		int                woi_height,
-		int                kernels_hor,
+extern "C" __global__ void convert_direct( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
-		int                kernels_vert);
+//		struct CltExtra ** gpu_kernel_offsets, // [NUM_CAMS], // changed for jcuda to avoid struct parameters
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert,
+			int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			int *              pnum_active_tiles);  //  indices to gpu_tasks
+extern "C" __global__ void convert_correct_tiles(
+			float           ** gpu_kernel_offsets, // [NUM_CAMS],
+			float           ** gpu_kernels,        // [NUM_CAMS],
+			float           ** gpu_images,         // [NUM_CAMS],
+			struct tp_task   * gpu_tasks,
+			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
+			int                num_active_tiles,   // number of tiles in task
+			float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+			size_t             dstride,            // in floats (pixels)
+//			int                num_tiles,          // number of tiles in task
+			int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
+			int                woi_width,
+			int                woi_height,
+			int                kernels_hor,
+			int                kernels_vert);
 extern "C" __global__ void clear_texture_list(

--- a/src/geometry_correction.cu
+++ b/src/geometry_correction.cu
@@ -62,6 +62,8 @@ __device__ void printExtrinsicCorrection(corr_vector * cv);
 inline __device__ float getRByRDist(float rDist,
 		float rByRDist [RBYRDIST_LEN]); //shared memory
 __constant__ float ROTS_TEMPLATE[7][3][3][3] = {//  ...{cos,sin,const}...
 		{ // azimuth
 				{{ 1, 0,0},{0, 0,0},{ 0,-1,0}},
@@ -116,201 +118,6 @@ __constant__ int mm_seq [3][3][3]={
 				{-1,-1,-1} // do nothing
 		}};
-#if 0
-__device__ float rot_matrices       [NUM_CAMS][3][3];
-//__device__ float rot_deriv_matrices [NUM_CAMS][4][3][3]; // /d_azimuth, /d_tilt, /d_roll, /d_zoom)
-// threads (3,3,4)
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector)
-{
-	__shared__ float zoom    [NUM_CAMS];
-	__shared__ float sincos  [NUM_CAMS][3][2];    // {az,tilt,roll, d_az, d_tilt, d_roll, d_az}{cos,sin}
-	__shared__ float matrices[NUM_CAMS][4][3][3]; // [7] - extra
-	float angle;
-	int ncam = threadIdx.z;
-	int nangle1 = threadIdx.x + threadIdx.y * blockDim.x; // * >> 1;
-	int nangle =  nangle1 >> 1;
-	int is_sin = nangle1 & 1;
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0)  && ( threadIdx.y == 0)  && ( threadIdx.z == 0)){
-		printf("\nget_tiles_offsets() threadIdx.x = %d, blockIdx.x= %d\n", (int)threadIdx.x, (int) blockIdx.x);
-		printExtrinsicCorrection(gpu_correction_vector);
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	if (nangle < 4){ // this part only for 1-st 3
-		float* gangles =
-				(nangle ==0)?gpu_correction_vector->azimuth:(
-						(nangle ==1)?gpu_correction_vector->tilt:(
-								(nangle ==2)?gpu_correction_vector->roll:
-										gpu_correction_vector->zoom));
-		if ((ncam < (NUM_CAMS -1)) || (nangle == 2)){ // for rolls - all 4
-			angle = *(gangles + ncam);
-		} else {
-			angle = 0.0f;
-#pragma	unroll
-			for (int n = 0; n < (NUM_CAMS-1); n++){
-				angle -= *(gangles + n);
-			}
-		}
-		if (!is_sin){
-			angle += M_PI/2;
-		}
-		if (nangle < 3) {
-			sincos[ncam][nangle][is_sin]=sinf(angle);
-		} else if (is_sin){
-			zoom[ncam] = angle;
-		}
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	if (nangle == 3) {
-		sincos[ncam][2][is_sin] *= (1.0 + zoom[ncam]); // modify roll
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\na    Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f, zoom = %f\n", n, sincos[n][0][0], sincos[n][0][1], zoom[n]);
-			printf("a    Tilt matrix for camera %d, sincos[0] = %f, sincos[0] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			printf("a    Roll matrix for camera %d, sincos[0] = %f, sincos[2] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-	// now 3x3
-	for (int axis = 0; axis < 3; axis++) {
-		matrices[ncam][axis][threadIdx.y][threadIdx.x] =
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][0] * sincos[ncam][axis][0]+ // cos
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][1] * sincos[ncam][axis][1]+ // sin
-				ROTS_TEMPLATE[axis][threadIdx.y][threadIdx.x][2];                         // const
-	}
-	__syncthreads();
-#ifdef DEBUG20a
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n1-Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-			printf("1-Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-			printf("1-Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-    // tilt * az ->
-	// multiply matrices[ncam][1] * matrices[ncam][0] -> matrices[ncam][3]
-	matrices[ncam][3][threadIdx.y][threadIdx.x] =
-			matrices[ncam][1][threadIdx.y][0] * matrices[ncam][0][0][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][1] * matrices[ncam][0][1][threadIdx.x]+
-			matrices[ncam][1][threadIdx.y][2] * matrices[ncam][0][2][threadIdx.x];
-	// multiply matrices[ncam][2] * matrices[ncam][3] -> rot_matrices[ncam]
-	__syncthreads();
-	rot_matrices[ncam][threadIdx.y][threadIdx.x] =
-			matrices[ncam][2][threadIdx.y][0] * matrices[ncam][3][0][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][1] * matrices[ncam][3][1][threadIdx.x]+
-			matrices[ncam][2][threadIdx.y][2] * matrices[ncam][3][2][threadIdx.x];
-	__syncthreads();
-#ifdef DEBUG20
-	if ((threadIdx.x == 0) && (threadIdx.y == 0) && (threadIdx.z == 0)){
-		for (int n = 0; n < NUM_CAMS; n++){
-			printf("\n2 - Azimuth matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][0][0], sincos[n][0][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][0][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Tilt matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][1][0], sincos[n][1][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][1][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Roll/Zoom matrix for camera %d, sincos[0] = %f, sincos[1] = %f\n", n, sincos[n][2][0], sincos[n][2][1]);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", matrices[n][2][i][j]);
-				}
-				printf("\n");
-			}
-			printf("2 - Rotation matrix for camera %d\n", n);
-			for (int i = 0; i < 3; i++){
-				for (int j = 0; j < 3; j++){
-					printf("%9.6f, ", rot_matrices[n][i][j]);
-				}
-				printf("\n");
-			}
-		}
-	}
-	__syncthreads();// __syncwarp();
-#endif // DEBUG20
-}
-#endif
 __constant__ int offset_rots =     0;                   //0
 __constant__ int offset_derivs =   1;                   // 1..4 // should be next
 __constant__ int offset_matrices = 5;   // 5..11
@@ -890,8 +697,69 @@ extern "C" __global__ void get_tiles_offsets(
 }
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist)
+{
+	//int num_threads = NUM_CAMS *  blockDim.z  *  blockDim.y * blockDim.x; // 36
+	int indx =  ((blockIdx.x * blockDim.z + threadIdx.z) *  blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x;
+//	double delta=1E-20; // 12; // 10; // -8; 215.983994 ms
+//	double delta=1E-4; //rByRDist error = 0.000072
+	double delta=1E-10; // 12; // 10; // -8; 0.730000 ms
+	double minDerivative=0.01;
+	int numIterations=1000;
+	double drDistDr=1.0;
+	double d=1.0
+			-geometry_correction -> distortionA8
+			-geometry_correction -> distortionA7
+			-geometry_correction -> distortionA6
+			-geometry_correction -> distortionA5
+			-geometry_correction -> distortionA
+			-geometry_correction -> distortionB
+			-geometry_correction -> distortionC;
+	double rPrev=0.0;
+	int num_points = (RBYRDIST_LEN + CALC_REVERSE_TABLE_BLOCK_THREADS - 1) / CALC_REVERSE_TABLE_BLOCK_THREADS;
+	for (int p = 0; p < num_points; p ++){
+		int i = indx * num_points +p;
+		if (i >= RBYRDIST_LEN){
+			return;
+		}
+		if (i == 0){
+			rByRDist[0]= (float) 1.0/d;
+			break;
+		}
+		double rDist = RBYRDIST_STEP * i;
+		double r = (p == 0) ? rDist : rPrev;
+		for (int iteration=0;iteration<numIterations;iteration++){
+			double k=(((((((
+					geometry_correction -> distortionA8) * r +
+					geometry_correction -> distortionA7) * r +
+					geometry_correction -> distortionA6) * r +
+					geometry_correction -> distortionA5) * r +
+					geometry_correction -> distortionA) * r +
+					geometry_correction -> distortionB) * r +
+					geometry_correction -> distortionC) * r + d;
+			drDistDr=(((((((
+					8 * geometry_correction -> distortionA8) * r +
+					7 * geometry_correction -> distortionA7) * r +
+					6 * geometry_correction -> distortionA6) * r +
+					5 * geometry_correction -> distortionA5) * r +
+					4 * geometry_correction -> distortionA) * r +
+					3 * geometry_correction -> distortionB) * r+
+					2 * geometry_correction -> distortionC) * r+d;
+			if (drDistDr<minDerivative) { // folds backwards !
+				return; // too high distortion
+			}
+			double rD=r*k;
+			if (fabs(rD-rDist)<delta){
+				break;
+			}
+			r+=(rDist-rD)/drDistDr;
+		}
+		rPrev=r;
+		rByRDist[i]= (float) r/rDist;
+	}
+}
 /**
 * Calculate non-distorted radius from distorted using table approximation

--- a/src/geometry_correction.h
+++ b/src/geometry_correction.h
@@ -148,14 +148,15 @@ extern "C" __global__ void get_tiles_offsets(
 		float *              gpu_rByRDist, // length should match RBYRDIST_LEN
 		trot_deriv   * gpu_rot_deriv);
-#if 0
-// uses 3 threadIdx.x, 3 - threadIdx.y, 4 - threadIdx.z
-extern "C" __global__ void calc_rot_matrices(
-		struct corr_vector * gpu_correction_vector);
-#endif
 // uses NUM_CAMS blocks, (3,3,3) threads
 extern "C" __global__ void calc_rot_deriv(
 		struct corr_vector * gpu_correction_vector,
 		trot_deriv   * gpu_rot_deriv);
+#define CALC_REVERSE_TABLE_BLOCK_THREADS (NUM_CAMS * 3 * 3 * 3) // fixed blockDim
+// Use same blocks/threads as with calc_rot_deriv() - NUM_CAMS blocks, (3,3,3) threads
+extern "C" __global__ void calcReverseDistortionTable(
+		struct gc * geometry_correction,
+		float * rByRDist);
--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -278,6 +278,7 @@ int main(int argc, char **argv)
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.portsxy",
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.portsxy"};
+#ifndef DBG_TILE
    const char* ports_clt_file[] = { // never referenced
    		"/data_ssd/git/tile_processor_gpu/clt/main_chn0.clt",
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.clt",
@@ -288,6 +289,7 @@ int main(int argc, char **argv)
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn1.rbg",
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
+#endif
    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
@@ -297,7 +299,7 @@ int main(int argc, char **argv)
    const char* geometry_correction_file = "/data_ssd/git/tile_processor_gpu/clt/main.geometry_correction";
    // not yet used
-    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
+///    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
    float port_offsets[NUM_CAMS][2] =  {// used only in textures to scale differences
 			{-0.5, -0.5},
@@ -332,7 +334,7 @@ int main(int argc, char **argv)
    float            * gpu_images_h         [NUM_CAMS];
    float              tile_coords_h        [NUM_CAMS][TILESX * TILESY][2];
    float            * gpu_clt_h            [NUM_CAMS];
-    float            * gpu_lpf_h            [NUM_COLORS]; // never used
+///    float            * gpu_lpf_h            [NUM_COLORS]; // never used
    float            * gpu_corr_images_h    [NUM_CAMS];
    float            * gpu_corrs;
@@ -353,11 +355,16 @@ int main(int argc, char **argv)
    float           ** gpu_images; //            [NUM_CAMS];
    float           ** gpu_clt;    //            [NUM_CAMS];
    float           ** gpu_corr_images; //       [NUM_CAMS];
-    float           ** gpu_lpf;    //            [NUM_CAMS]; // never referenced
+///    float           ** gpu_lpf;    //            [NUM_CAMS]; // never referenced
    // GPU pointers to GPU memory
 //    float * gpu_tasks;
    struct tp_task  * gpu_tasks;
+    int *             gpu_active_tiles;
+    int *             gpu_num_active;
+    checkCudaErrors (cudaMalloc((void **)&gpu_active_tiles, TILESX * TILESY * sizeof(int)));
+    checkCudaErrors (cudaMalloc((void **)&gpu_num_active,                     sizeof(int)));
    size_t  dstride;          // in bytes !
    size_t  dstride_rslt;     // in bytes !
    size_t  dstride_corr;     // in bytes ! for one 2d phase correlation (padded 15x15x4 bytes)
@@ -399,7 +406,7 @@ int main(int argc, char **argv)
 			rByRDist_length);
    checkCudaErrors(cudaMalloc((void **)&gpu_rot_deriv, sizeof(trot_deriv)));
+/*
    float lpf_rbg[3][64]; // not used
    for (int ncol = 0; ncol < 3; ncol++) {
    	if (lpf_sigmas[ncol] > 0.0) {
@@ -412,7 +419,7 @@ int main(int argc, char **argv)
    		gpu_lpf_h[ncol] = NULL;
    	}
    }
+*/
    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
@@ -476,6 +483,9 @@ int main(int argc, char **argv)
    }
    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
+    int num_active_tiles; // will be calculated by convert_direct
 #ifdef DBG0
@@ -578,7 +588,6 @@ int main(int argc, char **argv)
    gpu_clt =            copyalloc_pointers_gpu (gpu_clt_h,         NUM_CAMS);
    gpu_corr_images =    copyalloc_pointers_gpu (gpu_corr_images_h, NUM_CAMS);
 #ifdef DBG_TILE
    const int numIterations = 1; //0;
    const int i0 =  0; // -1;
@@ -587,6 +596,8 @@ int main(int argc, char **argv)
    const int i0 = -1; // 0; // -1;
 #endif
 #define TEST_ROT_MATRICES
 #ifdef  TEST_ROT_MATRICES
 //    dim3 threads_rot(3,3,NUM_CAMS);
@@ -607,9 +618,6 @@ int main(int argc, char **argv)
    		sdkStartTimer(&timerROT_MATRICES);
    	}
-//    	calc_rot_matrices<<<grid_rot,threads_rot>>> (
-//    			gpu_correction_vector);   // 		struct corr_vector * gpu_correction_vector,
    	calc_rot_deriv<<<grid_rot,threads_rot>>> (
    			gpu_correction_vector ,           // 		struct corr_vector * gpu_correction_vector,
    			gpu_rot_deriv);                  // union trot_deriv   * gpu_rot_deriv);
@@ -660,6 +668,67 @@ int main(int argc, char **argv)
 #endif // TEST_ROT_MATRICES
+#define TEST_REVERSE_DISTORTIONS
+#ifdef  TEST_REVERSE_DISTORTIONS
+    dim3 threads_rd(3,3,3);
+    dim3 grid_rd   (NUM_CAMS, 1, 1);
+    printf("REVERSE DISTORTIONS: threads_list=(%d, %d, %d)\n",threads_rd.x,threads_rd.y,threads_rd.z);
+    printf("REVERSE DISTORTIONS: grid_list=(%d, %d, %d)\n",grid_rd.x,grid_rd.y,grid_rd.z);
+    StopWatchInterface *timerREVERSE_DISTORTIONS = 0;
+    sdkCreateTimer(&timerREVERSE_DISTORTIONS);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerREVERSE_DISTORTIONS);
+    		sdkStartTimer(&timerREVERSE_DISTORTIONS);
+    	}
+    	calcReverseDistortionTable<<<grid_rd,threads_rd>>>(
+    			gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
+				gpu_rByRDist);
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    ///	cudaProfilerStop();
+    sdkStopTimer(&timerREVERSE_DISTORTIONS);
+    float avgTimeREVERSE_DISTORTIONS = (float)sdkGetTimerValue(&timerREVERSE_DISTORTIONS) / (float)numIterations;
+    sdkDeleteTimer(&timerREVERSE_DISTORTIONS);
+    printf("Average calcReverseDistortionTable  run time =%f ms\n",  avgTimeREVERSE_DISTORTIONS);
+    float * rByRDist_gen = (float *) malloc(RBYRDIST_LEN * sizeof(float));
+	checkCudaErrors(cudaMemcpy(
+			rByRDist_gen,
+			gpu_rByRDist,
+			RBYRDIST_LEN * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	float max_err = 0;
+	for (int i = 0; i < RBYRDIST_LEN; i++){
+		float err = abs(rByRDist_gen[i] - rByRDist[i]);
+		if (err > max_err){
+			max_err = err;
+		}
+#ifdef VERBOSE
+///		printf ("%5d: %8.6f %8.6f %f %f\n", i, rByRDist[i], rByRDist_gen[i] , err, max_err);
+#endif // #ifdef VERBOSE
+	}
+	printf("Maximal rByRDist error = %f\n",max_err);
+	free (rByRDist_gen);
+#if 0
+    // temporarily restore
+    checkCudaErrors(cudaMemcpy(
+    		gpu_rByRDist,
+			rByRDist,
+			RBYRDIST_LEN * sizeof(float),
+            cudaMemcpyHostToDevice));
+#endif // #if 1
+#endif // TEST_REVERSE_DISTORTIONS
@@ -680,14 +749,13 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerGEOM);
    		sdkStartTimer(&timerGEOM);
    	}
    	get_tiles_offsets<<<grid_geom,threads_geom>>> (
-    			gpu_tasks,                // struct tp_task   * gpu_tasks,
+    			gpu_tasks,               // struct tp_task     * gpu_tasks,
-				tp_task_size,             // int                num_tiles,          // number of tiles in task list
+				tp_task_size,            // int                  num_tiles,          // number of tiles in task list
-				gpu_geometry_correction, // 		struct gc          * gpu_geometry_correction,
+				gpu_geometry_correction, //	struct gc          * gpu_geometry_correction,
-				gpu_correction_vector,   // 		struct corr_vector * gpu_correction_vector,
+				gpu_correction_vector,   //	struct corr_vector * gpu_correction_vector,
-				gpu_rByRDist, // 		float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
+				gpu_rByRDist,            //	float *              gpu_rByRDist)      // length should match RBYRDIST_LEN
-				gpu_rot_deriv); // union trot_deriv   * gpu_rot_deriv);
+				gpu_rot_deriv);          // union trot_deriv   * gpu_rot_deriv);
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -708,9 +776,10 @@ int main(int argc, char **argv)
 			gpu_tasks,
 			tp_task_size * sizeof(struct tp_task),
 			cudaMemcpyDeviceToHost));
+#if 0 // for manual browsing
 	struct tp_task * old_task = &task_data [DBG_TILE];
 	struct tp_task * new_task = &task_data1[DBG_TILE];
+#endif
    printf("old_task txy = 0x%x\n",  task_data [DBG_TILE].txy);
    printf("new_task txy = 0x%x\n",  task_data1[DBG_TILE].txy);
@@ -740,15 +809,20 @@ int main(int argc, char **argv)
    StopWatchInterface *timerTP = 0;
    sdkCreateTimer(&timerTP);
+#if 0
    dim3 threads_tp(THREADSX, TILES_PER_BLOCK, 1);
    dim3 grid_tp((tp_task_size + TILES_PER_BLOCK -1 )/TILES_PER_BLOCK, 1);
+#else
+    dim3 threads_tp(1, 1, 1);
+    dim3 grid_tp(1, 1, 1);
+#endif
    printf("threads_tp=(%d, %d, %d)\n",threads_tp.x,threads_tp.y,threads_tp.z);
    printf("grid_tp=   (%d, %d, %d)\n",grid_tp.x,   grid_tp.y,   grid_tp.z);
-    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
+//    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
+//    cudaFuncSetCacheConfig(convert_correct_tiles, cudaFuncCachePreferShared);
    ///    cudaProfilerStart();
    float ** fgpu_kernel_offsets = (float **) gpu_kernel_offsets; //    [NUM_CAMS];
@@ -760,7 +834,22 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerTP);
    		sdkStartTimer(&timerTP);
    	}
+    	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+    			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
+				gpu_kernels,           // float           ** gpu_kernels,
+				gpu_images,            // float           ** gpu_images,
+				gpu_tasks,             // struct tp_task   * gpu_tasks,
+				gpu_clt,               // float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+				dstride/sizeof(float), // size_t             dstride, // for gpu_images
+				tp_task_size,          // int                num_tiles) // number of tiles in task
+				0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+				IMG_WIDTH,             // int                woi_width,
+				IMG_HEIGHT,            // int                woi_height,
+				KERNELS_HOR,           // int                kernels_hor,
+				KERNELS_VERT,          // int                kernels_vert);
+				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+    			gpu_num_active);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+#if 0
    	convert_correct_tiles<<<grid_tp,threads_tp>>>(
    			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
 				gpu_kernels,           // float           ** gpu_kernels,
@@ -774,6 +863,9 @@ int main(int argc, char **argv)
 				IMG_HEIGHT,            // int                woi_height,
 				KERNELS_HOR,           // int                kernels_hor,
 				KERNELS_VERT);         // int                kernels_vert);
+#endif
    	getLastCudaError("Kernel execution failed");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("%d\n",i);
@@ -782,7 +874,14 @@ int main(int argc, char **argv)
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
    sdkDeleteTimer(&timerTP);
-    printf("Run time =%f ms\n",  avgTime);
+	checkCudaErrors(cudaMemcpy(
+			&num_active_tiles,
+			gpu_num_active,
+			sizeof(int), // 8 sequences (0,2,4,6 - non-border, growing up;
+			//1,3,5,7 - border, growing down from the end of the corresponding non-border buffers
+			cudaMemcpyDeviceToHost));
+    printf("Run time =%f ms, num active tiles = %d\n",  avgTime, num_active_tiles);
 #ifdef SAVE_CLT
@@ -836,8 +935,6 @@ int main(int argc, char **argv)
    		sdkResetTimer(&timerIMCLT);
    		sdkStartTimer(&timerIMCLT);
    	}
-#define CDP1
-#ifdef CDP1
        dim3 threads_imclt_all(1, 1, 1);
 		dim3 grid_imclt_all(1, 1, 1);
        printf("threads_imclt_all=(%d, %d, %d)\n",threads_imclt_all.x,threads_imclt_all.y,threads_imclt_all.z);
@@ -850,34 +947,6 @@ int main(int argc, char **argv)
 				TILESX,                      // int               woi_twidth,
 				TILESY,                      // int               woi_theight,
 				dstride_rslt/sizeof(float)); // const size_t      dstride);            // in floats (pixels)
-#else
-        dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
-        printf("threads_imclt=(%d, %d, %d)\n",threads_imclt.x,threads_imclt.y,threads_imclt.z);
-    	for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
-    		for (int color = 0; color < NUM_COLORS; color++) {
-    			for (int v_offs = 0; v_offs < 2; v_offs++){
-    				for (int h_offs = 0; h_offs < 2; h_offs++){
-    					int tilesy_half = (TILESY + (v_offs ^ 1)) >> 1;
-    					int tilesx_half = (TILESX + (h_offs ^ 1)) >> 1;
-    					int tiles_in_pass = tilesy_half * tilesx_half;
-    					dim3 grid_imclt((tiles_in_pass + IMCLT_TILES_PER_BLOCK-1) / IMCLT_TILES_PER_BLOCK,1,1);
-    					//    				printf("grid_imclt=   (%d, %d, %d)\n",grid_imclt.x,   grid_imclt.y,   grid_imclt.z);
-    					imclt_rbg<<<grid_imclt,threads_imclt>>>(
-    							gpu_clt_h[ncam],         // float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-								gpu_corr_images_h[ncam], // float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
-								1,                       // int               apply_lpf,
-								NUM_COLORS,              // int               colors,               // defines lpf filter
-								color,                   // int               color,              // defines location of clt data
-								v_offs,                  // int               v_offset,
-								h_offs,                  // int               h_offset,
-								TILESX,                  // int               woi_twidth,
-								TILESY,                  // int               woi_theight,
-								dstride_rslt/sizeof(float));            //const size_t      dstride);            // in floats (pixels)
-    				}
-    			}
-    		}
-    	}
-#endif
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
@@ -1283,6 +1352,8 @@ int main(int argc, char **argv)
    	checkCudaErrors(cudaFree(gpu_corr_images_h[ncam]));
    }
 	checkCudaErrors(cudaFree(gpu_tasks));
+	checkCudaErrors(cudaFree(gpu_active_tiles));
+	checkCudaErrors(cudaFree(gpu_num_active));
 	checkCudaErrors(cudaFree(gpu_kernels));
 	checkCudaErrors(cudaFree(gpu_kernel_offsets));
 	checkCudaErrors(cudaFree(gpu_images));