Version for rectilinear images

f3521f7d · Andrey Filippov · 9d222e8c · f3521f7d · f3521f7d · f3521f7d
Commit f3521f7d authored Feb 08, 2024 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 178 additions and 129 deletions

TileProcessor.cuh src/TileProcessor.cuh +167 -122

test_tp.cu src/test_tp.cu +10 -6

tp_defines.h src/tp_defines.h +1 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1010,7 +1010,7 @@ extern "C" __global__ void correlate2D(
 /**
 * Calculate  2D phase correlation pairs from CLT representation. This is an outer kernel that calls other
 * ones with CDP, this one should be configured as correlate2D<<<1,1>>>
- *
+ * gpu_corr_indices will include sums
 * @param num_cams         number of cameras <= NUM_CAMS
 * @param sel_pairs        array of length to accommodate all pairs (4  for 16 cameras, 120 pairs).
 * @param gpu_clt          array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
@@ -2660,7 +2660,7 @@ __global__ void index_correlate(
 * Helper kernel for correlateInter2D() - generates dense list of correlation tasks.
 * For interscene correlation. One correlation output for each selected sensor
 * plus a sum of them all. So for all 16 sensors selected ooutput will have 17
- * 2D correlations (with some being the l;ast one)
+ * 2D correlations (with sum being the last one)
 * All pairs for the same tile will always be in the same order: increasing sensor numbers
 * with sum being the last. Sum will be marked by 0xff in the LSB.
 * With the quad camera each tile may generate up to 6 pairs (int array elements)
@@ -2770,8 +2770,8 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 		 convert_correct_tiles<<<grid_tp,threads_tp>>>(
 				 num_cams,           // int                num_cams,           // actual number of cameras
 				 num_colors,         // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-				 gpu_kernel_offsets, // float           ** gpu_kernel_offsets, // [num_cams],
-				 gpu_kernels,        // float           ** gpu_kernels,        // [num_cams],
+				 ((kernels_hor>0)?gpu_kernel_offsets:0), // float           ** gpu_kernel_offsets, // [num_cams],
+				 ((kernels_hor>0)?gpu_kernels:0),        // float           ** gpu_kernels,        // [num_cams],
 				 gpu_images,         // float           ** gpu_images,         // [num_cams],
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				 gpu_active_tiles,   // int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
@@ -2862,8 +2862,8 @@ extern "C" __global__ void erase_clt_tiles_inner(
 __global__ void convert_correct_tiles(
 		    int                num_cams,           // actual number of cameras
 		    int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
-			float           ** gpu_kernel_offsets, // [num_cams],
-			float           ** gpu_kernels,        // [num_cams],
+			float           ** gpu_kernel_offsets, // [num_cams],  ###
+			float           ** gpu_kernels,        // [num_cams],  ###
 			float           ** gpu_images,         // [num_cams],
 			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
@@ -2929,38 +2929,72 @@ __global__ void convert_correct_tiles(
    // process each camera,l each color in series (to reduce shared memory)
    for (int ncam = 0; ncam <  num_cams; ncam++){
    	for (int color = 0; color <  num_colors; color++){
-    		convertCorrectTile(
-    				// TODO: remove debug when done
+    		if (gpu_kernels) {
+    			convertCorrectTile(
+    					// TODO: remove debug when done
 #ifdef DBG_TILE
-    				num_colors  + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
+    					num_colors  + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
 #else
-    				num_colors,                      // int                   num_colors, //*
+						num_colors,                      // int                   num_colors, //*
 #endif
-    				(struct CltExtra*)(gpu_kernel_offsets[ncam]),        // struct CltExtra* gpu_kernel_offsets,
-					gpu_kernels[ncam],               // float           * gpu_kernels,
-					gpu_images[ncam],                // float           * gpu_images,
-					gpu_clt[ncam],                   // float           * gpu_clt,
-					color,                           // const int         color,
-					lpf_mask,                        // const int         lpf_mask,
-					tt[tile_in_block].xy[ncam][0],   // const float       centerX,
-					tt[tile_in_block].xy[ncam][1],   // const float       centerY,
-					tt[tile_in_block].txy,           // const int txy,
-					tt[tile_in_block].scale,         // const float           tscale,
-					dstride,                         // size_t            dstride, // in floats (pixels)
-					(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
-					(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
-					int_topleft[tile_in_block],      // int   int_topleft  [num_colors][2],
-					residual_shift[tile_in_block],   // float frac_topleft [num_colors][2],
-					window_hor_cos[tile_in_block],   // float window_hor_cos  [num_colors][2*DTT_SIZE],
-					window_hor_sin[tile_in_block],   // float window_hor_sin  [num_colors][2*DTT_SIZE],
-					window_vert_cos[tile_in_block],  // float window_vert_cos [num_colors][2*DTT_SIZE]);
-					window_vert_sin[tile_in_block],  // float window_vert_sin [num_colors][2*DTT_SIZE]);
-					woi_width,                       // int                woi_width,
-					woi_height,                      // int                woi_height,
-					kernels_hor,                     // int                kernels_hor,
-					kernels_vert, //int                kernels_vert)
-					tilesx); // int                tilesx);
-    		 __syncthreads();
+						(struct CltExtra*)(gpu_kernel_offsets[ncam]),        // struct CltExtra* gpu_kernel_offsets, ####
+						gpu_kernels[ncam],               // float           * gpu_kernels,   ####
+						gpu_images[ncam],                // float           * gpu_images,
+						gpu_clt[ncam],                   // float           * gpu_clt,
+						color,                           // const int         color,
+						lpf_mask,                        // const int         lpf_mask,
+						tt[tile_in_block].xy[ncam][0],   // const float       centerX,
+						tt[tile_in_block].xy[ncam][1],   // const float       centerY,
+						tt[tile_in_block].txy,           // const int txy,
+						tt[tile_in_block].scale,         // const float           tscale,
+						dstride,                         // size_t            dstride, // in floats (pixels)
+						(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
+						(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
+						int_topleft[tile_in_block],      // int   int_topleft  [num_colors][2],
+						residual_shift[tile_in_block],   // float frac_topleft [num_colors][2],
+						window_hor_cos[tile_in_block],   // float window_hor_cos  [num_colors][2*DTT_SIZE],
+						window_hor_sin[tile_in_block],   // float window_hor_sin  [num_colors][2*DTT_SIZE],
+						window_vert_cos[tile_in_block],  // float window_vert_cos [num_colors][2*DTT_SIZE]);
+						window_vert_sin[tile_in_block],  // float window_vert_sin [num_colors][2*DTT_SIZE]);
+						woi_width,                       // int                woi_width,
+						woi_height,                      // int                woi_height,
+						kernels_hor,                     // int                kernels_hor,
+						kernels_vert, //int                kernels_vert)
+						tilesx); // int                tilesx);
+    		} else {
+    			convertCorrectTile(
+    					// TODO: remove debug when done
+#ifdef DBG_TILE
+    					num_colors  + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
+#else
+						num_colors,                      // int                   num_colors, //*
+#endif
+						(struct CltExtra*) 0,            // struct CltExtra* gpu_kernel_offsets, ####
+						(float*) 0,                      // float           * gpu_kernels,   ####
+						gpu_images[ncam],                // float           * gpu_images,
+						gpu_clt[ncam],                   // float           * gpu_clt,
+						color,                           // const int         color,
+						lpf_mask,                        // const int         lpf_mask,
+						tt[tile_in_block].xy[ncam][0],   // const float       centerX,
+						tt[tile_in_block].xy[ncam][1],   // const float       centerY,
+						tt[tile_in_block].txy,           // const int txy,
+						tt[tile_in_block].scale,         // const float           tscale,
+						dstride,                         // size_t            dstride, // in floats (pixels)
+						(float * )(clt_tile [tile_in_block]),        // float clt_tile [TILES_PER_BLOCK][NUM_CAMS][num_colors][4][DTT_SIZE][DTT_SIZE])
+						(float * )(clt_kernels[tile_in_block]),      // float clt_tile    [num_colors][4][DTT_SIZE][DTT_SIZE],
+						int_topleft[tile_in_block],      // int   int_topleft  [num_colors][2],
+						residual_shift[tile_in_block],   // float frac_topleft [num_colors][2],
+						window_hor_cos[tile_in_block],   // float window_hor_cos  [num_colors][2*DTT_SIZE],
+						window_hor_sin[tile_in_block],   // float window_hor_sin  [num_colors][2*DTT_SIZE],
+						window_vert_cos[tile_in_block],  // float window_vert_cos [num_colors][2*DTT_SIZE]);
+						window_vert_sin[tile_in_block],  // float window_vert_sin [num_colors][2*DTT_SIZE]);
+						woi_width,                       // int                woi_width,
+						woi_height,                      // int                woi_height,
+						0, // kernels_hor,                     // int                kernels_hor,
+						0, // kernels_vert, //int                kernels_vert)
+						tilesx); // int                tilesx);
+    		}
+    		__syncthreads();
    	}
    }
 }
@@ -4394,8 +4428,8 @@ __device__ void normalizeTileAmplitude(
 */
 __device__ void convertCorrectTile(
 		int                   num_colors, //*
-		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
-		float               * gpu_kernels,        // [tileY][tileX][color]
+		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color] if null, will not calculate kernel-related offsets
+		float               * gpu_kernels,        // [tileY][tileX][color] // null w/o kernels
 		float               * gpu_images,
 		float               * gpu_clt,
 		const int             color,
@@ -4406,7 +4440,7 @@ __device__ void convertCorrectTile(
 		const float           tscale,
 		const size_t          dstride, // in floats (pixels)
 		float               * clt_tile, //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
-		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
+		float               * clt_kernels, //      [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports // null w/o kernels
 		int   int_topleft     [2],
 		float residual_shift  [2],
 	    float window_hor_cos  [2*DTT_SIZE],
@@ -4430,78 +4464,85 @@ __device__ void convertCorrectTile(
 	int max_py =   woi_height - 1; // IMG-HEIGHT - 1; // odd
 	int max_pxm1 = max_px - 1; // even
 	int max_pym1 = max_py - 1; // even
-	int max_kernel_hor = kernels_hor - 1; // KERNELS_HOR -1;
-	int max_kernel_vert = kernels_vert - 1; // KERNELS_VERT-1;
-
-	int   ktileX, ktileY;
 	int   kernel_index; // common for all coors
 	float kdx, kdy;
-	if (threadIdx.x == 0){
-//		ktileX = min(max_kernel_hor,  max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
-//		ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
-//		kdx =    centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
-//		kdy =    centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
-
-		// From ImageDttCPU.java: extract_correct_tile() (modified 2022/05/12):
-//		int kernel_pitch = width/(clt_kernels[chn_kernel][0].length - 2);
+	if (gpu_kernels) {
+		int   ktileX, ktileY;
+		int max_kernel_hor = kernels_hor - 1; // KERNELS_HOR -1;
+		int max_kernel_vert = kernels_vert - 1; // KERNELS_VERT-1;
+		if (threadIdx.x == 0){
+			//		ktileX = min(max_kernel_hor,  max(0, ((int) lrintf(centerX * (1.0/KERNELS_STEP)+1))));
+			//		ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY * (1.0/KERNELS_STEP)+1))));
+			//		kdx =    centerX - (ktileX << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
+			//		kdy =    centerY - (ktileY << KERNELS_LSTEP) + (1 << (KERNELS_LSTEP -1)); // difference in pixel
+
+			// From ImageDttCPU.java: extract_correct_tile() (modified 2022/05/12):
+			//		int kernel_pitch = width/(clt_kernels[chn_kernel][0].length - 2);
 			// 1. find closest kernel
-//		ktileX = (int) Math.round(centerX/kernel_pitch) + 1;
-//		ktileY = (int) Math.round(centerY/kernel_pitch) + 1;
-//		if      (ktileY < 0)                                ktileY = 0;
-//		else if (ktileY >= clt_kernels[chn_kernel].length)  ktileY = clt_kernels[chn_kernel].length-1;
-//		if      (ktileX < 0)                                ktileX = 0;
-//		else if (ktileX >= clt_kernels[chn_kernel][ktileY].length) ktileX = clt_kernels[chn_kernel][ktileY].length-1;
-    	// extract center offset data stored with each kernel tile
-//		CltExtra ce = new CltExtra (clt_kernels[chn_kernel][ktileY][ktileX][4]);
-		// 2. calculate correction for center of the kernel offset
-//		double kdx = centerX - (ktileX -1 +0.5) *  kernel_pitch; // difference in pixel
-//		double kdy = centerY - (ktileY -1 +0.5) *  kernel_pitch;
-		int kernel_pitch = woi_width / (kernels_hor - 2);
-		ktileX = min(max_kernel_hor,  max(0, ((int) lrintf(centerX /kernel_pitch + 1))));
-		ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY /kernel_pitch + 1))));
-		kdx =    centerX - (ktileX - 0.5) * kernel_pitch; // difference in pixel
-		kdy =    centerY - (ktileY - 0.5) * kernel_pitch; //
-
-		kernel_index = (ktileX + ktileY * kernels_hor) * num_colors;
+			//		ktileX = (int) Math.round(centerX/kernel_pitch) + 1;
+			//		ktileY = (int) Math.round(centerY/kernel_pitch) + 1;
+			//		if      (ktileY < 0)                                ktileY = 0;
+			//		else if (ktileY >= clt_kernels[chn_kernel].length)  ktileY = clt_kernels[chn_kernel].length-1;
+			//		if      (ktileX < 0)                                ktileX = 0;
+			//		else if (ktileX >= clt_kernels[chn_kernel][ktileY].length) ktileX = clt_kernels[chn_kernel][ktileY].length-1;
+			// extract center offset data stored with each kernel tile
+			//		CltExtra ce = new CltExtra (clt_kernels[chn_kernel][ktileY][ktileX][4]);
+			// 2. calculate correction for center of the kernel offset
+			//		double kdx = centerX - (ktileX -1 +0.5) *  kernel_pitch; // difference in pixel
+			//		double kdy = centerY - (ktileY -1 +0.5) *  kernel_pitch;
+			int kernel_pitch = woi_width / (kernels_hor - 2);
+			ktileX = min(max_kernel_hor,  max(0, ((int) lrintf(centerX /kernel_pitch + 1))));
+			ktileY = min(max_kernel_vert, max(0, ((int) lrintf(centerY /kernel_pitch + 1))));
+			kdx =    centerX - (ktileX - 0.5) * kernel_pitch; // difference in pixel
+			kdy =    centerY - (ktileY - 0.5) * kernel_pitch; //
+
+			kernel_index = (ktileX + ktileY * kernels_hor) * num_colors;
+		}
+		// broadcast kernel_index
+		kernel_index =  __shfl_sync(
+				0xffffffff,        // unsigned mask,
+				kernel_index,      // T var,
+				0,                 // int srcLane,
+				THREADS_PER_TILE); // int width=warpSize);
+		kdx =  __shfl_sync(
+				0xffffffff,        // unsigned mask,
+				kdx,               // T var,
+				0,                 // int srcLane,
+				THREADS_PER_TILE); // int width=warpSize);
+		kdy =  __shfl_sync(
+				0xffffffff,        // unsigned mask,
+				kdy,               // T var,
+				0,                 // int srcLane,
+				THREADS_PER_TILE); // int width=warpSize);
+		__syncthreads();// __syncwarp(); // is it needed?
 	}
-    // broadcast kernel_index
-    kernel_index =  __shfl_sync(
-    		0xffffffff,        // unsigned mask,
-			kernel_index,      // T var,
-			0,                 // int srcLane,
-			THREADS_PER_TILE); // int width=warpSize);
-    kdx =  __shfl_sync(
-    		0xffffffff,        // unsigned mask,
-			kdx,               // T var,
-			0,                 // int srcLane,
-			THREADS_PER_TILE); // int width=warpSize);
-    kdy =  __shfl_sync(
-    		0xffffffff,        // unsigned mask,
-			kdy,               // T var,
-			0,                 // int srcLane,
-			THREADS_PER_TILE); // int width=warpSize);
-
-     __syncthreads();// __syncwarp(); // is it needed?
    float px, py;
-    // copy kernel
-	int kernel_full_index = kernel_index + color;
-    float * kernel_src = gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
-    float * kernelp =  clt_kernels;
-
-    kernel_src += threadIdx.x; // lsb;
-    kernelp +=    threadIdx.x; // lsb;
+    int kernel_full_index;
+    if (gpu_kernels) { // only copy kernels if they exist
+    	// copy kernel
+    	kernel_full_index = kernel_index + color;
+    	float * kernel_src = gpu_kernels + kernel_full_index* (DTT_SIZE * DTT_SIZE * 4);
+    	float * kernelp =  clt_kernels;
+    	kernel_src += threadIdx.x; // lsb;
+    	kernelp +=    threadIdx.x; // lsb;
 #pragma unroll
-    for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
-    	// shared memory kernels use DTT_SIZE1 (same as image data)
-    	*kernelp = *kernel_src;
-    	kernelp+=DTT_SIZE1;
-    	kernel_src+=THREADSX;
+    	for (int j = 0; j < DTT_SIZE * 4; j++){ // all 4 components, 8 rows
+    		// shared memory kernels use DTT_SIZE1 (same as image data)
+    		*kernelp = *kernel_src;
+    		kernelp+=DTT_SIZE1;
+    		kernel_src+=THREADSX;
+    	}
    }
-
+	px = centerX - DTT_SIZE; // fractional left corner (w/o kernel)
+	py = centerY - DTT_SIZE; // fractional top corner (w/o kernel)
+	if (gpu_kernel_offsets){ // adding no-kernel version with rectilinear images
+		struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
+		px -= clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy ;
+		py -= clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy ;
+	}
    // Calculate offsets and prepare windows (all colors):
-	struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
-
-	px = centerX - DTT_SIZE - (clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy) ; // fractional left corner
+//	struct CltExtra * clt_extra = &gpu_kernel_offsets[kernel_full_index];
+//	px = centerX - DTT_SIZE - (clt_extra->data_x + clt_extra->dxc_dx * kdx + clt_extra->dxc_dy * kdy) ; // fractional left corner
 	int itlx = (int) floorf(px +0.5f);
 	int_topleft [0] = itlx;
 	float shift_hor =  itlx - px;
@@ -4536,7 +4577,7 @@ __device__ void convertCorrectTile(
 		i1++;
 	}

-	py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
+//	py = centerY - DTT_SIZE - (clt_extra->data_y + clt_extra->dyc_dx * kdx + clt_extra->dyc_dy * kdy) ; // fractional top corner
 	int itly = (int) floorf(py +0.5f);
 	int_topleft[1] = itly;

@@ -4591,7 +4632,7 @@ __device__ void convertCorrectTile(
    if (dbg_tile && (threadIdx.x) == 0){
 		printf("COLOR=%d\n",color);
 		printf("centerX=%f,    centerY=%f\n",centerX, centerY);
-		printf("ktileX=%d,     ktileY=%d\n", ktileX,  ktileY);
+//		printf("ktileX=%d,     ktileY=%d\n", ktileX,  ktileY);
 		printf("kdx=%f,        kdy=%f\n",    kdx, kdy);
 		printf("int_topleft[%d][0]=%d,    int_topleft[%d][1]=%d\n",i,int_topleft[0],i,int_topleft[1]);
 		printf("residual_shift[%d][0]=%f, residual_shift[%d][1]=%f\n",i,residual_shift[0],i,residual_shift[1]);
@@ -4896,28 +4937,32 @@ __device__ void convertCorrectTile(
    } //  else { // if (is_mono) {

 #ifdef DEBUG30
-        if (dbg_tile && (threadIdx.x) == 0){
-    		printf("\nDTT Tiles after before convolving, color = %d\n",color);
-    		debug_print_clt1(clt_tile, color, 0xf);
+    if (dbg_tile && (threadIdx.x == 0)){
+    	printf("\nDTT Tiles after before convolving, color = %d\n",color);
+    	debug_print_clt1(clt_tile, color, 0xf);
+    }
+    __syncthreads();// __syncwarp();
+#endif
+
+    if (gpu_kernels) {
+
+    	// convolve first, then rotate to match Java and make it easier to verify
+    	// verify it works w/o kernels
+    	convolveTiles(
+    			clt_tile,      // float clt_tile   [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
+				clt_kernels); // float kernel     [4][DTT_SIZE][DTT_SIZE1]); // 4 quadrants of the CLT kernel (DTT3 converted)
+    	__syncthreads();// __syncwarp();
+#ifdef DEBUG30
+    	if (dbg_tile && (threadIdx.x == 0)){
+    		printf("\nDTT Tiles after convolution, color = %d\n",color);
+    		debug_print_clt1(clt_tile, color, 0xf); // all colors, all quadrants
    	}
    	__syncthreads();// __syncwarp();
 #endif
+    }



-    // convolve first, then rotate to match Java and make it easier to verify
-    convolveTiles(
-    		clt_tile,      // float clt_tile   [4][DTT_SIZE][DTT_SIZE1], // 4 quadrants of the clt data, rows extended to optimize shared ports
-			clt_kernels); // float kernel     [4][DTT_SIZE][DTT_SIZE1]); // 4 quadrants of the CLT kernel (DTT3 converted)
-     __syncthreads();// __syncwarp();
-#ifdef DEBUG2
-    if ((threadIdx.x) == 0){
-        printf("\nDTT Tiles after convolution, color = %d\n",color);
-    	debug_print_clt1(clt_tile, color, 0xf); // all colors, all quadrants
-    }
-     __syncthreads();// __syncwarp();
-#endif
-
    // rotate phases: first horizontal, then vertical
    shiftTileHor(
    		clt_tile,           // float clt_tile       [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports
@@ -4947,8 +4992,8 @@ __device__ void convertCorrectTile(


 #ifdef DBG_TILE
-#ifdef DEBUG3
-    if ((threadIdx.x) == 0){
+#ifdef DEBUG30
+    if (dbg_tile && (threadIdx.x == 0)){
        printf("\nDTT Tiles after vertical shift, color = %d\n",color);
    	debug_print_clt1(clt_tile, color,  0xf); // only 1 quadrant for R,B and 2 - for G
        printf("\nDTT All done\n");

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -30,13 +30,17 @@
 ** -----------------------------------------------------------------------------**
 */

-// #define NOCORR
-// #define NOCORR_TD
-// #define NOTEXTURES
-// #define NOTEXTURE_RGBA
+// all of the next 5 were disabled
+#define NOCORR
+#define NOCORR_TD
+#define NOTEXTURES
+#define NOTEXTURE_RGBA
+#define NOTEXTURE_RGBAXXX
+
+
 #define SAVE_CLT
 #define NO_DP
-#define CORR_INTER_SELF 1
+/// #define CORR_INTER_SELF 1


 #include <stdio.h>
@@ -1442,7 +1446,7 @@ int main(int argc, char **argv)
 				0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
 				IMG_WIDTH,             // int                woi_width,
 				IMG_HEIGHT,            // int                woi_height,
-				KERNELS_HOR,           // int                kernels_hor,
+				0, // KERNELS_HOR,           // int                kernels_hor,
 				KERNELS_VERT,          // int                kernels_vert);
 				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
    			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -147,7 +147,7 @@
 //#define DEBUG20 1 // Geometry Correction
 //#define DEBUG21 1 // Geometry Correction
 //#define DEBUG210 1
-////#define DEBUG30 1
+#define DEBUG30 1
 //#define DEBUG22 1
 //#define DEBUG23 1