Debugged monochrome direct/inverse mclt conversions

e1bf6e5c · Andrey Filippov · d26457f8 · e1bf6e5c · e1bf6e5c · e1bf6e5c
Commit e1bf6e5c authored Nov 29, 2021 by Andrey Filippov
Showing with 377 additions and 198 deletions

TileProcessor.cuh src/TileProcessor.cuh +319 -188

dtt8x8.cu src/dtt8x8.cu +46 -0

dtt8x8.h src/dtt8x8.h +2 -0

test_tp.cu src/test_tp.cu +8 -9

tp_defines.h src/tp_defines.h +2 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -868,6 +868,7 @@ __device__ void convertCorrectTile(
 	    float window_hor_cos  [2*DTT_SIZE],
 	    float window_hor_sin  [2*DTT_SIZE],
 	    float window_vert_cos [2*DTT_SIZE],
+	    float window_vert_sin [2*DTT_SIZE],
 		int                woi_width,
 		int                woi_height,
 		int                kernels_hor,
@@ -2576,7 +2577,6 @@ __global__ void index_correlate(
 * @param gpu_kernels          array of per-camera pointers to array of kernels (clt representation)
 * @param gpu_images           array of per-camera pointers to Bayer images
 * @param gpu_ftasks           flattened tasks, 27 floats per tile for quad EO, 99 floats -- for LWIR16
-// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param gpu_clt              output array of per-camera aberration-corrected transform-domain image representations
 *                             [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
 * @param dstride              stride (in floats) for the input Bayer images
@@ -2596,7 +2596,6 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 		float           ** gpu_kernels,        // [num_cams],
 		float           ** gpu_images,         // [num_cams],
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//		struct tp_task   * gpu_tasks,
 		float           ** gpu_clt,            // [num_cams][TILE-SY][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
@@ -2617,7 +2616,6 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 		 index_direct<<<blocks0,threads0>>>(
 				 task_size,          // int                task_size,        // flattened task size in 4-byte floats
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           //int                num_tiles,          // number of tiles in task
 				 gpu_active_tiles,    //int *              active_tiles,       // pointer to the calculated number of non-zero tiles
 				 pnum_active_tiles);  //int *              pnum_active_tiles)  //  indices to gpu_tasks  // should be initialized to zero
@@ -2632,7 +2630,6 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 				 gpu_kernels,        // float           ** gpu_kernels,        // [num_cams],
 				 gpu_images,         // float           ** gpu_images,         // [num_cams],
 				 gpu_ftasks,         // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				 gpu_tasks,          // struct tp_task   * gpu_tasks,          // array of tasks
 				 gpu_active_tiles,   // int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
 				 *pnum_active_tiles, // int                num_active_tiles,   // number of tiles in task
 				 gpu_clt,            // float           ** gpu_clt,            // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
@@ -2655,7 +2652,6 @@ extern "C" __global__ void convert_direct(  // called with a single block, singl
 * @param gpu_kernel_offsets   array of per-camera pointers to array of struct CltExtra (one element per kernel)
 * @param gpu_kernels          array of per-camera pointers to array of kernels (clt representation)
 * @param gpu_images           array of per-camera pointers to Bayer images
-// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param gpu_ftasks           flattened tasks, 27 floats per tile for quad EO, 99 floats -- for LWIR16
 * @param gpu_active_tiles     pointer to the calculated list of tiles
 * @param num_active_tiles     number of active tiles
@@ -2674,7 +2670,6 @@ __global__ void convert_correct_tiles(
 			float           ** gpu_kernels,        // [num_cams],
 			float           ** gpu_images,         // [num_cams],
 			float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//			struct tp_task   * gpu_tasks,
 			int              * gpu_active_tiles,   // indices in gpu_tasks to non-zero tiles
 			int                num_active_tiles,   // number of tiles in task
 			float           ** gpu_clt,            // [num_cams][TILES-Y][TILES-X][num_colors][DTT_SIZE*DTT_SIZE]
@@ -2697,39 +2692,8 @@ __global__ void convert_correct_tiles(
 	int task_size = get_task_size(num_cams);
 	float * tp0 = gpu_ftasks + task_size * task_num;
 	if (*(int *) tp0 == 0)     return; // NOP tile
-//	struct tp_task  * gpu_task = &gpu_tasks[task_num];
 	__shared__ struct tp_task tt [TILES_PER_BLOCK];
 	// Copy task data to shared memory
-	/*
-	if (!gpu_task->task)       return; // NOP tile
-    // struct tp_task
-	tt[tile_in_block].task =          gpu_task -> task;
-	tt[tile_in_block].txy =           gpu_task -> txy;
-	int thread0 =  threadIdx.x & 1; // 0,1
-	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change
-	if (thread12 < NUM_CAMS) {
-		tt[tile_in_block].xy[thread12][thread0] = gpu_task -> xy[thread12][thread0];
-	}
-	if (NUM_CAMS > 4){ // unlikely
-#pragma unroll
-		for (int nc0 = 4; nc0 < NUM_CAMS; nc0 += 4){
-			int nc = nc0 + thread12;
-			if (nc < NUM_CAMS) {
-				tt[tile_in_block].xy[nc][thread0] = gpu_task -> xy[nc][thread0];
-			}
-		}
-	}
-	// is it the same?
-#pragma unroll
-	for (int i = 0; i < (NUM_CAMS / 4); i++){
-		int nc = (threadIdx.x >> 1) + (i << 2);
-		if (nc < NUM_CAMS) {
-			tt[tile_in_block].xy[nc][0] = gpu_task -> xy[nc][0];
-			tt[tile_in_block].xy[nc][1] = gpu_task -> xy[nc][1];
-		}
-
-	}
-	*/
 	int thread0 =  threadIdx.x & 1; // 0,1
 	int thread12 = threadIdx.x >>1; // now 0..3 (total number ==  (DTT_SIZE), will not change

@@ -2762,12 +2726,13 @@ __global__ void convert_correct_tiles(
    __shared__ float window_hor_cos  [TILES_PER_BLOCK][2*DTT_SIZE];
    __shared__ float window_hor_sin  [TILES_PER_BLOCK][2*DTT_SIZE];
    __shared__ float window_vert_cos [TILES_PER_BLOCK][2*DTT_SIZE];
+    __shared__ float window_vert_sin [TILES_PER_BLOCK][2*DTT_SIZE];

    // process each camera,l each color in series (to reduce shared memory)
    for (int ncam = 0; ncam <  num_cams; ncam++){
    	for (int color = 0; color <  num_colors; color++){
    		convertCorrectTile(
-    				num_colors, // int                   num_colors, //*
+    				num_colors + (((task_num == DBG_TILE)&& (ncam == 0)) ? 16:0),                      // int                   num_colors, //*
    				(struct CltExtra*)(gpu_kernel_offsets[ncam]),        // struct CltExtra* gpu_kernel_offsets,
 					gpu_kernels[ncam],               // float           * gpu_kernels,
 					gpu_images[ncam],                // float           * gpu_images,
@@ -2783,8 +2748,9 @@ __global__ void convert_correct_tiles(
 					int_topleft[tile_in_block],      // int   int_topleft  [num_colors][2],
 					residual_shift[tile_in_block],   // float frac_topleft [num_colors][2],
 					window_hor_cos[tile_in_block],   // float window_hor_cos  [num_colors][2*DTT_SIZE],
-					window_hor_sin[tile_in_block],   //float window_hor_sin  [num_colors][2*DTT_SIZE],
-					window_vert_cos[tile_in_block],  //float window_vert_cos [num_colors][2*DTT_SIZE]);
+					window_hor_sin[tile_in_block],   // float window_hor_sin  [num_colors][2*DTT_SIZE],
+					window_vert_cos[tile_in_block],  // float window_vert_cos [num_colors][2*DTT_SIZE]);
+					window_vert_sin[tile_in_block],  // float window_vert_sin [num_colors][2*DTT_SIZE]);
 					woi_width,                       // int                woi_width,
 					woi_height,                      // int                woi_height,
 					kernels_hor,                     // int                kernels_hor,
@@ -2807,7 +2773,6 @@ __global__ void convert_correct_tiles(
 *
 * @param num_cams             number of cameras
 * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-// * @param gpu_tasks            array of per-tile tasks (struct tp_task)
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param gpu_texture_indices  allocated array - 1 integer per tile to process
 * @param num_texture_tiles    allocated array - 8 integers (may be reduced to 4 later)
@@ -2830,7 +2795,6 @@ __global__ void convert_correct_tiles(
 extern "C" __global__ void textures_nonoverlap(
 		int               num_cams,           // number of cameras
 		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
-//		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
 //		int               num_tilesx,         // number of tiles in a row
 // declare arrays in device code?
@@ -3967,15 +3931,15 @@ __device__ void normalizeTileAmplitude(
 * @param int_topleft          tile left and top, declared in shared memory (just allocated) [2]
 * @param residual_shift       tile fractional pixel shift (x,y) in shared memory (just allocated) [2]
 * @param window_hor_cos       array in shared memory for window horizontal cosine [2*DTT_SIZE]
- * @param window_hor_sin       array in shared memory for window horizontal cosine [2*DTT_SIZE]
- * @param window_vert_cos      array in shared memory for window horizontal cosine [2*DTT_SIZE]
+ * @param window_hor_sin       array in shared memory for window horizontal sine   [2*DTT_SIZE]
+ * @param window_vert_cos      array in shared memory for window vertical cosine   [2*DTT_SIZE]
+ * @param window_vert_cos      array in shared memory for window vertical sine     [2*DTT_SIZE]
 * @param woi_width            image width (was constant IMG-WIDTH, now variable to use with EO+LWIR
 * @param woi_height           image height (was constant IMG-HEIGHT, now variable to use with EO+LWIR
 * @param kernels_hor          number of deconvolution kernels per image width
 * @param kernels_vert         number of deconvolution kernels per image height
 */
 __device__ void convertCorrectTile(
-//		int                   num_cams,
 		int                   num_colors, //*
 		struct CltExtra     * gpu_kernel_offsets, // [tileY][tileX][color]
 		float               * gpu_kernels,        // [tileY][tileX][color]
@@ -3994,13 +3958,17 @@ __device__ void convertCorrectTile(
 	    float window_hor_cos  [2*DTT_SIZE],
 	    float window_hor_sin  [2*DTT_SIZE],
 	    float window_vert_cos [2*DTT_SIZE],
+		float window_vert_sin [2*DTT_SIZE],
 		int                woi_width,
 		int                woi_height,
 		int                kernels_hor,
 		int                kernels_vert,
 		int                tilesx)
 {
+	int dbg_tile = (num_colors & 16) != 0;
+	num_colors &= 7;
 //	int tilesx = TILES-X;
+	int is_mono = num_colors == 1;
 	// TODO: pass these values instead of constants to handle EO/LWIR
 	int max_px =   woi_width - 1; // IMG-WIDTH  - 1; // odd
 	int max_py =   woi_height - 1; // IMG-HEIGHT - 1; // odd
@@ -4070,11 +4038,11 @@ __device__ void convertCorrectTile(
 #pragma unroll
 	for (; i < (DTT_SIZE/2); i++ ){
 		int ri = (DTT_SIZE-1) - i;
-		window_hor_cos[i] =   HWINDOW[i ]*ahc + HWINDOW[ri]*ahs;
-		window_hor_cos[i1] =  HWINDOW[ i]*ahs - HWINDOW[ri]*ahc;
-		if (color == BAYER_GREEN){
-			window_hor_sin[i] =    HWINDOW[i ]*ahc + HWINDOW[ri]*ahs; // bayer_color== 2
-			window_hor_sin[i1] =   HWINDOW[ri]*ahc - HWINDOW[ i]*ahs;
+		window_hor_cos[i] =   HWINDOW[i]*ahc + HWINDOW[ri]*ahs;
+		window_hor_cos[i1] =  HWINDOW[i]*ahs - HWINDOW[ri]*ahc;
+		if (is_mono || (color == BAYER_GREEN)){
+			window_hor_sin[i] =   HWINDOW[i]*ahc + HWINDOW[ri]*ahs; // bayer_color== 2
+			window_hor_sin[i1] = -HWINDOW[i]*ahs + HWINDOW[ri]*ahc;
 		}
 		i1++;
 	}
@@ -4082,11 +4050,11 @@ __device__ void convertCorrectTile(
 #pragma unroll
 	for (; i < DTT_SIZE; i++ ){
 		int ri = (DTT_SIZE-1) - i;
-		window_hor_cos[i] =   -HWINDOW[i ]*ahc - HWINDOW[ri]*ahs;
-		window_hor_cos[i1] =   HWINDOW[ i]*ahs - HWINDOW[ri]*ahc;
-		if (color == BAYER_GREEN){
-			window_hor_sin[i] =    HWINDOW[i ]*ahc + HWINDOW[ri]*ahs;
-			window_hor_sin[i1] =   HWINDOW[ i]*ahs - HWINDOW[ri]*ahc;
+		window_hor_cos[i] =   -HWINDOW[i]*ahc - HWINDOW[ri]*ahs;
+		window_hor_cos[i1] =   HWINDOW[i]*ahs - HWINDOW[ri]*ahc;
+		if (is_mono || (color == BAYER_GREEN)){
+			window_hor_sin[i] =    HWINDOW[i]*ahc + HWINDOW[ri]*ahs;
+			window_hor_sin[i1] =   HWINDOW[i]*ahs - HWINDOW[ri]*ahc;
 		}
 		i1++;
 	}
@@ -4120,27 +4088,52 @@ __device__ void convertCorrectTile(
 #pragma unroll
 	for (; i < DTT_SIZE/2; i++ ){
 		int ri = (DTT_SIZE-1) - i;
-		window_vert_cos[i] =    HWINDOW[i ]*avc + HWINDOW[ri]*avs;
-		window_vert_cos[i1++] = HWINDOW[ i]*avs - HWINDOW[ri]*avc;
+		window_vert_cos[i] =        HWINDOW[i]*avc + HWINDOW[ri]*avs;
+		window_vert_cos[i1] =       HWINDOW[i]*avs - HWINDOW[ri]*avc;
+		if (is_mono){
+			window_vert_sin[i] =    HWINDOW[i]*avc + HWINDOW[ri]*avs;
+			window_vert_sin[i1] =  -HWINDOW[i]*avs + HWINDOW[ri]*avc;
+		}
+		i1++;
+
 	}
 #pragma unroll
 	for (; i < DTT_SIZE; i++ ){
 		int ri = (DTT_SIZE-1) - i;
-		window_vert_cos[i] =  -(HWINDOW[i ]*avc + HWINDOW[ri]*avs);
-		window_vert_cos[i1++] = HWINDOW[ i]*avs - HWINDOW[ri]*avc;
+		window_vert_cos[i] =       -HWINDOW[i]*avc - HWINDOW[ri]*avs;
+		window_vert_cos[i1] =       HWINDOW[i]*avs - HWINDOW[ri]*avc;
+		if (is_mono){
+			window_vert_sin[i] =    HWINDOW[i]*avc + HWINDOW[ri]*avs;
+			window_vert_sin[i1] =   HWINDOW[i]*avs - HWINDOW[ri]*avc;
 		}
+		i1++;

-
-//    } //  if (color < 3) else
+	}
     __syncthreads();// __syncwarp();
-#ifdef DEBUG1
-    if ((threadIdx.x) == 0){
+#ifdef DEBUG30
+    if (dbg_tile && (threadIdx.x) == 0){
 		printf("COLOR=%d\n",color);
 		printf("centerX=%f,    centerY=%f\n",centerX, centerY);
 		printf("ktileX=%d,     ktileY=%d\n", ktileX,  ktileY);
 		printf("kdx=%f,        kdy=%f\n",    kdx, kdy);
 		printf("int_topleft[%d][0]=%d,    int_topleft[%d][1]=%d\n",i,int_topleft[0],i,int_topleft[1]);
 		printf("residual_shift[%d][0]=%f, residual_shift[%d][1]=%f\n",i,residual_shift[0],i,residual_shift[1]);
+		printf("\nwindow_hor_cos\n");
+		for (int ii = 0; ii < 2*DTT_SIZE; ii++){
+			printf("%6f, ",window_hor_cos[ii]);
+		}
+		printf("\nwindow_hor_sin\n");
+		for (int ii = 0; ii < 2*DTT_SIZE; ii++){
+			printf("%6f, ",window_hor_sin[ii]);
+		}
+		printf("\nwindow_vert_cos\n");
+		for (int ii = 0; ii < 2*DTT_SIZE; ii++){
+			printf("%6f, ",window_vert_cos[ii]);
+		}
+		printf("\nwindow_vert_sin\n");
+		for (int ii = 0; ii < 2*DTT_SIZE; ii++){
+			printf("%6f, ",window_vert_sin[ii]);
+		}
    }
     __syncthreads();// __syncwarp();
 #endif
@@ -4149,15 +4142,137 @@ __device__ void convertCorrectTile(

    // prepare, fold and write data to DTT buffers
    int dstride2 = dstride << 1; // in floats (pixels)
-    int color0 = color & 1;
-    int color1 = (color >>1) & 1;
+    if (is_mono) {
+    	// clear 4 buffer of the CLT tile
+    	float *dtt_buf =  clt_tile + threadIdx.x;
+#pragma unroll
+    	for (int i = 0; i < 4* DTT_SIZE; i++) {
+    		(*dtt_buf)  = 0.0f;
+    		dtt_buf +=    DTT_SIZE1;
+    	}
+    	__syncthreads();// __syncwarp();
+
+        for (int npass = 0; npass < 4; npass++) { // 4 passes to reuse Bayer tables
+//          for (int npass = 1; npass < 2; npass++) { // 4 passes to reuse Bayer tables
+        	int col_tl = int_topleft[0]; //  + (threadIdx.x << 1);
+        	int row_tl = int_topleft[1];
+        	// for red, blue and green, pass 0
+        	int local_col = (npass & 1)  + (threadIdx.x << 1);
+        	int local_row = (npass >> 1) & 1;
+        	float hwind_cos = window_hor_cos [local_col];
+        	float hwind_sin = window_hor_sin [local_col];
+
+        	int dtt_offset =     fold_indx2[local_row][local_col];
+        	int dtt_offset_inc = fold_inc[local_row];
+
+        	float *dcct_buf = clt_tile + (0 * (DTT_SIZE * DTT_SIZE1)); // CC buffer
+        	float *dsct_buf = clt_tile + (1 * (DTT_SIZE * DTT_SIZE1)); // SC buffer
+        	float *dcst_buf = clt_tile + (2 * (DTT_SIZE * DTT_SIZE1)); // CS buffer
+        	float *dsst_buf = clt_tile + (3 * (DTT_SIZE * DTT_SIZE1)); // SS buffer
+
+        	// replace pixels outside input window
+        	int col_src = col_tl + local_col;
+        	if (col_src < 0) {
+        		col_src = 0;
+        	} else if (col_src > max_px){
+        		col_src = max_px;
+        	}
+        	int row_src = row_tl + local_row;
+        	int row_use = row_src;
+        	if (row_use < 0) {
+        		row_use = 0;
+        	} else if (row_use > max_py){
+        		row_use = max_py;
+        	}
+//            __syncthreads();// ?
+
+    		float *image_p = gpu_images + dstride * row_use + col_src;
+    #pragma unroll
+    		for (int i = 0; i < 8; i++) {
+    			float vwind_cos = window_vert_cos[local_row];
+    			float vwind_sin = window_vert_sin[local_row];
+    			float d = (*image_p);
+    			int dtt_offset1 = dtt_offset + (dtt_offset >> 3); // converting for 9-long rows (DTT_SIZE1)
+    			dcct_buf[dtt_offset1] += d * hwind_cos * vwind_cos;
+    			dsct_buf[dtt_offset1] += d * hwind_sin * vwind_cos;
+    			dcst_buf[dtt_offset1] += d * hwind_cos * vwind_sin;
+    			dsst_buf[dtt_offset1] += d * hwind_sin * vwind_sin;
+    			dtt_offset = ( dtt_offset + ((dtt_offset_inc & 0xf) << 3)) & 0x3f;
+    			dtt_offset_inc >>= 4;
+    			local_row += 2;
+    			row_src +=2;
+    			if ((row_src >= 0) && (row_src <= max_pym1)){
+    				image_p += dstride2;
+    			}
+    		}
+        	__syncthreads();// __syncwarp();
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
+        	printf("\nFOLDing DTT Tiles,mono, npass=%d\n",npass);
+        	debug_print_clt1(clt_tile, color, 0xf);
+        }
+        __syncthreads();// __syncwarp();
+#endif
+
+        }
+//        __syncthreads();// __syncwarp();
+        // no need to clone calculated tile so each will be processed to CC, SC, CS, and SS in-place, will be done in dttiv_mono_2d(clt_tile);
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
+        	printf("\nFOLDED DTT Tiles,mono\n");
+        	debug_print_clt1(clt_tile, color, 0xf);
+        }
+        __syncthreads();// __syncwarp();
+#endif
+
+#ifdef DEBUG2
+        if ((threadIdx.x) == 0){
+        	printf("\nFOLDED DTT Tiles,mono\n");
+        	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+        }
+        __syncthreads();// __syncwarp();
+#endif
+
+        dttiv_mono_2d(clt_tile);
+
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
+        	printf("\nDTT Tiles after vertical pass (both passes), mono\n");
+        	debug_print_clt1(clt_tile, color, 0xf); // only 1 quadrant for R,B and 2 - for G
+        }
+        __syncthreads();// __syncwarp();
+#endif
+
+    } else { // if (is_mono) {
+    	float *dtt_buf =  clt_tile + threadIdx.x;
+#pragma unroll
+    	for (int i = 0; i < 4*DTT_SIZE; i++) {
+    		(*dtt_buf)  = 0.0f;
+    		dtt_buf +=    DTT_SIZE1;
+    	}
+    	__syncthreads();// __syncwarp();
+
+
+		int colorX = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
+    	int colorY = color; // 1; // color; // 2; // 0 - OK, 1 - OK, 2 - wrong
+    	int color0 = colorX & 1;
+    	int color1 = (colorX >>1) & 1;

    	for (int gpass = 0; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
+//       	for (int gpass = 0; gpass < 1; gpass++) { // Only once for R, B, twice - for G
+//    	for (int gpass = 1; gpass < (color1 + 1); gpass++) { // Only once for R, B, twice - for G
    		int col_tl = int_topleft[0]; //  + (threadIdx.x << 1);
    		int row_tl = int_topleft[1];
    		// for red, blue and green, pass 0
    		int local_col = ((col_tl & 1) ^ (BAYER_RED_COL ^ color0 ^ color1 ^ gpass)) + (threadIdx.x << 1); // green red row: invert column from red
    		int local_row = ((row_tl & 1) ^ BAYER_RED_ROW ^ color0          ^ gpass);                            // use red row
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
+        	printf("\ngpass= %d, local_row=%d, local_col=%d\n",gpass, local_row, local_col);
+        }
+#endif
+        __syncthreads();// __syncwarp();
+
    		float hwind_cos = window_hor_cos[local_col];
    		float hwind_sin = window_hor_sin[local_col]; // **** only used for green

@@ -4166,6 +4281,7 @@ __device__ void convertCorrectTile(
    		float *dct_buf = clt_tile + ((gpass << 1) * (DTT_SIZE * DTT_SIZE1));
    		float *dst_buf = clt_tile + (((gpass << 1) + 1) * (DTT_SIZE * DTT_SIZE1));   // **** only used for green

+    		// replace pixels outside input window
    		int col_src = col_tl + local_col;
    		if (col_src < 0) {
    			col_src &= 1; // same Bayer
@@ -4179,8 +4295,9 @@ __device__ void convertCorrectTile(
    		} else if (row_use > max_py){
    			row_use = (row_use & 1) + max_pym1;
    		}
+//           __syncthreads();// worse

-//		float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
+    		//		float *image_p = gpu_images + dstride * (row_tl + local_row)+ col_tl + local_col;
    		float *image_p = gpu_images + dstride * row_use + col_src;
 #pragma unroll
    		for (int i = 0; i < 8; i++) {
@@ -4198,16 +4315,17 @@ __device__ void convertCorrectTile(
    				image_p += dstride2;
    			}
    		}
+//        	__syncthreads();// __syncwarp();
    	}
    	__syncthreads();// __syncwarp();
-#ifdef DEBUG2
-    if ((threadIdx.x == 0) && (color == BAYER_GREEN)){
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
    		printf("\nFOLDED DTT Tiles Green before reduction\n");
    		debug_print_clt1(clt_tile, color, 0xf); // all quadrants for green only
    	}
    	__syncthreads();// __syncwarp();
 #endif
-     if (color == BAYER_GREEN) {
+    	if (colorY == BAYER_GREEN) {
    		// reduce 4 green DTT buffers into 2 (so free future rotated green that were borrowed)
    		float *dtt_buf =  clt_tile + threadIdx.x;
    		float *dtt_buf1 = dtt_buf+ (2 * DTT_SIZE1 * DTT_SIZE); // ((float *) clt_tile[2]) + threadIdx.x;
@@ -4229,23 +4347,22 @@ __device__ void convertCorrectTile(
 #endif
    	dttiv_color_2d(
    			clt_tile,
-     		color);
-
-#ifdef DEBUG2
-    if ((threadIdx.x) == 0){
+				colorY);
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
    		printf("\nDTT Tiles after vertical pass (both passes), color = %d\n",color);
-    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+        	debug_print_clt1(clt_tile, color, 0xf); // only 1 quadrant for R,B and 2 - for G
        }
        __syncthreads();// __syncwarp();
 #endif

-// Replicate DTT, so non-bayer can still use same in-place rotation code
+    	// Replicate DTT, so non-bayer can still use same in-place rotation code
    	float *src, *dst;
    	int   negate; // , dst_inc;

    	// Replicate horizontally (for R and B only):
-    if (color != BAYER_GREEN) {
-    	negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ color)) << 1); // +1/-1
+    	if (colorY != BAYER_GREEN) {
+    		negate = 1-(((int_topleft[0] & 1) ^ (BAYER_RED_COL ^ colorY)) << 1); // +1/-1
    		src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x    ];
    		dst = clt_tile + (DTT_SIZE1 * DTT_SIZE) + (threadIdx.x ^ 7); // &clt_tile[1][0][threadIdx.x ^ 7];
 #pragma unroll
@@ -4265,7 +4382,7 @@ __device__ void convertCorrectTile(

    	}
    	// replicate all colors down diagonal
-	negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (color >> 1))) << 1); // +1/-1 // 1 -
+    	negate = 1-(((int_topleft[0] & 1) ^ (int_topleft[1] & 1) ^ (BAYER_RED_COL ^ BAYER_RED_ROW ^ (colorY >> 1))) << 1); // +1/-1 // 1 -
    	// CC -> SS
    	src = clt_tile + threadIdx.x; // &clt_tile[0][0][threadIdx.x    ];
    	dst = clt_tile + (DTT_SIZE1 * (DTT_SIZE * 3 + 7)) +  (threadIdx.x ^ 7); // &clt_tile[3][7][threadIdx.x ^ 7];
@@ -4278,14 +4395,15 @@ __device__ void convertCorrectTile(
    	//SC -> CS
    	src = clt_tile + (DTT_SIZE1 * DTT_SIZE) + threadIdx.x; // &clt_tile[1][0][threadIdx.x    ];
    	dst = clt_tile + (DTT_SIZE1 * (DTT_SIZE * 2 + 7)) +  (threadIdx.x ^ 7); // &clt_tile[2][7][threadIdx.x    ];
+    	__syncthreads();// did not help
 #pragma unroll
    	for (int i = 0; i < DTT_SIZE; i++){
    		*dst = negate*(*src);
    		src += DTT_SIZE1;
    		dst -= DTT_SIZE1;
    	}
-#ifdef DEBUG2
-    	if ((threadIdx.x) == 0){
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
    		printf("\nDTT Tiles after all replicating, color = %d\n",color);
    		debug_print_clt1(clt_tile, color, 0xf);
    	}
@@ -4294,14 +4412,27 @@ __device__ void convertCorrectTile(



-#ifdef DEBUG2
-    if ((threadIdx.x) == 0){
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
    		printf("\nKernel tiles to convolve, color = %d\n",color);
    		debug_print_clt1(clt_kernels, color, 0xf); // all colors, all quadrants
    	}
    	__syncthreads();// __syncwarp();
 #endif

+    	__syncthreads();// did not help
+
+    } //  else { // if (is_mono) {
+
+#ifdef DEBUG30
+        if (dbg_tile && (threadIdx.x) == 0){
+    		printf("\nDTT Tiles after before convolving, color = %d\n",color);
+    		debug_print_clt1(clt_tile, color, 0xf);
+    	}
+    	__syncthreads();// __syncwarp();
+#endif
+
+

    // convolve first, then rotate to match Java and make it easier to verify
    convolveTiles(

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -951,6 +951,7 @@ __device__ void dttiv_color_2d(
    dctiv_nodiverg( // all colors
 			clt_tile + (DTT_SIZE1 * threadIdx.x), // [0][threadIdx.x], // pointer to start of row
 			1); //int inc);
+//	__syncthreads();// worsened
    if (color == BAYER_GREEN){
        dstiv_nodiverg( // all colors
 				clt_tile + DTT_SIZE1 * threadIdx.x + DTT_SIZE1 * DTT_SIZE, // clt_tile[1][threadIdx.x], // pointer to start of row
@@ -969,6 +970,7 @@ __device__ void dttiv_color_2d(
    dctiv_nodiverg( // all colors
    		clt_tile + threadIdx.x, //  &clt_tile[0][0][threadIdx.x], // pointer to start of column
 			DTT_SIZE1); // int inc,
+//	__syncthreads();// worsened
    if (color == BAYER_GREEN){
          dctiv_nodiverg( // all colors
        		clt_tile + threadIdx.x + (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
@@ -977,6 +979,50 @@ __device__ void dttiv_color_2d(
  	 __syncthreads();// __syncwarp();
 }

+__device__ void dttiv_mono_2d(
+		float * clt_tile)
+{
+	// Copy 0-> 1
+
+    dctiv_nodiverg(
+			clt_tile + (DTT_SIZE1 * threadIdx.x) + (0 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dstiv_nodiverg(
+    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (1 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dctiv_nodiverg(
+			clt_tile + (DTT_SIZE1 * threadIdx.x) + (2 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+    dstiv_nodiverg(
+    		clt_tile + (DTT_SIZE1 * threadIdx.x) + (3 * DTT_SIZE1 * DTT_SIZE),
+			1); //int inc);
+	__syncthreads();// __syncwarp();
+
+#ifdef DEBUG222
+    if ((threadIdx.x) == 0){
+        printf("\nDTT Tiles after horizontal pass, color=%d\n",color);
+    	debug_print_clt1(clt_tile, color, (color== BAYER_GREEN)?3:1); // only 1 quadrant for R,B and 2 - for G
+    }
+     __syncthreads();// __syncwarp();
+#endif
+
+ 	dctiv_nodiverg( // CC
+ 			clt_tile + threadIdx.x,
+			DTT_SIZE1); // int inc,
+ 	dctiv_nodiverg( // SC
+ 			clt_tile + threadIdx.x + 1 * (DTT_SIZE1 * DTT_SIZE),
+			DTT_SIZE1); // int inc,
+ 	dstiv_nodiverg( // CS
+ 			clt_tile + threadIdx.x + 2 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+			DTT_SIZE1); // int inc,
+ 	dstiv_nodiverg( // SS
+ 			clt_tile + threadIdx.x + 3 * (DTT_SIZE1 * DTT_SIZE), // &clt_tile[1][0][threadIdx.x], // pointer to start of column
+			DTT_SIZE1); // int inc,
+  	 __syncthreads();// __syncwarp();
+}
+
+
+
 //
 // Uses 16 threads, gets 4*8*8 clt tiles, performs idtt-iv (swapping 1 and 2 quadrants) and then unfolds with window,
 // adding to the output 16x16 tile (to use Read-modify-write with 4 passes over the frame. Should be zeroed before the

--- a/src/dtt8x8.h
+++ b/src/dtt8x8.h
@@ -88,6 +88,8 @@ extern __device__ void dttii_2d(
 extern __device__ void dttiv_color_2d(
 		float * clt_tile,
 		int color);
+extern __device__ void dttiv_mono_2d(
+		float * clt_tile);
 extern __device__ void imclt(
 		float * clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
 		float * mclt_tile );

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -34,7 +34,7 @@
 #define NOCORR_TD
 #define NOTEXTURES
 #define NOTEXTURE_RGBA
-
+#define SAVE_CLT

 #include <stdio.h>
 #include <stdlib.h>
@@ -330,7 +330,7 @@ int main(int argc, char **argv)
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn15.portsxy"};

 //#ifndef DBG_TILE
-    /*
+#ifdef SAVE_CLT
    const char* ports_clt_file[] = { // never referenced
    		"/home/eyesis/git/tile_processor_gpu/clt/aux_chn0.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn1.clt",
@@ -348,7 +348,8 @@ int main(int argc, char **argv)
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn13.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn14.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn15.clt"};
-*/
+
+#endif
    const char* result_rbg_file[] = {
    		"/home/eyesis/git/tile_processor_gpu/clt/aux_chn0.rbg",
 			"/home/eyesis/git/tile_processor_gpu/clt/aux_chn1.rbg",
@@ -401,15 +402,13 @@ int main(int argc, char **argv)
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn1.portsxy",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn2.portsxy",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn3.portsxy"};
-
-//#ifndef DBG_TILE
-    /*
+#ifdef SAVE_CLT
    const char* ports_clt_file[] = { // never referenced
    		"/home/eyesis/git/tile_processor_gpu/clt/main_chn0.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn1.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn2.clt",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn3.clt"};
-*/
+#endif
    const char* result_rbg_file[] = {
    		"/home/eyesis/git/tile_processor_gpu/clt/main_chn0.rbg",
 			"/home/eyesis/git/tile_processor_gpu/clt/main_chn1.rbg",
@@ -1133,12 +1132,12 @@ int main(int argc, char **argv)
 				gpu_clt_h[ncam],
 				rslt_size * sizeof(float),
    			cudaMemcpyDeviceToHost));
-#ifndef DBG_TILE
+//#ifndef DBG_TILE
        printf("Writing CLT data to %s\n",  ports_clt_file[ncam]);
    	writeFloatsToFile(cpu_clt, // float *       data, // allocated array
    			rslt_size, // int           size, // length in elements
 				ports_clt_file[ncam]); // 			   const char *  path) // file path
-#endif
+//#endif
    }
 #endif


--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -140,11 +140,12 @@
 // geom
 //#define DEBUG20 1

+
 #if (DBG_TILE_X >= 0) && (DBG_TILE_Y >= 0)
 #define DEBUG20 1 // Geometry Correction
 #define DEBUG21 1 // Geometry Correction
 //#define DEBUG210 1
-
+#define DEBUG30 1
 //#define DEBUG22 1
 //#define DEBUG23 1