debugging texture generation

29147908 · Andrey Filippov · b5f12c63 · 29147908 · 29147908 · 29147908
Commit 29147908 authored Dec 07, 2021 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 615 additions and 215 deletions

TileProcessor.cuh src/TileProcessor.cuh +338 -187

test_tp.cu src/test_tp.cu +272 -26

tp_defines.h src/tp_defines.h +5 -2

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1108,7 +1108,7 @@ extern "C" __global__ void corr2D_combine_inner(
 		const size_t      corr_stride_combo,  // (in floats) stride for the output TD correlations (same as input)
 		float           * gpu_corrs_combo);    // combined correlation output (one per tile)

-extern "C" __global__ void textures_accumulate(
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int               num_cams,           // number of cameras used
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -1131,7 +1131,7 @@ extern "C" __global__ void textures_accumulate(
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
-		float           * gpu_diff_rgb_combo, //'); // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+		float           * gpu_diff_rgb_combo, //) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 		int               tilesx);

 __device__ int get_textures_shared_size( // in bytes
@@ -2455,7 +2455,7 @@ __global__ void index_direct(
 * (i.e. colors x 16 x 16 per each tile in the list ) texture tile generation
 *
 * @param num_cams         number of cameras <= NUM_CAMS
- * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ * @param gpu_ftasks           flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 * @param num_tiles            number of tiles int gpu_tasks array prepared for processing
 * @param width                number of tiles in a row
 * @param nonoverlap_list      integer array to place the generated list
@@ -2847,7 +2847,13 @@ extern "C" __global__ void textures_nonoverlap(
 				 num_cams,     // int                num_cams,     // actual number of cameras
 				 colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 				 0);           // int *              offsets);     // in floats
-		 textures_accumulate <<<grid_texture,threads_texture, shared_size>>>(
+#ifdef DEBUG7A
+		printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, colors);
+
+		__syncthreads();
+
+#endif
+		 textures_accumulate <<<grid_texture,threads_texture,  shared_size>>>( // 65536>>>( //
 				 num_cams,                        // 	int               num_cams,           // number of cameras used
 				 (int *) 0,                       // int             * woi,                // x, y, width,height
 				 gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
@@ -2933,6 +2939,12 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 {
 	// will process exactly 4 cameras at a time in one block,
 	// so imclt is executed sequentially for each group of 4 cameras
+///	if ((threadIdx.x == 0) && (threadIdx.y == 0)){
+///		printf("DONE\n");
+///	}
+///	__syncthreads();
+///	return;
+
 	int offsets [9];
 	int shared_size = get_textures_shared_size( // in bytes
 			 num_cams,     // int                num_cams,     // actual number of cameras
@@ -2950,25 +2962,46 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		return; // nothing to do
 	}
 	int tile_num = tile_code >> CORR_NTILE_SHIFT;
-#ifdef DEBUG22
-		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\n1. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
+#ifdef DEBUG7A
+	__syncthreads();// __syncwarp();
+	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		printf("textures_accumulate: diff_sigma =     %f\n",    diff_sigma);
+		printf("textures_accumulate: diff_threshold = %f\n",diff_threshold);
+		printf("textures_accumulate: min_agree =      %f\n",     min_agree);
+		printf("textures_accumulate: weights[0] =     %f\n",weights[0]);
+		printf("textures_accumulate: weights[1] =     %f\n",weights[1]);
+		printf("textures_accumulate: weights[2] =     %f\n",weights[2]);
+		printf("textures_accumulate: dust_remove =    %d\n",dust_remove);
+		printf("textures_accumulate: keep_weights =   %d\n",keep_weights);
+	}
+#endif //DEBUG7A

+#ifdef DEBUG7A // 22
+	if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		for (int i = 0; i <9; i++){
+			printf(" offsets[%d] = 0x%x\n",i,offsets[i]);
+		}
+	}
+	__syncthreads();
+#endif	// #ifdef DEBUG22
+#ifdef DEBUG7AXX // 22
+		if ((tile_num == DBG_TILE)) { //  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n1. tile_indx=%d, tile_num=%d threadIdx.x = %d threadIdx.y =%d\n",tile_indx,tile_num,threadIdx.x,threadIdx.y);
 		}
 		__syncthreads();
 #endif	// #ifdef DEBUG22
 	extern __shared__ float all_shared[];
-	float * mclt_tiles =       &all_shared[0] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-	float * clt_tiles  =       &all_shared[1] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9  = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
-	float * mclt_debayer =     &all_shared[1] ; // [num_cams][colors][MCLT_UNION_LEN]; //  16 * 1 * 16 * 18  = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
-	float * mclt_tmps =        &all_shared[2] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
-	float * rgbaw =            &all_shared[2] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
-
-	float * port_offsets =     &all_shared[3] ; // [num_cams][2];          // 16 * 2 = 0x20 | 4*2 = 0x8
-	float * ports_rgb_shared = &all_shared[4] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
-	float * max_diff_shared =  &all_shared[5] ; // [num_cams];             // 16 = 0x10     | 4 = 0x4     | return to system memory (optionally pass null to skip calculation)
-	float * max_diff_tmp =     &all_shared[6] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
-	float * ports_rgb_tmp =    &all_shared[7] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8  = 0x60 |  [4*3][8]
+	float * mclt_tiles =       &all_shared[offsets[0]] ; // [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
+	float * clt_tiles  =       &all_shared[offsets[1]] ; // [num_cams][colors][4][DTT_SIZE][DTT_SIZE1]; // 16 * 1 * 4 * 8 * 9  = 0x1200 | 4 * 3 * 4 * 8 * 9 = 0xd80
+	float * mclt_debayer =     &all_shared[offsets[1]] ; // [num_cams][colors][MCLT_UNION_LEN]; //  16 * 1 * 16 * 18  = 0x1200 | 4 * 3 * 16 * 18 = 0xd80 | to align with clt_tiles
+	float * mclt_tmps =        &all_shared[offsets[2]] ; // [num_cams][colors][DTT_SIZE2][DTT_SIZE21]; // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
+	float * rgbaw =            &all_shared[offsets[2]] ; // [colors + 1 + num_cams + colors + 1][DTT_SIZE2][DTT_SIZE21];
+
+	float * port_offsets =     &all_shared[offsets[3]] ; // [num_cams][2];          // 16 * 2 = 0x20 | 4*2 = 0x8
+	float * ports_rgb_shared = &all_shared[offsets[4]] ; // [colors][num_cams]; // 16 * 1 = 0x10 | 4 * 3 = 0xc | return to system memory (optionally pass null to skip calculation)
+	float * max_diff_shared =  &all_shared[offsets[5]] ; // [num_cams];             // 16 = 0x10     | 4 = 0x4     | return to system memory (optionally pass null to skip calculation)
+	float * max_diff_tmp =     &all_shared[offsets[6]] ; // [num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 8 = 0x80 | 4 * 8 = 0x20 | [4][8]
+	float * ports_rgb_tmp =    &all_shared[offsets[7]] ; // [colors][num_cams][TEXTURE_THREADS_PER_TILE]; // 16 * 1 * 8 = 0x80 | 4 * 3 * 8  = 0x60 |  [4*3][8]

 //	__shared__ float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21];  // 16*1*16*17=0x1100 | 4*3*16*17=0xcc0
 //	__shared__ union {
@@ -2991,8 +3024,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)


 #ifdef DBG_TILE
-#ifdef DEBUG7
-	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+#ifdef DEBUG7AXX
+	if (tile_num == DBG_TILE){ // }  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 		printf("\ntextures_gen tile = %d\n",tile_num);
 		//    	debug_print_clt1(clt_tile1, color,  0xf); //
 		//        printf("\textures_gen tile = %d, pair=%d, color = %d   CAMERA22\n",tile_num, corr_pair,color);
@@ -3009,6 +3042,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		if (threadIdx.x < 2){ // not more than 16 sensors, not less than
 			port_offsets[camera_num * 2 + threadIdx.x] =   gpu_geometry_correction->rXY[camera_num][threadIdx.x];
 		}
+		__syncthreads();// __syncwarp(); // is it needed?

 		for (int color = 0; color < colors; color++){
 			//        int offs = (tile_num * NUM_COLORS + color) * (4 * DTT_SIZE * DTT_SIZE);
@@ -3022,9 +3056,9 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			float * clt_tile =  clt_tiles + cam_col * 2 * DTT_SIZE * DTT_SIZE21; // start of 4 * DTT_SIZE * DTT_SIZE block, no threadIdx.x here
 			float * clt_tilei = clt_tile + threadIdx.x; // threadIdx.x = 0..7 here
 			float * gpu_tile = ((float *) gpu_clt[camera_num]) +  (tile_num * colors + color) * (4 * DTT_SIZE * DTT_SIZE) + threadIdx.x;
-			float * mclt_tile = mclt_tiles +    (camera_num * colors+ color) * 2 * DTT_SIZE * DTT_SIZE21;
-			float * mclt_dst =  mclt_debayer +  (camera_num* colors + color) * MCLT_UNION_LEN; // 16 * 18
-			float * mclt_tmp =  mclt_tmps +     (camera_num* colors + color) * DTT_SIZE2 * DTT_SIZE21;
+			float * mclt_tile = mclt_tiles +    (camera_num * colors + color) * 2 * DTT_SIZE * DTT_SIZE21;
+			float * mclt_dst =  mclt_debayer +  (camera_num * colors + color) * MCLT_UNION_LEN; // 16 * 18
+			float * mclt_tmp =  mclt_tmps +     (camera_num * colors + color) * DTT_SIZE2 * DTT_SIZE21;
 			// no camera_num below
 #pragma unroll
 			for (int q = 0; q < 4; q++) {
@@ -3039,7 +3073,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				}
 			}
 			__syncthreads();
-#ifdef DEBUG7
+#ifdef DEBUG7AXXX
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 				printf("\ntextures_gen LPF for color = %d\n",color);
 				debug_print_lpf(lpf_data[(colors > 1)? color : 3]);
@@ -3050,7 +3084,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			__syncthreads();// __syncwarp();
 #endif

-#ifdef DBG_TILE		// perform idct
+#ifdef DBG_TILEXXX		// perform idct
 			imclt8threads(
 					0,          // int     do_acc,     // 1 - add to previous value, 0 - overwrite
 					clt_tile,   //        [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports [4][8][9]
@@ -3064,17 +3098,19 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 					0);
 #endif
 			__syncthreads();// __syncwarp();
-#ifdef DEBUG7
+#ifdef DEBUG7AXXX
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-				printf("\ntextures_gen mclt color = %d\n",color);
-				debug_print_mclt(
-						mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-						color);
+				for (int ncam = camera_num_offs; ncam < (camera_num_offs + 4); ncam++){
+					printf("\ntextures_gen mclt camera = % d,  color = %d\n",ncam, color);
+					debug_print_mclt(
+							mclt_tile + (ncam * colors + color) * 2 * DTT_SIZE * DTT_SIZE21, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							color);
+				}
 			}
 			__syncthreads();// __syncwarp();
 #endif
 			if (colors > 1) {
-#ifdef DBG_TILE
+#ifdef DBG_TILE_XXX
 				debayer_shot(
 						(color < 2), // const int rb_mode,    // 0 - green, 1 - r/b
 						min_shot,    // float     min_shot,   // 10.0
@@ -3098,11 +3134,42 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				// copy? - no, just remember to use mclt_tile, not mclt_dst
 				// will have to copy mclt_tiles -> mclt_dst as they have different gaps
 				// untested copy for mono mode
-#pragma unroll
+#ifdef DEBUG7AXXX
+				if (tile_num == DBG_TILE) {
+//					for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
+					int n = 0;
+						printf("textures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, threadIdx.x= %d, n=%d, msp=0x%x, dst=0x%x\n",
+								camera_num_offs,threadIdx.y, threadIdx.x, n,
+								(int) (mclt_tile + threadIdx.x + n), (int)(mclt_dst +  threadIdx.x + n));
+//					}
+
+				}
+				__syncthreads();// __syncwarp();
+#endif
+
+
+#ifdef DEBUG7AXX // Good here
+			if (tile_num == DBG_TILE) {
+				for (int ccam = 0; ccam < num_cams; ccam++) {
+					if ((threadIdx.x == 0) && (camera_num == ccam)){
+						printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
+						debug_print_mclt( // broken for camera 1
+								mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+								-1);
+					}
+					__syncthreads();// __syncwarp();
+				}
+			}
+			__syncthreads();// __syncwarp();
+#endif
+
+
+
+//#pragma unroll
 				for (int n = 0; n <= DTT_SIZE; n += DTT_SIZE){
 					float * msp = mclt_tile + threadIdx.x + n;
 					float * dst = mclt_dst +  threadIdx.x + n;
-#pragma unroll
+//#pragma unroll
 					for (int row = 0; row < DTT_SIZE2; row++){
 						*dst = *msp;
 						msp += DTT_SIZE21;
@@ -3111,22 +3178,29 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				}
 				__syncthreads();
 			}
-#ifdef DEBUG77
-			//		float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+#ifdef DEBUG7AXXX
+			if (tile_num == DBG_TILE) {
+				for (int ccam = 0; ccam < num_cams; ccam++) {
+					if ((threadIdx.x == 0) && (camera_num == ccam)){
+						printf("\ntextures_gen mclt_tile camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
+						debug_print_mclt( // broken for camera 1
+								mclt_tile, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+								-1);

-			for (int ccam = 0; ccam < num_cams; ccam++) {
-				if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
-					printf("\ntextures_gen AFTER DEBAER cam= %d, color = %d\n",threadIdx.y, color);
-					debug_print_mclt(
-							mclt_dst, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
-							-1);
+						printf("\ntextures_gen AFTER DEBAER camera_num_offs= %d threadIdx.y= %d, color = %d\n",camera_num_offs,threadIdx.y, color);
+						debug_print_mclt(
+								mclt_dst, //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+								-1);
+						/*
 					printf("\ntextures_gen AFTER DEBAER0 cam= %d, color = %d\n",threadIdx.y, 0);
 					debug_print_mclt(
-							(float *) shr.mclt_debayer[ccam][0], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							mclt_debayer + (ccam * colors * MCLT_UNION_LEN), //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 							-1);
+						 */

+					}
+					__syncthreads();// __syncwarp();
 				}
-				__syncthreads();// __syncwarp();
 			}
 			__syncthreads();// __syncwarp();
 #endif
@@ -3134,15 +3208,15 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)

 		__syncthreads(); // __syncwarp();
 		///	return;
-#ifdef DEBUG77
+#ifdef DEBUG7AXXX
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			for (int ccam = 0; ccam < num_cams; ccam++) {
 				//		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
 				for (int nncol = 0; nncol < colors; nncol++){
-					printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
+					printf("\ntextures_gen AFTER DEBAER1 camera_num_offs = %d, cam= %d, color = %d\n", camera_num_offs, ccam, nncol);
 					//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
 					debug_print_mclt(
-							(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 							-1);
 				}
 			}
@@ -3150,7 +3224,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		__syncthreads();// __syncwarp();
 #endif

-#ifdef DEBUG77
+#ifdef DEBUG7AXXX
 		//#ifdef DEBUG22
 		for (int ccam = 0; ccam < num_cams; ccam++) {
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
@@ -3158,7 +3232,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 					printf("\ntextures_gen AFTER DEBAER1 cam= %d, color = %d\n",ccam, nncol);
 					//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
 					debug_print_mclt(
-							(float *) shr.mclt_debayer[ccam][nncol], //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							mclt_debayer+ ((ccam * colors + nncol) * MCLT_UNION_LEN), //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
 							-1);
 				}
 			}
@@ -3168,6 +3242,24 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 #endif
 		//	__shared__ float mclt_tiles [num_cams][colors][2*DTT_SIZE][DTT_SIZE21];
 	} // end of sequential camera group: for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
+
+#ifdef DEBUG7A
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			for (int ccam = 0; ccam < num_cams; ccam++) {
+				//		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == ccam)){
+				for (int nncol = 0; nncol < colors; nncol++){
+					printf("\ntextures_gen AFTER DEBAYERs all cameras cam= %d, color = %d\n", ccam, nncol);
+					//				float * mclt_dst =  (float *) shr.mclt_debayer[camera_num][color];
+					debug_print_mclt(
+							mclt_debayer + ((ccam * colors + nncol) * MCLT_UNION_LEN), //         [4][DTT_SIZE][DTT_SIZE1], // +1 to alternate column ports)
+							-1);
+				}
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif
+
+
 #ifdef DBG_TILE
 		int debug = (tile_num == DBG_TILE);
 #else
@@ -3237,7 +3329,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			}

 		}
-#ifdef DEBUG7
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
 		}
@@ -3252,7 +3344,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 	}

 	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
-#ifdef DEBUG12
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 //			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
 //					tile_num, (int) tile_code, border_tile);
@@ -3261,7 +3353,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			for (int ncol = 0; ncol <= colors; ncol++) {
 				printf("\ntile[%d]\n",ncol);
 				debug_print_mclt(
-						(float *) (shr1.rgbaw[ncol]),
+//						(float *) (shr1.rgbaw[ncol]),
+						rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
 						-1);
 			}
 		}
@@ -3296,7 +3389,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE; //  - (DTT_SIZE/2); // may be negative == -4
 ///		int height = *(woi + 3) << DTT_SIZE_LOG2;

-#ifdef DEBUG12
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
 					tileX, tileY, tile_x0, tile_y0, slice_stride);
@@ -3304,7 +3397,8 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			for (int ncol = 0; ncol <= colors; ncol++) {
 				printf("\ntile[%d]\n",ncol);
 				debug_print_mclt(
-						(float *) (shr1.rgbaw[ncol]),
+//						(float *) (shr1.rgbaw[ncol]),
+						rgbaw + (ncol + (DTT_SIZE2 * DTT_SIZE21)),
 						-1);
 			}
 		}
@@ -3319,7 +3413,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 			int gi = g_row * texture_rbg_stride  + g_col; // offset to the top left corner
 			float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
 			float * rgba_i = rgbaw + i;
-#ifdef DEBUG12
+#ifdef DEBUG7A
 			if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 				printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
 						pass, row, col, g_row, g_col, i, gi);
@@ -3348,46 +3442,47 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 	} // 	if (gpu_texture_rbg) { // generate RGBA
 	if (calc_extra){ // gpu_diff_rgb_combo
 		__syncthreads(); // needed?
-#ifdef DEBUG22
+#ifdef DEBUG7A
 		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\n3. tile_indx=%d, tile_num=%d\n",tile_indx,tile_num);
-			printf("max_diff: %f, %f, %f, %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
-			printf("R:        %f, %f, %f, %f\n",ports_rgb_shared[0][0],ports_rgb_shared[0][1],ports_rgb_shared[0][2],ports_rgb_shared[0][3]);
-			printf("B:        %f, %f, %f, %f\n",ports_rgb_shared[1][0],ports_rgb_shared[1][1],ports_rgb_shared[1][2],ports_rgb_shared[1][3]);
-			printf("G:        %f, %f, %f, %f\n",ports_rgb_shared[2][0],ports_rgb_shared[2][1],ports_rgb_shared[2][2],ports_rgb_shared[2][3]);
-
-			printf("\n 3. max_diff\n");
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf    ("max_diff: ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
+			for (int ccol = 0; ccol < colors; ccol++){
+				printf("color%d:   ",ccol);for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ccol * num_cams +ccam]);} printf("\n");
+			}
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
 			}
 			for (int ncol = 0; ncol < colors; ncol++){
-				printf("\n%d:total %f %f %f %f\n",
-						ncol,
-						ports_rgb_shared[ncol][0],
-						ports_rgb_shared[ncol][1],
-						ports_rgb_shared[ncol][2],
-						ports_rgb_shared[ncol][3]);
+				printf("\n%d:total ",ncol);
+				for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams +ccam]);} printf("\n");
 				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-					printf("tmp[%d] %f %f %f %f\n",
-							i,
-							ports_rgb_tmp[ncol][0][i],
-							ports_rgb_tmp[ncol][1][i],
-							ports_rgb_tmp[ncol][2][i],
-							ports_rgb_tmp[ncol][3][i]);
+					printf("tmp[%d] ",i);
+					for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
 				}
 			}
+		}
+		__syncthreads();
+//DBG_TILE
+#endif// #ifdef DEBUG7A


+#ifdef DEBUG7A
+		if ((tile_num == DBG_TILE) && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\n4. tile_indx=%d, tile_num=%d, DBG_TILE = %d\n",tile_indx,tile_num, DBG_TILE);
 		}
 		__syncthreads();
 //DBG_TILE
-#endif// #ifdef DEBUG22
+#endif// #ifdef DEBUG7A
+

 		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
 			int camera_num = threadIdx.y + camera_num_offs;
 //			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * NUM_CAMS* (colors + 1) + camera_num;
-			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;
+//			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_indx * num_cams* (colors + 1) + camera_num;// tile_num
+
+// Maybe needs to be changed back if output data should match tile index in task list, not the tile absolute position
+
+			float * pdiff_rgb_combo = gpu_diff_rgb_combo + tile_num  * num_cams* (colors + 1) + camera_num;//
 			if (threadIdx.x == 0){
 				*pdiff_rgb_combo = max_diff_shared[camera_num];
 			}
@@ -3396,22 +3491,25 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 				*(pdiff_rgb_combo + (threadIdx.x + 1) * num_cams) = ports_rgb_shared[threadIdx.x * num_cams + camera_num];// [color][camera]
 			}
 		}
-	}
+	} // if (calc_extra){ // gpu_diff_rgb_combo
 } // textures_accumulate()

 __device__ int get_textures_shared_size( // in bytes
+//__device__ int get_textures_shared_size( // in bytes
 	    int                num_cams,     // actual number of cameras
 	    int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
 		int *              offsets){     // in floats
-	int shared_floats = 0;
+//	int shared_floats = 0;
 	int offs = 0;
 //	int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
 	if (offsets) offsets[0] = offs;
-	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
+	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles         [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
 	if (offsets) offsets[1] = offs;
-	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles  [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
+	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles         [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
 	if (offsets) offsets[2] = offs;
-	offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;    //float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
+	int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;                // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
+	int rgbaw_size =    (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21;     // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
+	offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
 	if (offsets) offsets[3] = offs;
 	offs += num_cams * 2;                                      // float port_offsets      [NUM_CAMS][2];
 	if (offsets) offsets[4] = offs;
@@ -3421,9 +3519,9 @@ __device__ int get_textures_shared_size( // in bytes
 	if (offsets) offsets[6] = offs;
 	offs += num_cams * TEXTURE_THREADS_PER_TILE;               // float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
 	if (offsets) offsets[7] = offs;
-	offs += offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
+	offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
 	if (offsets) offsets[8] = offs;
-	return sizeof(float) * shared_floats;
+	return sizeof(float) * offs; // shared_floats;

 }

@@ -4771,10 +4869,10 @@ __device__ void debayer( // 8 threads
 * @param calc_extra           calculate ports_rgb, max_diff. If not null - will ignore rbg_tile, so this mode
 *                             should not be combined with texture generation. It is intended to generate a
 *                             lo-res (1/8) images for macro correlation
- * @param ports_rgb_shared     shared memory data to be used to return lo-res images tile average color [NUM_COLORS][NUM_CAMS]
- * @param max_diff_shared      shared memory data to be used to return lo-res images tile mismatch form average [NUM_CAMS]
- * @param max_diff_tmp         shared memory to be used here for temporary storage [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
- * @param ports_rgb_tmp        shared memory to be used here for temporary storage [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE], [4*3][8]
+ * @param ports_rgb_shared     shared memory data to be used to return lo-res images tile average color [NUM_COLORS * NUM_CAMS]
+ * @param max_diff_shared      shared memory data to be used to return lo-res images tile mismatch from average [NUM_CAMS]
+ * @param max_diff_tmp         shared memory to be used here for temporary storage [NUM_CAMS * TEXTURE_THREADS_PER_TILE]
+ * @param ports_rgb_tmp        shared memory to be used here for temporary storage [NUM_COLORS *NUM_CAMS * TEXTURE_THREADS_PER_TILE], [4*3][8]
 * @param port_offsets         [port]{x_off, y_off} - just to scale pixel value differences (quad - {{-0.5, -0.5},{0.5,-0.5},{-0.5,0.5},{0.5,0.5}}
 * @param diff_sigma           pixel value/pixel change (1.5)
 * @param diff_threshold       pixel value/pixel change (10)
@@ -4843,9 +4941,18 @@ __device__ void tile_combine_rgba(
 	}
 	int colors_offset = colors * MCLT_UNION_LEN; // padded in union !

-#ifdef DEBUG8
+#ifdef DEBUG7A
 	__syncthreads();// __syncwarp();
 	if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+		printf("diff_sigma = %f\n",    diff_sigma);
+		printf("diff_threshold = %f\n",diff_threshold);
+		printf("min_agree = %f\n",     min_agree);
+		printf("chn_weights[0] = %f\n",chn_weights[0]);
+		printf("chn_weights[1] = %f\n",chn_weights[1]);
+		printf("chn_weights[2] = %f\n",chn_weights[2]);
+		printf("dust_remove =    %d\n",dust_remove);
+		printf("keep_weights =   %d\n",keep_weights);
+
 		printf("\ntile_combine_rgba ksigma = %f\n",ksigma);
 		for (int i = 0; i < indx; i++) {
 			printf("%02d: %d :%d %f\n",i,pair_ports[i][0], pair_ports[i][1], pair_dist2r[i]);
@@ -4941,10 +5048,10 @@ __device__ void tile_combine_rgba(
 		float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQ[col_sym];
 		float wnd2_inv = 1.0/wnd2;

-#pragma unroll
+//#pragma unroll
 		for (int ipair = 0; ipair < (num_cams*(num_cams-1)/2); ipair++){
 			float d = 0;
-#pragma unroll // non-constant
+//#pragma unroll // non-constant
 			for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
 				//					double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i];
 				float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
@@ -4993,7 +5100,7 @@ __device__ void tile_combine_rgba(
 		float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)));
 		float w2 = 1.0 - w1;
 		float * rgba_i = rgba + i;
-#pragma unroll  // non-constant
+//#pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
 			float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
 			* (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))=
@@ -5157,7 +5264,7 @@ __device__ void tile_combine_rgba(
 //		}
 		float a = 0;

-#pragma unroll
+//#pragma unroll
 		for (int cam = 0; cam < num_cams; cam++) {
 			a +=  *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam);
 		}
@@ -5165,7 +5272,7 @@ __device__ void tile_combine_rgba(
 	}// for (int pass = 0; pass < 8; pass ++)
 	__syncthreads();

-#ifdef DEBUG8
+#ifdef DEBUG7A // 8
 	if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 		printf("\ntile_combine_rgba() final\n");
 		for (int ncol = 0; ncol < colors; ncol++) {
@@ -5196,46 +5303,81 @@ __device__ void tile_combine_rgba(


 	if (calc_extra){
-		int cam = threadIdx.y;
-		int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
-//		max_diff_tmp[cam][threadIdx.x] = 0.0;
-		max_diff_tmp[indx] = 0.0;
+		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
+			int cam = camera_num_offs + threadIdx.y;
+			int indx0 = cam * TEXTURE_THREADS_PER_TILE;
+			int indx =  indx0 + threadIdx.x;
+			//		max_diff_tmp[cam][threadIdx.x] = 0.0;
+			max_diff_tmp[indx] = 0.0;
 #pragma unroll
-		for (int pass = 0; pass < 32; pass++){
-			int row = (pass >> 1);
-			int col = ((pass & 1) << 3) + threadIdx.x;
-			int i = row * DTT_SIZE21 + col;
-			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
-			float d2 = 0.0;
-#pragma unroll  // non-constant
-			for (int ncol = 0; ncol < colors; ncol++){
-				float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
-				d2 += *(chn_weights + ncol) * dc * dc;
+			for (int pass = 0; pass < 32; pass++){
+				int row = (pass >> 1);
+				int col = ((pass & 1) << 3) + threadIdx.x;
+				int i = row * DTT_SIZE21 + col;
+				float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+				float d2 = 0.0;
+//#pragma unroll  // non-constant
+				for (int ncol = 0; ncol < colors; ncol++){
+					float dc = *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
+					d2 += *(chn_weights + ncol) * dc * dc;
+				}
+				//max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
+				max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
 			}
-//			max_diff_tmp[cam][threadIdx.x] = fmaxf(max_diff_tmp[cam][threadIdx.x], d2);
-			max_diff_tmp[indx] = fmaxf(max_diff_tmp[indx], d2);
-		}
-		__syncthreads();
-		if (threadIdx.x == 0){ // combine results
-			float mx = 0.0;
+			__syncthreads();
+			if (threadIdx.x == 0){ // combine results
+				float mx = 0.0;
 #pragma unroll
-			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//				mx = fmaxf(mx, max_diff_tmp[cam][i]);
-				mx = fmaxf(mx, max_diff_tmp[indx]);
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					//				mx = fmaxf(mx, max_diff_tmp[cam][i]);
+					mx = fmaxf(mx, max_diff_tmp[indx0 + i]);
+				}
+				max_diff_shared[cam] = sqrtf(mx);
 			}
-			max_diff_shared[cam] = sqrtf(mx);
-		}
 #ifdef DEBUG22
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+				printf("\n 1. max_diff\n");
+				printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+					//				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
+					printf("tmp[%d] %f %f %f %f\n",i,
+							max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
+							max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
+				}
+				for (int ncol = 0; ncol < colors; ncol++){
+					printf("\n average for color %d\n",ncol);
+					debug_print_mclt(
+							rgba + (DTT_SIZE2*DTT_SIZE21) * ncol,
+							-1);
+					for (int ncam = 0; ncam < num_cams;ncam ++){
+						printf("\n mclt for color %d, camera %d\n",ncol,ncam);
+						debug_print_mclt(
+								mclt_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
+								-1);
+#if 0
+						printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
+						if (rgb_tile) {
+							debug_print_mclt(
+									rbg_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
+									-1);
+						}
+#endif
+					}
+				}
+			}
+			__syncthreads();// __syncwarp();
+
+#endif // #ifdef DEBUG22
+		} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y)
+
+#ifdef DEBUG7A
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\n 1. max_diff\n");
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("\n X2. max_diff\n");
+			printf("total ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//				printf("tmp[%d] %f %f %f %f\n",i, max_diff_tmp[0][i],max_diff_tmp[1][i],max_diff_tmp[2][i],max_diff_tmp[3][i]);
-				printf("tmp[%d] %f %f %f %f\n",i,
-						max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
 			}
 			for (int ncol = 0; ncol < colors; ncol++){
 				printf("\n average for color %d\n",ncol);
@@ -5247,88 +5389,97 @@ __device__ void tile_combine_rgba(
 					debug_print_mclt(
 							mclt_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
 							-1);
-#if 0
-					printf("\n rgb_tile for color %d, camera %d\n",ncol,ncam);
-					if (rgb_tile) {
-						debug_print_mclt(
-								rbg_tile +  (DTT_SIZE2*(DTT_SIZE21 + 1)) * ncol +  colors_offset * ncam,
-								-1);
-					}
-#endif
 				}
 			}
 		}
 		__syncthreads();// __syncwarp();
-
-#endif // #ifdef DEBUG22
+#endif // #ifdef DEBUG7A
 	}
+
+
+
 	if (calc_extra) {
 		int incr = num_cams * TEXTURE_THREADS_PER_TILE;
-		int cam =  threadIdx.y;
-		int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
-		int indx1 = indx;
-		for (int ncol = 0; ncol < colors; ncol++){
-//			ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
-			ports_rgb_tmp[indx1 += incr] = 0.0;
-		}
+		for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {// assuming num_cams is multiple blockDim.y
+			int cam = camera_num_offs + threadIdx.y;
+			//		int cam =  threadIdx.y;  // BUG!
+			int indx = cam * TEXTURE_THREADS_PER_TILE + threadIdx.x;
+			int indx1 = indx;
+			for (int ncol = 0; ncol < colors; ncol++){
+				//			ports_rgb_tmp[ncol][cam][threadIdx.x] = 0.0;
+				ports_rgb_tmp[indx1] = 0.0; // no difference in wrong zeros when removed
+				indx1 += incr;
+			}
+#ifdef DEBUG7AXX
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+				printf("\nAA: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
+				__syncthreads();// __syncwarp();
+			}
+#endif // #ifdef DEBUG7A

 #pragma unroll
-		for (int pass = 0; pass < 32; pass++){
-			int row = (pass >> 1);
-			int col = ((pass & 1) << 3) + threadIdx.x;
-			int i = row * DTT_SIZE21 + col;
-			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
-			indx1 = indx;
-			for (int ncol = 0; ncol < colors; ncol++){
-//				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
-				ports_rgb_tmp[indx1 += incr] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+			for (int pass = 0; pass < 32; pass++){
+				int row = (pass >> 1);
+				int col = ((pass & 1) << 3) + threadIdx.x;
+				int i = row * DTT_SIZE21 + col;
+				float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
+				indx1 = indx;
+				for (int ncol = 0; ncol < colors; ncol++){
+					//				ports_rgb_tmp[ncol][cam][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+//					ports_rgb_tmp[indx1 += incr] += 1.0; /// *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+					ports_rgb_tmp[indx1] += *(mclt_cam_i + (DTT_SIZE2*(DTT_SIZE21 +1)) * ncol);
+					indx1 += incr;
+
+				}
 			}
-		}
-		__syncthreads();
-		if (threadIdx.x == 0){ // combine results
-			for (int ncol = 0; ncol < colors; ncol++){
-				int indx2 = ncol * num_cams + cam;
-//				ports_rgb_shared[ncol][cam] = 0;
-				ports_rgb_shared[indx] = 0;
-				int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
+			__syncthreads();
+#ifdef DEBUG7AXX
+			if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+				printf("\nBB: indx = %d, camera_num_offs=%d, indx1=%d, cam = %d\n",indx, camera_num_offs, indx1, cam);
+				__syncthreads();// __syncwarp();
+			}
+#endif // #ifdef DEBUG7A
+
+			if (threadIdx.x == 0){ // combine results
+				for (int ncol = 0; ncol < colors; ncol++){
+					int indx2 = ncol * num_cams + cam;
+					//				ports_rgb_shared[ncol][cam] = 0;
+					ports_rgb_shared[indx2] = 0;
+					int indx3 = indx2 * TEXTURE_THREADS_PER_TILE;
 #pragma unroll
-				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-//					ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
-					ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
+					for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
+						//					ports_rgb_shared[ncol][cam] += ports_rgb_tmp[ncol][cam][i];
+						ports_rgb_shared[indx2] += ports_rgb_tmp[indx3++];
+					}
+					ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 				}
-				ports_rgb_shared[indx2] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 			}
-		}
-#ifdef DEBUG22
+		} // for (int camera_num_offs = 0; camera_num_offs < num_cams; camera_num_offs+= blockDim.y) {
+
+		__syncthreads();
+#ifdef DEBUG7A
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\n 2. max_diff\n");
-			printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("\n 2. max_diff, ports_rgb_shared, DBG_TILE = %d\n",DBG_TILE);
+			//				printf("total %f %f %f %f\n",max_diff_shared[0],max_diff_shared[1],max_diff_shared[2],max_diff_shared[3]);
+			printf("max_diff_shared ");for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_shared[ccam]);} printf("\n");
 			for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-				printf("tmp[%d] %f %f %f %f\n",i,
-						max_diff_tmp[0 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[1 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[2 * TEXTURE_THREADS_PER_TILE + i],
-						max_diff_tmp[3 * TEXTURE_THREADS_PER_TILE + i]);
+				printf("tmp[%d]:   ",i); for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",max_diff_tmp[ccam * TEXTURE_THREADS_PER_TILE + i]);} printf("\n");
 			}
+
 			for (int ncol = 0; ncol < colors; ncol++){
-				printf("\n%d:total %f %f %f %f\n", // only first 4 cameras
-						ncol,
-						ports_rgb_shared[ncol * num_cams + 0],
-						ports_rgb_shared[ncol * num_cams + 1],
-						ports_rgb_shared[ncol * num_cams + 2],
-						ports_rgb_shared[ncol * num_cams + 3]);
+				printf("\n%d:ports_rgb_shared ",ncol);
+				for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_shared[ ncol *num_cams + ccam]);} printf("\n");
+
 				for (int i = 0; i < TEXTURE_THREADS_PER_TILE; i++){
-					printf("tmp[%d] %f %f %f %f\n",
-							i,
-							ports_rgb_tmp[(ncol * num_cams + 0) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 1) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 2) * TEXTURE_THREADS_PER_TILE + i],
-							ports_rgb_tmp[(ncol * num_cams + 3) * TEXTURE_THREADS_PER_TILE + i]);
+					printf("ports_rgb_tmp[%d] ",i);
+					for (int ccam = 0; ccam < num_cams; ccam++) {printf("%f, ",ports_rgb_tmp[(ncol*num_cams + ccam) * TEXTURE_THREADS_PER_TILE+ i]);} printf("\n");
 				}
 			}
+
 		}
 		__syncthreads();// __syncwarp();
-#endif // #ifdef DEBUG22
+#endif // #ifdef DEBUG7A
+
 	}
 }


--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -30,8 +30,9 @@
 ** -----------------------------------------------------------------------------**
 */

-//#define NOCORR
+#define NOCORR
 #define NOCORR_TD
+//#define NOTEXTURES_HOST
 #define NOTEXTURES
 #define NOTEXTURE_RGBA
 #define SAVE_CLT
@@ -232,6 +233,38 @@ void set_clt_lpf(
 	}
 }

+int host_get_textures_shared_size( // in bytes
+//__device__ int get_textures_shared_size( // in bytes
+	    int                num_cams,     // actual number of cameras
+	    int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+		int *              offsets){     // in floats
+//	int shared_floats = 0;
+	int offs = 0;
+//	int texture_threads_per_tile = TEXTURE_THREADS/num_cams;
+	if (offsets) offsets[0] = offs;
+	offs += num_cams * num_colors * 2 * DTT_SIZE * DTT_SIZE21; //float mclt_tiles         [NUM_CAMS][NUM_COLORS][2*DTT_SIZE][DTT_SIZE21]
+	if (offsets) offsets[1] = offs;
+	offs += num_cams * num_colors * 4 * DTT_SIZE * DTT_SIZE1;  // float clt_tiles         [NUM_CAMS][NUM_COLORS][4][DTT_SIZE][DTT_SIZE1]
+	if (offsets) offsets[2] = offs;
+//	offs += num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;    //float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
+	int mclt_tmp_size = num_cams * num_colors * DTT_SIZE2 * DTT_SIZE21;                // [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21]
+	int rgbaw_size =    (2* (num_colors + 1) + num_cams) * DTT_SIZE2 * DTT_SIZE21;     // [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21]
+	offs += (rgbaw_size > mclt_tmp_size) ? rgbaw_size : mclt_tmp_size;
+	if (offsets) offsets[3] = offs;
+	offs += num_cams * 2;                                      // float port_offsets      [NUM_CAMS][2];
+	if (offsets) offsets[4] = offs;
+	offs += num_colors * num_cams;                             // float ports_rgb_shared  [NUM_COLORS][NUM_CAMS];
+	if (offsets) offsets[5] = offs;
+	offs += num_cams;                                          // float max_diff_shared   [NUM_CAMS];
+	if (offsets) offsets[6] = offs;
+	offs += num_cams * TEXTURE_THREADS_PER_TILE;               // float max_diff_tmp      [NUM_CAMS][TEXTURE_THREADS_PER_TILE]
+	if (offsets) offsets[7] = offs;
+	offs += num_colors * num_cams *  TEXTURE_THREADS_PER_TILE; //float ports_rgb_tmp     [NUM_COLORS][NUM_CAMS][TEXTURE_THREADS_PER_TILE];
+	if (offsets) offsets[8] = offs;
+	return sizeof(float) * offs; // shared_floats;
+
+}
+


 /**
@@ -371,13 +404,27 @@ int main(int argc, char **argv)
    const char* result_corr_file =          "/home/eyesis/git/tile_processor_gpu/clt/aux_corr.corr";
    const char* result_corr_quad_file =     "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-quad.corr";
 ///    const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_corr-cross.corr";
-    const char* result_textures_file =      "/home/eyesis/git/tile_processor_gpu/clt/texture_aux.rgba";
-    const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/texture_rgba_aux.rgba";
+    const char* result_textures_file =      "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_aux.rgba";
+    const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/aux_diff_rgb_combo.drbg";
+    const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/aux_texture_rgba_aux.rgba";

    const char* rByRDist_file =             "/home/eyesis/git/tile_processor_gpu/clt/aux.rbyrdist";
    const char* correction_vector_file =    "/home/eyesis/git/tile_processor_gpu/clt/aux.correction_vector";
    const char* geometry_correction_file =  "/home/eyesis/git/tile_processor_gpu/clt/aux.geometry_correction";

+    float color_weights [] = {
+            1.0,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
+            1.0,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
+            1.0};             // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
+	float generate_RBGA_params[]={
+			10.0,                  // float             min_shot,           // 10.0
+            3.0,                   // float             scale_shot,         // 3.0
+            10.0, // 1.5f,                  // float             diff_sigma,         // pixel value/pixel change
+            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
+            12.0    // 3.0         // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	};
+
+
 #else
    const char* kernel_file[] = {
    		"/home/eyesis/git/tile_processor_gpu/clt/main_chn0_transposed.kernel",
@@ -418,12 +465,25 @@ int main(int argc, char **argv)
    const char* result_corr_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr.corr";
    const char* result_corr_quad_file =  "/home/eyesis/git/tile_processor_gpu/clt/main_corr-quad.corr";
 ///    const char* result_corr_cross_file = "/home/eyesis/git/tile_processor_gpu/clt/main_corr-cross.corr";
-    const char* result_textures_file =       "/home/eyesis/git/tile_processor_gpu/clt/texture.rgba";
-    const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/texture_rgba.rgba";
-
+    const char* result_textures_file =       "/home/eyesis/git/tile_processor_gpu/clt/main_texture.rgba";
+    const char* result_diff_rgb_combo_file ="/home/eyesis/git/tile_processor_gpu/clt/main_diff_rgb_combo.drbg";
+    const char* result_textures_rgba_file = "/home/eyesis/git/tile_processor_gpu/clt/main_texture_rgba.rgba";
    const char* rByRDist_file =            "/home/eyesis/git/tile_processor_gpu/clt/main.rbyrdist";
    const char* correction_vector_file =   "/home/eyesis/git/tile_processor_gpu/clt/main.correction_vector";
    const char* geometry_correction_file = "/home/eyesis/git/tile_processor_gpu/clt/main.geometry_correction";
+
+    float color_weights [] = {
+            0.294118,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
+            0.117647,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
+            0.588235};              // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
+	float generate_RBGA_params[]={
+			10.0,                  // float             min_shot,           // 10.0
+            3.0,                   // float             scale_shot,         // 3.0
+            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
+            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
+            3.0                    // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	};
+
 #endif


@@ -795,17 +855,6 @@ int main(int argc, char **argv)
    // number of border tiles

    // copy port indices to gpu
-    float color_weights [] = {
-            0.294118,              // float             weight0,            // scale for R 0.5 / (1.0 + 0.5 +0.2)
-            0.117647,              // float             weight1,            // scale for B 0.2 / (1.0 + 0.5 +0.2)
-            0.588235};              // float             weight2,            // scale for G 1.0 / (1.0 + 0.5 +0.2)
-	float generate_RBGA_params[]={
-			10.0,                  // float             min_shot,           // 10.0
-            3.0,                   // float             scale_shot,         // 3.0
-            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-            3.0                    // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	};

 	gpu_port_offsets =         (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_cams * 2); // num_ports * 2);
    gpu_color_weights =        (float *) copyalloc_kernel_gpu((float * ) color_weights, sizeof(color_weights));
@@ -1110,7 +1159,7 @@ int main(int argc, char **argv)

    	getLastCudaError("Kernel execution failed");
    	checkCudaErrors(cudaDeviceSynchronize());
-    	printf("%d\n",i);
+//    	printf("%d\n",i);
    }
    sdkStopTimer(&timerTP);
    float avgTime = (float)sdkGetTimerValue(&timerTP) / (float)numIterations;
@@ -1326,7 +1375,7 @@ int main(int argc, char **argv)
 				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
 				sel_pairs[3], // int               sel_pairs3,           // unused bits should be 0
        		gpu_clt,                       // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				3,                             // int               colors,             // number of colors (3/1)
+				num_colors,                    // int               colors,             // number of colors (3/1)
 				color_weights[0], // 0.25,     // float             scale0,             // scale for R
 				color_weights[1], // 0.25,     // float             scale1,             // scale for B
 				color_weights[2], // 0.5,      // float             scale2,             // scale for G
@@ -1416,14 +1465,191 @@ int main(int argc, char **argv)


 // -----------------
+#ifndef NOTEXTURES_HOST
+    		//    cudaProfilerStart();
+    		    // testing textures
+    		//    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // not used
+    		//    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // not used
+    		//    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
+    		//    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
+
+    		 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
+    		 dim3 blocks0 ((tp_task_size + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);
+
+     		printf("threads0=(%d, %d, %d)\n",threads0.x,threads0.y,threads0.z);
+     		printf("blocks0=(%d, %d, %d)\n",blocks0.x,blocks0.y,blocks0.z);
+     		int   cpu_pnum_texture_tiles = 0;
+     	    int * gpu_pnum_texture_tiles;
+     	    checkCudaErrors (cudaMalloc((void **)&gpu_pnum_texture_tiles, sizeof(int)));
+
+
+    		    StopWatchInterface *timerTEXTURE = 0;
+    		    sdkCreateTimer(&timerTEXTURE);
+
+    		    for (int i = i0; i < numIterations; i++)
+    		    {
+    		    	if (i == 0)
+    		    	{
+    		    		checkCudaErrors(cudaDeviceSynchronize());
+    		    		sdkResetTimer(&timerTEXTURE);
+    		    		sdkStartTimer(&timerTEXTURE);
+    		    	}
+
+    				//*pnum_texture_tiles = 0;
+    				cpu_pnum_texture_tiles = 0;
+    				checkCudaErrors(cudaMemcpy(
+    						gpu_pnum_texture_tiles,
+							&cpu_pnum_texture_tiles,
+							sizeof(int),
+							cudaMemcpyHostToDevice));
+
+    				 create_nonoverlap_list<<<blocks0,threads0>>>(
+    						 num_cams,                // int                num_cams,
+    						 gpu_ftasks,              // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+							 tp_task_size,            // int                num_tiles,           // number of tiles in task
+							 TILESX,                  // int                width,               // number of tiles in a row
+    						 gpu_texture_indices,     // int *              nonoverlap_list,     // pointer to the calculated number of non-zero tiles
+							 gpu_pnum_texture_tiles); // int *              pnonoverlap_length)  //  indices to gpu_tasks  // should be initialized to zero
+    				 cudaDeviceSynchronize();
+
+    				 checkCudaErrors(cudaMemcpy(
+    						 &cpu_pnum_texture_tiles,
+							 gpu_pnum_texture_tiles,
+							 sizeof(int),
+							 cudaMemcpyDeviceToHost));
+    				 printf("cpu_pnum_texture_tiles = %d\n",  cpu_pnum_texture_tiles);
+    				 int num_cams_per_thread = NUM_THREADS / TEXTURE_THREADS_PER_TILE; // 4 cameras parallel, then repeat
+    				 dim3 threads_texture1(TEXTURE_THREADS_PER_TILE, num_cams_per_thread, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+    				 dim3 grid_texture1((cpu_pnum_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
+    		     		printf("threads_texture1=(%d, %d, %d)\n",threads_texture1.x,threads_texture1.y,threads_texture1.z);
+    		     		printf("grid_texture1=(%d, %d, %d)\n",grid_texture1.x,grid_texture1.y,grid_texture1.z);
+
+    				 int shared_size = host_get_textures_shared_size( // in bytes
+    						 num_cams,         // int                num_cams,     // actual number of cameras
+							 texture_colors,   // int                num_colors,   // actual number of colors: 3 for RGB, 1 for LWIR/mono
+    						 0);           // int *              offsets);     // in floats
+    				printf("\n1. shared_size=%d, num_cams=%d, colors=%d\n",shared_size,num_cams, texture_colors);
+
+    				cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
+    				cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
+
+    				textures_accumulate <<<grid_texture1,threads_texture1,  shared_size>>>( // 65536>>>( //
+    						num_cams,                        // 	int               num_cams,           // number of cameras used
+							(int *) 0,                       // int             * woi,                // x, y, width,height
+							gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+							cpu_pnum_texture_tiles, // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+							gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+							gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
+							texture_colors,                  // int               colors,             // number of colors (3/1)
+							(texture_colors == 1),           // int               is_lwir,            // do not perform shot correction
+							generate_RBGA_params[0], // min_shot,      // float             min_shot,           // 10.0
+							generate_RBGA_params[1], // scale_shot,    // float             scale_shot,         // 3.0
+							generate_RBGA_params[2], // diff_sigma,    // float             diff_sigma,         // pixel value/pixel change
+							generate_RBGA_params[3], // diff_threshold,// float             diff_threshold,     // pixel value/pixel change
+							generate_RBGA_params[4], // min_agree,     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+							gpu_color_weights,               // float             weights[3],         // scale for R,B,G
+							1,                       // dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
+							1, // 0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
+							// combining both non-overlap and overlap (each calculated if pointer is not null )
+							0,                               // size_t      texture_rbg_stride, // in floats
+							(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+							0,                       // texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
+							(float *) 0,             // gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+							gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
+							TILESX);
+    				getLastCudaError("Kernel failure");
+    				checkCudaErrors(cudaDeviceSynchronize());
+    				printf("test pass: %d\n",i);
+    		    }
+    		///	cudaProfilerStop();
+    		    sdkStopTimer(&timerTEXTURE);
+    		    float avgTimeTEXTURES = (float)sdkGetTimerValue(&timerTEXTURE) / (float)numIterations;
+    		    sdkDeleteTimer(&timerTEXTURE);
+    		    printf("Average Texture run time =%f ms\n",  avgTimeTEXTURES);
+
+    		    int rslt_texture_size =   num_textures * tile_texture_size;
+    		    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));
+
+    		    checkCudaErrors(cudaMemcpy2D( // something wrong with size
+    		    		cpu_textures,
+    					tile_texture_size * sizeof(float),
+    					gpu_textures,
+    					dstride_textures,
+    					tile_texture_size * sizeof(float),
+    					num_textures,
+    		    		cudaMemcpyDeviceToHost));
+
+    		    int ntiles = TILESX * TILESY;
+    		    int nlayers = num_cams * (num_colors + 1);
+    		    int diff_rgb_combo_size = ntiles * nlayers;
+    		    float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
+    			checkCudaErrors(cudaMemcpy(
+    					cpu_diff_rgb_combo,
+    					gpu_diff_rgb_combo,
+    					diff_rgb_combo_size  * sizeof(float),
+    					cudaMemcpyDeviceToHost));
+    		    float * cpu_diff_rgb_combo_out = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
+    		    for (int nl = 0; nl <nlayers; nl++){
+    		    	for (int ntile = 0; ntile < ntiles; ntile++){
+    		    		cpu_diff_rgb_combo_out[nl * ntiles + ntile] = cpu_diff_rgb_combo[ntile * nlayers + nl];
+    		    	}
+    		    }
+
+
+
+    		#ifndef NSAVE_TEXTURES
+    		    		printf("Writing phase texture data to %s\n",  result_textures_file);
+    		    		/*
+    		    		writeFloatsToFile(
+    		    				cpu_textures,    // float *       data, // allocated array
+    							rslt_texture_size,    // int           size, // length in elements
+    							result_textures_file); // 			   const char *  path) // file path
+    							*/
+    		    		writeFloatsToFile(
+    		    				cpu_diff_rgb_combo, // cpu_diff_rgb_combo,    // float *       data, // allocated array
+    							diff_rgb_combo_size,    // int           size, // length in elements
+								result_textures_file); // 			   const char *  path) // file path
+
+    		    		printf("Writing low-res data to %s\n",  result_diff_rgb_combo_file);
+    		    		writeFloatsToFile(
+    		    				cpu_diff_rgb_combo_out, // cpu_diff_rgb_combo,    // float *       data, // allocated array
+    							diff_rgb_combo_size,    // int           size, // length in elements
+    							result_diff_rgb_combo_file); // 			   const char *  path) // file path
+
+    		//DBG_TILE
+    		#ifdef DEBUG10
+    		    		int texture_offset = DBG_TILE * tile_texture_size;
+    		    		int chn = 0;
+    		    		for (int i = 0; i < tile_texture_size; i++){
+    		    			if ((i % 256) == 0){
+    		    				printf("\nchn = %d\n", chn++);
+    		    			}
+    		    			printf("%10.4f", *(cpu_textures + texture_offset + i));
+    		    			if (((i + 1) % 16) == 0){
+    		    				printf("\n");
+    		    			} else {
+    		    				printf(" ");
+    		    			}
+    		    		}
+    		#endif // DEBUG9
+    		#endif
+    		    		free(cpu_textures);
+    		    		free (cpu_diff_rgb_combo);
+    		    		free (cpu_diff_rgb_combo_out);
+    		    		checkCudaErrors(cudaFree(gpu_pnum_texture_tiles));
+
+#endif //NOTEXTURES_HOST
+
+
+

 #ifndef NOTEXTURES
 //    cudaProfilerStart();
    // testing textures
-    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1);
-    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
-    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
-    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
+//    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, num_cams, 1); // TEXTURE_TILES_PER_BLOCK, 1); // not used
+//    dim3 grid_texture((num_textures + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1); // not used
+//    printf("threads_texture=(%d, %d, %d)\n",threads_texture.x,threads_texture.y,threads_texture.z);
+//    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
    StopWatchInterface *timerTEXTURE = 0;
    sdkCreateTimer(&timerTEXTURE);

@@ -1441,8 +1667,11 @@ int main(int argc, char **argv)
 		// Channel2 weight = 0.588235

    	// FIXME: update to use new correlations and num_cams
-		cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
-    	textures_nonoverlap<<<1,1>>> (
+//		cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributeMaxDynamicSharedMemorySize, 65536); // for CC 7.5
+//		cudaFuncSetAttribute(textures_nonoverlap, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
+		cudaFuncSetAttribute(textures_accumulate, cudaFuncAttributePreferredSharedMemoryCarveout,cudaSharedmemCarveoutMaxShared);
+    	textures_nonoverlap<<<1,1>>> ( //,65536>>> (
    			num_cams,              // int                num_cams,           // number of cameras used
 				gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
 //                gpu_tasks,             // struct tp_task   * gpu_tasks,
@@ -1476,7 +1705,7 @@ int main(int argc, char **argv)
    int rslt_texture_size =   num_textures * tile_texture_size;
    float * cpu_textures = (float *)malloc(rslt_texture_size * sizeof(float));

-    checkCudaErrors(cudaMemcpy2D(
+    checkCudaErrors(cudaMemcpy2D( // somethong wrong with size
    		cpu_textures,
 			tile_texture_size * sizeof(float),
 			gpu_textures,
@@ -1485,6 +1714,16 @@ int main(int argc, char **argv)
 			num_textures,
    		cudaMemcpyDeviceToHost));

+    int diff_rgb_combo_size = TILESX * TILESY * num_cams * (num_colors + 1);
+    float * cpu_diff_rgb_combo = (float *)malloc(diff_rgb_combo_size  * sizeof(float));
+	checkCudaErrors(cudaMemcpy(
+			cpu_diff_rgb_combo,
+			gpu_diff_rgb_combo,
+			diff_rgb_combo_size  * sizeof(float),
+			cudaMemcpyDeviceToHost));
+
+
+
 #ifndef NSAVE_TEXTURES
    		printf("Writing phase texture data to %s\n",  result_textures_file);
    		writeFloatsToFile(
@@ -1492,6 +1731,12 @@ int main(int argc, char **argv)
 					rslt_texture_size,    // int           size, // length in elements
 					result_textures_file); // 			   const char *  path) // file path

+    		printf("Writing low-res data to %s\n",  result_diff_rgb_combo_file);
+    		writeFloatsToFile(
+    				cpu_diff_rgb_combo,    // float *       data, // allocated array
+					diff_rgb_combo_size,    // int           size, // length in elements
+					result_diff_rgb_combo_file); // 			   const char *  path) // file path
+
 //DBG_TILE
 #ifdef DEBUG10
    		int texture_offset = DBG_TILE * tile_texture_size;
@@ -1510,6 +1755,7 @@ int main(int argc, char **argv)
 #endif // DEBUG9
 #endif
    		free(cpu_textures);
+    		free (cpu_diff_rgb_combo);
 #endif // ifndef NOTEXTURES



--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -106,8 +106,8 @@
 //#define DBG_TILE_X     40
 //#define DBG_TILE_Y     80
 #if TEST_LWIR
-	#define DBG_TILE_X     32 // 162 // 151 // 161 // 49
-	#define DBG_TILE_Y     36 // 88 // 121 // 69  // 111 // 66
+	#define DBG_TILE_X    52 // 32 // 162 // 151 // 161 // 49
+	#define DBG_TILE_Y     5 // 36 // 88 // 121 // 69  // 111 // 66
 	#define DBG_TILE    (DBG_TILE_Y * 80 + DBG_TILE_X)
 #else
 	#define DBG_TILE_X     114 // 32 // 162 // 151 // 161 // 49
@@ -126,6 +126,9 @@
 //#define DEBUG4 1
 //#define DEBUG5 1
 //#define DEBUG6 1
+
+// #define DEBUG7 1
+ #define DEBUG7A 1
 /*
 #define DEBUG7 1
 #define DEBUG8 1