debugging rgba texture generation

16bc5c47 · Andrey Filippov · 11455aa9 · 16bc5c47 · 16bc5c47 · 16bc5c47
Commit 16bc5c47 authored Apr 05, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 219 additions and 83 deletions

TileProcessor.cuh src/TileProcessor.cuh +98 -21

dtt8x8.cu src/dtt8x8.cu +118 -61

dtt8x8.cuh src/dtt8x8.cuh +3 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -72,8 +72,8 @@
 #define THREADS_DYNAMIC_BITS      5 // treads in block for CDP creation of the texture list
-#undef HAS_PRINTF
+//#undef HAS_PRINTF
-//#define HAS_PRINTF
+#define HAS_PRINTF
 //7
 //#define DEBUG1 1
 //#define DEBUG2 1
@@ -87,7 +87,8 @@
 #define DEBUG9 1
 */
 #define DEBUG10 1
+#define DEBUG11 1
+#define DEBUG12 1
 //#define USE_textures_gen
 #endif //#ifndef JCUDA
@@ -1533,10 +1534,15 @@ __global__ void generate_RBGA(
 	 int texture_slices =       colors + 1;
 	 if (threadIdx.x == 0) {
+		 //DTT_SIZE_LOG2
+//		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
+//		    int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+//		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
 		    dim3 threads2((1 << THREADS_DYNAMIC_BITS), 1, 1);
-		    int blocks_x = (texture_width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
+		    int blocks_x = (texture_width + ((1 << (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2 )) - 1)) >> (THREADS_DYNAMIC_BITS + DTT_SIZE_LOG2);
 		    dim3 blocks2 (blocks_x, texture_tiles_height * texture_slices, 1); // each thread - 8 vertical
-		    clear_texture_rbga<<<blocks2,threads2>>>(
+		    clear_texture_rbga<<<blocks2,threads2>>>( // illegal value error
 		    		texture_width,
 					texture_tiles_height * texture_slices, // int               texture_slice_height,
 					texture_rbga_stride,                   // const size_t      texture_rbga_stride,     // in floats 8*stride
@@ -1547,12 +1553,23 @@ __global__ void generate_RBGA(
 			for (int pass = 0; pass < 8; pass++){
 			    dim3 threads_texture(TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1); // TEXTURE_TILES_PER_BLOCK, 1);
 			    int border_tile =  pass >> 2;
-			    size_t ntt = *(num_texture_tiles + (2* (pass & 3)) + border_tile);
+			    int ntt = *(num_texture_tiles + ((pass & 3) << 1) + border_tile);
 			    dim3 grid_texture((ntt + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1);
 			    int ti_offset = (pass & 3) * (TILESX * (TILESYA >> 2));  // 1/4
 			    if (border_tile){
 			    	ti_offset += TILESX * (TILESYA >> 2) - ntt;
 			    }
+#ifdef DEBUG12
+				printf("\ngenerate_RBGA() pass= %d, border_tile= %d, ti_offset= %d, ntt=%d\n",
+						pass, border_tile,ti_offset, ntt);
+				printf("\ngenerate_RBGA() gpu_texture_indices= 0x%x, gpu_texture_indices + ti_offset=0x%x\n",
+						(int) gpu_texture_indices, (int) (gpu_texture_indices + ti_offset));
+				printf("\ngenerate_RBGA() grid_texture={%d, %d, %d)\n",
+						grid_texture.x, grid_texture.y, grid_texture.z);
+				printf("\ngenerate_RBGA() threads_texture={%d, %d, %d)\n",
+						threads_texture.x, threads_texture.y, threads_texture.z);
+				printf("\n");
+#endif
 			    /* */
 			    textures_accumulate<<<grid_texture,threads_texture>>>(
 			    		border_tile,                     // int               border_tile,        // if 1 - watch for border
@@ -1578,9 +1595,8 @@ __global__ void generate_RBGA(
 						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
 						gpu_texture_tiles); // (float *) 0 );                   // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				cudaDeviceSynchronize(); // not needed yet, just for testing
 				/* */
 			}
 	 }
@@ -1590,21 +1606,20 @@ __global__ void generate_RBGA(
 // blockDim.x * gridDim.x >= width
 extern "C" __global__ void clear_texture_rbga(
-		int               texture_width,
+		int               texture_width, // aligned to DTT_SIZE
 		int               texture_slice_height,
 		const size_t      texture_rbga_stride,     // in floats 8*stride
 		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
 {
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
+	int col = (blockDim.x * blockIdx.x + threadIdx.x) << DTT_SIZE_LOG2;
 	if (col > texture_width) {
 		return;
 	}
-	int row = (blockIdx.y << 3); // includes slices
+	int row = blockIdx.y;; // includes slices
 	float * pix = gpu_texture_tiles + col + row * texture_rbga_stride;
 #pragma unroll
 	for (int n = 0; n < DTT_SIZE; n++) {
-		*(pix) = 0.0;
+		*(pix++) = 0.0;
-		pix += texture_rbga_stride;
 	}
 }
@@ -1778,26 +1793,51 @@ __global__ void gen_texture_list(
 	int cxy = gpu_tasks[task_num].txy;
 	int x = (cxy & 0xffff);
 	int y = (cxy >> 16);
+#ifdef DEBUG12
+		if ((x == DBG_TILE_X)  && (y == DBG_TILE_Y)){
+			printf("\ngen_texture_list() x = %d, y= %d\n",x, y);
+			printf("\ngen_texture_list() num_texture_tiles = %d(%d) %d(%d) %d(%d) %d(%d)\n",
+					num_texture_tiles[0],num_texture_tiles[1],num_texture_tiles[2],num_texture_tiles[3],
+					num_texture_tiles[4],num_texture_tiles[5],num_texture_tiles[6],num_texture_tiles[7]);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 //	int is_border = (x == woi[0]) || (y == woi[1]) || (x == woi[2]) || (y == woi[3]);
 	// don't care if calculate extra pixels that still fit into memory
 	int is_border = (x == woi[0]) || (y == woi[1]) || (x == (TILESX - 1)) || (y == (TILESY - 1));
+	int buff_head = 0;
+	int num_offset = 0;
 	if (x & 1) {
-		gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
+		buff_head += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
-		num_texture_tiles += 2; // int *
+		num_offset += 2; // int *
 	}
 	if (y & 1) {
-		gpu_texture_indices += TILESX * (TILESYA >> 1);
+		buff_head += TILESX * (TILESYA >> 1);
-		num_texture_tiles += 4; // int *
+		num_offset += 4; // int *
 	}
 	if (is_border){
-		gpu_texture_indices += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
+		buff_head += (TILESX * (TILESYA >> 2) - 1); // end of the buffer
-		num_texture_tiles += 1; // int *
+		num_offset += 1; // int *
 	}
+	gpu_texture_indices += buff_head;
+	num_texture_tiles += num_offset;
 	// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
 	int buf_offset = atomicAdd(num_texture_tiles, 1);
 	if (is_border){
 		buf_offset = -buf_offset;
 	}
+#ifdef DEBUG12
+	if ((x == DBG_TILE_X)  && (y == DBG_TILE_Y)){
+		printf("\ngen_texture_list() buff_head=%d, buf_offset = %d, num_offset= %d, is_border=%d\n",
+				buff_head, buf_offset, num_offset,is_border);
+		printf("\ngen_texture_list() gpu_texture_indices = 0x%x,  gpu_texture_indices + buf_offset = 0x%x\n",
+				 (int) gpu_texture_indices, (int) (gpu_texture_indices + buf_offset));
+	}
+	__syncthreads();// __syncwarp();
+#endif // DEBUG12
 	*(gpu_texture_indices + buf_offset) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
 }
@@ -2420,7 +2460,7 @@ __global__ void textures_accumulate(
 		}
 #ifdef DEBUG7
 		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-			printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
+			printf("\textures_accumulate tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
 		}
 		__syncthreads();// __syncwarp();
 #endif
@@ -2432,6 +2472,20 @@ __global__ void textures_accumulate(
 	}
 	if (gpu_texture_rbg && (texture_rbg_stride != 0)) { // generate RGBA
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate accumulating tile = %d, tile_code= %d, border_tile=%d\n",
+					tile_num, (int) tile_code, border_tile);
+			for (int ncol = 0; ncol <= colors; ncol++) {
+				printf("\ntile[%d]\n",ncol);
+				debug_print_mclt(
+						(float *) (shr1.rgbaw[ncol]),
+						-1);
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 		if (tile_code != TASK_TEXTURE_BITS){ // only multiply if needed, for tile_code == TASK_TEXTURE_BITS keep as is.
 			for (int pass = 0; pass < 8; pass ++) {
 				int row = pass * 2 + (threadIdx.y >> 1);
@@ -2453,12 +2507,26 @@ __global__ void textures_accumulate(
 				}
 			}
 		}
-		int slice_stride = texture_rbg_stride * *(woi + 3); // offset to the next color
+		int slice_stride = texture_rbg_stride * *(woi + 3) * DTT_SIZE; // offset to the next color
 		int tileY = tile_num / TILESX; // slow, but 1 per tile
 		int tileX = tile_num - tileY * TILESX;
 		int tile_x0 = (tileX - *(woi + 0)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
 		int tile_y0 = (tileY - *(woi + 1)) * DTT_SIZE - (DTT_SIZE/2); // may be negative == -4
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate () tileX=%d, tileY=%d, tile_x0=%d, tile_y0=%d, slice_stride=%d\n",
+					tileX, tileY, tile_x0, tile_y0, slice_stride);
+			for (int ncol = 0; ncol <= colors; ncol++) {
+				printf("\ntile[%d]\n",ncol);
+				debug_print_mclt(
+						(float *) (shr1.rgbaw[ncol]),
+						-1);
+			}
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 		for (int pass = 0; pass < 8; pass ++) {
 			int row = pass * 2 + (threadIdx.y >> 1);          // row    inside a tile (0..15)
 			int col = ((threadIdx.y & 1) << 3) + threadIdx.x; // column inside a tile (0..15)
@@ -2468,6 +2536,15 @@ __global__ void textures_accumulate(
 			int gi = g_row * texture_rbg_stride  + g_col; // offset to the top left corner
 			float * gpu_texture_rbg_gi = gpu_texture_rbg + gi;
 			float * rgba_i = ((float *) shr1.rgbaw) + i;
+#ifdef DEBUG12
+		if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
+			printf("\ntextures_accumulate () pass=%d, row=%d, col=%d, g_row=%d, g_col=%d, i=%d, gi=%d\n",
+					pass, row, col, g_row, g_col, i, gi);
+		}
+		__syncthreads();// __syncwarp();
+#endif // DEBUG12
 			if (!border_tile ||
 					((g_row >= 0) && (g_col >= 0) && (g_row < (DTT_SIZE * TILESX)) && (g_col < (DTT_SIZE * TILESY)))){
 				// always copy 3 (1) colors + alpha

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -260,7 +260,8 @@ int main(int argc, char **argv)
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn2.rbg",
 			"/data_ssd/git/tile_processor_gpu/clt/main_chn3.rbg"};
    const char* result_corr_file = "/data_ssd/git/tile_processor_gpu/clt/main_corr.corr";
-    const char* result_textures_file = "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
+    const char* result_textures_file =       "/data_ssd/git/tile_processor_gpu/clt/texture.rgba";
+    const char* result_textures_rgba_file = "/data_ssd/git/tile_processor_gpu/clt/texture_rgba.rgba";
    // not yet used
    float lpf_sigmas[3] = {0.9f, 0.9f, 0.9f}; // G, B, G
@@ -429,7 +430,6 @@ int main(int argc, char **argv)
    int tp_task_size =  sizeof(task_data)/sizeof(struct tp_task);
-#ifdef DBG_TILE
 #ifdef DBG0
 //#define NUM_TEST_TILES 128
 #define NUM_TEST_TILES 1
@@ -445,7 +445,6 @@ int main(int argc, char **argv)
    }
    tp_task_size =  NUM_TEST_TILES; // sizeof(task_data)/sizeof(float);
-#endif
 #endif
    // segfault in the next
@@ -660,24 +659,10 @@ int main(int argc, char **argv)
    				}
    			}
    		}
-#ifdef DEBUG4
-    		break;
-#endif
-#ifdef DEBUG5
-    		break;
-#endif
    	}
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
-#ifdef DEBUG4
-    	break;
-#endif
-#ifdef DEBUG5
-    		break;
-#endif
    }
    sdkStopTimer(&timerIMCLT);
@@ -746,12 +731,6 @@ int main(int argc, char **argv)
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
-#ifdef DEBUG4
-    	break;
-#endif
-#ifdef DEBUG5
-    		break;
-#endif
    }
    sdkStopTimer(&timerCORR);
@@ -809,8 +788,6 @@ int main(int argc, char **argv)
 		// Channel0 weight = 0.294118
 		// Channel1 weight = 0.117647
 		// Channel2 weight = 0.588235
-#define TEXTURES_ACCUMULATE
-#ifdef TEXTURES_ACCUMULATE
    	textures_accumulate<<<grid_texture,threads_texture>>> (
    			0,          // int               border_tile,        // if 1 - watch for border
    			(int *) 0,  //      int             * woi,                // x, y, width,height
@@ -835,37 +812,9 @@ int main(int argc, char **argv)
    			(float *) 0, // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 				dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
 				gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
-#else
-    	textures_gen<<<grid_texture,threads_texture>>> (
-    			gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-				num_textures,          // size_t            num_texture_tiles,  // number of texture tiles to process
-				gpu_texture_indices,   // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-				gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
-				texture_colors,        // int               colors,             // number of colors (3/1)
-				(texture_colors == 1), // int               is_lwir,            // do not perform shot correction
-				10.0,                  // float             min_shot,           // 10.0
-				3.0,                   // float             scale_shot,         // 3.0
-				1.5f,                  // float             diff_sigma,         // pixel value/pixel change
-				10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-				3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-				0.294118,              // float             weight0,            // scale for R
-				0.117647,              // float             weight1,            // scale for B
-				0.588235,              // float             weight2,            // scale for G
-				1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-				keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
-				dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
-				gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
-#endif
    	getLastCudaError("Kernel failure");
    	checkCudaErrors(cudaDeviceSynchronize());
    	printf("test pass: %d\n",i);
-#ifdef DEBUG4
-    	break;
-#endif
-#ifdef DEBUG5
-    		break;
-#endif
    }
 ///	cudaProfilerStop();
    sdkStopTimer(&timerTEXTURE);
@@ -945,12 +894,6 @@ int main(int argc, char **argv)
    			getLastCudaError("Kernel failure");
    			checkCudaErrors(cudaDeviceSynchronize());
    			printf("test pass: %d\n",i);
-#ifdef DEBUG4
-    			break;
-#endif
-#ifdef DEBUG5
-    			break;
-#endif
    		}
    		///	cudaProfilerStop();
    		sdkStopTimer(&timerTEXTURELIST);
@@ -958,7 +901,7 @@ int main(int argc, char **argv)
    		sdkDeleteTimer(&timerTEXTURELIST);
    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
-    		int cpu_num_texture_tiles[4];
+    		int cpu_num_texture_tiles[8];
    		checkCudaErrors(cudaMemcpy(
    				cpu_woi,
 					gpu_woi,
@@ -966,7 +909,6 @@ int main(int argc, char **argv)
 					cudaMemcpyDeviceToHost));
    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
    		checkCudaErrors(cudaMemcpy(
-//    				&cpu_num_texture_tiles,
    				cpu_num_texture_tiles,
 					gpu_num_texture_tiles,
 					8 * sizeof(float), // 8 sequences (0,2,4,6 - non-border, growing up;
@@ -1003,6 +945,121 @@ int main(int argc, char **argv)
    		}
 #endif //GEN_TEXTURE_LIST
+#ifndef NOTEXTURE_RGBA
+    dim3 threads_rgba(1, 1, 1);
+    dim3 grid_rgba(1,1,1);
+    printf("threads_rgba=(%d, %d, %d)\n", threads_rgba.x,threads_rgba.y,threads_rgba.z);
+    printf("grid_rgba=(%d, %d, %d)\n",    grid_rgba.x,grid_rgba.y,grid_rgba.z);
+    StopWatchInterface *timerRGBA = 0;
+    sdkCreateTimer(&timerRGBA);
+    for (int i = i0; i < numIterations; i++)
+    {
+    	if (i == 0)
+    	{
+    		checkCudaErrors(cudaDeviceSynchronize());
+    		sdkResetTimer(&timerRGBA);
+    		sdkStartTimer(&timerRGBA);
+    	}
+    	generate_RBGA<<<grid_rgba,threads_rgba>>> (
+    	// Parameters to generate texture tasks
+                gpu_tasks,             // struct tp_task   * gpu_tasks,
+                tp_task_size,          // int                num_tiles,          // number of tiles in task list
+    	// declare arrays in device code?
+	            gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+	            gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (8 elements)
+	            gpu_woi,               // int              * woi,                // x,y,width,height of the woi
+	            TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+	            TILESY,                // int                height); // <= TILESY, use for faster processing of LWIR images
+    	// Parameters for the texture generation
+	            gpu_clt ,              // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+	            gpu_port_offsets,      // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+	            texture_colors,        // int               colors,             // number of colors (3/1)
+	            (texture_colors == 1), // int               is_lwir,            // do not perform shot correction
+	            10.0,                  // float             min_shot,           // 10.0
+	            3.0,                   // float             scale_shot,         // 3.0
+	            1.5f,                  // float             diff_sigma,         // pixel value/pixel change
+	            10.0f,                 // float             diff_threshold,     // pixel value/pixel change
+	            3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	            0.294118,              // float             weight0,            // scale for R
+	            0.117647,              // float             weight1,            // scale for B
+	            0.588235,              // float             weight2,            // scale for G
+	            1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
+	            0,                     // int               keep_weights,       // return channel weights after A in RGBA
+				dstride_textures_rbga/sizeof(float), // 	const size_t      texture_rbga_stride,     // in floats
+				gpu_textures_rbga);    // 	float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+    	getLastCudaError("Kernel failure");
+    	checkCudaErrors(cudaDeviceSynchronize());
+    	printf("test pass: %d\n",i);
+    }
+    sdkStopTimer(&timerRGBA);
+    float avgTimeRGBA = (float)sdkGetTimerValue(&timerRGBA) / (float)numIterations;
+    sdkDeleteTimer(&timerRGBA);
+    printf("Average Texture run time =%f ms\n",  avgTimeRGBA);
+	checkCudaErrors(cudaMemcpy(
+			cpu_woi,
+			gpu_woi,
+			4 * sizeof(float),
+			cudaMemcpyDeviceToHost));
+	printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+    int rgba_woi_width =  cpu_woi[2] * DTT_SIZE;
+    int rgba_woi_height = cpu_woi[3] * DTT_SIZE;
+    int rslt_rgba_size =     rgba_woi_width * rgba_woi_height * rbga_slices;
+    float * cpu_textures_rgba = (float *)malloc(rslt_rgba_size * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_textures_rgba,
+			rgba_width * sizeof(float),
+			gpu_textures_rbga,
+			dstride_textures_rbga,
+			rgba_width * sizeof(float),
+			rgba_height * rbga_slices,
+    		cudaMemcpyDeviceToHost));
+#ifndef NSAVE_TEXTURES
+    printf("Writing RBGA texture slices to %s\n",  result_textures_rgba_file);
+    writeFloatsToFile(
+    		cpu_textures_rgba,    // float *       data, // allocated array
+			rslt_rgba_size,    // int           size, // length in elements
+			result_textures_rgba_file); // 			   const char *  path) // file path
+#endif
+#ifdef DEBUG11
+    int rgba_offset = (DBG_TILE_Y - cpu_woi[1]) * DTT_SIZE * rgba_woi_width  + (DBG_TILE_X - cpu_woi[0]);
+    for (int chn = 0; chn < rbga_slices; chn++){
+    	printf("\nchn = %d\n", chn);
+    	int rgba_offset_chn = rgba_offset + chn * rgba_woi_width * rgba_woi_height;
+    	for (int i = 0; i < 8; i++){
+    		for (int j = 0; j < 8; j++){
+    			printf("%10.4f ", *(cpu_textures_rgba + rgba_offset_chn + i * rgba_woi_width + j));
+    		}
+    		printf("\n");
+    	}
+    }
+#endif // DEBUG11
+    free(cpu_textures_rgba);
+#endif // ifndef NOTEXTURES
 #ifdef SAVE_CLT
    free(cpu_clt);
 #endif

--- a/src/dtt8x8.cuh
+++ b/src/dtt8x8.cuh
@@ -45,9 +45,11 @@
 * with Nvidia Nsight, driver API when calling these kernels from Java
 */
 #ifndef JCUDA
-#define DTT_SIZE                      8
+#define DTT_SIZE_LOG2                 3
+//#define DTT_SIZE                      8
 #endif
 #pragma once
+#define DTT_SIZE                     (1 << DTT_SIZE_LOG2)
 #define DTTTEST_BLOCK_WIDTH          32
 #define DTTTEST_BLOCK_HEIGHT         16
 #define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)