tested texture transferred to the CPU memory

edce489f · Andrey Filippov · 9e1a74b7 · edce489f · edce489f
Commit edce489f authored Mar 25, 2020 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 91 additions and 25 deletions

TileProcessor.cuh src/TileProcessor.cuh +60 -20

dtt8x8.cu src/dtt8x8.cu +31 -5

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -768,7 +768,6 @@ Java code:
 #endif
 #endif
-// searching for bug. Uncomment later
    // copy 15x15 tile to main memory (2 * corr_radius +1) x (2 * corr_radius +1)
 	int size2r1 = 2 * corr_radius + 1;
 	int len2r1x2r1 = size2r1 * size2r1;
@@ -893,7 +892,7 @@ __global__ void textures_gen(
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles)  // (number of colors +1)*16*16 rgba texture tiles
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
 {
 	float weights[3] = {weight0, weight1, weight2};
 	// will process exactly 4 cameras in one block (so this number is not adjustable here NUM_CAMS should be == 4 !
@@ -912,7 +911,7 @@ __global__ void textures_gen(
 	} shr;
 	__shared__ union {
 		float mclt_tmp           [NUM_CAMS][NUM_COLORS][DTT_SIZE2][DTT_SIZE21];
-		float rgbaw              [NUM_COLORS+1 + NUM_CAMS + NUM_COLORS+1][DTT_SIZE2][DTT_SIZE21];
+		float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
 		// add more
 	} shr1;
 	//	__shared__ float port_weights[NUM_CAMS][DTT_SIZE2 * DTT_SIZE21];
@@ -1067,14 +1066,54 @@ __global__ void textures_gen(
 			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
 			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
 			min_agree,                 // float   min_agree,   NOT USED?   // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-			weights,                   // float * chn_weights,       // color channel weights, sum == 1.0
+			weights,                   // float * chn_weights,    // color channel weights, sum == 1.0
-			dust_remove,               // int     dust_remove,       // Do not reduce average weight when only one image differes much from the average
+			dust_remove,               // int     dust_remove,    // Do not reduce average weight when only one image differes much from the average
-			dust_remove,               // int     keep_weights,   // return channel weights after A in RGBA - ALWAYS
+			keep_weights,              // int     keep_weights,   // return channel weights and rms after A in RGBA (weight are always calculated)
 			(tile_num == DBG_TILE) );  //int     debug );
+// return either only 4 slices (RBGA) or all 12 (with weights and rms) if keep_weights
+// float rgbaw              [NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1][DTT_SIZE2][DTT_SIZE21];
+//	size_t texture_tile_offset = + tile_indx * texture_stride;
+	float * gpu_texture_tile = gpu_texture_tiles + tile_indx * texture_stride;
+	for (int pass = 0; pass < 8; pass ++) {
+		int row = pass * 2 + (threadIdx.y >> 1);
+		int col = ((threadIdx.y & 1) << 3) + threadIdx.x;
+		int i  = row * DTT_SIZE21 + col;
+		int gi = row * DTT_SIZE2  + col;
+		float * gpu_texture_tile_gi = gpu_texture_tile + gi;
+		float * rgba_i = ((float *) shr1.rgbaw) + i;
+		// always copy 3 (1) colors + alpha
+		if (colors == 3){
+			if (keep_weights) {
+#pragma unroll
+				for (int ncol = 0; ncol < NUM_COLORS + 1 + NUM_CAMS + NUM_COLORS + 1 ; ncol++) { // 12
+					*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+				}
+			} else {
+#pragma unroll
+				for (int ncol = 0; ncol < NUM_COLORS + 1; ncol++) { // 4
+					*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+				}
+			}
+		} else { // assuming colors = 1
+			if (keep_weights) {
+#pragma unroll
+				for (int ncol = 0; ncol < 1 + 1 + NUM_CAMS + 1 + 1 ; ncol++) { // 8
+					*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+				}
+			} else {
+#pragma unroll
+				for (int ncol = 0; ncol < 1 + 1; ncol++) { // 2
+					*(gpu_texture_tile_gi + ncol * (DTT_SIZE2 * DTT_SIZE2)) = *(rgba_i + ncol * (DTT_SIZE2 * DTT_SIZE21));
+				}
+			}
+		}
+	}
 #ifdef DEBUG7
 	if ((tile_num == DBG_TILE)  && (threadIdx.x == 0) && (threadIdx.y == 0)){
-        printf("\ntextures_gen tile done = %d\n",tile_num);
+        printf("\ntextures_gen tile done = %d, texture_stride= %d\n",tile_num, (int) texture_stride);
    }
     __syncthreads();// __syncwarp();
 #endif
@@ -2684,13 +2723,13 @@ __device__ void tile_combine_rgba(
 		float   min_agree,      // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float * chn_weights,     // color channel weights, sum == 1.0
 		int     dust_remove,     // Do not reduce average weight when only one image differes much from the average
-		int     keep_weights,   // return channel weights after A in RGBA - ALWAYS
+		int     keep_weights,   // eturn channel weights and rms after A in RGBA (weight are always calculated, not so for the crms)
 		int     debug
 )
 {
 	float * alpha =        rgba + (colors * (DTT_SIZE2*DTT_SIZE21));
 	float * port_weights = alpha + (DTT_SIZE2*DTT_SIZE21);
-	float * crms =         port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // results are never used?
+	float * crms =         port_weights + NUM_CAMS*(DTT_SIZE2*DTT_SIZE21); // calculated only if keep_weights
 	float  threshold2 = diff_sigma * diff_threshold;
 	threshold2 *= threshold2; // squared to compare with diff^2
 	float  pair_dist2r [NUM_CAMS*(NUM_CAMS-1)/2]; // new double [ports*(ports-1)/2]; // reversed squared distance between images - to be used with gaussian. Can be calculated once !
@@ -2809,7 +2848,7 @@ __device__ void tile_combine_rgba(
 #pragma unroll
 		for (int ipair = 0; ipair < (NUM_CAMS*(NUM_CAMS-1)/2); ipair++){
 			float d = 0;
-#pragma unroll
+#pragma unroll // non-constant
 			for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null){
 				//					double dc = iclt_tile[pair_ports[ip][0]][ncol][i] - iclt_tile[pair_ports[ip][1]][ncol][i];
 				float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
@@ -2828,7 +2867,7 @@ __device__ void tile_combine_rgba(
 		int bestPort1 = 0;
 		float best_val= *port_weights_i;
 #pragma unroll
-		for (int cam = bestPort1 + 1; cam < NUM_CAMS; cam++) {
+		for (int cam = 1; cam < NUM_CAMS; cam++) {
 			float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21));
 			if (val > best_val){
 				bestPort1 = cam;
@@ -2837,7 +2876,7 @@ __device__ void tile_combine_rgba(
 		}
 		int bestPort2 = (bestPort1 == 0) ? 1 : 0;
 		best_val= *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21));
-#pragma unroll
+#pragma unroll  // non-constant
 		for (int cam = bestPort2 + 1; cam < NUM_CAMS; cam++){
 			float val = *(port_weights_i + cam * (DTT_SIZE2*DTT_SIZE21));
 			if ((cam != bestPort1) && (val > best_val)){
@@ -2855,7 +2894,7 @@ __device__ void tile_combine_rgba(
 		float w1 = pw1/(pw1 + *(port_weights_i + bestPort2 * (DTT_SIZE2*DTT_SIZE21)));
 		float w2 = 1.0 - w1;
 		float * rgba_i = rgba + i;
-#pragma unroll
+#pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
 			float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
 			* (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21))=
@@ -2912,9 +2951,11 @@ __device__ void tile_combine_rgba(
 #endif // #ifdef DEBUG9
 		// recalculate all weights using difference from this average of the best pair
+#pragma unroll
 		for (int cam = 0; cam < NUM_CAMS; cam++) { // if ((port_mask & ( 1 << ip)) != 0){
 			float * mclt_cam_i = mclt_tile_i + cam * colors_offset;
 			float d2_ip = 0;
+#pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++) { //  if (iclt_tile[0][ncol] != null){
 				float * mclt_cam_col_i = mclt_cam_i + MCLT_UNION_LEN * ncol; // DTT_SIZE2*DTT_SIZE21 * ncol;
 				float dc = *(mclt_cam_col_i) - * (rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21));
@@ -2998,7 +3039,7 @@ __device__ void tile_combine_rgba(
 			k += *(port_weights_i + (DTT_SIZE2*DTT_SIZE21) * cam); // port_weights[ip][i];
 		}
 		k = 1.0/k;
-#pragma unroll
+#pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++) { // if (iclt_tile[0][ncol] != null) {
 			float * rgba_col_i = rgba_i + ncol * (DTT_SIZE2*DTT_SIZE21);
 			float * mclt_col_i = mclt_tile_i + MCLT_UNION_LEN * ncol;
@@ -3036,7 +3077,7 @@ __device__ void tile_combine_rgba(
 		debug_print_mclt(
 				alpha, //
 				-1);
-		for (int cam = 0; cam < colors; cam++) {
+		for (int cam = 0; cam < NUM_CAMS; cam++) {
 			printf("\ntile_combine_rgba() port_weights[%d]\n",cam);
 			debug_print_mclt(
 					port_weights + (cam * (DTT_SIZE2*DTT_SIZE21)),
@@ -3066,10 +3107,9 @@ __device__ void tile_combine_rgba(
 			int row_sym = row ^ ((row & 8)? 0xf : 0);
 			int col_sym = col ^ ((col & 8)? 0xf : 0);
 			float wnd2 = HWINDOW_SQ[row_sym] * HWINDOW_SQi[col_sym];
-//			float * port_weights_i = port_weights + i;
 			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
 			float d2 = 0.0;
-#pragma unroll
+#pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
 				float dc = *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol) - *(rgba + (DTT_SIZE2*DTT_SIZE21) * ncol + i);
 				d2 += *(chn_weights + ncol) * dc * dc;
@@ -3091,7 +3131,7 @@ __device__ void tile_combine_rgba(
 	if (ports_rgb) {
 		__shared__ float ports_rgb_tmp [NUM_CAMS][NUM_COLORS][TEXTURE_THREADS_PER_TILE]; // [4*3][8]
 		int cam = threadIdx.y;
-#pragma unroll
+#pragma unroll  // non-constant
 		for (int ncol = 0; ncol < colors; ncol++){
 			ports_rgb_tmp[cam][ncol][threadIdx.x] = 0.0;
 		}
@@ -3103,14 +3143,14 @@ __device__ void tile_combine_rgba(
 			int i = row * DTT_SIZE21 + col;
 //			int row_sym = row ^ ((row & 8)? 0xf : 0);
 			float * mclt_cam_i = mclt_tile +  colors_offset * cam + i;
-#pragma unroll
+#pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
 				ports_rgb_tmp[cam][ncol][threadIdx.x] += *(mclt_cam_i + (DTT_SIZE2*DTT_SIZE21) * ncol);
 			}
 		}
 		__syncthreads();
 		if (threadIdx.x == 0){ // combine results
-#pragma unroll
+#pragma unroll  // non-constant
 			for (int ncol = 0; ncol < colors; ncol++){
 				ports_rgb[ncol * NUM_CAMS + cam] = 0;
 #pragma unroll

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -103,7 +103,7 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer,
 float * copyalloc_image_gpu(float * image_host,
-						size_t* dstride, // in bytes!!
+						size_t* dstride, // in floats !
 		                int width,
 						int height)
 {
@@ -257,6 +257,8 @@ int main(int argc, char **argv)
 			{ 0.5, -0.5},
 			{-0.5,  0.5},
 			{ 0.5,  0.5}};
+    int keep_texture_weights = 1; // try with 0 also
    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
@@ -465,11 +467,18 @@ int main(int argc, char **argv)
    // copy port indices to gpu
    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
-    // allocates one correlation kernel per line (15x15 floats), number of rows - number of tiles * number of pairs
-    int tile_texture_size = (texture_colors+1)*256;
+//    int keep_texture_weights = 1; // try with 0 also
+//    int texture_colors = 3; // result will be 3+1 RGBA (for mono - 2)
+//		double [][] rgba = new double[numcol + 1 + (keep_weights?(ports + numcol + 1):0)][];
+    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
    gpu_textures = alloc_image_gpu(
    		&dstride_textures,              // in bytes ! for one rgba/ya 16x16 tile
-			tile_texture_size,         // int width,
+			tile_texture_size,              // int width (floats),
 			TILESX * TILESY);               // int height);
@@ -777,7 +786,7 @@ int main(int argc, char **argv)
 		0.117647,              // float             weight1,            // scale for B
 		0.588235,              // float             weight2,            // scale for G
 		1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
-		1,                     // int               keep_weights,       // return channel weights after A in RGBA
+		keep_texture_weights,  // int               keep_weights,       // return channel weights after A in RGBA
 		dstride_textures/sizeof(float), // const size_t      texture_stride,     // in floats (now 256*4 = 1024)
 		gpu_textures);    // float           * gpu_texture_tiles);  // 4*16*16 rgba texture tiles
@@ -817,6 +826,23 @@ int main(int argc, char **argv)
    				cpu_textures,    // float *       data, // allocated array
 					rslt_texture_size,    // int           size, // length in elements
 					result_textures_file); // 			   const char *  path) // file path
+//DBG_TILE
+    		int texture_offset = DBG_TILE * tile_texture_size;
+    		int chn = 0;
+    		for (int i = 0; i < tile_texture_size; i++){
+    			if ((i % 256) == 0){
+    				printf("\nchn = %d\n", chn++);
+    			}
+    			printf("%10.4f", *(cpu_textures + texture_offset + i));
+    			if (((i + 1) % 16) == 0){
+    				printf("\n");
+    			} else {
+    				printf(" ");
+    			}
+    		}
+//    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
 #endif
    		free(cpu_textures);
 #endif // ifndef NOTEXTURES