implementing prepare_texture_list with CDP

9619c4ac · Andrey Filippov · 891a0d82 · 9619c4ac · 9619c4ac
Commit 9619c4ac authored Apr 02, 2020 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 291 additions and 23 deletions

TileProcessor.cuh src/TileProcessor.cuh +177 -12

dtt8x8.cu src/dtt8x8.cu +114 -11

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -61,6 +61,10 @@
 #define CORR_PAIRS_MASK        0x3f// lower bits used to address correlation pair for the selected tile
 #define CORR_TEXTURE_BIT          7 // bit 7 used to request texture for the tile
 #define TASK_CORR_BITS            4
+#define TASK_TEXTURE_N_BIT        0 // Texture with North neighbor
+#define TASK_TEXTURE_E_BIT        1 // Texture with East  neighbor
+#define TASK_TEXTURE_S_BIT        2 // Texture with South neighbor
+#define TASK_TEXTURE_W_BIT        3 // Texture with West  neighbor
 #define TASK_TEXTURE_BIT          3 // bit to request texture calculation int task field of struct tp_task
 #define LIST_TEXTURE_BIT          7 // bit to request texture calculation
 #define CORR_OUT_RAD              4
@@ -82,7 +86,12 @@
 #define DEBUG9 1
 */
-#endif
+#endif //#ifndef JCUDA
+#define TASK_TEXTURE_BITS ((1 << TASK_TEXTURE_N_BIT) | (1 << TASK_TEXTURE_E_BIT) | (1 << TASK_TEXTURE_S_BIT) | (1 << TASK_TEXTURE_W_BIT))
 //#define IMCLT14
 //#define NOICLT 1
 //#define TEST_IMCLT
@@ -131,6 +140,8 @@
 #define KERNELS_STEP  (1 << KERNELS_LSTEP)
 #define TILESX        (IMG_WIDTH / DTT_SIZE)
 #define TILESY        (IMG_HEIGHT / DTT_SIZE)
+// Make TILESYA >= TILESX and a multiple of 4
+#define TILESYA       ((TILESY +3) & (~3))
 // increase row length by 1 so vertical passes will use different ports
 #define DTT_SIZE1        (DTT_SIZE + 1)
 #define DTT_SIZE2        (2 * DTT_SIZE)
@@ -162,9 +173,10 @@
 //#define TASK_SIZE      12
 struct tp_task {
 	int   task;
+	union {
 		int      txy;
-//	short ty;
+		unsigned short sxy[2];
-//	short tx;
+	};
 	float xy[NUM_CAMS][2];
 };
 struct CltExtra{
@@ -1101,11 +1113,29 @@ __device__ void imclt_plane( // not implemented, not used
 		float           * gpu_rbg,            // WIDTH, HEIGHT
 		const size_t      dstride);            // in floats (pixels)
+extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
 		int                height); // <= TILESY, use for faster processing of LWIR images
+extern "C"
+__global__ void mark_texture_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,            // number of tiles in task list
+		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+extern "C"
+__global__ void mark_texture_neighbor_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+extern "C"
+__global__ void gen_texture_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,  // number of texture tiles to process
+		int              * woi);                // x,y,width,height of the woi
 extern "C"
 __global__ void correlate2D(
@@ -1386,41 +1416,176 @@ __global__ void prepare_texture_list(
 			struct tp_task   * gpu_tasks,
 			int                num_tiles,          // number of tiles in task list
 			int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-			int              * num_texture_tiles,  // number of texture tiles to process
+			int              * num_texture_tiles,  // number of texture tiles to process  (4 separate elements for accumulation)
 			int              * woi,                // x,y,width,height of the woi
-			int                width,  // <= TILESX, use for faster processing of LWIR images
+			int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
 			int                height) // <= TILESY, use for faster processing of LWIR images
 {
+// TODO use atomic_add to increment	num_texture_tiles
+// TODO calculate woi
 //	int task_num = blockIdx.x;
 //	int tid = threadIdx.x; // maybe it will be just <<<1,1>>>
-    dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+    dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
    int blocks_x = (width + 1) >> THREADS_DYNAMIC_BITS;
-    dim3 blocks (blocks_x, height, 1);
+    dim3 blocks0 (blocks_x, height, 1);
 	if (threadIdx.x == 0) {
-		clear_texture_list<<<blocks,threads>>>(
+		clear_texture_list<<<blocks0,threads0>>>(
 				gpu_texture_indices,
 				width,
 				height);
 		cudaDeviceSynchronize(); // not needed yet, just for testing
+		dim3 threads((1 << THREADS_DYNAMIC_BITS), 1, 1);
+		int blocks_t =   (num_tiles + 1) >> THREADS_DYNAMIC_BITS;//
+	    dim3 blocks(blocks_t, 1, 1);
+	    // mark used tiles in gpu_texture_indices memory
+		mark_texture_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+	    // mark n/e/s/w used tiles from gpu_texture_indices memory to gpu_tasks lower 4 bits
+		mark_texture_neighbor_tiles <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices); // packed tile + bits (now only (1 << 7)
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+	    // Generate tile indices list, upper 24 bits - tile index, lower 4 bits: n/e/s/w neighbors, bit 7 - set to 1
+		*(woi + 0) = TILESX;
+		*(woi + 1) = TILESY;
+		*(woi + 2) = 0;
+		*(woi + 3) = 0;
+		*(num_texture_tiles+0) = 0;
+		*(num_texture_tiles+1) = 0;
+		*(num_texture_tiles+2) = 0;
+		*(num_texture_tiles+3) = 0;
+		gen_texture_list <<<blocks,threads>>>(
+				gpu_tasks,
+				num_tiles,          // number of tiles in task list
+				gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+				num_texture_tiles,  // number of texture tiles to process
+				woi);                // x,y,width,height of the woi
+		cudaDeviceSynchronize(); // not needed yet, just for testing
+		*(woi + 2) += 1 - *(woi + 0);
+		*(woi + 3) += 1 - *(woi + 1);
 	}
 	 __syncthreads();
 }
 // blockDim.x * gridDim.x >= width
+extern "C"
 __global__ void clear_texture_list(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int                width,  // <= TILESX, use for faster processing of LWIR images
 		int                height) // <= TILESY, use for faster processing of LWIR images
 {
-	int col = threadIdx.x + blockDim.x * blockIdx.x;
+	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockIdx.y;
 	if (col > width) {
 		return;
 	}
-	*(gpu_texture_indices + col + row * TILESX) = 0.0;
+	*(gpu_texture_indices + col + row * TILESX) = 0;
+}
+// treads (*,1,1), blocks = (*,1,1)
+extern "C"
+__global__ void mark_texture_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
+{
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+//	struct tp_task  * gpu_task = &gpu_tasks[task_num];
+//	int task = gpu_task->task;
+	int task = gpu_tasks[task_num].task;
+	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+//	int cxy = gpu_task->txy;
+	int cxy = gpu_tasks[task_num].txy;
+	*(gpu_texture_indices + (cxy & 0xffff) + (cxy >> 16) * TILESX) = 1;
 }
+// treads (*,1,1), blocks = (*,1,1)
+extern "C"
+__global__ void mark_texture_neighbor_tiles(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,           // number of tiles in task list
+		int              * gpu_texture_indices) // packed tile + bits (now only (1 << 7)
+{
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+//	struct tp_task  * gpu_task = &gpu_tasks[task_num];
+//	int task = gpu_task->task;
+	int task = gpu_tasks[task_num].task;
+	if (!(task & TASK_TEXTURE_BITS)){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+//	int cxy = gpu_task->txy;
+	int cxy = gpu_tasks[task_num].txy;
+	int x = (cxy & 0xffff);
+	int y = (cxy >> 16);
+	int d = 0;
+	if ((y > 0)            && *(gpu_texture_indices +  x +     (y - 1) * TILESX)) d |= (1 << TASK_TEXTURE_N_BIT);
+	if ((x < (TILESX - 1)) && *(gpu_texture_indices + (x + 1) + y *      TILESX)) d |= (1 << TASK_TEXTURE_E_BIT);
+	if ((y < (TILESY - 1)) && *(gpu_texture_indices +  x +     (y + 1) * TILESX)) d |= (1 << TASK_TEXTURE_S_BIT);
+	if ((x > 0)            && *(gpu_texture_indices + (x - 1) + y *      TILESX)) d |= (1 << TASK_TEXTURE_W_BIT);
+	gpu_tasks[task_num].task = ((task ^ d) & TASK_TEXTURE_BITS) ^ task;
+}
+extern "C"
+__global__ void gen_texture_list(
+		struct tp_task   * gpu_tasks,
+		int                num_tiles,            // number of tiles in task list
+		int              * gpu_texture_indices, // packed tile + bits (now only (1 << 7)
+		int              * num_texture_tiles,    // number of texture tiles to process
+		int              * woi)                  // x,y,width,height of the woi
+{
+//	int *gpu_texture_indices1 = gpu_texture_indices0 + TILESX * (TILESYA >> 2);
+//	int *gpu_texture_indices2 = gpu_texture_indices1 + TILESX * (TILESYA >> 2);
+//	int *gpu_texture_indices3 = gpu_texture_indices2 + TILESX * (TILESYA >> 2);
+	int task_num = blockDim.x * blockIdx.x + threadIdx.x;
+	if (task_num >= num_tiles) {
+		return; // nothing to do
+	}
+//	struct tp_task  * gpu_task = &gpu_tasks[task_num];
+//	int task = gpu_task->task;
+	int task = gpu_tasks[task_num].task & TASK_TEXTURE_BITS;
+	if (!task){ // here any bit in TASK_TEXTURE_BITS is sufficient
+		return; // NOP tile
+	}
+//	int cxy = gpu_task->txy;
+	int cxy = gpu_tasks[task_num].txy;
+	int x = (cxy & 0xffff);
+	int y = (cxy >> 16);
+	if (x & 1) {
+		gpu_texture_indices += TILESX * (TILESYA >> 2); //TILESYA - 2 LSB == 00
+		num_texture_tiles += 1; // int *
+	}
+	if (y & 1) {
+		gpu_texture_indices += TILESX * (TILESYA >> 1);
+		num_texture_tiles += 2; // int *
+	}
+	// using atomic operation in global memory - slow, but as operations here are per-til, not per- pixel, it should be OK
+	atomicMin(woi+0, x);
+	atomicMin(woi+1, y);
+	atomicMax(woi+2, x);
+	atomicMax(woi+3, y);
+	*(gpu_texture_indices + atomicAdd(num_texture_tiles, 1)) = task | ((x + y * TILESX) << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
+}
+//CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
 #endif //#ifdef USE_CDP

--- a/src/dtt8x8.cu
+++ b/src/dtt8x8.cu
@@ -51,10 +51,11 @@
 float * copyalloc_kernel_gpu(float * kernel_host,
-		                int size) // size in floats
+		                int size, // size in floats
+						int full_size)
 {
 	float *kernel_gpu;
-    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, size * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&kernel_gpu, full_size * sizeof(float)));
    checkCudaErrors(cudaMemcpy( // segfault
    		kernel_gpu,
    		kernel_host,
@@ -62,6 +63,17 @@ float * copyalloc_kernel_gpu(float * kernel_host,
            cudaMemcpyHostToDevice));
    return kernel_gpu;
 }
+float * copyalloc_kernel_gpu(float * kernel_host,
+		                int size)
+{
+	return copyalloc_kernel_gpu(kernel_host,
+            size, // size in floats
+			size);
+}
 float * alloccopy_from_gpu(
 		float * gpu_data,
 		float * cpu_data, // if null, will allocate
@@ -294,7 +306,9 @@ int main(int argc, char **argv)
    struct tp_task     task_data [TILESX*TILESY]; // maximal length - each tile
    int                corr_indices         [NUM_PAIRS*TILESX*TILESY];
-    int                texture_indices      [TILESX*TILESY];
+//    int                texture_indices      [TILESX*TILESY];
+    int                texture_indices      [TILESX*TILESYA];
+    int                cpu_woi              [4];
    // host array of pointers to GPU memory
    float            * gpu_kernels_h        [NUM_CAMS];
@@ -312,6 +326,8 @@ int main(int argc, char **argv)
    float            * gpu_textures;
    int              * gpu_texture_indices;
+    int              * gpu_woi;
+    int              * gpu_num_texture_tiles;
    float            * gpu_port_offsets;
    int                num_corrs;
    int                num_textures;
@@ -448,14 +464,19 @@ int main(int argc, char **argv)
    }
    // num_corrs now has the total number of correlations
    // copy corr_indices to gpu
-    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
+//    gpu_corr_indices = (int  *) copyalloc_kernel_gpu((float * ) corr_indices, num_corrs);
+    gpu_corr_indices = (int  *) copyalloc_kernel_gpu(
+    		(float * ) corr_indices,
+			num_corrs,
+			NUM_PAIRS * TILESX * TILESY);
    // build texture_indices
    num_textures = 0;
    for (int ty = 0; ty < TILESY; ty++){
    	for (int tx = 0; tx < TILESX; tx++){
    		int nt = ty * TILESX + tx;
-    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
+//    		int cm = (task_data[nt].task >> TASK_TEXTURE_BIT) & 1;
+    		int cm = task_data[nt].task & TASK_TEXTURE_BITS;
    		if (cm){
    			texture_indices[num_textures++] = (nt << CORR_NTILE_SHIFT) | (1 << LIST_TEXTURE_BIT);
    		}
@@ -463,7 +484,15 @@ int main(int argc, char **argv)
    }
    // num_textures now has the total number of textures
    // copy corr_indices to gpu
-    gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
+//  gpu_texture_indices = (int  *) copyalloc_kernel_gpu((float * ) texture_indices, num_textures);
+    gpu_texture_indices = (int  *) copyalloc_kernel_gpu(
+    		(float * ) texture_indices,
+			num_textures,
+			TILESX * TILESYA); // number of rows - multiple of 4
+    // just allocate
+    checkCudaErrors(cudaMalloc((void **)&gpu_woi,               4 * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&gpu_num_texture_tiles, 4 * sizeof(float)));
    // copy port indices to gpu
    gpu_port_offsets = (float *) copyalloc_kernel_gpu((float * ) port_offsets, num_ports * 2);
@@ -780,7 +809,6 @@ int main(int argc, char **argv)
 		3.0,                   // float             scale_shot,         // 3.0
 		1.5f,                  // float             diff_sigma,         // pixel value/pixel change
 		10.0f,                 // float             diff_threshold,     // pixel value/pixel change
-//		int               diff_gauss,         // when averaging images, use gaussian around average as weight (false - sharp all/nothing)
 		3.0,                   // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		0.294118,              // float             weight0,            // scale for R
 		0.117647,              // float             weight1,            // scale for B
@@ -828,6 +856,7 @@ int main(int argc, char **argv)
 					result_textures_file); // 			   const char *  path) // file path
 //DBG_TILE
+#ifdef DEBUG9
    		int texture_offset = DBG_TILE * tile_texture_size;
    		int chn = 0;
    		for (int i = 0; i < tile_texture_size; i++){
@@ -842,16 +871,88 @@ int main(int argc, char **argv)
    			}
    		}
 //    int tile_texture_size = (texture_colors + 1 + (keep_texture_weights? (NUM_CAMS + texture_colors + 1): 0)) *256;
+#endif // DEBUG9
 #endif
    		free(cpu_textures);
 #endif // ifndef NOTEXTURES
+#define GEN_TEXTURE_LIST
+#ifdef  GEN_TEXTURE_LIST
+    		dim3 threads_list(1,1, 1); // TEXTURE_TILES_PER_BLOCK, 1);
+    		dim3 grid_list   (1,1,1);
+    		printf("threads_list=(%d, %d, %d)\n",threads_list.x,threads_list.y,threads_list.z);
+    		printf("grid_list=(%d, %d, %d)\n",grid_list.x,grid_list.y,grid_list.z);
+    		StopWatchInterface *timerTEXTURELIST = 0;
+    		sdkCreateTimer(&timerTEXTURELIST);
+    		for (int i = i0; i < numIterations; i++)
+    		{
+    			if (i == 0)
+    			{
+    				checkCudaErrors(cudaDeviceSynchronize());
+    				sdkResetTimer(&timerTEXTURELIST);
+    				sdkStartTimer(&timerTEXTURELIST);
+    			}
+    			prepare_texture_list<<<grid_list,threads_list>>> (
+    					gpu_tasks,             // struct tp_task   * gpu_tasks,
+						tp_task_size,          // int                num_tiles,          // number of tiles in task list
+						gpu_texture_indices,   // int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+						gpu_num_texture_tiles, // int              * num_texture_tiles,  // number of texture tiles to process (4 elements)
+						gpu_woi,               // int              * woi,                // x,y,width,height of the woi
+						TILESX,                // int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+						TILESY);               // int                height); // <= TILESY, use for faster processing of LWIR images
+    			getLastCudaError("Kernel failure");
+    			checkCudaErrors(cudaDeviceSynchronize());
+    			printf("test pass: %d\n",i);
+#ifdef DEBUG4
+    			break;
+#endif
+#ifdef DEBUG5
+    			break;
+#endif
+    		}
+    		///	cudaProfilerStop();
+    		sdkStopTimer(&timerTEXTURELIST);
+    		float avgTimeTEXTURESLIST = (float)sdkGetTimerValue(&timerTEXTURELIST) / (float)numIterations;
+    		sdkDeleteTimer(&timerTEXTURELIST);
+    		printf("Average TextureList run time =%f ms\n",  avgTimeTEXTURESLIST);
+    		int cpu_num_texture_tiles[4];
+    		checkCudaErrors(cudaMemcpy(
+    				cpu_woi,
+					gpu_woi,
+					4 * sizeof(float),
+					cudaMemcpyDeviceToHost));
+    		printf("WOI x=%d, y=%d, width=%d, height=%d\n", cpu_woi[0], cpu_woi[1], cpu_woi[2], cpu_woi[3]);
+    		checkCudaErrors(cudaMemcpy(
+//    				&cpu_num_texture_tiles,
+    				cpu_num_texture_tiles,
+					gpu_num_texture_tiles,
+					4 * sizeof(float), // 4 separate sequences
+					cudaMemcpyDeviceToHost));
+    		printf("cpu_num_texture_tiles=(%d, %d, %d, %d)\n", cpu_num_texture_tiles[0],
+    				cpu_num_texture_tiles[1], cpu_num_texture_tiles[2], cpu_num_texture_tiles[3]);
+    		for (int q = 0; q < 4; q++) {
+    			checkCudaErrors(cudaMemcpy(
+    					texture_indices  + q * TILESX * (TILESYA >> 2),
+						gpu_texture_indices  + q * TILESX * (TILESYA >> 2),
+						cpu_num_texture_tiles[q] * sizeof(float), // change to cpu_num_texture_tiles when ready
+						cudaMemcpyDeviceToHost));
+    		}
+    		for (int q = 0; q < 4; q++) {
+        		printf("%d: %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x %3x:%3x \n",q,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 0] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 1] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 2] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 3] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 4] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 5] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 6] >> 8) % TILESX,
+        				(texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) / TILESX, (texture_indices[q * TILESX * (TILESYA >> 2) + 7] >> 8) % TILESX);
+    		}
+#endif //GEN_TEXTURE_LIST
 #ifdef SAVE_CLT
    free(cpu_clt);
@@ -879,6 +980,8 @@ int main(int argc, char **argv)
 	checkCudaErrors(cudaFree(gpu_texture_indices));
 	checkCudaErrors(cudaFree(gpu_port_offsets));
 	checkCudaErrors(cudaFree(gpu_textures));
+	checkCudaErrors(cudaFree(gpu_woi));
+	checkCudaErrors(cudaFree(gpu_num_texture_tiles));