more TILESX

44e87f14 · Andrey Filippov · bb7792f8 · 44e87f14 · 44e87f14 · 44e87f14
Commit 44e87f14 authored Aug 07, 2020 by Andrey Filippov
Showing with 115 additions and 91 deletions

TileProcessor.cuh src/TileProcessor.cuh +93 -81

TileProcessor.h src/TileProcessor.h +11 -9

test_tp.cu src/test_tp.cu +3 -1

tp_defines.h src/tp_defines.h +8 -0

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -48,7 +48,7 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		float           ** gpu_kernels,        // [NUM_CAMS],
 		float           ** gpu_images,         // [NUM_CAMS],
 		struct tp_task   * gpu_tasks,
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
 		int                num_tiles,          // number of tiles in task
 		int                lpf_mask,           // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green. Now - always 0 !
@@ -57,10 +57,12 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		int                kernels_hor,
 		int                kernels_vert,
 		int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-		int *              pnum_active_tiles);  //  indices to gpu_tasks
+		int *              pnum_active_tiles,  //  indices to gpu_tasks
+		int                tilesx);
 extern "C" __global__ void correlate2D(
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		int               colors,             // number of colors (3/1)
 		float             scale0,             // scale for R
 		float             scale1,             // scale for B
@@ -83,7 +85,7 @@ extern "C" __global__ void textures_nonoverlap(
 // declare arrays in device code?
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int             * pnum_texture_tiles,  // returns total number of elements in gpu_texture_indices array
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
@@ -99,7 +101,7 @@ extern "C" __global__ void textures_nonoverlap(
 extern "C"
 __global__ void imclt_rbg_all(
-		float           ** gpu_clt,            // [NUM_CAMS][TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           ** gpu_corr_images,    // [NUM_CAMS][WIDTH, 3 * HEIGHT]
 		int                apply_lpf,
 		int                colors,
@@ -108,7 +110,7 @@ __global__ void imclt_rbg_all(
 		const size_t       dstride);            // in floats (pixels)
 extern "C" __global__ void imclt_rbg(
-		float           * gpu_clt,            // [TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float           * gpu_clt,            // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
 		int               apply_lpf,
 		int               mono,               // defines lpf filter
@@ -127,10 +129,10 @@ extern "C" __global__ void generate_RBGA(
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
 		int              * woi,                // x,y,width,height of the woi
-		int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
+		int                width,  // <= TILES-X, use for faster processing of LWIR images (should be actual + 1)
-		int                height, // <= TILESY, use for faster processing of LWIR images
+		int                height, // <= TILES-Y, use for faster processing of LWIR images
 		// Parameters for the texture generation
-		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
+		float          ** gpu_clt,            // [NUM_CAMS] ->[TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -870,7 +870,9 @@ int main(int argc, char **argv)
 				KERNELS_HOR,           // int                kernels_hor,
 				KERNELS_VERT,          // int                kernels_vert);
 				gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
-    			gpu_num_active);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+    			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+				TILESX); // int                tilesx)
    	getLastCudaError("Kernel execution failed");
    	checkCudaErrors(cudaDeviceSynchronize());

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -77,6 +77,14 @@
 #define RBYRDIST_STEP             0.0004 // for doubles, 0.0002 - floats // to fit into GPU shared memory (was 0.001);
 #define TILES_PER_BLOCK_GEOM     (32/NUM_CAMS)   // each tile has NUM_CAMS threads
+// only used in C++ test
+#define TILESX        (IMG_WIDTH / DTT_SIZE)
+//#define TILESY        (IMG_HEIGHT / DTT_SIZE)
+#define TILESYA       ((TILESY +3) & (~3))
 #define DEBUG_OOB1 1
 // Use CORR_OUT_RAD for the correlation output