Before adding per-tile weghts for fat zeros

53f7f2ae · Andrey Filippov · 12758c3e · 53f7f2ae · 53f7f2ae · 53f7f2ae
Commit 53f7f2ae authored Dec 14, 2021 by Andrey Filippov
Hide whitespace changes
Inline Side-by-side

Showing with 67 additions and 22 deletions

TileProcessor.cuh src/TileProcessor.cuh +48 -10

TileProcessor.h src/TileProcessor.h +9 -4

test_tp.cu src/test_tp.cu +9 -7

tp_defines.h src/tp_defines.h +1 -1

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -1109,6 +1109,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
@@ -1151,7 +1152,7 @@ __device__ int get_textures_shared_size( // in bytes
 * @param scale1           scale blue (if colors = 3) component before mixing
 * @param scale2           scale green (if colors = 3) component before mixing
 * @param fat_zero         add this value squared to the sum of squared components before normalization\
- * @param gpu_ftasks           flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+ * @param gpu_ftasks           flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 // * @param gpu_tasks        array of per-tile tasks (now bits 4..9 - correlation pairs)
 * @param num_tiles        number of tiles int gpu_tasks array prepared for processing
 * @param tilesx           number of tile rows
@@ -1190,14 +1191,12 @@ extern "C" __global__ void correlate2D(
 		 *pnum_corr_tiles = 0;
 		 index_correlate<<<blocks0,threads0>>>(
 				 num_cams,            // int               num_cams,
-//				 sel_pairs,           // int *             sel_pairs,
 				 sel_pairs0,          // int               sel_pairs0,
 				 sel_pairs1,          // int               sel_pairs1,
 				 sel_pairs2,          // int               sel_pairs2,
 				 sel_pairs3,          // int               sel_pairs3,
-				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+				 gpu_ftasks,          // float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
-				 //				 gpu_tasks,           // struct tp_task   * gpu_tasks,
 				 num_tiles,           // int                num_tiles,          // number of tiles in task
 				 tilesx,              // int                width,                // number of tiles in a row
 				 gpu_corr_indices,    // int *              gpu_corr_indices,  // array of correlation tasks
@@ -2062,7 +2061,8 @@ extern "C" __global__ void generate_RBGA(
 			    		woi,                             // int             * woi,                // x, y, width,height
 						gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
-						gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+						ti_offset,                       //                   gpu_texture_indices_offset,// add to gpu_texture_indices
+						gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 						gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 						colors,                          // int               colors,             // number of colors (3/1)
 						is_lwir,                         // int               is_lwir,            // do not perform shot correction
@@ -2789,7 +2789,9 @@ __global__ void convert_correct_tiles(
 * @param dust_remove          do not reduce average weight when only one image differs much from the average (true)
 * @param texture_stride       output stride in floats (now 256*4 = 1024)
 * @param gpu_texture_tiles    output array (number of colors +1 + ?)*16*16 rgba texture tiles) float values. Will not be calculated if null
+ * @param inescan_order        0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
 * @param gpu_diff_rgb_combo   low-resolution output, with per-camera mismatch an each color average. Will not be calculated if null
+ * @param num_tilesx           number of tiles in a row
 */
 extern "C" __global__ void textures_nonoverlap(
 		int               num_cams,           // number of cameras
@@ -2810,6 +2812,7 @@ extern "C" __global__ void textures_nonoverlap(
 // combining both non-overlap and overlap (each calculated if pointer is not null )
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		int               linescan_order,     // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
 		float           * gpu_diff_rgb_combo, // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 		int               num_tilesx)
 // num_tilesx in the end - worked, after num_tiles - did not compile with JIT in Eclipse
@@ -2856,6 +2859,7 @@ extern "C" __global__ void textures_nonoverlap(
 				(int *) 0,                       // int             * woi,                // x, y, width,height
 				gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 				*pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+				0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
 				gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 				colors,                          // int               colors,             // number of colors (3/1)
@@ -2872,8 +2876,8 @@ extern "C" __global__ void textures_nonoverlap(
 				0,                               // size_t      texture_rbg_stride, // in floats
 				(float *) 0,                     // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 				texture_stride,                  // size_t      texture_stride,     // in floats (now 256*4 = 1024)
-				gpu_texture_tiles,               //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+				gpu_texture_tiles,               // (float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-				1, // 	int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
+				linescan_order,                  //	int               linescan_order,     // if !=0 then output gpu_diff_rgb_combo in linescan order, else  - in gpu_texture_indices order
 				gpu_diff_rgb_combo, //);             // float           * gpu_diff_rgb_combo) // diff[num_cams], R[num_cams], B[num_cams],G[num_cams]
 				num_tilesx);
 	 }
@@ -2892,6 +2896,7 @@ extern "C" __global__ void textures_nonoverlap(
 * @param woi                  WoI for the output texture (x,y,width,height of the woi), may be null if overlapped output is not used
 * @param gpu_clt              array of num_cams pointers to the CLT (frequency domain) data [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 * @param num_texture_tiles    number of texture tiles to process
+ * @param gpu_texture_indices_offset add to gpu_texture_indices
 * @param gpu_texture_indices  array - 1 integer per tile to process
 * @param gpu_geometry_correction geometry correction structure, used for rXY to determine pairs weight
 * @param colors               number of colors used:  3 for RGB or 1 for monochrome
@@ -2918,6 +2923,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
+		int               gpu_texture_indices_offset,// add to gpu_texture_indices
 		int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
@@ -2959,7 +2965,7 @@ extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		return; // nothing to do
 	}
 	// get number of tile
-	int tile_code = gpu_texture_indices[tile_indx];
+	int tile_code = gpu_texture_indices[tile_indx + gpu_texture_indices_offset]; // Added for Java, no DP
 	if ((tile_code & (1 << CORR_TEXTURE_BIT)) == 0){
 		return; // nothing to do
 	}
@@ -3513,11 +3519,25 @@ __global__ void imclt_rbg_all(
 		int                woi_theight,
 		const size_t       dstride)            // in floats (pixels)
 {
 //	int num_cams = sizeof(gpu_clt)/sizeof(&gpu_clt[0]);
+	dim3 threads_erase8x8(DTT_SIZE, NUM_THREADS/DTT_SIZE, 1);
+	dim3 grid_erase8x8_right_col (1, woi_theight + 1, 1);
+	dim3 grid_erase8x8_bottom_row(woi_twidth + 1, 1, 1);
 	dim3 threads_imclt(IMCLT_THREADS_PER_TILE, IMCLT_TILES_PER_BLOCK, 1);
 	if (threadIdx.x == 0) { // anyway 1,1,1
 		for (int ncam = 0; ncam < num_cams; ncam++) { // was NUM_CAMS
 			for (int color = 0; color < colors; color++) {
+				// clear right and bottom 8-pixel column and row
+				float *right_col = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color + (woi_twidth * DTT_SIZE);
+				erase8x8<<<grid_erase8x8_right_col,threads_erase8x8>>>(
+						right_col,             // float           * gpu_top_left,
+						dstride);              // const size_t      dstride);
+				float *bottom_row = gpu_corr_images[ncam] + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color +  dstride * (woi_theight * DTT_SIZE);
+				erase8x8<<<grid_erase8x8_bottom_row,threads_erase8x8>>>(
+						bottom_row,             // float           * gpu_top_left,
+						dstride);              // const size_t      dstride);
 				for (int v_offs = 0; v_offs < 2; v_offs++){
 					for (int h_offs = 0; h_offs < 2; h_offs++){
 						int tilesy_half = (woi_theight + (v_offs ^ 1)) >> 1;
@@ -3544,7 +3564,25 @@ __global__ void imclt_rbg_all(
 	}
 }
+/**
+ * Clear 8x8 tiles, used to erase right and bottom 8-pixel wide column/row before imclt_rbg
+ * @param gpu_top_left - pointer to the top-left corner of the firsr tile to erase
+ * @param dstride - offset for 1 pixel step down
+ * block.x - horizontal tile  offset
+ * block.y - vertical tile offset
+ * 0<=thread.x < 8 - horizontal pixel offset
+ * 0<=thread.y < 4 - vertical pixel offset
+ */
+extern "C"
+__global__ void erase8x8(
+		float           * gpu_top_left,
+		const size_t      dstride)
+{
+	float * pixel = gpu_top_left + (((blockIdx.y * DTT_SIZE) + threadIdx.y) * dstride) + ((blockIdx.x * DTT_SIZE) + threadIdx.x);
+	* pixel = 0.0f;
+	pixel += dstride * blockDim.y; // add 4 pixel rows (assuming blockDim.x==4)
+	* pixel = 0.0f;
+}
 /**
 * Helper kernel for imclt_rbg_all(), generate per-camera -per color image from the in-memory frequency domain representation.
@@ -3573,7 +3611,7 @@ __global__ void imclt_rbg(
 		const size_t      dstride)            // in floats (pixels)
 {
 	float *color_plane = gpu_rbg + dstride * (woi_theight * DTT_SIZE + DTT_SIZE) * color;
-	int pass =           (v_offset << 1) + h_offset;     		// 0..3 to correctly acummulate 16x16 tiles stride 8
+	int pass =           (v_offset << 1) + h_offset;     		// 0..3 to correctly accumulate 16x16 tiles stride 8
 	int tile_in_block = threadIdx.y;
 	int tile_num = blockIdx.x * IMCLT_TILES_PER_BLOCK + tile_in_block;
 	int tilesx_half = (woi_twidth + (h_offset ^ 1)) >> 1;

--- a/src/TileProcessor.h
+++ b/src/TileProcessor.h
@@ -49,7 +49,7 @@ extern "C" __global__ void convert_direct( // called with a single block, single
 		float           ** gpu_kernel_offsets, // [NUM_CAMS],
 		float           ** gpu_kernels,        // [NUM_CAMS],
 		float           ** gpu_images,         // [NUM_CAMS],
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		float           ** gpu_clt,            // [NUM_CAMS][TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t             dstride,            // in floats (pixels)
@@ -76,7 +76,7 @@ extern "C" __global__ void correlate2D(
 		float             scale1,             // scale for B
 		float             scale2,             // scale for G
 		float             fat_zero,           // here - absolute
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 //		struct tp_task  * gpu_tasks,          // array of per-tile tasks (now bits 4..9 - correlation pairs)
 		int               num_tiles,          // number of tiles in task
 		int               tilesx,             // number of tile rows
@@ -109,7 +109,7 @@ extern "C" __global__ void corr2D_combine(
 extern "C" __global__ void textures_nonoverlap(
 		int               num_cams,           // number of cameras
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats
+		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats
 		//		struct tp_task  * gpu_tasks,
 		int               num_tiles,          // number of tiles in task list
 //		int               num_tilesx,         // number of tiles in a row
@@ -127,6 +127,7 @@ extern "C" __global__ void textures_nonoverlap(
 // combining both non-overlap and overlap (each calculated if pointer is not null )
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
+		int               linescan_order,     // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
 		float           * gpu_diff_rgb_combo, //); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
 		int               num_tilesx);
@@ -141,6 +142,10 @@ __global__ void imclt_rbg_all(
 		int                woi_theight,
 		const size_t       dstride);            // in floats (pixels)
+extern "C" __global__ void erase8x8(
+		float           * gpu_top_left,
+		const size_t      dstride);
 extern "C" __global__ void imclt_rbg(
 		float           * gpu_clt,            // [TILES-Y][TILES-X][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		float           * gpu_rbg,            // WIDTH, 3 * HEIGHT
@@ -156,7 +161,7 @@ extern "C" __global__ void imclt_rbg(
 extern "C" __global__ void generate_RBGA(
 		int                num_cams,           // number of cameras used
 		// Parameters to generate texture tasks
-		float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+		float            * gpu_ftasks,         // flattened tasks, 29 floats for quad EO, 101 floats for LWIR16
 //		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
 		// declare arrays in device code?

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -31,8 +31,8 @@
 */
 #define NOCORR
-#define NOCORR_TD
+//#define NOCORR_TD
-#define NOTEXTURES_HOST
+//#define NOTEXTURES_HOST
 #define NOTEXTURES
 #define NOTEXTURE_RGBA
 #define SAVE_CLT
@@ -492,7 +492,9 @@ void generate_RBGA_host(
 				 gpu_woi,                             // int             * woi,                // x, y, width,height
 				 gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 				 ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
-				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+				 ti_offset,                       //                gpu_texture_indices_offset,// add to gpu_texture_indices
+				 gpu_texture_indices, //  + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
+//				 gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 				 gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 				 colors,                          // int               colors,             // number of colors (3/1)
 				 is_lwir,                         // int               is_lwir,            // do not perform shot correction
@@ -1626,7 +1628,6 @@ int main(int argc, char **argv)
    	// FIXME: provide sel_pairs
        correlate2D<<<1,1>>>( // output TD tiles, no normalization
        		num_cams,                      // int               num_cams,
-				//				0,                             // int *             sel_pairs,           // unused bits should be 0
 				sel_pairs[0], // int               sel_pairs0           // unused bits should be 0
 				sel_pairs[1], // int               sel_pairs1,           // unused bits should be 0
 				sel_pairs[2], // int               sel_pairs2,           // unused bits should be 0
@@ -1638,7 +1639,6 @@ int main(int argc, char **argv)
 				color_weights[2], // 0.5,      // float             scale2,             // scale for G
 				30.0,                          // float             fat_zero,           // here - absolute
 				gpu_ftasks,                    // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
-//				gpu_tasks,                     // struct tp_task  * gpu_tasks,
 				tp_task_size,                  // int               num_tiles) // number of tiles in task
 				TILESX,                        // int               tilesx,             // number of tile rows
 				gpu_corr_indices,              // int             * gpu_corr_indices,   // packed tile+pair
@@ -1795,6 +1795,7 @@ int main(int argc, char **argv)
 							(int *) 0,                       // int             * woi,                // x, y, width,height
 							gpu_clt,                         // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 							cpu_pnum_texture_tiles, // *pnum_texture_tiles,             // size_t            num_texture_tiles,  // number of texture tiles to process
+							0,                               //                gpu_texture_indices_offset,// add to gpu_texture_indices
 							gpu_texture_indices,             // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 							gpu_geometry_correction,         // struct gc       * gpu_geometry_correction,
 							texture_colors,                  // int               colors,             // number of colors (3/1)
@@ -1949,7 +1950,7 @@ int main(int argc, char **argv)
 //    printf("grid_texture=(%d, %d, %d)\n",grid_texture.x,grid_texture.y,grid_texture.z);
    StopWatchInterface *timerTEXTURE = 0;
    sdkCreateTimer(&timerTEXTURE);
+	int  linescan_order = 1; // output low-res in linescan order, 0 - in gpu_texture_indices order
    for (int i = i0; i < numIterations; i++)
    {
    	if (i == 0)
@@ -1986,7 +1987,8 @@ int main(int argc, char **argv)
 				1,                     // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
    	// combining both non-overlap and overlap (each calculated if pointer is not null )
 				0, // dstride_textures/sizeof(float), // size_t            texture_stride,     // in floats (now 256*4 = 1024)  // may be 0 if not needed
-				(float *) 0,          // gpu_textures,         // float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed
+				(float *) 0,          // gpu_textures,         // float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles    // may be 0 if not needed\
+				linescan_order,       // int               linescan_order,     // 0 low-res tiles have tghe same order, as gpu_texture_indices, 1 - in linescan order
 				gpu_diff_rgb_combo, //);  // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS] // may be 0 if not needed
 				TILESX);
    	getLastCudaError("Kernel failure");

--- a/src/tp_defines.h
+++ b/src/tp_defines.h
@@ -134,7 +134,7 @@
 #define DEBUG8 1
 #define DEBUG9 1
 */
-#define DEBUG8A 1
+//#define DEBUG8A 1 // generate_RBGA_host
 //textures
 //#define DEBUG10 1
 //#define DEBUG11 1