Changes from May 2022

6fc2c45f · Andrey Filippov · f72f2a9c · 6fc2c45f · 6fc2c45f
Commit 6fc2c45f authored Jun 16, 2022 by Andrey Filippov
Show whitespace changes
Inline Side-by-side

Showing with 233 additions and 13 deletions

TileProcessor.cuh src/TileProcessor.cuh +3 -2

test_tp.cu src/test_tp.cu +230 -11

No files found.
--- a/src/TileProcessor.cuh
+++ b/src/TileProcessor.cuh
@@ -437,6 +437,7 @@ __constant__ float lpf_rb_corr[64]={ // modify if needed
 				0.32891038f, 0.30456742f, 0.26124917f, 0.20786692f, 0.15413642f, 0.10818204f, 0.07536856f, 0.05845371f,
 				0.22914618f, 0.21218686f, 0.18200779f, 0.14481729f, 0.10738418f, 0.07536856f, 0.05250797f, 0.04072369f,
 				0.17771927f, 0.16456610f, 0.14116007f, 0.11231618f, 0.08328412f, 0.05845371f, 0.04072369f, 0.03158414f
 		};
 __constant__ float lpf_corr[64]={ // modify if needed
 				1.00000000f, 0.87041007f, 0.65943687f, 0.43487258f, 0.24970076f, 0.12518080f, 0.05616371f, 0.02728573f,
@@ -447,6 +448,7 @@ __constant__ float lpf_corr[64]={ // modify if needed
 				0.12518080f, 0.10895863f, 0.08254883f, 0.05443770f, 0.03125774f, 0.01567023f, 0.00703062f, 0.00341565f,
 				0.05616371f, 0.04888546f, 0.03703642f, 0.02442406f, 0.01402412f, 0.00703062f, 0.00315436f, 0.00153247f,
 				0.02728573f, 0.02374977f, 0.01799322f, 0.01186582f, 0.00681327f, 0.00341565f, 0.00153247f, 0.00074451f
 		};
 __constant__ float LoG_corr[64]={ // modify if needed high-pass filter before correlation to fit into float range
@@ -1389,7 +1391,6 @@ extern "C" __global__ void combine_inter(     // combine per-senor interscene co
    	clt        += DTT_SIZE1;
    	mem_corr   += DTT_SIZE;
    }
-    corr_offset++;
 }
@@ -1825,7 +1826,7 @@ extern "C" __global__ void correlate2D_inner(
 * Combine multiple correlation pairs for quad (square) camera: 2 or 4 ortho into a single clt tile,
 * and separately the two diagonals into another single one
 * When adding vertical pairs to the horizontal, each quadrant is transposed, and the Q1 and Q2 are also swapped.
- * when combining tho diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
+ * when combining two diagonals (down-right and up-right), the data in quadrants Q2 and Q3 is negated
 * (corresponds to a vertical flip).
 * Data can be added to the existing one (e.g. for the inter-scene accumulation of the compatible correlations).
 * This is an outer kernel that calls the inner one with CDP, this one should be configured as corr2D_combine<<<1,1>>>

--- a/src/test_tp.cu
+++ b/src/test_tp.cu
@@ -138,14 +138,97 @@ float ** copyalloc_pointers_gpu(float ** gpu_pointer,
    return gpu_pointer_to_gpu_pointers;
 }
+// shift image in-place, repeat lines/columns
+void shift_image (
+		float * image,
+        int width,
+		int height,
+		int bayer,
+		int dx,
+		int dy)
+{
+	int step = 1;
+	if (bayer){
+		step = 2;
+		dx &= -2;
+		dy &= -2;
+	}
+	// vertical shift dy>0 - down, dy < 0 - up
+	for (int m = 0; m < dy; m+= step) { // only if dy > 0 (down)
+		for (int y = height - 1; y >= step; y++){
+			float * dp = image + (y * width);
+			float * sp = dp - step * width;
+			for (int x = 0; x < width; x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+	// vertical shift dy < 0 - up
+	for (int m = 0; m > dy; m-= step) { // only if dy < 0 (up)
+		for (int y = 0; y < height - step; y++){
+			float * dp = image + (y * width);
+			float * sp = dp + step * width;
+			for (int x = 0; x < width; x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+	// horizontal shift dx > 0 - right, dx < 0 - left
+	for (int m = 0; m < dx; m+= step) { // only if dx > 0 (right)
+		for (int y = 0; y < height; y++){
+			float * dp = image + (y * width) + width - 1;
+			float * sp = dp - step;
+			for (int x = 0; x < (width - step); x++){
+				(*dp--) = (*sp--);
+			}
+		}
+	}
+	// horizontal shift dx < 0 - left
+	for (int m = 0; m > dx; m-= step) { // only if dx < 0 (left)
+		for (int y = 0; y < height; y++){
+			float * dp = image + (y * width);
+			float * sp = dp + step;
+			for (int x = 0; x < (width - step); x++){
+				(*dp++) = (*sp++);
+			}
+		}
+	}
+}
+void update_image_gpu(
+		float * image_host,
+		float * image_gpu,
+		size_t  dstride, // in floats !
+		int     width,
+		int     height){
+    checkCudaErrors(cudaMemcpy2D(
+    		image_gpu,
+            dstride, //  * sizeof(float),
+			image_host,
+			width * sizeof(float), // make in 16*n?
+            width * sizeof(float),
+			height,
+			cudaMemcpyHostToDevice));
+}
-float * copyalloc_image_gpu(float * image_host,
+float * copyalloc_image_gpu(
+		float * image_host,
 		size_t* dstride, // in floats !
 		int width,
 		int height)
 {
 	float *image_gpu;
    checkCudaErrors(cudaMallocPitch((void **)&image_gpu, dstride, width * sizeof(float), height));
+    update_image_gpu(
+    		image_host,
+    		image_gpu,
+    		*dstride, // in floats !
+    		width,
+    		height);
+    /*
    checkCudaErrors(cudaMemcpy2D(
    		image_gpu,
            *dstride, //  * sizeof(float),
@@ -154,6 +237,7 @@ float * copyalloc_image_gpu(float * image_host,
            width * sizeof(float),
 			height,
 			cudaMemcpyHostToDevice));
+	*/
    return image_gpu;
 }
@@ -569,6 +653,8 @@ int main(int argc, char **argv)
    //initialize CUDA
    findCudaDevice(argc, (const char **)argv);
+    float fat_zero = 1000.0f; // 300.0f; // 30.0;
 #if TEST_LWIR
    const char* kernel_file[] = {
    		"clt/aux_chn0_transposed.kernel",
@@ -996,7 +1082,6 @@ int main(int argc, char **argv)
 			TILESX * TILESY);                // int height);
-//    for (int ncam = 0; ncam < NUM_CAMS; ncam++) {
    for (int ncam = 0; ncam < num_cams; ncam++) {
        readFloatsFromFile(
        		host_kern_buf, // float * data, // allocated array
@@ -1493,7 +1578,7 @@ int main(int argc, char **argv)
 				color_weights[0], // 0.25,  // float             scale0,             // scale for R
 				color_weights[1], // 0.25,  // float             scale1,             // scale for B
 				color_weights[2], // 0.5,   // float             scale2,             // scale for G
-				30.0 * 30.0,                // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,                // float             fat_zero2,           // here - absolute
 				gpu_ftasks,                 // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				tp_task_size,               // int               num_tiles) // number of tiles in task
 				TILESX,                     // int               tilesx,             // number of tile rows
@@ -1618,7 +1703,7 @@ int main(int argc, char **argv)
 				color_weights[0], // 0.25,     // float             scale0,             // scale for R
 				color_weights[1], // 0.25,     // float             scale1,             // scale for B
 				color_weights[2], // 0.5,      // float             scale2,             // scale for G
-				30.0*30.0,                     // float             fat_zero2,          // here - absolute (squared)
+				fat_zero*fat_zero,                     // float             fat_zero2,          // here - absolute (squared)
 				gpu_ftasks,                    // float            * gpu_ftasks,        // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
 				tp_task_size,                  // int               num_tiles) // number of tiles in task
 				TILESX,                        // int               tilesx,             // number of tile rows
@@ -1662,7 +1747,7 @@ int main(int argc, char **argv)
 				(float *) 0, // corr_weights,                  // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 				dstride_corr_combo/sizeof(float),    // const size_t      corr_stride,        // in floats
 				gpu_corrs_combo,                     // float           * gpu_corrs,          // correlation output data (pixel domain)
-				30.0 * 30.0,                         // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
 				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 #else
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -1674,7 +1759,7 @@ int main(int argc, char **argv)
 				(float *) 0, // corr_weights,        // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 				dstride_corr/sizeof(float),          // const size_t      corr_stride,        // in floats
 				gpu_corrs,                           // float           * gpu_corrs,          // correlation output data (pixel domain)
-				30.0 * 30.0,                         // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
 				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
 #endif
    	getLastCudaError("Kernel failure:corr2D_normalize");
@@ -1785,6 +1870,54 @@ int main(int argc, char **argv)
 			corr_img_size,             // int           size, // length in elements
 			result_corr_td_norm_file); // 			   const char *  path) // file path
 #endif
+#if 1 // export TD intra
+    int intra_corr_size_td =   num_corrs * DTT_SIZE2*DTT_SIZE2;
+    cpu_corr_td = (float *)malloc(intra_corr_size_td * sizeof(float));
+    checkCudaErrors(cudaMemcpy2D(
+    		cpu_corr_td,
+			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
+			gpu_corrs_td,
+			dstride_corr_td,
+			(DTT_SIZE2*DTT_SIZE2) * sizeof(float),
+			num_corrs,
+			cudaMemcpyDeviceToHost));
+    // Reuse the same corr_img for TD images - each tile is still 16x16 (corr was 15x15 and gap)
+    for (int ict = 0; ict < num_corr_indices; ict++){
+    	int ctt = ( cpu_corr_indices[ict] >>  CORR_NTILE_SHIFT);
+    	int cpair = cpu_corr_indices[ict] & ((1 << CORR_NTILE_SHIFT) - 1);
+    	int ty = ctt / TILESX;
+    	int tx = ctt % TILESX;
+    	int src_offs0 = ict * DTT_SIZE2*DTT_SIZE2;
+    	int dst_offs0 = cpair * (num_tiles * 16 * 16) +  (ty * 16 * TILESX * 16) + (tx * 16);
+    	for (int iquad = 0; iquad < 4; iquad ++) {
+    		int iqy = (iquad >> 1) & 1;
+    		int iqx = (iquad >> 0) & 1;
+    		for (int iy = 0; iy < DTT_SIZE; iy++){
+    			int src_offs = src_offs0 + iy * DTT_SIZE + iquad * DTT_SIZE * DTT_SIZE;
+    			int dst_offs = dst_offs0 + (iy + DTT_SIZE * iqy)* (TILESX * 16) + iqx * DTT_SIZE;
+    			for (int ix = 0; ix < DTT_SIZE; ix++){
+    				corr_img[dst_offs++] = cpu_corr_td[src_offs++];
+    			}
+    		}
+    	}
+    }
+#ifndef NSAVE_CORR
+    printf("Writing intrascene phase correlation TD data");
+    writeFloatsToFile(
+    		corr_img,                  // float *       data, // allocated array
+			corr_img_size,             // int           size, // length in elements
+			"clt/aux_intrascene-TD.raw"); // 			   const char *  path) // file path
+#endif
+    free (cpu_corr_td);
+#endif  // if 1
+    // reuse image, export TD data
    free (cpu_corr);
    free (cpu_corr_indices);
    free (corr_img);
@@ -1793,6 +1926,8 @@ int main(int argc, char **argv)
 #endif // ifndef NOCORR_TD
 // Testing "interframe" correlation with itself, assuming direct convert already ran
@@ -1803,6 +1938,84 @@ int main(int argc, char **argv)
    int num_sel_sensors = 16; // 15; // 16;
    num_pairs = num_sel_sensors+1;
    num_corr_indices = num_pairs * num_tiles;
+    int is_bayer = 0;
+    int image_dx = 2;
+    int image_dy = 0;
+    float            * gpu_clt_ref_h            [num_cams];
+    float           ** gpu_clt_ref;    //            [NUM_CAMS];
+    for (int ncam = 0; ncam < num_cams; ncam++) {
+       gpu_clt_ref_h[ncam] = alloc_kernel_gpu(TILESY * TILESX * num_colors * 4 * DTT_SIZE * DTT_SIZE);
+    }
+    gpu_clt_ref =      copyalloc_pointers_gpu (gpu_clt_ref_h,  num_cams); // NUM_CAMS);
+    // use gpu_images and convert to gpu_clt_ref
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			num_cams,              // int                num_cams,           // actual number of cameras
+			num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
+			gpu_kernels,           // float           ** gpu_kernels,
+			gpu_images,            // float           ** gpu_images,
+			gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			gpu_clt_ref, //******  // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float), // size_t             dstride, // for gpu_images
+			tp_task_size,          // int                num_tiles) // number of tiles in task
+			0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			IMG_WIDTH,             // int                woi_width,
+			IMG_HEIGHT,            // int                woi_height,
+			KERNELS_HOR,           // int                kernels_hor,
+			KERNELS_VERT,          // int                kernels_vert);
+			gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			TILESX); // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors(cudaDeviceSynchronize());
+	// re-read same images. shift them, update gpu_images and convert to gpu_clt;
+    for (int ncam = 0; ncam < num_cams; ncam++) {
+        readFloatsFromFile(
+        		host_kern_buf, // float * data, // allocated array
+				image_files[ncam]); // 			   char *  path) // file path
+        shift_image (
+        		host_kern_buf, // float * image,
+				IMG_WIDTH,     // int width,
+				IMG_HEIGHT,    // int height,
+				is_bayer,      // int bayer,
+				image_dx,      // int dx,
+				image_dy);     // int dy);
+        update_image_gpu(
+        		host_kern_buf,          // float * image_host,
+				gpu_images_h[ncam], // float * image_gpu,
+				dstride,                // size_t  dstride, // in floats !
+				IMG_WIDTH,              // int     width,
+				IMG_HEIGHT);            // int     height);
+    }
+	convert_direct<<<grid_tp,threads_tp>>>( // called with a single block, CONVERT_DIRECT_INDEXING_THREADS threads
+			num_cams,              // int                num_cams,           // actual number of cameras
+			num_colors,            // int                num_colors,         // actual number of colors: 3 for RGB, 1 for LWIR/mono
+			fgpu_kernel_offsets,   // struct CltExtra ** gpu_kernel_offsets,
+			gpu_kernels,           // float           ** gpu_kernels,
+			gpu_images,            // float           ** gpu_images,
+			gpu_ftasks,            // float            * gpu_ftasks,         // flattened tasks, 27 floats for quad EO, 99 floats for LWIR16
+			gpu_clt,     //******  // float           ** gpu_clt,            // [num_cams][TILESY][TILESX][num_colors][DTT_SIZE*DTT_SIZE]
+			dstride/sizeof(float), // size_t             dstride, // for gpu_images
+			tp_task_size,          // int                num_tiles) // number of tiles in task
+			0,                     // int                lpf_mask)            // apply lpf to colors : bit 0 - red, bit 1 - blue, bit2 - green
+			IMG_WIDTH,             // int                woi_width,
+			IMG_HEIGHT,            // int                woi_height,
+			KERNELS_HOR,           // int                kernels_hor,
+			KERNELS_VERT,          // int                kernels_vert);
+			gpu_active_tiles,      // int *              gpu_active_tiles,      // pointer to the calculated number of non-zero tiles
+			gpu_num_active, //);       // int *              pnum_active_tiles);  //  indices to gpu_tasks
+			TILESX); // int                tilesx)
+	getLastCudaError("Kernel execution failed");
+	checkCudaErrors(cudaDeviceSynchronize());
    StopWatchInterface *timerINTERSELF = 0;
    sdkCreateTimer(&timerINTERSELF);
 //    int num_corr_combo_inter;
@@ -1818,7 +2031,7 @@ int main(int argc, char **argv)
    			num_cams,                      // int               num_cams,
 				sel_sensors,                   // int               sel_sensors,
 				gpu_clt,                       // float          ** gpu_clt,            // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
-				gpu_clt,                       // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
+				gpu_clt_ref, // *********      // float          ** gpu_clt_ref,        // [num_cams] ->[TILES-Y][TILES-X][colors][DTT_SIZE*DTT_SIZE]
 				num_colors,                    // int               colors,             // number of colors (3/1)
 				color_weights[0], // 0.25,     // float             scale0,             // scale for R
 				color_weights[1], // 0.25,     // float             scale1,             // scale for B
@@ -1846,7 +2059,7 @@ int main(int argc, char **argv)
 				(float *) 0, // corr_weights,        // float           * corr_weights,       // null or per-tile weight (fat_zero2 will be divided by it)
 				dstride_corr/sizeof(float),          // const size_t      corr_stride,        // in floats
 				gpu_corrs,                           // float           * gpu_corrs,          // correlation output data (pixel domain)
-				30.0 * 30.0,                         // float             fat_zero2,           // here - absolute
+				fat_zero * fat_zero,                         // float             fat_zero2,           // here - absolute
 				CORR_OUT_RAD);                       // int               corr_radius);        // radius of the output correlation (7 for 15x15)
    	getLastCudaError("Kernel failure:corr2D_normalize");
    	checkCudaErrors(cudaDeviceSynchronize());
@@ -1989,6 +2202,12 @@ int main(int argc, char **argv)
    free (cpu_corr);
    free (cpu_corr_indices);
    free (corr_img);
+    for (int ncam = 0; ncam < num_cams; ncam++){
+    	checkCudaErrors(cudaFree(gpu_clt_ref_h[ncam]));
+    }
+    checkCudaErrors(cudaFree(gpu_clt_ref));
 #endif    // #ifdef CORR_INTER_SELF