adding generation of data for macroblocks

4203dbbb · Andrey Filippov · fbab1093 · 4203dbbb · 4203dbbb · 4203dbbb
Commit 4203dbbb authored Apr 17, 2020 by Andrey Filippov
5 changed files
--- a/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
+++ b/src/main/java/com/elphel/imagej/correction/Eyesis_Correction.java
@@ -702,9 +702,9 @@ private Panel panel1,
 			panelClt_GPU.setLayout(new GridLayout(1, 0, 5, 5)); // rows, columns, vgap, hgap
 			addButton("JCUDA TEST",                 panelClt_GPU);
 			addButton("TF TEST",                    panelClt_GPU);
-			addButton("GPU files",                  panelClt_GPU, color_conf_process);
-			addButton("Rig8 gpu",                   panelClt_GPU, color_conf_process);
-			addButton("ShowGPU",                    panelClt_GPU, color_conf_process);
+			addButton("GPU simulate",                  panelClt_GPU, color_conf_process);
+			addButton("GPU RUN",                   panelClt_GPU, color_conf_process);
+//			addButton("ShowGPU",                    panelClt_GPU, color_conf_process);

 			addButton("LWIR_TEST",                  panelClt_GPU, color_conf_process);
 			addButton("LWIR_ACQUIRE",               panelClt_GPU, color_conf_process);
@@ -4915,13 +4915,13 @@ private Panel panel1,
    	getPairImages2();
    	return;
 /* ======================================================================== */
-    } else if (label.equals("GPU files")) {
+    } else if (label.equals("GPU simulate")) {
        DEBUG_LEVEL=MASTER_DEBUG_LEVEL;
    	EYESIS_CORRECTIONS.setDebug(DEBUG_LEVEL);
    	generateGPUDebugFiles();
    	return;
 /* ======================================================================== */
-    } else if (label.equals("Rig8 gpu")) {
+    } else if (label.equals("GPU RUN")) {
        DEBUG_LEVEL=MASTER_DEBUG_LEVEL;
    	EYESIS_CORRECTIONS.setDebug(DEBUG_LEVEL);
    	getPairImages2Gpu();

--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -184,21 +184,22 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_kernels =             new CUdeviceptr();
    private CUdeviceptr gpu_kernel_offsets =      new CUdeviceptr();
    private CUdeviceptr gpu_bayer =               new CUdeviceptr();
-    private CUdeviceptr gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.POINTER
-    private CUdeviceptr gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.POINTER
-    private CUdeviceptr gpu_textures =            new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
+    private CUdeviceptr gpu_tasks =               new CUdeviceptr(); //  allocate tilesX * tilesY * TPTASK_SIZE * Sizeof.FLOAT
+    private CUdeviceptr gpu_corrs =               new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_PAIRS * CORR_SIZE * Sizeof.FLOAT
+    private CUdeviceptr gpu_textures =            new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
    private CUdeviceptr gpu_clt =                 new CUdeviceptr();
    private CUdeviceptr gpu_4_images =            new CUdeviceptr();
-    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
-    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
-    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
+    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
+    private CUdeviceptr gpu_diff_rgb_combo =      new CUdeviceptr(); //  allocate tilesX * tilesY * NUM_CAMS* (NUM_COLORS + 1) * Sizeof.FLOAT

-//    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
-    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
+//    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT
+    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT

    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
    private CUdeviceptr gpu_num_texture_tiles =   new CUdeviceptr(); //  8 ints
-    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
+    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT

    private CUdeviceptr gpu_correction_vector=    new CUdeviceptr();
    private CUdeviceptr gpu_rot_deriv=            new CUdeviceptr(); //  used internally by device, may be read to CPU for testing
@@ -562,7 +563,9 @@ public class GPUTileProcessor {
    	//#define TILESYA       ((TILESY +3) & (~3))
    	int tilesYa = (tilesY + 3) & ~3;
 //    	cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
-    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
+    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.FLOAT);
+
+    	cuMemAlloc(gpu_diff_rgb_combo, tilesX * tilesYa * NUM_CAMS* (NUM_COLORS + 1) *  Sizeof.FLOAT);

 //    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.FLOAT);
    	cuMemAlloc(gpu_color_weights,             3 * Sizeof.FLOAT);
@@ -1272,7 +1275,9 @@ public class GPUTileProcessor {
 	            Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
 	            Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
-	            Pointer.to(gpu_textures_rgba));                  // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+	            Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
+	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+
    	cuCtxSynchronize();
    	// Call the kernel function
    	cuLaunchKernel(GPU_RBGA_kernel,
@@ -1333,8 +1338,8 @@ public class GPUTileProcessor {
    			Pointer.to(new int[] {0}),//  0, // const size_t      texture_rbg_stride, // in floats - DISABLE GENERATION!
    			Pointer.to(new int[] {0}), // null, //  new Pointer(),  //Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
    			Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-    			Pointer.to(gpu_textures)
-    			);
+    			Pointer.to(gpu_textures),
+	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
    	cuCtxSynchronize();
    	// Call the kernel function
    	cuLaunchKernel(GPU_TEXTURES_kernel,
@@ -1574,7 +1579,7 @@ public class GPUTileProcessor {
 //    	for (String sourceCode: sourceCodeUnits) {
       	for (int cunit = 0; cunit < ptxDataUnits.length; cunit++) {
       		String sourceCode = sourceCodeUnits[cunit];
-       		//System.out.print(sourceCode);
+//       		System.out.print(sourceCode);
    		// Use the NVRTC to create a program by compiling the source code
    		nvrtcProgram program = new nvrtcProgram();
    		nvrtcCreateProgram(	program, sourceCode, null, 0, null, null);

--- a/src/main/java/com/elphel/imagej/tileprocessor/MacroCorrelation.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/MacroCorrelation.java
@@ -45,11 +45,11 @@ public class MacroCorrelation {
 			double trusted_correlation,
 	    	double     weight_var, //  = 1.0;   // weight of variance data (old, detects thin wires?)
 	    	double     weight_Y, //  =   1.0;     // weight of average intensity
-	    	double     weight_RBmG //  = 5.0;  // weight of average color difference (0.5*(R+B)-G), shoukld be ~5*weight_Y
+	    	double     weight_RBmG //  = 5.0;  // weight of average color difference (0.5*(R+B)-G), should be ~5*weight_Y
 			){
    	this.weight_var = weight_var;   // weight of variance data (old, detects thin wires?)
    	this.weight_Y = weight_Y;     // weight of average intensity
-    	this.weight_RBmG = weight_RBmG; //  = 5.0;  // weight of average color difference (0.5*(R+B)-G), shoukld be ~5*weight_Y
+    	this.weight_RBmG = weight_RBmG; //  = 5.0;  // weight of average color difference (0.5*(R+B)-G), should be ~5*weight_Y

 		this.tp = tp;
 		  final int pTilesX = tp.getTilesX();

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -1196,7 +1196,8 @@ __global__ void generate_RBGA(
 			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 			int               keep_weights,       // return channel weights after A in RGBA (was removed)
 			const size_t      texture_rbga_stride,     // in floats
-			float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+			float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+			float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
 {
 // TODO use atomic_add to increment	num_texture_tiles
 // TODO calculate woi
@@ -1329,7 +1330,9 @@ __global__ void generate_RBGA(
 						texture_rbga_stride,             // size_t      texture_rbg_stride, // in floats
 						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
-						gpu_texture_tiles); // (float *) 0 );                   // float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+						gpu_texture_tiles, //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+						gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+
 				cudaDeviceSynchronize(); // not needed yet, just for testing
 				/* */
 			}
@@ -1788,8 +1791,8 @@ __global__ void textures_accumulate(
 		size_t            texture_rbg_stride, // in floats
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
-
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
 {
 	//						(float *) gpu_geometry_correction ->pXY0,
 //	float weights[3] = {weight0, weight1, weight2};
@@ -1997,8 +2000,8 @@ __global__ void textures_accumulate(
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
 			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
 			(float *) shr1.rgbaw,      // float * rgba,          // result
-			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
-			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float * ) ports_rgb,      // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) max_diff,       // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
 			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
 			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
 			diff_threshold,            // float   diff_threshold, // pixel value/pixel change
@@ -2013,8 +2016,8 @@ __global__ void textures_accumulate(
 			(float*) shr.mclt_debayer, // float * mclt_tile,     // debayer // has gaps to align with union !
 			(float*) mclt_tiles,       // float * rbg_tile,      // if not null - original (not-debayered) rbg tile to use for the output
 			(float *) shr1.rgbaw,      // float * rgba,          // result
-			(float * ) 0,              // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
-			(float * ) 0,              // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
+			(float * ) ports_rgb,      // float * ports_rgb,     // average values of R,G,B for each camera (R0,R1,...,B2,B3) // null
+			(float * ) max_diff,       // float * max_diff,      // maximal (weighted) deviation of each channel from the average /null
 			(float *) port_offsets,    // float * port_offsets,  // [port]{x_off, y_off} - just to scale pixel value differences
 			diff_sigma,                // float   diff_sigma,     // pixel value/pixel change
 			diff_threshold,            // float   diff_threshold, // pixel value/pixel change

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -96,7 +96,9 @@ extern "C" __global__ void textures_accumulate(
 		size_t            texture_rbg_stride, // in floats
 		float           * gpu_texture_rbg,    // (number of colors +1 + ?)*16*16 rgba texture tiles
 		size_t            texture_stride,     // in floats (now 256*4 = 1024)
-		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
+		float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+

 extern "C"
 __global__ void imclt_rbg_all(