using geometry correction for port offsets, found mitigation for an nvrtc problem

8893c688 · Andrey Filippov · 4fb94627 · 8893c688 · 8893c688 · 8893c688
Commit 8893c688 authored Apr 16, 2020 by Andrey Filippov
4 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -192,7 +192,10 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_corr_indices =        new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_num_corr_tiles =      new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
    private CUdeviceptr gpu_texture_indices =     new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.POINTER
+
    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
+    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.POINTER
+
    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
    private CUdeviceptr gpu_num_texture_tiles =   new CUdeviceptr(); //  8 ints
    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.POINTER
@@ -429,7 +432,7 @@ public class GPUTileProcessor {
                		file = new File(src_dir.getPath(), src_file);
                		System.out.println("Loading resource "+file);
                	}
-                	System.out.println(file.getAbsolutePath());
+//                	System.out.println(file.getAbsolutePath());
                	String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
                	String sourceFile = readFileAsString(cuFileName); // readResourceAsString(cuFileName);
                	if (sourceFile == null) {
@@ -441,29 +444,6 @@ public class GPUTileProcessor {
            	}
            }
        }
-        /*
-        String kernelSource =     getTpDefines();
-        for (String src_file:GPU_KERNEL_FILES) {
-        	File file = null;
-        	if ((cuda_project_directory == null) || cuda_project_directory.isEmpty()) {
-        		file = new File(classLoader.getResource(GPU_RESOURCE_DIR+"/"+src_file).getFile());
-        		System.out.println("Loading resource "+file);
-        	} else {
-        		File src_dir = new File(cuda_project_directory, "src");
-        		file = new File(src_dir.getPath(), src_file);
-        		System.out.println("Loading resource "+file);
-        	}
-        	System.out.println(file.getAbsolutePath());
-        	String cuFileName = file.getAbsolutePath(); // /home/eyesis/workspace-python3/nvidia_dct8x8/src/dtt8x8.cuh";// "dtt8x8.cuh";
-        	String sourceFile = readFileAsString(cuFileName); // readResourceAsString(cuFileName);
-        	if (sourceFile == null) {
-        		String msg = "Could not read the kernel source code";
-        		IJ.showMessage("Error",	msg);
-        		new IllegalArgumentException (msg);
-        	}
-        	kernelSource += sourceFile;
-        }
-        */

        // Create the kernel functions (first - just test)
        String [] func_names = {
@@ -583,7 +563,10 @@ public class GPUTileProcessor {
    	int tilesYa = (tilesY + 3) & ~3;
 //    	cuMemAlloc(gpu_texture_indices,tilesX * tilesY * Sizeof.POINTER);
    	cuMemAlloc(gpu_texture_indices,tilesX * tilesYa * Sizeof.POINTER);
-    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.POINTER);
+
+    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_color_weights,             3 * Sizeof.FLOAT);
+

    	cuMemAlloc(gpu_woi,                               4 * Sizeof.FLOAT);
    	cuMemAlloc(gpu_num_texture_tiles,                 8 * Sizeof.FLOAT);
@@ -1237,7 +1220,6 @@ public class GPUTileProcessor {
    }

    public void execRBGA(
-    		double [][] port_offsets,
    		double [] color_weights,
    		boolean   is_lwir,
    		double    min_shot,           // 10.0
@@ -1251,18 +1233,14 @@ public class GPUTileProcessor {
    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
    		return;
    	}
-    	float [] fport_offsets = new float[port_offsets.length * 2];
-    	for (int cam = 0; cam < port_offsets.length; cam++) {
-    		fport_offsets[2*cam + 0] = (float) port_offsets[cam][0];
-    		fport_offsets[2*cam + 1] = (float) port_offsets[cam][1];
-    	}
-        cuMemcpyHtoD(gpu_port_offsets, Pointer.to(fport_offsets),  fport_offsets.length * Sizeof.FLOAT);
-
    	int num_colors = color_weights.length;
    	if (num_colors > 3) num_colors = 3;
-    	float weighht0 = (float) color_weights[0];
-    	float weighht1 = (num_colors >1)?((float) color_weights[1]):0.0f;
-    	float weighht2 = (num_colors >2)?((float) color_weights[2]):0.0f;
+    	float [] fcolor_weights = new float[3];
+    	fcolor_weights[0] = (float) color_weights[0];
+    	fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
+    	fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
+        cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);
+
    	int iis_lwir =      (is_lwir)? 1:0;
    	int idust_remove =  (dust_remove)? 1 : 0;

@@ -1282,7 +1260,8 @@ public class GPUTileProcessor {
    			Pointer.to(new int[] {IMG_HEIGHT / DTT_SIZE}),   // int                height); // <= TILESY, use for faster processing of LWIR images
    	    	// Parameters for the texture generation
    			Pointer.to(gpu_clt),                             // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
-	            Pointer.to(gpu_port_offsets),                    // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+        		Pointer.to(gpu_geometry_correction),             //	struct gc          * gpu_geometry_correction,
+//	            Pointer.to(gpu_port_offsets),                    // float           * port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 	            Pointer.to(new int[]   {num_colors}),            // int               colors,             // number of colors (3/1)
 	            Pointer.to(new int[]   {iis_lwir}),              // int               is_lwir,            // do not perform shot correction
 	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
@@ -1290,9 +1269,7 @@ public class GPUTileProcessor {
 	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
 	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
 	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-	            Pointer.to(new float[] {weighht0}),              // float             weight0,            // scale for R
-	            Pointer.to(new float[] {weighht1}),              // float             weight1,            // scale for B
-	            Pointer.to(new float[] {weighht2}),              // float             weight2,            // scale for G
+	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
 	            Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
 	            Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
@@ -1307,6 +1284,76 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

+    public void execTextures(
+//    		double [][] port_offsets,
+    		double [] color_weights,
+    		boolean   is_lwir,
+    		double    min_shot,           // 10.0
+    		double    scale_shot,         // 3.0
+    		double    diff_sigma,         // pixel value/pixel change
+    		double    diff_threshold,     // pixel value/pixel change
+    		double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    		boolean   dust_remove,
+    		boolean   keep_weights) {
+    	if (GPU_TEXTURES_kernel == null)
+    	{
+    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
+    		return;
+    	}
+//    	float [] fport_offsets = new float[port_offsets.length * 2];
+//    	for (int cam = 0; cam < port_offsets.length; cam++) {
+//    		fport_offsets[2*cam + 0] = (float) port_offsets[cam][0];
+//    		fport_offsets[2*cam + 1] = (float) port_offsets[cam][1];
+//    	}
+//        cuMemcpyHtoD(gpu_port_offsets, Pointer.to(fport_offsets),  fport_offsets.length * Sizeof.FLOAT);
+
+    	int num_colors = color_weights.length;
+    	if (num_colors > 3) num_colors = 3;
+    	float weighht0 = (float) color_weights[0];
+    	float weighht1 = (num_colors >1)?((float) color_weights[1]):0.0f;
+    	float weighht2 = (num_colors >2)?((float) color_weights[2]):0.0f;
+    	int iis_lwir =      (is_lwir)? 1:0;
+    	int idust_remove =  (dust_remove)? 1 : 0;
+    	int ikeep_weights = (keep_weights)? 1 : 0;
+
+		int [] GridFullWarps =    {(num_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1};
+    	int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
+
+    	Pointer kernelParameters = Pointer.to(
+//    			Pointer.to(new int[] {0}), // 0,          // int               border_tile,        // if 1 - watch for border
+    			Pointer.to(new int[] {0}), //  int             * woi, - not used
+    			Pointer.to(gpu_clt),
+    			Pointer.to(new int[] { num_texture_tiles }),
+    			Pointer.to(gpu_texture_indices),
+//    			Pointer.to(gpu_port_offsets),
+        		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
+    			Pointer.to(new int[] { num_colors }),
+    			Pointer.to(new int[] { iis_lwir }),
+    			Pointer.to(new float[] {(float) min_shot }),
+    			Pointer.to(new float[] {(float) scale_shot }),
+    			Pointer.to(new float[] {(float) diff_sigma }),
+    			Pointer.to(new float[] {(float) diff_threshold }),
+    			Pointer.to(new float[] {(float) min_agree }),
+    			Pointer.to(new float[] {weighht0 }),
+    			Pointer.to(new float[] {weighht1 }),
+    			Pointer.to(new float[] {weighht2 }),
+    			Pointer.to(new int[] { idust_remove }),
+    			Pointer.to(new int[] { ikeep_weights }),
+    			Pointer.to(new int[] {0}),//  0, // const size_t      texture_rbg_stride, // in floats - DISABLE GENERATION!
+    			Pointer.to(new int[] {0}), // null, //  new Pointer(),  //Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
+    			Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
+    			Pointer.to(gpu_textures)
+    			);
+    	cuCtxSynchronize();
+    	// Call the kernel function
+    	cuLaunchKernel(GPU_TEXTURES_kernel,
+    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
+    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
+    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
+    			kernelParameters, null);   // Kernel- and extra parameters
+    	cuCtxSynchronize();
+    }
+
    public void execTextures(
    		double [][] port_offsets,
    		double [] color_weights,
@@ -1349,6 +1396,7 @@ public class GPUTileProcessor {
    			Pointer.to(new int[] { num_texture_tiles }),
    			Pointer.to(gpu_texture_indices),
    			Pointer.to(gpu_port_offsets),
+//        		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
    			Pointer.to(new int[] { num_colors }),
    			Pointer.to(new int[] { iis_lwir }),
    			Pointer.to(new float[] {(float) min_shot }),

--- a/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
+++ b/src/main/java/com/elphel/imagej/tileprocessor/TwoQuadCLT.java
@@ -2139,7 +2139,7 @@ public class TwoQuadCLT {
 // run textures
 		long startTextures = System.nanoTime();   // System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execTextures(
-				port_offsets,                  // double [][] port_offsets,
+/** remove when done **/				port_offsets,                  // double [][] port_offsets,
 				col_weights,                   // double [] color_weights,
 				quadCLT_main.isLwir(),         // boolean   is_lwir,
 				clt_parameters.min_shot,       // double    min_shot,           // 10.0
@@ -2154,7 +2154,6 @@ public class TwoQuadCLT {
 // run texturesRBGA
 		long startTexturesRBGA = System.nanoTime();   // System.nanoTime();
 		for (int i = 0; i < NREPEAT; i++ ) gPUTileProcessor.execRBGA(
-				port_offsets,                  // double [][] port_offsets,
 				col_weights,                   // double [] color_weights,
 				quadCLT_main.isLwir(),         // boolean   is_lwir,
 				clt_parameters.min_shot,       // double    min_shot,           // 10.0

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -1182,7 +1182,9 @@ __global__ void generate_RBGA(
 // Parameters for the texture generation
 			float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 			// TODO: use geometry_correction rXY !
-			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+			struct gc       * gpu_geometry_correction,
+//			float           * gpu_geometry_correction,
+//			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 			int               colors,             // number of colors (3/1)
 			int               is_lwir,            // do not perform shot correction
 			float             min_shot,           // 10.0
@@ -1190,9 +1192,7 @@ __global__ void generate_RBGA(
 			float             diff_sigma,         // pixel value/pixel change
 			float             diff_threshold,     // pixel value/pixel change
 			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-			float             weight0,            // scale for R
-			float             weight1,            // scale for B
-			float             weight2,            // scale for G
+			float             weights[3],         // scale for R,B,G
 			int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 			int               keep_weights,       // return channel weights after A in RGBA (was removed)
 			const size_t      texture_rbga_stride,     // in floats
@@ -1200,7 +1200,8 @@ __global__ void generate_RBGA(
 {
 // TODO use atomic_add to increment	num_texture_tiles
 // TODO calculate woi
-    dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
+
+	dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
    dim3 blocks0 (blocks_x, height, 1);

@@ -1311,7 +1312,8 @@ __global__ void generate_RBGA(
 						gpu_clt,                         // float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 						ntt,                             // size_t            num_texture_tiles,  // number of texture tiles to process
 						gpu_texture_indices + ti_offset, // int             * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
-						gpu_port_offsets,                // float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+//						gpu_port_offsets,                // float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
+						(float *) gpu_geometry_correction ->pXY0,
 						colors,                          // int               colors,             // number of colors (3/1)
 						is_lwir,                         // int               is_lwir,            // do not perform shot correction
 						min_shot,                        // float             min_shot,           // 10.0
@@ -1319,9 +1321,9 @@ __global__ void generate_RBGA(
 						diff_sigma,                      // float             diff_sigma,         // pixel value/pixel change
 						diff_threshold,                  // float             diff_threshold,     // pixel value/pixel change
 						min_agree,                       // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-						weight0,                         // float             weight0,            // scale for R
-						weight1,                         // float             weight1,            // scale for B
-						weight2,                         // float             weight2,            // scale for G
+						weights[0],                      // float             weight0,            // scale for R
+						weights[1],                      // float             weight1,            // scale for B
+						weights[2],                      // float             weight2,            // scale for G
 						dust_remove,                     // int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 			    		0,                               // int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
 			    // combining both non-overlap and overlap (each calculated if pointer is not null )

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -121,7 +121,7 @@ extern "C" __global__ void imclt_rbg(
 		int               woi_twidth,
 		int               woi_theight,
 		const size_t      dstride);            // in floats (pixels)
-
+/*
 extern "C" __global__ void generate_RBGA(
 		// Parameters to generate texture tasks
 		struct tp_task   * gpu_tasks,
@@ -135,6 +135,7 @@ extern "C" __global__ void generate_RBGA(
 		// Parameters for the texture generation
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
+		struct gc       * gpu_geometry_correction,
 		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
@@ -150,4 +151,4 @@ extern "C" __global__ void generate_RBGA(
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-
+*/