consolidated 5 float inputs to an array of 5 floats to solve CUDA_ERROR_INVALID_PTX

0182bb3c · Andrey Filippov · 02f6e62d · 0182bb3c · 0182bb3c · 0182bb3c
Commit 0182bb3c authored Apr 20, 2020 by Andrey Filippov
3 changed files
--- a/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
+++ b/src/main/java/com/elphel/imagej/gpu/GPUTileProcessor.java
@@ -198,8 +198,8 @@ public class GPUTileProcessor {
    private CUdeviceptr gpu_texture_indices_len = new CUdeviceptr(); //  allocate tilesX * tilesY * 6 * Sizeof.FLOAT
    private CUdeviceptr gpu_diff_rgb_combo =      new CUdeviceptr(); //  1 int

-//    private CUdeviceptr gpu_port_offsets =        new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT
-    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate Quad * 2 * Sizeof.FLOAT
+    private CUdeviceptr gpu_color_weights =       new CUdeviceptr(); //  allocate 3 * Sizeof.FLOAT
+    private CUdeviceptr gpu_generate_RBGA_params =new CUdeviceptr(); //  allocate 5 * Sizeof.FLOAT

    private CUdeviceptr gpu_woi =                 new CUdeviceptr(); //  4 integers (x, y, width, height) Rectangle - in tiles
    private CUdeviceptr gpu_textures_rgba =       new CUdeviceptr(); //  allocate tilesX * tilesY * ? * 256 * Sizeof.FLOAT
@@ -570,8 +570,8 @@ public class GPUTileProcessor {

    	cuMemAlloc(gpu_diff_rgb_combo, tilesX * tilesYa * NUM_CAMS* (NUM_COLORS + 1) *  Sizeof.FLOAT);

-//    	cuMemAlloc(gpu_port_offsets,   NUM_CAMS * 2 * Sizeof.FLOAT);
    	cuMemAlloc(gpu_color_weights,             3 * Sizeof.FLOAT);
+    	cuMemAlloc(gpu_generate_RBGA_params,      5 * Sizeof.FLOAT);


    	cuMemAlloc(gpu_woi,                               4 * Sizeof.FLOAT);
@@ -1268,6 +1268,15 @@ public class GPUTileProcessor {
    	fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
        cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);

+        float [] generate_RBGA_params = {
+    			(float)             min_shot,           // 10.0
+    			(float)             scale_shot,         // 3.0
+    			(float)             diff_sigma,         // pixel value/pixel change
+    			(float)             diff_threshold,     // pixel value/pixel change
+    			(float)             min_agree          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    	};
+        cuMemcpyHtoD(gpu_generate_RBGA_params, Pointer.to(generate_RBGA_params),  generate_RBGA_params.length * Sizeof.FLOAT);
+
    	int iis_lwir =      (is_lwir)? 1:0;
    	int idust_remove =  (dust_remove)? 1 : 0;

@@ -1290,18 +1299,17 @@ public class GPUTileProcessor {
        		Pointer.to(gpu_geometry_correction),             //	struct gc          * gpu_geometry_correction,
 	            Pointer.to(new int[]   {num_colors}),            // int               colors,             // number of colors (3/1)
 	            Pointer.to(new int[]   {iis_lwir}),              // int               is_lwir,            // do not perform shot correction
-	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
-	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
-	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
-	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
-	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+	            Pointer.to(gpu_generate_RBGA_params),            // float             generate_RBGA_params[5],
+//	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
+//	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
+//	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
+//	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
+//	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
 	            Pointer.to(new int[]   { idust_remove }),        // int               dust_remove,        // Do not reduce average weight when only one image differes much from the average
 	            Pointer.to(new int[]   {0}),                     // int               keep_weights,       // return channel weights after A in RGBA
 	            Pointer.to(new int[]   { texture_stride_rgba }), // const size_t      texture_rbga_stride,     // in floats
-	            Pointer.to(gpu_textures_rgba),                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles
-//	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
-	            Pointer.to(new int[]   {0})); // gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+	            Pointer.to(gpu_textures_rgba));                   // float           * gpu_texture_tiles)    // (number of colors +1 + ?)*16*16 rgba texture tiles

    	cuCtxSynchronize();
    	// Call the kernel function
@@ -1313,68 +1321,6 @@ public class GPUTileProcessor {
    	cuCtxSynchronize();
    }

-    public void execTextures_old( // old
-    		double [] color_weights,
-    		boolean   is_lwir,
-    		double    min_shot,           // 10.0
-    		double    scale_shot,         // 3.0
-    		double    diff_sigma,         // pixel value/pixel change
-    		double    diff_threshold,     // pixel value/pixel change
-    		double    min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-    		boolean   dust_remove,
-    		boolean   keep_weights) {
-    	if (GPU_TEXTURES_kernel == null)
-    	{
-    		IJ.showMessage("Error", "No GPU kernel: GPU_TEXTURES_kernel");
-    		return;
-    	}
-
-    	int num_colors = color_weights.length;
-    	if (num_colors > 3) num_colors = 3;
-    	float [] fcolor_weights = new float[3];
-    	fcolor_weights[0] = (float) color_weights[0];
-    	fcolor_weights[1] = (num_colors >1)?((float) color_weights[1]):0.0f;
-    	fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
-        cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);
-
-    	int iis_lwir =      (is_lwir)? 1:0;
-    	int idust_remove =  (dust_remove)? 1 : 0;
-    	int ikeep_weights = (keep_weights)? 1 : 0;
-
-		int [] GridFullWarps =    {(num_texture_tiles + TEXTURE_TILES_PER_BLOCK-1) / TEXTURE_TILES_PER_BLOCK,1,1};
-    	int [] ThreadsFullWarps = {TEXTURE_THREADS_PER_TILE, NUM_CAMS, 1};
-
-    	Pointer kernelParameters = Pointer.to(
-    			Pointer.to(new int[] {0}), //  int             * woi, - not used
-    			Pointer.to(gpu_clt),
-    			Pointer.to(new int[] { num_texture_tiles }),
-    			Pointer.to(gpu_texture_indices),
-        		Pointer.to(gpu_geometry_correction),     //	struct gc          * gpu_geometry_correction,
-    			Pointer.to(new int[] { num_colors }),
-    			Pointer.to(new int[] { iis_lwir }),
-    			Pointer.to(new float[] {(float) min_shot }),
-    			Pointer.to(new float[] {(float) scale_shot }),
-    			Pointer.to(new float[] {(float) diff_sigma }),
-    			Pointer.to(new float[] {(float) diff_threshold }),
-    			Pointer.to(new float[] {(float) min_agree }),
-	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
-    			Pointer.to(new int[] { idust_remove }),
-    			Pointer.to(new int[] { ikeep_weights }),
-    			Pointer.to(new int[] {0}),//  0, // const size_t      texture_rbg_stride, // in floats - DISABLE GENERATION!
-    			Pointer.to(new int[] {0}), // null, //  new Pointer(),  //Pointer.to(gpu_textures), // new Pointer(),  // Pointer.to(gpu_textures),
-    			Pointer.to(new int[] { texture_stride }), // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
-    			Pointer.to(gpu_textures),
-	            Pointer.to(gpu_diff_rgb_combo));                 // float           * gpu_diff_rgb_combo); // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
-    	cuCtxSynchronize();
-    	// Call the kernel function
-    	cuLaunchKernel(GPU_TEXTURES_kernel,
-    			GridFullWarps[0],    GridFullWarps[1],   GridFullWarps[2],   // Grid dimension
-    			ThreadsFullWarps[0], ThreadsFullWarps[1],ThreadsFullWarps[2],// Block dimension
-    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
-    			kernelParameters, null);   // Kernel- and extra parameters
-    	cuCtxSynchronize();
-    }
-
     public void execTextures(
    		double [] color_weights,
    		boolean   is_lwir,
@@ -1398,6 +1344,15 @@ public class GPUTileProcessor {
    	fcolor_weights[2] = (num_colors >2)?((float) color_weights[2]):0.0f;
        cuMemcpyHtoD(gpu_color_weights, Pointer.to(fcolor_weights),  fcolor_weights.length * Sizeof.FLOAT);

+        float [] generate_RBGA_params = {
+    			(float)             min_shot,           // 10.0
+    			(float)             scale_shot,         // 3.0
+    			(float)             diff_sigma,         // pixel value/pixel change
+    			(float)             diff_threshold,     // pixel value/pixel change
+    			(float)             min_agree           // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+    	};
+        cuMemcpyHtoD(gpu_generate_RBGA_params, Pointer.to(generate_RBGA_params),  generate_RBGA_params.length * Sizeof.FLOAT);
+
    	int iis_lwir =      (is_lwir)? 1:0;
    	int idust_remove =  (dust_remove)? 1 : 0;

@@ -1415,11 +1370,12 @@ public class GPUTileProcessor {
        		Pointer.to(gpu_geometry_correction),             //	struct gc          * gpu_geometry_correction,
    			Pointer.to(new int[] { num_colors }),
    			Pointer.to(new int[] { iis_lwir }),
-    			Pointer.to(new float[] {(float) min_shot }),
-    			Pointer.to(new float[] {(float) scale_shot }),
-    			Pointer.to(new float[] {(float) diff_sigma }),
-    			Pointer.to(new float[] {(float) diff_threshold }),
-    			Pointer.to(new float[] {(float) min_agree }),
+	            Pointer.to(gpu_generate_RBGA_params),            // float             generate_RBGA_params[5],
+//	            Pointer.to(new float[] {(float) min_shot}),      // float             min_shot,           // 10.0
+//	            Pointer.to(new float[] {(float) scale_shot}),    // float             scale_shot,         // 3.0
+//	            Pointer.to(new float[] {(float) diff_sigma}),    // float             diff_sigma,         // pixel value/pixel change
+//	            Pointer.to(new float[] {(float) diff_threshold}),// float             diff_threshold,     // pixel value/pixel change
+//	            Pointer.to(new float[] {(float) min_agree}),     // float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 	            Pointer.to(gpu_color_weights),                   // float             weights[3],         // scale for R,B,G
    			Pointer.to(new int[] { idust_remove }),
 //    			Pointer.to(new int[] { 0}), // texture_stride }),        // can be a null pointer - will not be used! float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles

--- a/src/main/resources/kernels/TileProcessor.cuh
+++ b/src/main/resources/kernels/TileProcessor.cuh
@@ -1217,41 +1217,48 @@ extern "C" __global__ void correlate2D_inner(
 }
 #define USE_CDP
 #ifdef USE_CDP
-extern "C"
-__global__ void generate_RBGA(
-// Parameters to generate texture tasks
+extern "C" __global__ void generate_RBGA(
+		// Parameters to generate texture tasks
 		struct tp_task   * gpu_tasks,
 		int                num_tiles,          // number of tiles in task list
-// declare arrays in device code?
+		// declare arrays in device code?
 		int              * gpu_texture_indices,// packed tile + bits (now only (1 << 7)
 		int              * num_texture_tiles,  // number of texture tiles to process  (8 separate elements for accumulation)
 		int              * woi,                // x,y,width,height of the woi
 		int                width,  // <= TILESX, use for faster processing of LWIR images (should be actual + 1)
 		int                height, // <= TILESY, use for faster processing of LWIR images
-// Parameters for the texture generation
+		// Parameters for the texture generation
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
-//			float           * gpu_geometry_correction,
-//			float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
+		float             params[5],          // mitigating CUDA_ERROR_INVALID_PTX
+		/*
 			float             min_shot,           // 10.0
 			float             scale_shot,         // 3.0
 			float             diff_sigma,         // pixel value/pixel change
 			float             diff_threshold,     // pixel value/pixel change
 			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		 */
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
-			float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
-			float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+		float           * gpu_texture_tiles)  // (number of colors +1 + ?)*16*16 rgba texture tiles
+//			float             aaaa)
+//			float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
 {
+	float             min_shot = params[0];           // 10.0
+	float             scale_shot = params[1];         // 3.0
+	float             diff_sigma = params[2];         // pixel value/pixel change
+	float             diff_threshold = params[3];     // pixel value/pixel change
+	float             min_agree = params[4];          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+
+
 // TODO use atomic_add to increment	num_texture_tiles
 // TODO calculate woi
-
-	dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
+dim3 threads0((1 << THREADS_DYNAMIC_BITS), 1, 1);
    int blocks_x = (width + ((1 << THREADS_DYNAMIC_BITS) - 1)) >> THREADS_DYNAMIC_BITS;
    dim3 blocks0 (blocks_x, height, 1);

@@ -1380,8 +1387,7 @@ __global__ void generate_RBGA(
 						gpu_texture_tiles,               // float           * gpu_texture_rbg,     // (number of colors +1 + ?)*16*16 rgba texture tiles
 			    		0,                               // size_t      texture_stride,     // in floats (now 256*4 = 1024)
 						gpu_texture_tiles, //(float *)0);// float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-						gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
-//						gpu_diff_rgb_combo + ti_offset * NUM_CAMS*(colors+1)); // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
+						(float *)0);//gpu_diff_rgb_combo);             // float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]

 				cudaDeviceSynchronize(); // not needed yet, just for testing
 				/* */
@@ -1848,11 +1854,12 @@ extern "C" __global__ void textures_nonoverlap(
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
-		float             min_shot,           // 10.0
-		float             scale_shot,         // 3.0
-		float             diff_sigma,         // pixel value/pixel change
-		float             diff_threshold,     // pixel value/pixel change
-		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             params[5],
+//		float             min_shot,           // 10.0
+//		float             scale_shot,         // 3.0
+//		float             diff_sigma,         // pixel value/pixel change
+//		float             diff_threshold,     // pixel value/pixel change
+//		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 //		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
@@ -1861,6 +1868,12 @@ extern "C" __global__ void textures_nonoverlap(
 		float           * gpu_texture_tiles,  // (number of colors +1 + ?)*16*16 rgba texture tiles
 		float           * gpu_diff_rgb_combo) // diff[NUM_CAMS], R[NUM_CAMS], B[NUM_CAMS],G[NUM_CAMS]
 {
+	float             min_shot = params[0];           // 10.0
+	float             scale_shot = params[1];         // 3.0
+	float             diff_sigma = params[2];         // pixel value/pixel change
+	float             diff_threshold = params[3];     // pixel value/pixel change
+	float             min_agree = params[4];          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+
 	 dim3 threads0(CONVERT_DIRECT_INDEXING_THREADS, 1, 1);
 	 dim3 blocks0 ((num_tiles + CONVERT_DIRECT_INDEXING_THREADS -1) >> CONVERT_DIRECT_INDEXING_THREADS_LOG2,1, 1);

@@ -1902,8 +1915,7 @@ extern "C" __global__ void textures_nonoverlap(


 //#undef USE_textures_gen
-extern "C"
-__global__ void textures_accumulate( // (8,4,1) (N,1,1)
+extern "C" __global__ void textures_accumulate( // (8,4,1) (N,1,1)
 		int             * woi,                // x, y, width,height
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		size_t            num_texture_tiles,  // number of texture tiles to process
@@ -3952,7 +3964,6 @@ __device__ void tile_combine_rgba(
 			}
 			max_diff_shared[cam] = sqrtf(mx);
 		}
-		__syncthreads(); //?
 #ifdef DEBUG22
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\n 1. max_diff\n");
@@ -4022,7 +4033,6 @@ __device__ void tile_combine_rgba(
 				ports_rgb_shared[ncol][cam] /= DTT_SIZE2*DTT_SIZE2; // correct for window?
 			}
 		}
-		__syncthreads(); //?
 #ifdef DEBUG22
 		if (debug  && (threadIdx.x == 0) && (threadIdx.y == 0)){
 			printf("\n 2. max_diff\n");

--- a/src/main/resources/kernels/TileProcessor.h
+++ b/src/main/resources/kernels/TileProcessor.h
@@ -86,11 +86,12 @@ extern "C" __global__ void textures_nonoverlap(
 		struct gc       * gpu_geometry_correction,
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
-		float             min_shot,           // 10.0
-		float             scale_shot,         // 3.0
-		float             diff_sigma,         // pixel value/pixel change
-		float             diff_threshold,     // pixel value/pixel change
-		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
+		float             params[5],
+//		float             min_shot,           // 10.0
+//		float             scale_shot,         // 3.0
+//		float             diff_sigma,         // pixel value/pixel change
+//		float             diff_threshold,     // pixel value/pixel change
+//		float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
 		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 //		int               keep_weights,       // return channel weights after A in RGBA (was removed) (should be 0 if gpu_texture_rbg)?
@@ -120,7 +121,7 @@ extern "C" __global__ void imclt_rbg(
 		int               woi_twidth,
 		int               woi_theight,
 		const size_t      dstride);            // in floats (pixels)
-/*
+
 extern "C" __global__ void generate_RBGA(
 		// Parameters to generate texture tasks
 		struct tp_task   * gpu_tasks,
@@ -135,19 +136,19 @@ extern "C" __global__ void generate_RBGA(
 		float          ** gpu_clt,            // [NUM_CAMS] ->[TILESY][TILESX][NUM_COLORS][DTT_SIZE*DTT_SIZE]
 		// TODO: use geometry_correction rXY !
 		struct gc       * gpu_geometry_correction,
-		float           * gpu_port_offsets,       // relative ports x,y offsets - just to scale differences, may be approximate
 		int               colors,             // number of colors (3/1)
 		int               is_lwir,            // do not perform shot correction
+		float             params[5],          // mitigating CUDA_ERROR_INVALID_PTX
+		/*
 			float             min_shot,           // 10.0
 			float             scale_shot,         // 3.0
 			float             diff_sigma,         // pixel value/pixel change
 			float             diff_threshold,     // pixel value/pixel change
 			float             min_agree,          // minimal number of channels to agree on a point (real number to work with fuzzy averages)
-		float             weight0,            // scale for R
-		float             weight1,            // scale for B
-		float             weight2,            // scale for G
+		 */
+		float             weights[3],         // scale for R,B,G
 		int               dust_remove,        // Do not reduce average weight when only one image differs much from the average
 		int               keep_weights,       // return channel weights after A in RGBA (was removed)
 		const size_t      texture_rbga_stride,     // in floats
 		float           * gpu_texture_tiles);  // (number of colors +1 + ?)*16*16 rgba texture tiles
-*/
+