working on GPU code

f04bcc82 · Andrey Filippov · cd9b6096 · f04bcc82 · f04bcc82 · f04bcc82
Commit f04bcc82 authored Sep 20, 2018 by Andrey Filippov
5 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,5 @@
 NC393I
 attic
 *.log
-FOCUS-PSF*
\ No newline at end of file
+FOCUS-PSF*
+src/main/resources/trained_model
\ No newline at end of file
--- a/src/main/java/GPUTileProcessor.java
+++ b/src/main/java/GPUTileProcessor.java
@@ -117,7 +117,7 @@ public class GPUTileProcessor {
        copyH2D.WidthInBytes =    width_in_bytes;
        copyH2D.Height =          height; // /4;

-// for copying results back to host
+// for copying results to host
        CUDA_MEMCPY2D copyD2H =   new CUDA_MEMCPY2D();
        copyD2H.srcMemoryType =   CUmemorytype.CU_MEMORYTYPE_DEVICE;
        copyD2H.srcDevice =       dst_dpointer; // ((test & 1) ==0) ? src_dpointer : dst_dpointer; // copy same data
@@ -130,8 +130,7 @@ public class GPUTileProcessor {
        copyD2H.WidthInBytes =    width_in_bytes;
        copyD2H.Height =          height; // /2;

-        // Set up the kernel parameters: A pointer to an array
-        // of pointers which point to the actual values.
+        // kernel parameters: pointer to pointers
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dst_dpointer),
            Pointer.to(src_dpointer),
@@ -152,16 +151,16 @@ public class GPUTileProcessor {
    			0, null,                 // Shared memory size and stream (shared - only dynamic, static is in code)
    			kernelParameters, null);   // Kernel- and extra parameters

-        // Copy the data from the device back to the host
+        // Copy the data from the device to the host
        cuMemcpy2D(copyD2H);
        // clean up
        cuMemFree(src_dpointer);
        cuMemFree(dst_dpointer);
    }

-    public int setup() throws IOException // String arg, ImagePlus imagePlus)
+    public int setup() throws IOException
    {
-
+    	// From code by Marco Hutter - http://www.jcuda.org
        // Enable exceptions and omit all subsequent error checks
        JCudaDriver.setExceptionsEnabled(true);
        JNvrtc.setExceptionsEnabled(true);
@@ -174,7 +173,9 @@ public class GPUTileProcessor {
        cuCtxCreate(context, 0, device);

        // Obtain the CUDA source code from the CUDA file
-
+        // Get absolute path to the file in resource foldder, then read it as a normal file.
+        // When using just Eclipse resources - it does not notice that the file
+        // was edited (happens frequently during kernel development).
        ClassLoader classLoader = getClass().getClassLoader();
        File file = new File(classLoader.getResource(GPU_KERNEL_FILE).getFile());
        System.out.println(file.getAbsolutePath());
@@ -196,9 +197,7 @@ public class GPUTileProcessor {
    }

    /**
-     * Create the CUDA function object for the kernel function with the
-     * given name that is contained in the given source code
-     *
+     * Create the kernel function by its name in the  source code
     * @param sourceCode The source code
     * @param kernelName The kernel function name
     * @return

--- a/src/main/java/ImageDtt.java
+++ b/src/main/java/ImageDtt.java
--- a/src/main/java/TwoQuadCLT.java
+++ b/src/main/java/TwoQuadCLT.java
@@ -20,6 +20,7 @@
 ** -----------------------------------------------------------------------------**
 **
 */
+import java.io.DataOutputStream;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
@@ -907,7 +908,80 @@ public class TwoQuadCLT {
 		return results;
 	}

+	public void saveFloatKernels(String file_prefix,
+			                     double [][][][][][] clt_kernels,
+			                     double [][][]       image_data,
+			                     double [][][]       port_xy,
+			                     boolean transpose) throws IOException {
+		if (clt_kernels != null) {
+			for (int chn = 0; chn < clt_kernels.length; chn++) {
+				String kern_path = file_prefix+"_chn"+chn+(transpose?"_transposed":"")+".kernel";
+				String offs_path = file_prefix+"_chn"+chn+(transpose?"_transposed":"")+".kernel_offsets";
+				FileOutputStream fos = new FileOutputStream(kern_path);
+				DataOutputStream dos = new DataOutputStream(fos);
+				for (int ty = 0; ty <  clt_kernels[chn][0].length; ty++) {
+					for (int tx = 0; tx <  clt_kernels[chn][0][ty].length; tx++) {
+						for (int col = 0; col <  clt_kernels[chn].length; col++) {
+							for (int p = 0; p < 4; p++) {
+								double [] pa = clt_kernels[chn][col][ty][tx][p];
+								for (int i0 = 0; i0 < 64; i0++) {
+									int i;
+									if (transpose) {
+										i = ((i0 & 7) << 3) + ((i0 >>3) & 7);
+									} else {
+										i = i0;
+									}
+									dos.writeFloat((float)pa[i]);
+								}
+							}
+						}
+					}
+				}
+				dos.close();
+				fos = new FileOutputStream(offs_path);
+				dos = new DataOutputStream(fos);
+
+				for (int ty = 0; ty <  clt_kernels[chn][0].length; ty++) {
+					for (int tx = 0; tx <  clt_kernels[chn][0][ty].length; tx++) {
+						for (int col = 0; col <  clt_kernels[chn].length; col++) {
+							double [] pa = clt_kernels[chn][col][ty][tx][4];
+							for (int i = 0; i < pa.length; i++) {
+								dos.writeFloat((float)pa[i]);
+							}
+						}
+					}
+				}

+				dos.close();
+			}
+		}
+
+		if (image_data != null) {
+			for (int chn = 0; chn < image_data.length; chn++) {
+				String img_path =  file_prefix+"_chn"+chn+".bayer";
+				FileOutputStream fos = new FileOutputStream(img_path);
+				DataOutputStream dos = new DataOutputStream(fos);
+				for (int i = 0; i <  image_data[chn][0].length; i++) {
+					dos.writeFloat((float) (image_data[chn][0][i] + image_data[chn][1][i] + image_data[chn][2][i]));
+				}
+				dos.close();
+			}
+		}
+		if (port_xy != null) {
+			for (int chn = 0; chn < port_xy[0].length; chn++) {
+				String img_path =  file_prefix+"_chn"+chn+".portsxy";
+				FileOutputStream fos = new FileOutputStream(img_path);
+				DataOutputStream dos = new DataOutputStream(fos);
+				for (int i = 0; i <  port_xy.length; i++) {
+					dos.writeFloat((float) (port_xy[i][chn][0])); // x-offset
+					dos.writeFloat((float) (port_xy[i][chn][1])); // y-offset
+				}
+				dos.close();
+			}
+
+		}
+
+	}

 	public ImagePlus [] processCLTQuadCorrPairGpu(
 			GPUTileProcessor                               gPUTileProcessor,
@@ -1003,6 +1077,8 @@ public class TwoQuadCLT {
 		double [][][][][][] clt_kernels_main = quadCLT_main.getCLTKernels(); // [4][3][123][164]{[64],[64],[64],[64],[8]}
 		double [][][][][][] clt_kernels_aux =  quadCLT_aux.getCLTKernels();

+
+		//[4][3][123][164][5][]
 		double [][] dbg_kern = clt_kernels_main[0][0][0][0];
 		// here all data is ready (images, kernels) to try GPU code

@@ -1033,13 +1109,12 @@ public class TwoQuadCLT {
 				"converted",
 				dbg_titles);

-		if (debugLevel < 1000) {
-			return null;
-		}

+		double [][][]       port_xy_main_dbg = new double [tilesX*tilesY][][];
+		double [][][]       port_xy_aux_dbg = new double [tilesX*tilesY][][];

 		final double [][][][][][][] clt_bidata = // new double[2][quad][nChn][tilesY][tilesX][][]; // first index - main/aux
-				image_dtt.clt_bi_quad (
+				image_dtt.clt_bi_quad_dbg (
 						clt_parameters,                       // final EyesisCorrectionParameters.CLTParameters       clt_parameters,
 						clt_parameters.fat_zero,              // final double              fatzero,         // May use correlation fat zero from 2 different parameters - fat_zero and rig.ml_fatzero
 						notch_mode,                           //  final boolean             notch_mode,      // use notch filter for inter-camera correlation to detect poles
@@ -1070,8 +1145,43 @@ public class TwoQuadCLT {
 //						woi_tops,                             // final int [][]            woi_tops,
 						ers_delay,                            // final double [][][]       ers_delay,        // if not null - fill with tile center acquisition delay
 						threadsMax,                           // final int                 threadsMax,  // maximal number of threads to launch
-						debugLevel);                          // final int                 globalDebugLevel);
+						debugLevel,                           // final int                 globalDebugLevel);
+						port_xy_main_dbg,                     // final double [][][]       port_xy_main_dbg, // for each tile/port save x,y pixel coordinates (gpu code development)
+						port_xy_aux_dbg);                     // final double [][][]       port_xy_aux_dbg) // for each tile/port save x,y pixel coordinates (gpu code development)
+
+		String kernel_dir = "/home/eyesis/workspace-python3/nvidia_dct8x8/clt/";
+		boolean [][] what_to_save = {{false,false,true}, {false,false,true}};
+		try {
+			saveFloatKernels(
+					kernel_dir +"main", // String file_prefix,
+					(what_to_save[0][0]?clt_kernels_main:null), // double [][][][][][] clt_kernels, // null
+					(what_to_save[0][1]?quadCLT_main.image_data:null),
+					(what_to_save[0][2]?port_xy_main_dbg:null), // double [][][]       port_xy,
+					true);
+		} catch (IOException e) {
+			System.out.println("Failed to save flattened kernels tp "+kernel_dir);
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} // boolean transpose);

+		try {
+			saveFloatKernels(
+					kernel_dir +"aux", // String file_prefix,
+					(what_to_save[1][0]?clt_kernels_aux:null), // double [][][][][][] clt_kernels, // null
+					(what_to_save[1][1]?quadCLT_aux.image_data:null),
+					(what_to_save[1][2]?port_xy_aux_dbg:null), // double [][][]       port_xy,
+			        true);
+		} catch (IOException e) {
+			System.out.println("Failed to save flattened kernels tp "+kernel_dir);
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		} // boolean transpose);
+
+
+
+		if (debugLevel < 1000) {
+			return null;
+		}

 		if (ers_delay !=null) {
 			showERSDelay(ers_delay);

--- a/src/main/resources/dtt8x8.cuh
+++ b/src/main/resources/dtt8x8.cuh
@@ -43,6 +43,7 @@
 #define DTTTEST_BLK_STRIDE     (DTTTEST_BLOCK_WIDTH+1)
 #define DTT_SIZE                      8

+//#define CUDART_INF_F            __int_as_float(0x7f800000)
 /*
 Python code to generate constant coefficients:
 def dct_constants():