kind of works, need more testing

963e5a90 · Oleg Dzhimiev · 0f343bbb · 963e5a90 · 963e5a90 · 963e5a90
Commit 963e5a90 authored Mar 24, 2020 by Oleg Dzhimiev
7 changed files
--- a/src/main/java/tfhello.java
+++ b/src/main/java/tfhello.java
@@ -40,7 +40,8 @@ import jcuda.runtime.JCuda;
 import jcuda.runtime.cudaError;

 import java.nio.ByteBuffer;
-
+import java.nio.ByteOrder;
+import java.nio.FloatBuffer;
 import java.lang.reflect.Field;
 import java.lang.reflect.Modifier;

@@ -86,10 +87,15 @@ public class tfhello{
 		int cuSize = size*Sizeof.INT;
 		
 		int[] px_in = new int[size];
+		float[] px_in_float = new float[size];
+		byte[] px_in_byte = new byte[size]; 
+		
 		int[] px_out = new int[size];
 		
 		for(int i = 0; i < px_in.length; i++) {
            px_in[i] = i+1;
+            px_in_float[i] = i+1;
+            //px_in_byte[i] = (i+1).byteValue();
        }
 		
 		JCudaDriver.setExceptionsEnabled(true);
@@ -167,19 +173,31 @@ public class tfhello{
 	        		                                     .build();
 	        
 	        Operation x = g.opBuilder("Placeholder", "array_tensor_in")
-     		       		   .setAttr("dtype",DataType.INT32)
+     		       		   //.setAttr("dtype",DataType.INT32)
+	        			   .setAttr("dtype",DataType.FLOAT)
+	        			   //.setAttr("dtype",DataType.UINT8)
+	        			   //.setAttr("dtype",DataType.INT64)
     		       		   .build();
 	        			 
 	        Operation z = g.opBuilder("Identity", "array_tensor_out")
 	        		       .addInput(x.output(0))
 	        		       .build();
 	        
-	        Tensor t = Tensor.create(px_in);
-	        System.out.println("Is CUDA tensor? "+String.valueOf(t.elphel_isCUDATensor()));
-	        System.out.println(t.elphelTestCUDAPointer());
+	        // unit8
+	        //Tensor t = Tensor.create(px_in_uint8);
+	        // int
+	        //Tensor t = Tensor.create(px_in);
+	        // float
 	        
-	        long handle1;
-	        //session.makeCallable(handle1);
+	        ByteBuffer bbuf = ByteBuffer.allocateDirect(px_in_float.length * Float.BYTES); //4 bytes per float
+	        bbuf.order(ByteOrder.nativeOrder());
+	        FloatBuffer fbuf = bbuf.asFloatBuffer();
+	        fbuf.put(px_in_float);
+	        fbuf.position(0); 
+	        Tensor<Float> t = Tensor.create(new long[]{px_in_float.length},fbuf);
+	        
+	        //System.out.println("Is CUDA tensor? "+String.valueOf(t.elphel_isCUDATensor()));
+	        //System.out.println(t.elphelTestCUDAPointer());
 	        
 	        try (
 	        	Session s = new Session(g, config.toByteArray())
@@ -191,31 +209,27 @@ public class tfhello{
 	        	
 	        	Tensor output = s.runner().fetch("array_tensor_out").feed("array_tensor_in", t).run().get(0);
 	        	
-	        	System.out.println(output.numBytes());
+	        	System.out.println("Numbytes: "+output.numBytes());
 	        	
-	        	int[] obuf = new int[output.numBytes()/Sizeof.INT];
+	        	//int[] obuf = new int[output.numBytes()/Sizeof.INT];
+	        	float[] obuf = new float[output.numBytes()/Sizeof.FLOAT];
 	        	output.copyTo(obuf);
+	        	System.out.println("Output from the first run: ");
 	        	System.out.println(Arrays.toString(obuf));
 	        	
+	        	
 	        	// natively got GPU device name to insert into options
 	        	// it's the same all the time
 	        	String gpuDeviceName = s.elphelGPUDeviceName();
 	        	
-	    		// that's for RunCallable() if it ever gets implemented
-	    		CallableOptions callableOpts = CallableOptions.newBuilder()
-	    			.addFetch("array_tensor_out:0")
-	    		    .addFeed("array_tensor_in:0")
-	    		    .putFeedDevices("array_tensor_in:0", gpuDeviceName)
-	    		    .build();
-	    		
-	    		System.out.println(callableOpts);
-	        	
-	    		// GPU allocation:
-	    		Tensor t3 = Tensor.elphelCreateGPUTensor(new long[]{256},DataType.INT32);
+	    		// GPU allocation: dims must be power of 2?
+	    		Tensor t3 = Tensor.elphelCreateGPUTensor(new long[]{256},DataType.FLOAT);
 	    		//System.out.println(t2.nativeRef);
 	    	
+	    		// Let's check what happended
 	    		long t3_gpuptr = t3.elphel_GetGPUTensorPointer();
-	    		System.out.println(String.format("0x%08x", t3_gpuptr));
+	    		// Print address
+	    		//System.out.println("Pointer address: "+String.format("0x%08x", t3_gpuptr));
 	    		
 	    		CUdeviceptr ptr3 = longToCUdeviceptr(t3_gpuptr);
 	    		
@@ -228,13 +242,49 @@ public class tfhello{
 	    			System.out.println("Not a CUDA device");
 	    		}
 	    		
-	    		System.out.println("cuda pointer attributes?! "+res);
-	    		System.out.println(attrs.toString());
+	    		if (attrs.device==0) {
+	    			Field f = attrs.devicePointer.getClass().getSuperclass().getDeclaredField("nativePointer");
+	    			f.setAccessible(true);
+	    			long addr = f.getLong(attrs.devicePointer);
+	    			//System.out.println(String.format("0x%08x",addr));
+	    			if (addr!=0) {
+	    				System.err.println("\nTensor is allocated in CUDA: "+attrs.toString()+"\n");
+	    			}
+	    		}
 	    		
-	    		cuMemcpyHtoD(ptr3, Pointer.to(px_in), cuSize);
-	    		cuMemcpyDtoH(Pointer.to(px_out), ptr3, cuSize);
-	    		System.out.println(Arrays.toString(px_out));
-	    		// check if it a GPU pointer
+	    		// initialize tensor with values
+	    		int dsize = 256;
+	    		int cuDsize = dsize*Sizeof.FLOAT;
+	    		
+	    		float[] din  = new float[dsize];
+	    		float[] dout = new float[dsize];
+	    		for(int i = 0; i < din.length; i++) {
+	                din[i] = i+1;
+	            }
+	    		
+	    		cuMemcpyHtoD(ptr3, Pointer.to(din), cuDsize);
+	    		cuMemcpyDtoH(Pointer.to(dout), ptr3, cuDsize);
+	    		System.out.println(Arrays.toString(dout));
+	    		
+	    		// that's for RunCallable() if it ever gets implemented
+	    		CallableOptions callableOpts = CallableOptions.newBuilder()
+	    			.addFetch("array_tensor_out:0")
+	    		    .addFeed("array_tensor_in:0")
+	    		    .putFeedDevices("array_tensor_in:0", gpuDeviceName)
+	    		    .build();
+	    		
+	    		System.out.println(callableOpts);
+	    		
+	    		// callable handle
+	    		long feed_gpu_fetch_cpu = s.MakeCallable(callableOpts.toByteArray());
+	    		Tensor<?> t3out = s.runner().fetch("array_tensor_out").feed("array_tensor_in",t3).runElphelCallable(feed_gpu_fetch_cpu).get(0);
+	    		System.out.println(t3);
+	    		System.out.println(t3out);
+	    		
+	        	float[] obuf2 = new float[t3out.numBytes()/Sizeof.FLOAT];
+	        	t3out.copyTo(obuf2);
+	        	System.out.println("Output from the second run: ");
+	        	System.out.println(Arrays.toString(obuf2));
 	    		
 		    }
 	        

--- a/tf_jni/Session.java
+++ b/tf_jni/Session.java
@@ -82,11 +82,16 @@ public final class Session implements AutoCloseable {
  }

  public String elphelGPUDeviceName(){
-    //return "CHECKPOINT";
    return elphelGetGPUDeviceName(this.nativeHandle);
  }

-  public native String elphelGetGPUDeviceName(long handle);
+  private native String elphelGetGPUDeviceName(long handle);
+  
+  public long MakeCallable(byte[] config){
+	return elphelMakeCallable(this.nativeHandle, config);
+  }
+  
+  private native long elphelMakeCallable(long nativeHandle, byte[] config);

  /**
   * Release resources associated with the Session.
@@ -125,6 +130,7 @@ public final class Session implements AutoCloseable {
   * #feed(String,int,Tensor)}.
   */
  public final class Runner {
+	  	  
    /**
     * Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
     *
@@ -136,6 +142,8 @@ public final class Session implements AutoCloseable {
     *     SavedModelBundle#metaGraphDef()}.
     */
    public Runner feed(String operation, Tensor<?> t) {
+      //debug
+      System.out.println("Adding feed to operation: "+operation);
      return feed(parseOutput(operation), t);
    }

@@ -165,6 +173,14 @@ public final class Session implements AutoCloseable {
      return this;
    }

+    /**
+     * Feed for RunCallable - just a tensor 
+     */
+    public Runner feed(Tensor<?> t) {
+    	inputTensors.add(t);
+    	return this;
+    }
+    
    /**
     * Make {@link #run()} return the output of {@code operation}.
     *
@@ -295,6 +311,64 @@ public final class Session implements AutoCloseable {
      return runHelper(true);
    }

+    public List<Tensor<?>> runElphelCallable(long handle) {
+        return runElphelCallableHelper(handle).outputs;
+    }
+    
+    // whatever
+    private Run runElphelCallableHelper(long handle) {
+    	
+        long[] inputTensorHandles = new long[inputTensors.size()];
+        long[] outputTensorHandles = new long[outputs.size()];
+
+        System.out.println("Number of input handles: "+inputTensors.size());
+        System.out.println("Number of output handles: "+outputs.size());
+        
+        // It's okay to use Operation.getUnsafeNativeHandle() here since the safety depends on the
+        // validity of the Graph and graphRef ensures that.
+        int idx = 0;
+        for (Tensor<?> t : inputTensors) {
+          inputTensorHandles[idx++] = t.getNativeHandle();
+        }
+
+        Reference runRef = new Reference();
+        byte[] metadata = null;
+        try {
+          System.out.println("About to run RunCallable\n");
+          metadata =
+              Session.elphelRunCallable(
+                  nativeHandle,
+                  handle,
+                  inputTensorHandles,
+                  outputTensorHandles);
+          System.out.println("Ready to process output\n");
+        } finally {
+          runRef.close();
+        }
+        
+        System.out.println("Processing output\n");
+        // test something here
+        
+        List<Tensor<?>> outputs = new ArrayList<Tensor<?>>();
+        
+        for (long h : outputTensorHandles) {
+          try {
+            outputs.add(Tensor.fromHandle(h));
+          } catch (Exception e) {
+            for (Tensor<?> t : outputs) {
+              t.close();
+            }
+            outputs.clear();
+            throw e;
+          }
+        }        
+        
+        Run ret = new Run();
+        ret.outputs = outputs;
+        ret.metadata = metadata;
+        return ret;    	
+    }
+    
    private Run runHelper(boolean wantMetadata) {
      long[] inputTensorHandles = new long[inputTensors.size()];
      long[] inputOpHandles = new long[inputs.size()];
@@ -495,4 +569,14 @@ public final class Session implements AutoCloseable {
      long[] targetOpHandles,
      boolean wantRunMetadata,
      long[] outputTensorHandles);
+  
+  /**
+   * Run RunCallable Callable 
+   */
+  private static native byte[] elphelRunCallable(
+	  long sessionHandle, 
+	  long callableHandle,
+	  long[] inputTensorHandles,
+	  long[] outputTensorHandles);
+  
 }
--- a/tf_jni/Tensor.java
+++ b/tf_jni/Tensor.java
@@ -164,7 +164,7 @@ public final class Tensor<T> implements AutoCloseable {
 	  long nativeHandle;
 	  nativeHandle = elphelAllocateGPUTensor(t.shapeCopy,t.dtype.c());
 	  t.nativeRef = new NativeReference(nativeHandle);
-	  System.out.println(t.nativeRef);
+	  //System.out.println(t.nativeRef);
 	  return t;
  }
  
@@ -544,11 +544,12 @@ public final class Tensor<T> implements AutoCloseable {
    return String.format("%s tensor with shape %s", dtype.toString(), Arrays.toString(shape()));
  }

-
+  /*
  public int elphel_isCUDATensor() {
    int result = elphelIsCUDATensor(getNativeHandle());
    return result;
  }
+  */

  public long elphel_GetGPUTensorPointer(){
 	  return elphelGetGPUTensorPointer(getNativeHandle());
@@ -862,9 +863,9 @@ public final class Tensor<T> implements AutoCloseable {

  private static native void readNDArray(long handle, Object value);

-  private static native int elphelIsCUDATensor(long handle);
+  //private static native int elphelIsCUDATensor(long handle);

-  public static native int elphelTestCUDAPointer();
+  //public static native int elphelTestCUDAPointer();

  static {
    TensorFlow.init();

--- a/tf_jni/session_jni.cc
+++ b/tf_jni/session_jni.cc
@@ -208,6 +208,102 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
  return ret;
 }

+// Create an empty tensor of type 'dtype'. 'shape' can be arbitrary, but has to
+// result in a zero-sized tensor.
+static TF_Tensor* EmptyTensor(TF_DataType dtype,
+                              const tensorflow::TensorShape& shape) {
+  static char empty;
+  tensorflow::int64 nelems = 1;
+  std::vector<tensorflow::int64> dims;
+  for (int i = 0; i < shape.dims(); ++i) {
+    dims.push_back(shape.dim_size(i));
+    nelems *= shape.dim_size(i);
+  }
+  CHECK_EQ(nelems, 0);
+  static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
+                "64-bit int types should match in size");
+  return TF_NewTensor(
+      dtype, reinterpret_cast<const int64_t*>(dims.data()), shape.dims(),
+      reinterpret_cast<void*>(&empty), 0, [](void*, size_t, void*) {}, nullptr);
+}
+
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_elphelRunCallable(
+    JNIEnv* env, jclass clazz, 
+	jlong session_handle, jlong callable_handle,
+    jlongArray input_tensor_handles,
+	jlongArray output_tensor_handles) {
+	
+	//printf("Running Callable\n");
+	
+	TF_Session* session = requireHandle(env, session_handle);
+	
+	using namespace tensorflow;
+	
+	Session::CallableHandle feed_gpu_fetch_cpu = (Session::CallableHandle) reinterpret_cast<long>(callable_handle);
+	
+    const jint ninputs = env->GetArrayLength(input_tensor_handles);
+    const jint noutputs = env->GetArrayLength(output_tensor_handles);
+	
+    //printf("ninputs: %d, noutputs: %d\n",ninputs, noutputs);
+    
+    std::unique_ptr<TF_Tensor* []> output_values(new TF_Tensor*[noutputs]);
+    std::unique_ptr<TF_Tensor* []> input_values(new TF_Tensor*[ninputs]);
+    
+    // from input tensor handles to inputs?
+    resolveHandles(env, "input Tensors", input_tensor_handles, input_values.get(), ninputs);    
+	
+	std::vector<Tensor> inputs(ninputs);
+	for (int i=0; i<ninputs; ++i) {
+		TF_TensorToTensor(input_values[i],&inputs[i]);
+	}
+	
+	// figure out how to create stuff from handles
+    std::vector<Tensor> outputs(noutputs);
+    
+	auto runStatus = session->session->RunCallable(feed_gpu_fetch_cpu, {inputs}, &outputs, nullptr);
+    
+	if (!runStatus.ok()){
+		printf("It is with a heavy heart I inform you that RunCallable has failed. Here's the error message:\n");
+		printf(runStatus.error_message().c_str());
+		return nullptr;
+	}
+	
+	// get the handles t
+    jlong* t = env->GetLongArrayElements(output_tensor_handles, nullptr);
+    TF_Status* status = TF_NewStatus();
+    
+    for (int i = 0; i < noutputs; ++i) {
+      //outputs[i] = inputz[i];
+      const Tensor& src = outputs[i];
+      
+      /*
+      std::cout << src.DebugString() << std::endl;
+      // print values:
+      std::cout << "Output tensor (printing from session_jni.cc):";
+      auto tmap = src.tensor<float, 1>();
+	  for (int d = 0; d < 256; d++){
+		  std::cout << (int) tmap(d);
+		  if (d!=255) std::cout << ", ";
+	  }
+	  */
+	  
+      //output_values[i]->tensor = outputs[i];
+	  if (!src.IsInitialized() || src.NumElements() == 0) {
+		  output_values[i] = EmptyTensor(static_cast<TF_DataType>(src.dtype()), src.shape());
+	      continue;
+	  }
+	  output_values[i] = TF_TensorFromTensor(src, status);
+	  // for whatever reason status cannot be a nullptr here
+      //output_values[i] = TF_TensorFromTensor(src, nullptr);
+      t[i] = reinterpret_cast<jlong>(output_values[i]);
+    }
+    // this copies back the updated array andit can be accessed up there in Java
+    env->ReleaseLongArrayElements(output_tensor_handles, t, 0);
+
+	jbyteArray ret = nullptr;
+	return ret;
+}
+
 JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(JNIEnv* env,
                                                          jclass clazz,
                                                          jlong handle) {
@@ -228,3 +324,43 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(JNI

  return env->NewStringUTF("");
 }
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_elphelMakeCallable(
+    JNIEnv* env, jclass clazz, jlong session_handle, jbyteArray config){
+	
+	TF_Session* session = requireHandle(env, session_handle);
+	
+	using namespace tensorflow;
+	
+	CallableOptions opts;
+	
+	jbyte* cconfig = nullptr;
+	if (config != nullptr) {
+	  cconfig = env->GetByteArrayElements(config, nullptr);
+	  opts.ParseFromArray(cconfig, static_cast<size_t>(env->GetArrayLength(config)));
+	}
+	
+	Session::CallableHandle feed_gpu_fetch_cpu;
+	
+	auto runStatus = session->session->MakeCallable(opts, &feed_gpu_fetch_cpu);
+	
+	if (!runStatus.ok()){
+		printf("It is with a heavy heart I inform you that MakeCallable has failed. Here's the error message:\n");
+		printf(runStatus.error_message().c_str());
+		return -1;
+	}else{
+		/*
+		jlong* t = env->GetLongArrayElements(callable_handle, nullptr);
+		t[0] = reinterpret_cast<jlong>((long) feed_gpu_fetch_cpu);
+		env->ReleaseLongArrayElements(callable_handle, t, 0);
+		*/
+		return reinterpret_cast<jlong>((long) feed_gpu_fetch_cpu);
+	}
+}
+
+
+
+
+
+
+
--- a/tf_jni/session_jni.h
+++ b/tf_jni/session_jni.h
@@ -58,7 +58,13 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(


 JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(
-    JNIEnv* env, jclass clazz, jlong handle);
+    JNIEnv*, jclass, jlong handle);
+
+JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_elphelMakeCallable(
+    JNIEnv*, jclass, jlong, jbyteArray);
+
+JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_elphelRunCallable(
+	JNIEnv*, jclass, jlong, jlong, jlongArray, jlongArray);

 #ifdef __cplusplus
 }  // extern "C"

--- a/tf_jni/tensor_jni.cc
+++ b/tf_jni/tensor_jni.cc
@@ -384,7 +384,30 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_elphelAllocateGPUTensor(JNIEn
    TF_Tensor* t;
    //t->tensor
    
-    tensorflow::TensorShape shapex = tensorflow::TensorShape({256});
+    using namespace tensorflow;
+    
+    DataType dt_dtype = static_cast<DataType>(dtype);
+    
+    // Actually, don't need TF_*
+    //TF_DataType tf_dtype = static_cast<TF_DataType>(dtype);
+    //size_t tf_dtype_size = TF_DataTypeSize(tf_dtype);
+    
+    const int num_dims = static_cast<int>(env->GetArrayLength(shape));
+    //int64_t* dims = new int64_t[num_dims];
+    std::vector<tensorflow::int64> dims(num_dims);
+    int64_t num_elements = 1;
+    {
+      jlong* jdims = env->GetLongArrayElements(shape, nullptr);
+      for (int i = 0; i < num_dims; ++i) {
+        dims[i] = static_cast<int64>(jdims[i]);
+        num_elements *= dims[i];
+      }
+      // what's this for?
+      env->ReleaseLongArrayElements(shape, jdims, JNI_ABORT);
+    }
+    
+    TensorShape ts_shape = tensorflow::TensorShape(dims);
+    
    tensorflow::PlatformGpuId platform_gpu_id(0);

    tensorflow::GPUMemAllocator *sub_allocator =
@@ -393,15 +416,19 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_elphelAllocateGPUTensor(JNIEn
            platform_gpu_id, false, {}, {});

    tensorflow::GPUBFCAllocator *allocator =
-            new tensorflow::GPUBFCAllocator(sub_allocator, shapex.num_elements() * sizeof(tensorflow::DT_UINT8), "GPU_0_bfc");
+            new tensorflow::GPUBFCAllocator(sub_allocator, num_elements * sizeof(dt_dtype), "GPU_0_bfc");

-    Tensor t_cuda = Tensor(allocator, tensorflow::DT_UINT8, shapex);
+    Tensor t_cuda = Tensor(allocator, dt_dtype, ts_shape);
+
+    //TODO:
+    // Maybe check tensor pointer here - CUDA or not CUDA?
    
    //t->tensor = t_cuda;
    TF_Status* status = TF_NewStatus();
+    // TODO: Check what exactly this function does...
    t = TF_TensorFromTensor(t_cuda,status);

-    printf("Allocating in GPU!");
+    //printf("Allocated in GPU!");
    return reinterpret_cast<jlong>(t);
 }

@@ -677,6 +704,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv* env,
              static_cast<jarray>(value));
 }

+/*
 JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv* env,
                                                         jclass clazz,
                                                         jlong handle) {
@@ -712,11 +740,13 @@ JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv* env,
 #endif

 }
-
+*/
+/*
 JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelTestCUDAPointer(JNIEnv* env,
                                                         jclass clazz){
    return 0x3;
 }
+*/




--- a/tf_jni/tensor_jni.h
+++ b/tf_jni/tensor_jni.h
@@ -156,14 +156,15 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(JNIEnv *,
 */
 JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv *, jclass,
                                                              jlong, jobject);
-
+/*
 JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv *,
                                                                    jclass,
                                                                    jlong);
-
+*/
+/*
 JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelTestCUDAPointer(JNIEnv *,
                                                                        jclass);
-
+*/
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus