Commit 963e5a90 authored by Oleg Dzhimiev's avatar Oleg Dzhimiev

kind of works, need more testing

parent 0f343bbb
......@@ -40,7 +40,8 @@ import jcuda.runtime.JCuda;
import jcuda.runtime.cudaError;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.FloatBuffer;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
......@@ -86,10 +87,15 @@ public class tfhello{
int cuSize = size*Sizeof.INT;
int[] px_in = new int[size];
float[] px_in_float = new float[size];
byte[] px_in_byte = new byte[size];
int[] px_out = new int[size];
for(int i = 0; i < px_in.length; i++) {
px_in[i] = i+1;
px_in_float[i] = i+1;
//px_in_byte[i] = (i+1).byteValue();
}
JCudaDriver.setExceptionsEnabled(true);
......@@ -167,19 +173,31 @@ public class tfhello{
.build();
Operation x = g.opBuilder("Placeholder", "array_tensor_in")
.setAttr("dtype",DataType.INT32)
//.setAttr("dtype",DataType.INT32)
.setAttr("dtype",DataType.FLOAT)
//.setAttr("dtype",DataType.UINT8)
//.setAttr("dtype",DataType.INT64)
.build();
Operation z = g.opBuilder("Identity", "array_tensor_out")
.addInput(x.output(0))
.build();
Tensor t = Tensor.create(px_in);
System.out.println("Is CUDA tensor? "+String.valueOf(t.elphel_isCUDATensor()));
System.out.println(t.elphelTestCUDAPointer());
// unit8
//Tensor t = Tensor.create(px_in_uint8);
// int
//Tensor t = Tensor.create(px_in);
// float
long handle1;
//session.makeCallable(handle1);
ByteBuffer bbuf = ByteBuffer.allocateDirect(px_in_float.length * Float.BYTES); //4 bytes per float
bbuf.order(ByteOrder.nativeOrder());
FloatBuffer fbuf = bbuf.asFloatBuffer();
fbuf.put(px_in_float);
fbuf.position(0);
Tensor<Float> t = Tensor.create(new long[]{px_in_float.length},fbuf);
//System.out.println("Is CUDA tensor? "+String.valueOf(t.elphel_isCUDATensor()));
//System.out.println(t.elphelTestCUDAPointer());
try (
Session s = new Session(g, config.toByteArray())
......@@ -191,31 +209,27 @@ public class tfhello{
Tensor output = s.runner().fetch("array_tensor_out").feed("array_tensor_in", t).run().get(0);
System.out.println(output.numBytes());
System.out.println("Numbytes: "+output.numBytes());
int[] obuf = new int[output.numBytes()/Sizeof.INT];
//int[] obuf = new int[output.numBytes()/Sizeof.INT];
float[] obuf = new float[output.numBytes()/Sizeof.FLOAT];
output.copyTo(obuf);
System.out.println("Output from the first run: ");
System.out.println(Arrays.toString(obuf));
// natively got GPU device name to insert into options
// it's the same all the time
String gpuDeviceName = s.elphelGPUDeviceName();
// that's for RunCallable() if it ever gets implemented
CallableOptions callableOpts = CallableOptions.newBuilder()
.addFetch("array_tensor_out:0")
.addFeed("array_tensor_in:0")
.putFeedDevices("array_tensor_in:0", gpuDeviceName)
.build();
System.out.println(callableOpts);
// GPU allocation:
Tensor t3 = Tensor.elphelCreateGPUTensor(new long[]{256},DataType.INT32);
// GPU allocation: dims must be power of 2?
Tensor t3 = Tensor.elphelCreateGPUTensor(new long[]{256},DataType.FLOAT);
//System.out.println(t2.nativeRef);
// Let's check what happended
long t3_gpuptr = t3.elphel_GetGPUTensorPointer();
System.out.println(String.format("0x%08x", t3_gpuptr));
// Print address
//System.out.println("Pointer address: "+String.format("0x%08x", t3_gpuptr));
CUdeviceptr ptr3 = longToCUdeviceptr(t3_gpuptr);
......@@ -228,13 +242,49 @@ public class tfhello{
System.out.println("Not a CUDA device");
}
System.out.println("cuda pointer attributes?! "+res);
System.out.println(attrs.toString());
if (attrs.device==0) {
Field f = attrs.devicePointer.getClass().getSuperclass().getDeclaredField("nativePointer");
f.setAccessible(true);
long addr = f.getLong(attrs.devicePointer);
//System.out.println(String.format("0x%08x",addr));
if (addr!=0) {
System.err.println("\nTensor is allocated in CUDA: "+attrs.toString()+"\n");
}
}
cuMemcpyHtoD(ptr3, Pointer.to(px_in), cuSize);
cuMemcpyDtoH(Pointer.to(px_out), ptr3, cuSize);
System.out.println(Arrays.toString(px_out));
// check if it a GPU pointer
// initialize tensor with values
int dsize = 256;
int cuDsize = dsize*Sizeof.FLOAT;
float[] din = new float[dsize];
float[] dout = new float[dsize];
for(int i = 0; i < din.length; i++) {
din[i] = i+1;
}
cuMemcpyHtoD(ptr3, Pointer.to(din), cuDsize);
cuMemcpyDtoH(Pointer.to(dout), ptr3, cuDsize);
System.out.println(Arrays.toString(dout));
// that's for RunCallable() if it ever gets implemented
CallableOptions callableOpts = CallableOptions.newBuilder()
.addFetch("array_tensor_out:0")
.addFeed("array_tensor_in:0")
.putFeedDevices("array_tensor_in:0", gpuDeviceName)
.build();
System.out.println(callableOpts);
// callable handle
long feed_gpu_fetch_cpu = s.MakeCallable(callableOpts.toByteArray());
Tensor<?> t3out = s.runner().fetch("array_tensor_out").feed("array_tensor_in",t3).runElphelCallable(feed_gpu_fetch_cpu).get(0);
System.out.println(t3);
System.out.println(t3out);
float[] obuf2 = new float[t3out.numBytes()/Sizeof.FLOAT];
t3out.copyTo(obuf2);
System.out.println("Output from the second run: ");
System.out.println(Arrays.toString(obuf2));
}
......
......@@ -82,11 +82,16 @@ public final class Session implements AutoCloseable {
}
public String elphelGPUDeviceName(){
//return "CHECKPOINT";
return elphelGetGPUDeviceName(this.nativeHandle);
}
public native String elphelGetGPUDeviceName(long handle);
private native String elphelGetGPUDeviceName(long handle);
public long MakeCallable(byte[] config){
return elphelMakeCallable(this.nativeHandle, config);
}
private native long elphelMakeCallable(long nativeHandle, byte[] config);
/**
* Release resources associated with the Session.
......@@ -125,6 +130,7 @@ public final class Session implements AutoCloseable {
* #feed(String,int,Tensor)}.
*/
public final class Runner {
/**
* Avoid evaluating {@code operation} and substitute {@code t} for the value it produces.
*
......@@ -136,6 +142,8 @@ public final class Session implements AutoCloseable {
* SavedModelBundle#metaGraphDef()}.
*/
public Runner feed(String operation, Tensor<?> t) {
//debug
System.out.println("Adding feed to operation: "+operation);
return feed(parseOutput(operation), t);
}
......@@ -165,6 +173,14 @@ public final class Session implements AutoCloseable {
return this;
}
/**
* Feed for RunCallable - just a tensor
*/
public Runner feed(Tensor<?> t) {
inputTensors.add(t);
return this;
}
/**
* Make {@link #run()} return the output of {@code operation}.
*
......@@ -295,6 +311,64 @@ public final class Session implements AutoCloseable {
return runHelper(true);
}
public List<Tensor<?>> runElphelCallable(long handle) {
return runElphelCallableHelper(handle).outputs;
}
// whatever
private Run runElphelCallableHelper(long handle) {
long[] inputTensorHandles = new long[inputTensors.size()];
long[] outputTensorHandles = new long[outputs.size()];
System.out.println("Number of input handles: "+inputTensors.size());
System.out.println("Number of output handles: "+outputs.size());
// It's okay to use Operation.getUnsafeNativeHandle() here since the safety depends on the
// validity of the Graph and graphRef ensures that.
int idx = 0;
for (Tensor<?> t : inputTensors) {
inputTensorHandles[idx++] = t.getNativeHandle();
}
Reference runRef = new Reference();
byte[] metadata = null;
try {
System.out.println("About to run RunCallable\n");
metadata =
Session.elphelRunCallable(
nativeHandle,
handle,
inputTensorHandles,
outputTensorHandles);
System.out.println("Ready to process output\n");
} finally {
runRef.close();
}
System.out.println("Processing output\n");
// test something here
List<Tensor<?>> outputs = new ArrayList<Tensor<?>>();
for (long h : outputTensorHandles) {
try {
outputs.add(Tensor.fromHandle(h));
} catch (Exception e) {
for (Tensor<?> t : outputs) {
t.close();
}
outputs.clear();
throw e;
}
}
Run ret = new Run();
ret.outputs = outputs;
ret.metadata = metadata;
return ret;
}
private Run runHelper(boolean wantMetadata) {
long[] inputTensorHandles = new long[inputTensors.size()];
long[] inputOpHandles = new long[inputs.size()];
......@@ -495,4 +569,14 @@ public final class Session implements AutoCloseable {
long[] targetOpHandles,
boolean wantRunMetadata,
long[] outputTensorHandles);
/**
* Run RunCallable Callable
*/
private static native byte[] elphelRunCallable(
long sessionHandle,
long callableHandle,
long[] inputTensorHandles,
long[] outputTensorHandles);
}
......@@ -164,7 +164,7 @@ public final class Tensor<T> implements AutoCloseable {
long nativeHandle;
nativeHandle = elphelAllocateGPUTensor(t.shapeCopy,t.dtype.c());
t.nativeRef = new NativeReference(nativeHandle);
System.out.println(t.nativeRef);
//System.out.println(t.nativeRef);
return t;
}
......@@ -544,11 +544,12 @@ public final class Tensor<T> implements AutoCloseable {
return String.format("%s tensor with shape %s", dtype.toString(), Arrays.toString(shape()));
}
/*
public int elphel_isCUDATensor() {
int result = elphelIsCUDATensor(getNativeHandle());
return result;
}
*/
public long elphel_GetGPUTensorPointer(){
return elphelGetGPUTensorPointer(getNativeHandle());
......@@ -862,9 +863,9 @@ public final class Tensor<T> implements AutoCloseable {
private static native void readNDArray(long handle, Object value);
private static native int elphelIsCUDATensor(long handle);
//private static native int elphelIsCUDATensor(long handle);
public static native int elphelTestCUDAPointer();
//public static native int elphelTestCUDAPointer();
static {
TensorFlow.init();
......
......@@ -208,6 +208,102 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
return ret;
}
// Create an empty tensor of type 'dtype'. 'shape' can be arbitrary, but has to
// result in a zero-sized tensor.
static TF_Tensor* EmptyTensor(TF_DataType dtype,
const tensorflow::TensorShape& shape) {
static char empty;
tensorflow::int64 nelems = 1;
std::vector<tensorflow::int64> dims;
for (int i = 0; i < shape.dims(); ++i) {
dims.push_back(shape.dim_size(i));
nelems *= shape.dim_size(i);
}
CHECK_EQ(nelems, 0);
static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
"64-bit int types should match in size");
return TF_NewTensor(
dtype, reinterpret_cast<const int64_t*>(dims.data()), shape.dims(),
reinterpret_cast<void*>(&empty), 0, [](void*, size_t, void*) {}, nullptr);
}
JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_elphelRunCallable(
JNIEnv* env, jclass clazz,
jlong session_handle, jlong callable_handle,
jlongArray input_tensor_handles,
jlongArray output_tensor_handles) {
//printf("Running Callable\n");
TF_Session* session = requireHandle(env, session_handle);
using namespace tensorflow;
Session::CallableHandle feed_gpu_fetch_cpu = (Session::CallableHandle) reinterpret_cast<long>(callable_handle);
const jint ninputs = env->GetArrayLength(input_tensor_handles);
const jint noutputs = env->GetArrayLength(output_tensor_handles);
//printf("ninputs: %d, noutputs: %d\n",ninputs, noutputs);
std::unique_ptr<TF_Tensor* []> output_values(new TF_Tensor*[noutputs]);
std::unique_ptr<TF_Tensor* []> input_values(new TF_Tensor*[ninputs]);
// from input tensor handles to inputs?
resolveHandles(env, "input Tensors", input_tensor_handles, input_values.get(), ninputs);
std::vector<Tensor> inputs(ninputs);
for (int i=0; i<ninputs; ++i) {
TF_TensorToTensor(input_values[i],&inputs[i]);
}
// figure out how to create stuff from handles
std::vector<Tensor> outputs(noutputs);
auto runStatus = session->session->RunCallable(feed_gpu_fetch_cpu, {inputs}, &outputs, nullptr);
if (!runStatus.ok()){
printf("It is with a heavy heart I inform you that RunCallable has failed. Here's the error message:\n");
printf(runStatus.error_message().c_str());
return nullptr;
}
// get the handles t
jlong* t = env->GetLongArrayElements(output_tensor_handles, nullptr);
TF_Status* status = TF_NewStatus();
for (int i = 0; i < noutputs; ++i) {
//outputs[i] = inputz[i];
const Tensor& src = outputs[i];
/*
std::cout << src.DebugString() << std::endl;
// print values:
std::cout << "Output tensor (printing from session_jni.cc):";
auto tmap = src.tensor<float, 1>();
for (int d = 0; d < 256; d++){
std::cout << (int) tmap(d);
if (d!=255) std::cout << ", ";
}
*/
//output_values[i]->tensor = outputs[i];
if (!src.IsInitialized() || src.NumElements() == 0) {
output_values[i] = EmptyTensor(static_cast<TF_DataType>(src.dtype()), src.shape());
continue;
}
output_values[i] = TF_TensorFromTensor(src, status);
// for whatever reason status cannot be a nullptr here
//output_values[i] = TF_TensorFromTensor(src, nullptr);
t[i] = reinterpret_cast<jlong>(output_values[i]);
}
// this copies back the updated array andit can be accessed up there in Java
env->ReleaseLongArrayElements(output_tensor_handles, t, 0);
jbyteArray ret = nullptr;
return ret;
}
JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(JNIEnv* env,
jclass clazz,
jlong handle) {
......@@ -228,3 +324,43 @@ JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(JNI
return env->NewStringUTF("");
}
JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_elphelMakeCallable(
JNIEnv* env, jclass clazz, jlong session_handle, jbyteArray config){
TF_Session* session = requireHandle(env, session_handle);
using namespace tensorflow;
CallableOptions opts;
jbyte* cconfig = nullptr;
if (config != nullptr) {
cconfig = env->GetByteArrayElements(config, nullptr);
opts.ParseFromArray(cconfig, static_cast<size_t>(env->GetArrayLength(config)));
}
Session::CallableHandle feed_gpu_fetch_cpu;
auto runStatus = session->session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok()){
printf("It is with a heavy heart I inform you that MakeCallable has failed. Here's the error message:\n");
printf(runStatus.error_message().c_str());
return -1;
}else{
/*
jlong* t = env->GetLongArrayElements(callable_handle, nullptr);
t[0] = reinterpret_cast<jlong>((long) feed_gpu_fetch_cpu);
env->ReleaseLongArrayElements(callable_handle, t, 0);
*/
return reinterpret_cast<jlong>((long) feed_gpu_fetch_cpu);
}
}
......@@ -58,7 +58,13 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_run(
JNIEXPORT jstring JNICALL Java_org_tensorflow_Session_elphelGetGPUDeviceName(
JNIEnv* env, jclass clazz, jlong handle);
JNIEnv*, jclass, jlong handle);
JNIEXPORT jlong JNICALL Java_org_tensorflow_Session_elphelMakeCallable(
JNIEnv*, jclass, jlong, jbyteArray);
JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Session_elphelRunCallable(
JNIEnv*, jclass, jlong, jlong, jlongArray, jlongArray);
#ifdef __cplusplus
} // extern "C"
......
......@@ -384,7 +384,30 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_elphelAllocateGPUTensor(JNIEn
TF_Tensor* t;
//t->tensor
tensorflow::TensorShape shapex = tensorflow::TensorShape({256});
using namespace tensorflow;
DataType dt_dtype = static_cast<DataType>(dtype);
// Actually, don't need TF_*
//TF_DataType tf_dtype = static_cast<TF_DataType>(dtype);
//size_t tf_dtype_size = TF_DataTypeSize(tf_dtype);
const int num_dims = static_cast<int>(env->GetArrayLength(shape));
//int64_t* dims = new int64_t[num_dims];
std::vector<tensorflow::int64> dims(num_dims);
int64_t num_elements = 1;
{
jlong* jdims = env->GetLongArrayElements(shape, nullptr);
for (int i = 0; i < num_dims; ++i) {
dims[i] = static_cast<int64>(jdims[i]);
num_elements *= dims[i];
}
// what's this for?
env->ReleaseLongArrayElements(shape, jdims, JNI_ABORT);
}
TensorShape ts_shape = tensorflow::TensorShape(dims);
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
......@@ -393,15 +416,19 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_Tensor_elphelAllocateGPUTensor(JNIEn
platform_gpu_id, false, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shapex.num_elements() * sizeof(tensorflow::DT_UINT8), "GPU_0_bfc");
new tensorflow::GPUBFCAllocator(sub_allocator, num_elements * sizeof(dt_dtype), "GPU_0_bfc");
Tensor t_cuda = Tensor(allocator, tensorflow::DT_UINT8, shapex);
Tensor t_cuda = Tensor(allocator, dt_dtype, ts_shape);
//TODO:
// Maybe check tensor pointer here - CUDA or not CUDA?
//t->tensor = t_cuda;
TF_Status* status = TF_NewStatus();
// TODO: Check what exactly this function does...
t = TF_TensorFromTensor(t_cuda,status);
printf("Allocating in GPU!");
//printf("Allocated in GPU!");
return reinterpret_cast<jlong>(t);
}
......@@ -677,6 +704,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv* env,
static_cast<jarray>(value));
}
/*
JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv* env,
jclass clazz,
jlong handle) {
......@@ -712,11 +740,13 @@ JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv* env,
#endif
}
*/
/*
JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelTestCUDAPointer(JNIEnv* env,
jclass clazz){
return 0x3;
}
*/
......
......@@ -156,14 +156,15 @@ JNIEXPORT jbyteArray JNICALL Java_org_tensorflow_Tensor_scalarBytes(JNIEnv *,
*/
JNIEXPORT void JNICALL Java_org_tensorflow_Tensor_readNDArray(JNIEnv *, jclass,
jlong, jobject);
/*
JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelIsCUDATensor(JNIEnv *,
jclass,
jlong);
*/
/*
JNIEXPORT int JNICALL Java_org_tensorflow_Tensor_elphelTestCUDAPointer(JNIEnv *,
jclass);
*/
#ifdef __cplusplus
} // extern "C"
#endif // __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment