cleanup

4e45d987 · Oleg Dzhimiev · e35a9afe · 4e45d987 · 4e45d987
Commit 4e45d987 authored Feb 20, 2020 by Oleg Dzhimiev
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 38 deletions

array.cu array.cu +1 -1

main.cpp main.cpp +41 -37

No files found.
--- a/array.cu
+++ b/array.cu
@@ -70,7 +70,7 @@ int myCreateCUDAArray(uint8_t *tf_ptr){
    // check if the GPU did the same as the CPU
    bool workedCorrectly = true;
-    printf("CUDA kernel incrementing test:\n");
+    printf("CUDA kernel simple incrementing test:\n");
    for(int i=0;i<numberOfNumbers;i++)
    {
        if (numbers1[i] != numbers2[i])

--- a/main.cpp
+++ b/main.cpp
@@ -32,27 +32,19 @@ using tensorflow::Status;
 using tensorflow::Tensor;
-Status loadGraph(unique_ptr<tensorflow::Session> *session){
+Status createGraphAndSession(unique_ptr<tensorflow::Session> *session){
    tensorflow::GraphDef graph_def;
    using namespace tensorflow;
-    using namespace tensorflow::ops;
    auto scope = Scope::NewRootScope();
    // TF likes power of 2
    tensorflow::TensorShape shape = tensorflow::TensorShape({256});
-    auto a  = Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, Placeholder::Shape(shape));
+    auto a  = ops::Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, ops::Placeholder::Shape(shape));
+    auto b  = ops::Identity(scope.WithOpName("array_tensor_out"), a);
-    //auto c0 = Const(scope,{256});
-    //auto c1 = Const(scope, (uint8_t)1, {});
-    //auto c2  = Fill(scope, c0, c1);
-    auto b  = Identity(scope.WithOpName("array_tensor_out"), a);
-    //auto b  = Identity(scope.WithOpName("array_tensor_out"), c2);
-    //auto b = Add(scope.WithOpName("array_tensor_out"),c2,a);
    TF_CHECK_OK(scope.ToGraphDef(&graph_def));
@@ -65,7 +57,7 @@ Status loadGraph(unique_ptr<tensorflow::Session> *session){
    session->reset(tensorflow::NewSession(session_options));
    Status session_create_status = (*session)->Create(graph_def);
    if (!session_create_status.ok()){
-        LOG(ERROR) << "loadGraph(): ERROR" << session_create_status;
+        LOG(ERROR) << "createGraphAndSession(): ERROR" << session_create_status;
    }
    return Status::OK();
 }
@@ -109,7 +101,7 @@ int main(int, char**) {
        LOG(INFO) << "Tensorflow built with CUDA, keep running" << endl;
    }
-    // check and init CUDA drivers and libs
+    // check and init CUDA drivers and libs?
    void *hHandleDriver = nullptr;
    CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
    if (cuda_res != CUDA_SUCCESS)
@@ -123,14 +115,17 @@ int main(int, char**) {
    }
    LOG(INFO) << "\033[1;32m" << "CUDA init: ok" << "\033[0m";
+    // construct graph and create TF session
    std::unique_ptr<tensorflow::Session> session;
+    createGraphAndSession(&session);
-    loadGraph(&session);
+    // do the opts and allocate tensor in GPU
+    // NOTE: must match with graph names
    const string inputLayer  = "array_tensor_in:0";
    const string outputLayer = "array_tensor_out:0";
-    // do the opts
    CallableOptions opts;
    Session::CallableHandle feed_gpu_fetch_cpu;
@@ -150,7 +145,6 @@ int main(int, char**) {
    // TF likes power of 2 and 256s
    tensorflow::TensorShape shape = tensorflow::TensorShape({256});
-    // allocate tensor on the GPU
    tensorflow::PlatformGpuId platform_gpu_id(0);
    tensorflow::GPUMemAllocator *sub_allocator =
@@ -165,32 +159,42 @@ int main(int, char**) {
    LOG(INFO) << "\033[1;37m" << "Is CUDA Tensor? " << (IsCUDATensor(inputTensor)?"\033[1;32myes":"\033[1;31mno") << "\033[0m";
+    // pointer to tensor data
    tensorflow::uint8 *p = inputTensor.flat<tensorflow::uint8>().data();
+    // run and check
+    vector<Tensor> outputs;
+    for(int i=0;i<2;i++){
+        cout << endl;
+        if (i==0){
+            LOG(INFO) << "\033[1;32m" << "RunCallable()... No feeding (zeroes)" << "\033[0m";
+        }
+        if (i==1){
+            LOG(INFO) << "\033[1;32m" << "RunCallable()... Feeding from CUDA kernel"<< "\033[0m";
            // CUDA kernel call
+            // NOTE: do not allocate memory for p inside the kernel
            myCreateCUDAArray(p);
+        }
-    vector<Tensor> outputs;
-    LOG(INFO) << "RunCallable()...";
        runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
-    if (!runStatus.ok())
+        if (!runStatus.ok()){
-    {
            LOG(ERROR) << "Running model failed: " << runStatus;
            return -1;
        }
        LOG(INFO) << "RunCallable() output:";
        LOG(INFO) << outputs[0].DebugString();
        auto tmap = outputs[0].tensor<uint8_t, 1>();
        cout << "\033[1;37m";
-    for (int d = 0; d < 256; d++) {
+        for (int d = 0; d < 256; d++){
            cout << (int) tmap(d);
            if (d!=255) cout << ", ";
        }
        cout << "\033[0m" << endl;
+    }
    session->ReleaseCallable(feed_gpu_fetch_cpu);