working on

763c5251 · Oleg Dzhimiev · 4b0ba7bb · 763c5251 · 763c5251 · 763c5251
Commit 763c5251 authored Feb 19, 2020 by Oleg Dzhimiev
Show whitespace changes
Inline Side-by-side

Showing with 98 additions and 22 deletions

CMakeLists.txt CMakeLists.txt +3 -3

array.cu array.cu +27 -7

array.h array.h +1 -1

main.cpp main.cpp +67 -11

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.16)
+set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
 project(tf-gpu-feed LANGUAGES CXX CUDA)
 cmake_policy(SET CMP0074 OLD)
 set(CMAKE_CXX_STANDARD 11)
-set(CUDACXX /usr/local/cuda/bin/nvcc)
+find_package(CUDA 10.0 REQUIRED)
-find_package(CUDA 9.0 REQUIRED)
 set(SOURCE_FILES
    main.cpp

--- a/array.cu
+++ b/array.cu
@@ -9,7 +9,7 @@
 // a scheme has to be utilized in order to assign each thread a
 // unique number
-__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
+__global__ void incrementArrayViaCUDAdevice(uint8_t *numberArray, int N)
 {
    // this is the assignment of a unique identifier.
    // blockIdx.x is the unique number of the block, in which the
@@ -26,7 +26,7 @@ __global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
 // this is the "normal" function to be run on the CPU
 // it does the exact same thing as the CUDA function above
-void incrementArray(int *numberArray, int N){
+void incrementArray(uint8_t *numberArray, int N){
    // go through every number in the array consecutively
    // and increment it
@@ -36,17 +36,21 @@ void incrementArray(int *numberArray, int N){
    }
 }
-int myCreateCUDAArray(int *tf_ptr){
+int myCreateCUDAArray(uint8_t *tf_ptr){
    // some arbitrary array length
    int numberOfNumbers = 100;
    // declare some arrays for storing numbers
-    int *numbers1, *numbers2;
+    uint8_t *numbers1;
+    uint8_t *numbers2;
-    numbers1 = tf_ptr;
    // reserve (allocate) some working space for the numbers in device memory
-    cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
-    cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
+    // do not malloc for tf_ptr
+    //cudaMalloc(&tf_ptr,sizeof(uint8_t)*numberOfNumbers);
+    cudaMallocManaged(&numbers1, sizeof(uint8_t)*numberOfNumbers);
+    cudaMallocManaged(&numbers2, sizeof(uint8_t)*numberOfNumbers);
    // fill the input array with some numbers
    for(int i=0;i<numberOfNumbers;i++)
@@ -66,6 +70,7 @@ int myCreateCUDAArray(int *tf_ptr){
    // check if the GPU did the same as the CPU
    bool workedCorrectly = true;
+    printf("CUDA kernel incrementing test:\n");
    for(int i=0;i<numberOfNumbers;i++)
    {
        if (numbers1[i] != numbers2[i])
@@ -80,6 +85,21 @@ int myCreateCUDAArray(int *tf_ptr){
    else
        printf("Something went wrong. The output numbers are not what was to be expected...\n");
+    // copy staff
+    cudaMemcpy(tf_ptr,numbers1,numberOfNumbers,cudaMemcpyDeviceToDevice);
+    /*
+    uint8_t *numbers3;
+    cudaMallocManaged(&numbers3, sizeof(uint8_t)*numberOfNumbers);
+    cudaMemcpy(numbers3,numberz1,numberOfNumbers,cudaMemcpyDeviceToDevice);
+    cudaDeviceSynchronize();
+    for(int i=0;i<numberOfNumbers;i++){
+        printf("%d|",numbers3[i]);
+    }
+    printf("\n");
+    */
    // free the space that has been used by our arrays so that
    // other programs might use it
    cudaFree(numbers1);

--- a/array.h
+++ b/array.h
-int myCreateCUDAArray(int *tf_ptr);
+int myCreateCUDAArray(uint8_t *tf_ptr);
--- a/main.cpp
+++ b/main.cpp
@@ -13,6 +13,7 @@
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/cc/client/client_session.h"
 // CUDA includes. Order matters
 #include "dynlink_nvcuvid.h"
@@ -41,10 +42,18 @@ Status loadGraph(unique_ptr<tensorflow::Session> *session){
    auto scope = Scope::NewRootScope();
    // TF likes power of 2
-    tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1});
+    tensorflow::TensorShape shape = tensorflow::TensorShape({256});
    auto a  = Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, Placeholder::Shape(shape));
+    //auto c0 = Const(scope,{256});
+    //auto c1 = Const(scope, (uint8_t)1, {});
+    //auto c2  = Fill(scope, c0, c1);
    auto b  = Identity(scope.WithOpName("array_tensor_out"), a);
+    //auto b  = Identity(scope.WithOpName("array_tensor_out"), c2);
+    //auto b = Add(scope.WithOpName("array_tensor_out"),c2,a);
    TF_CHECK_OK(scope.ToGraphDef(&graph_def));
    //tensorflow::WriteTextProto(Env::Default(), "mygraph.pbtxt", graph_def);
@@ -139,8 +148,8 @@ int main(int, char**) {
        LOG(ERROR) << "\033[1;31m" << "Failed to make callable" << "\033[0m";
    }
-    // TF likes power of 2
+    // TF likes power of 2 and 256s
-    tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1});
+    tensorflow::TensorShape shape = tensorflow::TensorShape({256});
    // allocate tensor on the GPU
    tensorflow::PlatformGpuId platform_gpu_id(0);
@@ -150,7 +159,7 @@ int main(int, char**) {
            platform_gpu_id, false, {}, {});
    tensorflow::GPUBFCAllocator *allocator =
-        new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
+        new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::DT_UINT8), "GPU_0_bfc");
    auto inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
@@ -158,12 +167,59 @@ int main(int, char**) {
    tensorflow::uint8 *p = inputTensor.flat<tensorflow::uint8>().data();
-    unsigned char* ptr;
+    // CUDA kernel call
-    //ptr = p;
+    myCreateCUDAArray(p);
-    // Testing array initialization
-    //myCreateCUDAArray(ptr);
+    vector<Tensor> outputs;
-    //check tensor data here
+    LOG(INFO) << "RunCallable()...";
+    runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
+    if (!runStatus.ok())
+    {
+        LOG(ERROR) << "Running model failed: " << runStatus;
+        return -1;
+    }
+    LOG(INFO) << "RunCallable() output:";
+    LOG(INFO) << outputs[0].DebugString();
+    auto tmap = outputs[0].tensor<uint8_t, 1>();
+    cout << "\033[1;37m";
+    for (int d = 0; d < 256; d++) {
+        cout << (int) tmap(d);
+        if (d!=255) cout << ", ";
+    }
+    cout << "\033[0m" << endl;
    session->ReleaseCallable(feed_gpu_fetch_cpu);
+    /*
+    using namespace tensorflow;
+    using namespace tensorflow::ops;
+    auto root = Scope::NewRootScope();
+    //auto A = Const(root, {1,16,16,1},{8,8});
+    //auto B = Const(root, 5,{});
+    auto A = Placeholder(root.WithOpName("A"), DT_INT32, Placeholder::Shape({2,2}));
+    auto v = Identity(root.WithOpName("v"), A);
+    //auto v = Fill(root.WithOpName("v"), A, B);
+    // Vector b = [3 5]
+    //auto b = Const(root, {{3, 5}});
+    // v = Ab^T
+    //auto v = MatMul(root.WithOpName("v"), A, b, MatMul::TransposeB(true));
+    ClientSession session2(root);
+    // Run and fetch v
+    int aa[2][2] = {{1,2},{3,4}};
+    TF_CHECK_OK(session2.Run({ {A,{{1,2},{3,4}}} },{v}, &outputs));
+    // Expect outputs[0] == [19; -3]
+    LOG(INFO) << outputs[0].DebugString();
+    cout << outputs[0].matrix<int>() << endl;
+    */
    return 0;
 }