Commit 763c5251 authored by Oleg Dzhimiev's avatar Oleg Dzhimiev

working on

parent 4b0ba7bb
cmake_minimum_required(VERSION 3.16) cmake_minimum_required(VERSION 3.16)
set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
project(tf-gpu-feed LANGUAGES CXX CUDA) project(tf-gpu-feed LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD) cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
set(CUDACXX /usr/local/cuda/bin/nvcc) find_package(CUDA 10.0 REQUIRED)
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES set(SOURCE_FILES
main.cpp main.cpp
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
// a scheme has to be utilized in order to assign each thread a // a scheme has to be utilized in order to assign each thread a
// unique number // unique number
__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N) __global__ void incrementArrayViaCUDAdevice(uint8_t *numberArray, int N)
{ {
// this is the assignment of a unique identifier. // this is the assignment of a unique identifier.
// blockIdx.x is the unique number of the block, in which the // blockIdx.x is the unique number of the block, in which the
...@@ -26,7 +26,7 @@ __global__ void incrementArrayViaCUDAdevice(int *numberArray, int N) ...@@ -26,7 +26,7 @@ __global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
// this is the "normal" function to be run on the CPU // this is the "normal" function to be run on the CPU
// it does the exact same thing as the CUDA function above // it does the exact same thing as the CUDA function above
void incrementArray(int *numberArray, int N){ void incrementArray(uint8_t *numberArray, int N){
// go through every number in the array consecutively // go through every number in the array consecutively
// and increment it // and increment it
...@@ -36,17 +36,21 @@ void incrementArray(int *numberArray, int N){ ...@@ -36,17 +36,21 @@ void incrementArray(int *numberArray, int N){
} }
} }
int myCreateCUDAArray(int *tf_ptr){ int myCreateCUDAArray(uint8_t *tf_ptr){
// some arbitrary array length // some arbitrary array length
int numberOfNumbers = 100; int numberOfNumbers = 100;
// declare some arrays for storing numbers // declare some arrays for storing numbers
int *numbers1, *numbers2; uint8_t *numbers1;
uint8_t *numbers2;
numbers1 = tf_ptr;
// reserve (allocate) some working space for the numbers in device memory // reserve (allocate) some working space for the numbers in device memory
cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers); // do not malloc for tf_ptr
//cudaMalloc(&tf_ptr,sizeof(uint8_t)*numberOfNumbers);
cudaMallocManaged(&numbers1, sizeof(uint8_t)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(uint8_t)*numberOfNumbers);
// fill the input array with some numbers // fill the input array with some numbers
for(int i=0;i<numberOfNumbers;i++) for(int i=0;i<numberOfNumbers;i++)
...@@ -66,6 +70,7 @@ int myCreateCUDAArray(int *tf_ptr){ ...@@ -66,6 +70,7 @@ int myCreateCUDAArray(int *tf_ptr){
// check if the GPU did the same as the CPU // check if the GPU did the same as the CPU
bool workedCorrectly = true; bool workedCorrectly = true;
printf("CUDA kernel incrementing test:\n");
for(int i=0;i<numberOfNumbers;i++) for(int i=0;i<numberOfNumbers;i++)
{ {
if (numbers1[i] != numbers2[i]) if (numbers1[i] != numbers2[i])
...@@ -80,6 +85,21 @@ int myCreateCUDAArray(int *tf_ptr){ ...@@ -80,6 +85,21 @@ int myCreateCUDAArray(int *tf_ptr){
else else
printf("Something went wrong. The output numbers are not what was to be expected...\n"); printf("Something went wrong. The output numbers are not what was to be expected...\n");
// copy staff
cudaMemcpy(tf_ptr,numbers1,numberOfNumbers,cudaMemcpyDeviceToDevice);
/*
uint8_t *numbers3;
cudaMallocManaged(&numbers3, sizeof(uint8_t)*numberOfNumbers);
cudaMemcpy(numbers3,numberz1,numberOfNumbers,cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
for(int i=0;i<numberOfNumbers;i++){
printf("%d|",numbers3[i]);
}
printf("\n");
*/
// free the space that has been used by our arrays so that // free the space that has been used by our arrays so that
// other programs might use it // other programs might use it
cudaFree(numbers1); cudaFree(numbers1);
......
int myCreateCUDAArray(int *tf_ptr); int myCreateCUDAArray(uint8_t *tf_ptr);
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include "tensorflow/cc/framework/scope.h" #include "tensorflow/cc/framework/scope.h"
#include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/cc/client/client_session.h"
// CUDA includes. Order matters // CUDA includes. Order matters
#include "dynlink_nvcuvid.h" #include "dynlink_nvcuvid.h"
...@@ -41,10 +42,18 @@ Status loadGraph(unique_ptr<tensorflow::Session> *session){ ...@@ -41,10 +42,18 @@ Status loadGraph(unique_ptr<tensorflow::Session> *session){
auto scope = Scope::NewRootScope(); auto scope = Scope::NewRootScope();
// TF likes power of 2 // TF likes power of 2
tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1}); tensorflow::TensorShape shape = tensorflow::TensorShape({256});
auto a = Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, Placeholder::Shape(shape)); auto a = Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, Placeholder::Shape(shape));
//auto c0 = Const(scope,{256});
//auto c1 = Const(scope, (uint8_t)1, {});
//auto c2 = Fill(scope, c0, c1);
auto b = Identity(scope.WithOpName("array_tensor_out"), a); auto b = Identity(scope.WithOpName("array_tensor_out"), a);
//auto b = Identity(scope.WithOpName("array_tensor_out"), c2);
//auto b = Add(scope.WithOpName("array_tensor_out"),c2,a);
TF_CHECK_OK(scope.ToGraphDef(&graph_def)); TF_CHECK_OK(scope.ToGraphDef(&graph_def));
//tensorflow::WriteTextProto(Env::Default(), "mygraph.pbtxt", graph_def); //tensorflow::WriteTextProto(Env::Default(), "mygraph.pbtxt", graph_def);
...@@ -139,8 +148,8 @@ int main(int, char**) { ...@@ -139,8 +148,8 @@ int main(int, char**) {
LOG(ERROR) << "\033[1;31m" << "Failed to make callable" << "\033[0m"; LOG(ERROR) << "\033[1;31m" << "Failed to make callable" << "\033[0m";
} }
// TF likes power of 2 // TF likes power of 2 and 256s
tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1}); tensorflow::TensorShape shape = tensorflow::TensorShape({256});
// allocate tensor on the GPU // allocate tensor on the GPU
tensorflow::PlatformGpuId platform_gpu_id(0); tensorflow::PlatformGpuId platform_gpu_id(0);
...@@ -150,7 +159,7 @@ int main(int, char**) { ...@@ -150,7 +159,7 @@ int main(int, char**) {
platform_gpu_id, false, {}, {}); platform_gpu_id, false, {}, {});
tensorflow::GPUBFCAllocator *allocator = tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc"); new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::DT_UINT8), "GPU_0_bfc");
auto inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape); auto inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
...@@ -158,12 +167,59 @@ int main(int, char**) { ...@@ -158,12 +167,59 @@ int main(int, char**) {
tensorflow::uint8 *p = inputTensor.flat<tensorflow::uint8>().data(); tensorflow::uint8 *p = inputTensor.flat<tensorflow::uint8>().data();
unsigned char* ptr; // CUDA kernel call
//ptr = p; myCreateCUDAArray(p);
// Testing array initialization
//myCreateCUDAArray(ptr); vector<Tensor> outputs;
//check tensor data here LOG(INFO) << "RunCallable()...";
runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
if (!runStatus.ok())
{
LOG(ERROR) << "Running model failed: " << runStatus;
return -1;
}
LOG(INFO) << "RunCallable() output:";
LOG(INFO) << outputs[0].DebugString();
auto tmap = outputs[0].tensor<uint8_t, 1>();
cout << "\033[1;37m";
for (int d = 0; d < 256; d++) {
cout << (int) tmap(d);
if (d!=255) cout << ", ";
}
cout << "\033[0m" << endl;
session->ReleaseCallable(feed_gpu_fetch_cpu); session->ReleaseCallable(feed_gpu_fetch_cpu);
/*
using namespace tensorflow;
using namespace tensorflow::ops;
auto root = Scope::NewRootScope();
//auto A = Const(root, {1,16,16,1},{8,8});
//auto B = Const(root, 5,{});
auto A = Placeholder(root.WithOpName("A"), DT_INT32, Placeholder::Shape({2,2}));
auto v = Identity(root.WithOpName("v"), A);
//auto v = Fill(root.WithOpName("v"), A, B);
// Vector b = [3 5]
//auto b = Const(root, {{3, 5}});
// v = Ab^T
//auto v = MatMul(root.WithOpName("v"), A, b, MatMul::TransposeB(true));
ClientSession session2(root);
// Run and fetch v
int aa[2][2] = {{1,2},{3,4}};
TF_CHECK_OK(session2.Run({ {A,{{1,2},{3,4}}} },{v}, &outputs));
// Expect outputs[0] == [19; -3]
LOG(INFO) << outputs[0].DebugString();
cout << outputs[0].matrix<int>() << endl;
*/
return 0; return 0;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment