Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
T
tensorflow-feed-from-gpu
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
Elphel
tensorflow-feed-from-gpu
Commits
763c5251
Commit
763c5251
authored
Feb 19, 2020
by
Oleg Dzhimiev
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
working on
parent
4b0ba7bb
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
98 additions
and
22 deletions
+98
-22
CMakeLists.txt
CMakeLists.txt
+3
-3
array.cu
array.cu
+27
-7
array.h
array.h
+1
-1
main.cpp
main.cpp
+67
-11
No files found.
CMakeLists.txt
View file @
763c5251
cmake_minimum_required
(
VERSION 3.16
)
set
(
ENV{CUDACXX} /usr/local/cuda/bin/nvcc
)
project
(
tf-gpu-feed LANGUAGES CXX CUDA
)
cmake_policy
(
SET CMP0074 OLD
)
set
(
CMAKE_CXX_STANDARD 11
)
set
(
CUDACXX /usr/local/cuda/bin/nvcc
)
find_package
(
CUDA 9.0 REQUIRED
)
find_package
(
CUDA 10.0 REQUIRED
)
set
(
SOURCE_FILES
main.cpp
...
...
array.cu
View file @
763c5251
...
...
@@ -9,7 +9,7 @@
// a scheme has to be utilized in order to assign each thread a
// unique number
__global__ void incrementArrayViaCUDAdevice(
in
t *numberArray, int N)
__global__ void incrementArrayViaCUDAdevice(
uint8_
t *numberArray, int N)
{
// this is the assignment of a unique identifier.
// blockIdx.x is the unique number of the block, in which the
...
...
@@ -26,7 +26,7 @@ __global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
// this is the "normal" function to be run on the CPU
// it does the exact same thing as the CUDA function above
void incrementArray(
in
t *numberArray, int N){
void incrementArray(
uint8_
t *numberArray, int N){
// go through every number in the array consecutively
// and increment it
...
...
@@ -36,17 +36,21 @@ void incrementArray(int *numberArray, int N){
}
}
int myCreateCUDAArray(
in
t *tf_ptr){
int myCreateCUDAArray(
uint8_
t *tf_ptr){
// some arbitrary array length
int numberOfNumbers = 100;
// declare some arrays for storing numbers
int *numbers1, *numbers2;
uint8_t *numbers1;
uint8_t *numbers2;
numbers1 = tf_ptr;
// reserve (allocate) some working space for the numbers in device memory
cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
// do not malloc for tf_ptr
//cudaMalloc(&tf_ptr,sizeof(uint8_t)*numberOfNumbers);
cudaMallocManaged(&numbers1, sizeof(uint8_t)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(uint8_t)*numberOfNumbers);
// fill the input array with some numbers
for(int i=0;i<numberOfNumbers;i++)
...
...
@@ -66,6 +70,7 @@ int myCreateCUDAArray(int *tf_ptr){
// check if the GPU did the same as the CPU
bool workedCorrectly = true;
printf("CUDA kernel incrementing test:\n");
for(int i=0;i<numberOfNumbers;i++)
{
if (numbers1[i] != numbers2[i])
...
...
@@ -80,6 +85,21 @@ int myCreateCUDAArray(int *tf_ptr){
else
printf("Something went wrong. The output numbers are not what was to be expected...\n");
// copy staff
cudaMemcpy(tf_ptr,numbers1,numberOfNumbers,cudaMemcpyDeviceToDevice);
/*
uint8_t *numbers3;
cudaMallocManaged(&numbers3, sizeof(uint8_t)*numberOfNumbers);
cudaMemcpy(numbers3,numberz1,numberOfNumbers,cudaMemcpyDeviceToDevice);
cudaDeviceSynchronize();
for(int i=0;i<numberOfNumbers;i++){
printf("%d|",numbers3[i]);
}
printf("\n");
*/
// free the space that has been used by our arrays so that
// other programs might use it
cudaFree(numbers1);
...
...
array.h
View file @
763c5251
int
myCreateCUDAArray
(
in
t
*
tf_ptr
);
int
myCreateCUDAArray
(
uint8_
t
*
tf_ptr
);
main.cpp
View file @
763c5251
...
...
@@ -13,6 +13,7 @@
#include "tensorflow/cc/framework/scope.h"
#include "tensorflow/cc/ops/standard_ops.h"
#include "tensorflow/cc/client/client_session.h"
// CUDA includes. Order matters
#include "dynlink_nvcuvid.h"
...
...
@@ -41,10 +42,18 @@ Status loadGraph(unique_ptr<tensorflow::Session> *session){
auto
scope
=
Scope
::
NewRootScope
();
// TF likes power of 2
tensorflow
::
TensorShape
shape
=
tensorflow
::
TensorShape
({
1
,
32
,
32
,
1
});
tensorflow
::
TensorShape
shape
=
tensorflow
::
TensorShape
({
256
});
auto
a
=
Placeholder
(
scope
.
WithOpName
(
"array_tensor_in"
),
DT_UINT8
,
Placeholder
::
Shape
(
shape
));
//auto c0 = Const(scope,{256});
//auto c1 = Const(scope, (uint8_t)1, {});
//auto c2 = Fill(scope, c0, c1);
auto
b
=
Identity
(
scope
.
WithOpName
(
"array_tensor_out"
),
a
);
//auto b = Identity(scope.WithOpName("array_tensor_out"), c2);
//auto b = Add(scope.WithOpName("array_tensor_out"),c2,a);
TF_CHECK_OK
(
scope
.
ToGraphDef
(
&
graph_def
));
//tensorflow::WriteTextProto(Env::Default(), "mygraph.pbtxt", graph_def);
...
...
@@ -139,8 +148,8 @@ int main(int, char**) {
LOG
(
ERROR
)
<<
"
\033
[1;31m"
<<
"Failed to make callable"
<<
"
\033
[0m"
;
}
// TF likes power of 2
tensorflow
::
TensorShape
shape
=
tensorflow
::
TensorShape
({
1
,
32
,
32
,
1
});
// TF likes power of 2
and 256s
tensorflow
::
TensorShape
shape
=
tensorflow
::
TensorShape
({
256
});
// allocate tensor on the GPU
tensorflow
::
PlatformGpuId
platform_gpu_id
(
0
);
...
...
@@ -150,7 +159,7 @@ int main(int, char**) {
platform_gpu_id
,
false
,
{},
{});
tensorflow
::
GPUBFCAllocator
*
allocator
=
new
tensorflow
::
GPUBFCAllocator
(
sub_allocator
,
shape
.
num_elements
()
*
sizeof
(
tensorflow
::
uint
8
),
"GPU_0_bfc"
);
new
tensorflow
::
GPUBFCAllocator
(
sub_allocator
,
shape
.
num_elements
()
*
sizeof
(
tensorflow
::
DT_UINT
8
),
"GPU_0_bfc"
);
auto
inputTensor
=
Tensor
(
allocator
,
tensorflow
::
DT_UINT8
,
shape
);
...
...
@@ -158,12 +167,59 @@ int main(int, char**) {
tensorflow
::
uint8
*
p
=
inputTensor
.
flat
<
tensorflow
::
uint8
>
().
data
();
unsigned
char
*
ptr
;
//ptr = p;
// Testing array initialization
//myCreateCUDAArray(ptr);
//check tensor data here
// CUDA kernel call
myCreateCUDAArray
(
p
);
vector
<
Tensor
>
outputs
;
LOG
(
INFO
)
<<
"RunCallable()..."
;
runStatus
=
session
->
RunCallable
(
feed_gpu_fetch_cpu
,
{
inputTensor
},
&
outputs
,
nullptr
);
if
(
!
runStatus
.
ok
())
{
LOG
(
ERROR
)
<<
"Running model failed: "
<<
runStatus
;
return
-
1
;
}
LOG
(
INFO
)
<<
"RunCallable() output:"
;
LOG
(
INFO
)
<<
outputs
[
0
].
DebugString
();
auto
tmap
=
outputs
[
0
].
tensor
<
uint8_t
,
1
>
();
cout
<<
"
\033
[1;37m"
;
for
(
int
d
=
0
;
d
<
256
;
d
++
)
{
cout
<<
(
int
)
tmap
(
d
);
if
(
d
!=
255
)
cout
<<
", "
;
}
cout
<<
"
\033
[0m"
<<
endl
;
session
->
ReleaseCallable
(
feed_gpu_fetch_cpu
);
/*
using namespace tensorflow;
using namespace tensorflow::ops;
auto root = Scope::NewRootScope();
//auto A = Const(root, {1,16,16,1},{8,8});
//auto B = Const(root, 5,{});
auto A = Placeholder(root.WithOpName("A"), DT_INT32, Placeholder::Shape({2,2}));
auto v = Identity(root.WithOpName("v"), A);
//auto v = Fill(root.WithOpName("v"), A, B);
// Vector b = [3 5]
//auto b = Const(root, {{3, 5}});
// v = Ab^T
//auto v = MatMul(root.WithOpName("v"), A, b, MatMul::TransposeB(true));
ClientSession session2(root);
// Run and fetch v
int aa[2][2] = {{1,2},{3,4}};
TF_CHECK_OK(session2.Run({ {A,{{1,2},{3,4}}} },{v}, &outputs));
// Expect outputs[0] == [19; -3]
LOG(INFO) << outputs[0].DebugString();
cout << outputs[0].matrix<int>() << endl;
*/
return
0
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment