Commit 773648ca authored by Oleg Dzhimiev's avatar Oleg Dzhimiev

standalone code

parent d8533f78
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.16)
set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
project(tf_detector_example LANGUAGES CXX CUDA)
project(tf-gpu-feed LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops
set(CUDACXX /usr/local/cuda/bin/nvcc)
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES
set(SOURCE_FILES
main.cpp
utils.cpp
utils.h
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
channel_first.cu
dynlink_nvcuvid.cpp
array.cu
)
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
add_executable(tf-gpu-feed ${SOURCE_FILES})
set_target_properties(tf-gpu-feed PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
include_directories(${CUDA_INCLUDE_DIRS})
# TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path
# This is Azure DLVM (not sure if DSVM is the same)
#include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
#include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
target_link_libraries(tf-gpu-feed
${CUDA_LIBRARIES}
cuda
cublas
nvinfer
nvToolsExt
nvparsers
nvinfer_plugin
nvonnxparser
nvparsers
nvinfer_plugin
nvonnxparser
${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS})
This diff is collapsed.
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# tensorflow-feed-from-gpu
Simple TF test
## Setup in Eclipse
From **Eclipse (2019-12)**:
* File > Open Projects from File System...
* Directory... > navigate to project's root > Finish
Tried importing a few times - indexer does not work sometimes.
# Run
```
mkdir build
cd build
cmake ..
./tf-gpu-feed
```
\ No newline at end of file
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
// this is the program that is to be run on the device for a
// large number of threads, in our example 100
// each thread takes care of one entry in the number array,
// so in order for the thread to know which number to manipulate,
// a scheme has to be utilized in order to assign each thread a
// unique number
__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
{
// this is the assignment of a unique identifier.
// blockIdx.x is the unique number of the block, in which the
// thread is positioned, blockDim.x holds the number of threads
// for each block and threadIdx.x is the number of the thread in
// this block.
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// this tells the thread to manipulate the assigned number in
// the array stored in device memory and increment it
if (idx<N)
numberArray[idx] = numberArray[idx] + 1;
}
// this is the "normal" function to be run on the CPU
// it does the exact same thing as the CUDA function above
void incrementArray(int *numberArray, int N){
// go through every number in the array consecutively
// and increment it
for(int i=0; i<N; ++i)
{
numberArray[i] = numberArray[i] + 1;
}
}
int myCreateCUDAArray(int *tf_ptr){
// some arbitrary array length
int numberOfNumbers = 100;
// declare some arrays for storing numbers
int *numbers1, *numbers2;
numbers1 = tf_ptr;
// reserve (allocate) some working space for the numbers in device memory
cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
// fill the input array with some numbers
for(int i=0;i<numberOfNumbers;i++)
{
numbers1[i] = i; // this will be manipulated by the CUDA device (GPU)
numbers2[i] = i; // this will be manipulated by the CPU (as any standard C program would do)
}
// tell the device (GPU) to do its magic
incrementArrayViaCUDAdevice<<<1, numberOfNumbers>>>(numbers1, numberOfNumbers);
// wait for the device to finish working
cudaDeviceSynchronize();
// compute the same function "normally" on the CPU
incrementArray(numbers2, numberOfNumbers);
// check if the GPU did the same as the CPU
bool workedCorrectly = true;
for(int i=0;i<numberOfNumbers;i++)
{
if (numbers1[i] != numbers2[i])
workedCorrectly = 0;
printf(" %d vs %d |",numbers1[i],numbers2[i]);
}
printf("\n");
if (workedCorrectly == 1)
printf("The device performed well!\n");
else
printf("Something went wrong. The output numbers are not what was to be expected...\n");
// free the space that has been used by our arrays so that
// other programs might use it
cudaFree(numbers1);
cudaFree(numbers2);
return 0;
}
int myCreateCUDAArray(int *tf_ptr);
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
This diff is collapsed.
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
FROM fierval/tensorrt:19.02-py3
# nvcuvid
ADD nvcuvid/* /usr/local/cuda/targets/x86_64-linux/include/
# opencv
RUN apt-get update
RUN apt-get install -y git libgtk2.0-dev curl pkg-config autoconf automake libtool libavcodec-dev \
libavformat-dev libswscale-dev python-dev python-numpy libtbb2 libtbb-dev \
libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev unzip libcurl4-gnutls-dev zlib1g-dev
RUN apt-get install -y wget
RUN apt-get install -y vim
## CMAKE
ADD https://cmake.org/files/v3.13/cmake-3.13.0.tar.gz /
RUN tar xzvf /cmake-3.13.0.tar.gz -C / \
&& cd /cmake-3.13.0 \
&& ./bootstrap \
&& make -j15 \
&& make install
# Second: get and build OpenCV 3.3.1
#
ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz /
RUN tar xzvf /protobuf-cpp-3.6.1.tar.gz -C /
RUN cd /protobuf-3.6.1 \
&& ./configure \
&& make -j15 \
&& make install \
&& ldconfig
ADD https://github.com/opencv/opencv_contrib/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
ADD https://github.com/opencv/opencv/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
RUN cd /workspace/opencv-3.3.1 \
&& mkdir build \
&& cd build \
&& cmake -DBUILD_TIFF=ON \
-DBUILD_opencv_java=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_CUDA=ON \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_TESTS=OFF \
-DBUILD_opencv_codacodec=ON \
# -DENABLE_FAST_MATH=1 \
# -DCUDA_FAST_MATH=1 \
-DWITH_CUBLAS=1 \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
-DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-3.3.1/modules/ \
##
-DCUDA_ARCH_BIN='7.0' \
-DCUDA_ARCH_PTX="" \
##
## AVX in dispatch because not all machines have it
-DCPU_DISPATCH=AVX,AVX2 \
-DENABLE_PRECOMPILED_HEADERS=OFF \
-DWITH_OPENGL=OFF \
-DWITH_OPENCL=OFF \
-DWITH_QT=OFF \
-DWITH_NVCUVID=ON \
-DWITH_IPP=ON \
-DWITH_TBB=ON \
-DFORCE_VTK=ON \
-DWITH_EIGEN=ON \
-DWITH_V4L=ON \
-DWITH_XINE=ON \
-DWITH_GDAL=ON \
-DWITH_1394=OFF \
-DWITH_FFMPEG=OFF \
-DBUILD_PROTOBUF=ON \
-DBUILD_TESTS=OFF \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_opencv_xfeatures2d=OFF \
-DCMAKE_BUILD_TYPE=RELEASE \
-DCMAKE_INSTALL_PREFIX=/usr/local \
.. \
&& make -j15 \
&& make install \
&& rm /3.3.1.zip \
&& rm /cmake-3.13.0.tar.gz
RUN mkdir fast_od
RUN mkdir /home/boris
# tensorflow libraries
ADD tensorflow.tar /
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef __cuda_h__
#define __cuda_h__
/**
* CUDA API version support
*/
#include "dynlink_cuda_cuda.h"
#endif //__cuda_h__
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
cuda_res = cuvidInit(0);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
std::cout << "CUDA init: SUCCESS" << endl;
cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
isCudaInited = true;
}
int InferenceBase::Init(string videoStream)
{
if (!isCudaInited)
{
InitCuda();
}
if (ReadClassLabels() != 0)
{
LOG(ERROR) << "ReadClassLabels returned non-zero\n";
return -1;
}
LOG(INFO) << "CUDA INIT DONE\n";
/*
if (ReadGraph() != 0)
{
LOG(ERROR) << "Could not load inference graph";
return -1;
}
LOG(INFO) << "Inference graph loaded";
// create video stream
d_reader = GetVideoReader(videoStream);