Commit a935f1f0 authored by fierval's avatar fierval

fst commit

parents
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
build/
debug/
.vs/
.vscode/
ssd/
*.tar
\ No newline at end of file
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.8)
project(tf_detector_example LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES
main.cpp
utils.cpp
utils.h
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
inference_trt.cpp
channel_first.cu
)
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
# TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path
# This is Azure DLVM (not sure if DSVM is the same)
include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
${CUDA_LIBRARIES}
cuda
cublas
nvinfer
nvToolsExt
nvparsers
nvinfer_plugin
nvonnxparser
${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS})
This diff is collapsed.
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Optimizied Video Object Detection
The completed application runs any [Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.mdTensorflow) style object detector in Tensorflow mode (default) and an Inception V2 SSD detector converted from Tensorflow graph to UFF format recognized by TensorRT in TensorRT mode (-t).
## Building the app
* Clone the [repo](https://github.com/fierval/fast_od).
* Get the frozen graph and the class labels files for Tensorflow from [here](https://github.com/fierval/tensorflow-object-detection-cpp/tree/master/demo/ssd_inception_v2)
* Get the [frozen graph for TensorRT](https://www.dropbox.com/s/nc3tzm95ip356i5/sample_ssd_relu6.uff?dl=0). The class labels file should be available in `/usr/src/tensorrt/data/ssd` directory.
* Build:
```sh
mkdir build
cd build
cmake .. # cmake -DCMAKE_BUILD_TYPE=Debug
```
## Running
Command line options are described in [`main.cpp`](https://github.com/fierval/fast_od/blob/master/main.cpp">):
```cpp
const String keys =
"{d display |1 | view video while objects are detected}"
"{t tensorrt|false | use tensorrt}"
"{i int8|false| use INT8 (requires callibration)}"
"{v video | | video for detection}"
"{graph ||frozen graph location}"
"{labels ||trained labels filelocation}";
```
Examples are in `run_*.sh` files in the sources directory. Worth mentioning:
```
-d=0 - run without UX, print out framerate only. -d=2 run with UX
-t - TensorRT graph
-t -i - TensorRT graph with INT8 precision.
```
## Slowdown due to UX
The application uses a bare-bones OpenCV UI for visual feedback (`imshow`) and that causes a significant perf hit, so to measure actual performance we run with `-d=0` which suppresses the UI.
\ No newline at end of file
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
This diff is collapsed.
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <stdio.h>
#include "cuda_runtime_api.h"
#include "dynlink_nvcuvid.h"
tcuvidCreateVideoSource *cuvidCreateVideoSource;
tcuvidCreateVideoSourceW *cuvidCreateVideoSourceW;
tcuvidDestroyVideoSource *cuvidDestroyVideoSource;
tcuvidSetVideoSourceState *cuvidSetVideoSourceState;
tcuvidGetVideoSourceState *cuvidGetVideoSourceState;
tcuvidGetSourceVideoFormat *cuvidGetSourceVideoFormat;
tcuvidGetSourceAudioFormat *cuvidGetSourceAudioFormat;
tcuvidCreateVideoParser *cuvidCreateVideoParser;
tcuvidParseVideoData *cuvidParseVideoData;
tcuvidDestroyVideoParser *cuvidDestroyVideoParser;
tcuvidCreateDecoder *cuvidCreateDecoder;
tcuvidDestroyDecoder *cuvidDestroyDecoder;
tcuvidDecodePicture *cuvidDecodePicture;
tcuvidMapVideoFrame *cuvidMapVideoFrame;
tcuvidUnmapVideoFrame *cuvidUnmapVideoFrame;
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
tcuvidMapVideoFrame64 *cuvidMapVideoFrame64;
tcuvidUnmapVideoFrame64 *cuvidUnmapVideoFrame64;
#endif
//tcuvidGetVideoFrameSurface *cuvidGetVideoFrameSurface;
tcuvidCtxLockCreate *cuvidCtxLockCreate;
tcuvidCtxLockDestroy *cuvidCtxLockDestroy;
tcuvidCtxLock *cuvidCtxLock;
tcuvidCtxUnlock *cuvidCtxUnlock;
// Auto-lock helper for C++ applications
CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx)
: m_ctx(ctx)
{
cuvidCtxLock(m_ctx, 0);
}
CCtxAutoLock::~CCtxAutoLock()
{
cuvidCtxUnlock(m_ctx, 0);
}
#define STRINGIFY(X) #X
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#include <Windows.h>
#ifdef UNICODE
static LPCWSTR __DriverLibName = L"nvcuvid.dll";
#else
static LPCSTR __DriverLibName = "nvcuvid.dll";
#endif
typedef HMODULE DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = LoadLibrary(__DriverLibName);
if (*pInstance == NULL)
{
printf("LoadLibrary \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, STRINGIFY(name##_v2));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h>
static char __DriverLibName[] = "libnvcuvid.so";
typedef void *DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = dlopen(__DriverLibName, RTLD_NOW);
if (*pInstance == NULL)
{
printf("dlopen \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#else
#error unsupported platform
#endif
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while(0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
#define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
CUresult CUDAAPI cuvidInit(unsigned int Flags)
{
DLLDRIVER DriverLib;
CHECKED_CALL(LOAD_LIBRARY(&DriverLib));
// fetch all function pointers
GET_PROC(cuvidCreateVideoSource);
GET_PROC(cuvidCreateVideoSourceW);
GET_PROC(cuvidDestroyVideoSource);
GET_PROC(cuvidSetVideoSourceState);
GET_PROC(cuvidGetVideoSourceState);
GET_PROC(cuvidGetSourceVideoFormat);
GET_PROC(cuvidGetSourceAudioFormat);
GET_PROC(cuvidCreateVideoParser);
GET_PROC(cuvidParseVideoData);
GET_PROC(cuvidDestroyVideoParser);
GET_PROC(cuvidCreateDecoder);
GET_PROC(cuvidDestroyDecoder);
GET_PROC(cuvidDecodePicture);
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
GET_PROC(cuvidMapVideoFrame64);
GET_PROC(cuvidUnmapVideoFrame64);
cuvidMapVideoFrame = cuvidMapVideoFrame64;
cuvidUnmapVideoFrame = cuvidUnmapVideoFrame64;
#else
GET_PROC(cuvidMapVideoFrame);
GET_PROC(cuvidUnmapVideoFrame);
#endif
// GET_PROC(cuvidGetVideoFrameSurface);
GET_PROC(cuvidCtxLockCreate);
GET_PROC(cuvidCtxLockDestroy);
GET_PROC(cuvidCtxLock);
GET_PROC(cuvidCtxUnlock);
return CUDA_SUCCESS;
}
This diff is collapsed.
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
cuda_res = cuvidInit(0);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
std::cout << "CUDA init: SUCCESS" << endl;
cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
isCudaInited = true;
}
int InferenceBase::Init(string videoStream)
{
if (!isCudaInited)
{
InitCuda();
}
if (ReadClassLabels() != 0)
{
return -1;
}
if (ReadGraph() != 0)
{
LOG(ERROR) << "Could not load inference graph";
return -1;
}
LOG(INFO) << "Inference graph loaded";
// create video stream
d_reader = GetVideoReader(videoStream);
if (d_reader == nullptr)
{
LOG(ERROR) << "Could not create video stream";
throw exception();
}
// save off frame dimensions
auto formatStruct = d_reader->format();
width = formatStruct.width;
height = formatStruct.height;
isInitialized = true;
return 0;
}
void InferenceBase::RunInferenceOnStream()
{
if (!isInitialized)
{
LOG(ERROR) << "Video streaming not initialized";
return;
}
cuda::GpuMat d_frame;
int iFrame = 0, nFrames = 30;
double fps = 0., infer_tf_ms = 0.;
high_resolution_clock::time_point start = high_resolution_clock::now();
high_resolution_clock::time_point end;
double duration = 0.;
for (;;)
{
start = high_resolution_clock::now();
if (!d_reader->nextFrame(d_frame))
{
break;
}
if (doInference(d_frame) != 0)
{
LOG(ERROR) << "Inference failed";
return;
}
end = high_resolution_clock::now();
duration += (double) duration_cast<milliseconds>(end - start).count();
visualize(d_frame, fps);
if (++iFrame % nFrames == 0)
{
fps = 1. * nFrames / duration * 1000.;
duration = 0.;
}
if (iFrame % 100 == 0)
{
LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
}
}
}
\ No newline at end of file
#pragma once
#include "utils.h"
using namespace std;
class InferenceBase
{
private:
bool isCudaInited;
cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
{return cv::cudacodec::createVideoReader(video_file);}
protected:
string labelsFile;
string graphFile;
map<int, string> labelsMap;
virtual int ReadClassLabels();
virtual int ReadGraph() = 0;
void InitCuda();
cv::Ptr<cv::cudacodec::VideoReader> d_reader;
double thresholdScore;
double thresholdIOU;
// frame width and height
int height;
int width;
int debug;
bool isInitialized;
public:
InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
: labelsFile(labelsFile)
, graphFile(graphFile)
, isCudaInited(false)
, thresholdScore(threshScore)
, thresholdIOU(threshIOU)
, isInitialized(false)
, labelsMap()
, debug(dbg)
{}
virtual ~InferenceBase() {}
void RunInferenceOnStream();
virtual int doInference(cv::cuda::GpuMat&) = 0;
virtual void visualize(cv::cuda::GpuMat&, double) = 0;
virtual int Init(string video_stream);
map<int, string> get_labels_map() {return labelsMap;}
void set_debug(int dbg) {debug = dbg;}
};
#include "inference_tf.h"
using tensorflow::Status;
using tensorflow::Tensor;
using namespace cv;
using tensorflow::int32;
int InferenceTensorflow::ReadGraph()
{
LOG(INFO) << "graphFile:" << graphFile;
Status loadGraphStatus = loadGraph(graphFile, &session);
if (!loadGraphStatus.ok())
{
LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
return -1;
}
else
LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
return 0;
}
// allocate input tensor
int InferenceTensorflow::Init(string videoStream)
{
if (InferenceBase::Init(videoStream) != 0)
{
return -1;
}
// configure callable options
opts.add_feed(inputLayer);
for (auto const &value : outputLayer)
{
opts.add_fetch(value);
}
const string gpu_device_name = GPUDeviceName(session.get());
opts.clear_fetch_devices();
opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok())
{
LOG(ERROR) << "Failed to make callable";
}
// allocate tensor on the GPU
tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
new tensorflow::GPUMemAllocator(
tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
platform_gpu_id, false /*use_unified_memory*/, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
return 0;
}
int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
{
Status runStatus;
readTensorFromGpuMat(d_frame, inputTensor);
runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
if (!runStatus.ok())
{
LOG(ERROR) << "Running model failed: " << runStatus;
return -1;
}
return 0;
}
void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
// Extract results from the outputs vector
tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
if (debug & 0x1)
{
for (size_t i = 0; i < goodIdxs.size(); i++)
LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
<< " (" << classes(goodIdxs.at(i)) << "), box:"
<< "," << boxes(0, goodIdxs.at(i), 0) << ","
<< boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
<< boxes(0, goodIdxs.at(i), 3);
}
// Draw bboxes and captions
if (debug & 0x2)
{
Mat frame;
d_frame.download(frame);
drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
auto color = Scalar(255, 0, 255);
drawFrameworkSignature(frame, fps, "Tensorflow", color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
using tensorflow::CallableOptions;
using tensorflow::Tensor;
using tensorflow::Session;
class InferenceTensorflow : public InferenceBase
{
private:
const string inputLayer = "image_tensor:0";
const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
CallableOptions opts;
std::unique_ptr<tensorflow::Session> session;
Session::CallableHandle feed_gpu_fetch_cpu;
// Allocate input tensor on the gpu
Tensor inputTensor;
vector<Tensor> outputs;
protected:
int ReadGraph() override;
int doInference(cv::cuda::GpuMat& d_frame) override;
void visualize(cv::cuda::GpuMat &d_frame, double) override;
public:
InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0)
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, opts()
{ }
int Init(string videoStream) override;
virtual ~InferenceTensorflow() { session->ReleaseCallable(feed_gpu_fetch_cpu);}
};
\ No newline at end of file
#include "inference_trt.h"
using namespace cv;
using namespace std;
int InferenceTensorRT::ReadGraph()
{
auto runtimeEngineContext = CreateTrtEngineAndContext(graphFile, isInt8);
runtime = std::get<0>(runtimeEngineContext);
engine = std::get<1>(runtimeEngineContext);
context = std::get<2>(runtimeEngineContext);
return 0;
}
int InferenceTensorRT::ReadClassLabels()
{
populateClassLabels(labelsVector, labelsFile);
return 0;
}
int InferenceTensorRT::doInference(cv::cuda::GpuMat &d_frame)
{
auto inferenceTuple = doInferenceWithTrt(d_frame, context, labelsVector);
detections = std::get<0>(inferenceTuple);
numDetections = std::get<1>(inferenceTuple);
return 0;
}
void InferenceTensorRT::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
Mat img;
d_frame.download(img);
for (int p = 0; p < N; ++p)
{
for (int i = 0; i < numDetections[p]; ++i)
{
float *det = &detections[0] + (p * detectionOutputParam.keepTopK + i) * 7;
if (det[2] < visualizeThreshold)
continue;
// Output format for each detection is stored in the below order
// [image_id, label, confidence, xmin, ymin, xmax, ymax]
assert((int)det[1] < OUTPUT_CLS_SIZE);
std::string storeName = outFileRoot + labelsVector[(int)det[1]] + "-" + std::to_string(det[2]) + ".jpg";
if (debug & 0x2)
{
// det array idxs: (4, 3) = (y0, x0), (6, 5) = (y1, x1)
// dets are in absolute coordinates: 0 <= pt <= 1
drawBoundingBoxOnImage(img, det[4], det[3], det[6], det[5], det[2], labelsVector[(int)det[1]]);
}
}
}
if (debug & 0x2)
{
string framework("TensorRT");
if (isInt8)
{
framework += " (INT8)";
}
auto color = Scalar(0, 255, 255);
drawFrameworkSignature(img, fps, framework, color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
class InferenceTensorRT : public InferenceBase
{
private:
IRuntime *runtime;
ICudaEngine *engine;
IExecutionContext *context;
bool isInt8;
//batch size
const int N = 1;
const float visualizeThreshold = 0.5;
vector<string> labelsVector;
vector<int> numDetections;
vector<float> detections;
string outFileRoot;
protected:
int ReadGraph() override;
int ReadClassLabels() override;
int doInference(cv::cuda::GpuMat &d_frame) override;
void visualize(cv::cuda::GpuMat&, double) override;
public:
InferenceTensorRT(const string &labelsFile, const string &graphFile, bool isInt8, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0, string outFile="")
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, labelsVector()
, numDetections(N)
, detections(N * detectionOutputParam.keepTopK * 7)
, outFileRoot(outFile)
, isInt8(isInt8)
{
}
virtual ~InferenceTensorRT()
{
if(context != nullptr)
{
context->destroy();
}
if(engine != nullptr)
{
engine->destroy();
}
if(runtime != nullptr)
{
runtime->destroy();
}
}
};
\ No newline at end of file
#include "inference_base.h"
#include "inference_tf.h"
#include "inference_trt.h"
#include <cuda_profiler_api.h>
using tensorflow::CallableOptions;
using tensorflow::int32;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::Tensor;
using namespace std;
using namespace cv;
using namespace std::chrono;
int main(int argc, char *argv[])
{
if (!tensorflow::IsGoogleCudaEnabled())
{
LOG(ERROR) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda";
return -1;
}
const String keys =
"{d display |1 | view video while objects are detected}"
"{t tensorrt|false | use tensorrt}"
"{i int8|false| use INT8 (requires callibration)}"
"{v video | | video for detection}"
"{graph ||frozen graph location}"
"{labels ||trained labels filelocation}";
// Set dirs variables
string ROOTDIR = "";
CommandLineParser parser(argc, argv, keys);
int showWindow = parser.get<int>("d");
String video_file = parser.get<String>("v");
bool is_tensor_rt = parser.get<bool>("t");
bool is_int8 = parser.get<bool>("i");
String LABELS = parser.get<String>("labels");
String GRAPH = parser.get<String>("graph");
unique_ptr<InferenceBase> infer(is_tensor_rt ?
(InferenceBase *) new InferenceTensorRT(LABELS, GRAPH, is_int8)
: (InferenceBase *) new InferenceTensorflow(LABELS, GRAPH));
infer->set_debug(showWindow);
infer->Init(video_file);
infer->RunInferenceOnStream();
return 0;
}
\ No newline at end of file
./build/tf_detector_example -d=$1 \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/home/boris/model/frozen_inference_graph.pb \
-labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-i \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
This diff is collapsed.
#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
#define TF_DETECTOR_EXAMPLE_UTILS_H
#endif //TF_DETECTOR_EXAMPLE_UTILS_H
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <map>
#include <unordered_map>
#include <math.h>
#include <regex>
#include <tuple>
#include <cassert>
#include <cublas_v2.h>
#include <cudnn.h>
#include <sstream>
#include <time.h>
#include "BatchStreamPPM.h"
#include "NvUffParser.h"
#include "common.h"
#include "NvInferPlugin.h"
// Required for CUDA check
#include "tensorflow/core/util/port.h"
// GPU allocator
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
// Direct session
#include "tensorflow/core/common_runtime/direct_session.h"
#include <cv.hpp>
#include <opencv2/cudacodec.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
// CUDA includes. Order matters
#include <dynlink_nvcuvid.h>
#include "cuda_runtime_api.h"
using namespace std;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::Session;
using namespace nvinfer1;
using namespace nvuffparser;
string type2str(int type);
Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
Status loadGraph(const string &graph_file_name,
std::unique_ptr<tensorflow::Session> *session);
Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
void drawBoundingBoxesOnImage(cv::Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
std::map<int, string> &labelsMap,
std::vector<size_t> &idxs);
void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
double IOU(cv::Rect box1, cv::Rect box2);
std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore);
bool IsCUDATensor(const Tensor &t);
string GPUDeviceName(Session* session);
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
extern DetectionOutputParameters detectionOutputParam;
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
extern const int OUTPUT_CLS_SIZE;
extern const int OUTPUT_BBOX_SIZE;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment