Commit a935f1f0 authored by fierval's avatar fierval

fst commit

parents
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
build/
debug/
.vs/
.vscode/
ssd/
*.tar
\ No newline at end of file
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.8)
project(tf_detector_example LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES
main.cpp
utils.cpp
utils.h
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
inference_trt.cpp
channel_first.cu
)
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
# TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path
# This is Azure DLVM (not sure if DSVM is the same)
include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
${CUDA_LIBRARIES}
cuda
cublas
nvinfer
nvToolsExt
nvparsers
nvinfer_plugin
nvonnxparser
${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS})
This diff is collapsed.
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Optimizied Video Object Detection
The completed application runs any [Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.mdTensorflow) style object detector in Tensorflow mode (default) and an Inception V2 SSD detector converted from Tensorflow graph to UFF format recognized by TensorRT in TensorRT mode (-t).
## Building the app
* Clone the [repo](https://github.com/fierval/fast_od).
* Get the frozen graph and the class labels files for Tensorflow from [here](https://github.com/fierval/tensorflow-object-detection-cpp/tree/master/demo/ssd_inception_v2)
* Get the [frozen graph for TensorRT](https://www.dropbox.com/s/nc3tzm95ip356i5/sample_ssd_relu6.uff?dl=0). The class labels file should be available in `/usr/src/tensorrt/data/ssd` directory.
* Build:
```sh
mkdir build
cd build
cmake .. # cmake -DCMAKE_BUILD_TYPE=Debug
```
## Running
Command line options are described in [`main.cpp`](https://github.com/fierval/fast_od/blob/master/main.cpp">):
```cpp
const String keys =
"{d display |1 | view video while objects are detected}"
"{t tensorrt|false | use tensorrt}"
"{i int8|false| use INT8 (requires callibration)}"
"{v video | | video for detection}"
"{graph ||frozen graph location}"
"{labels ||trained labels filelocation}";
```
Examples are in `run_*.sh` files in the sources directory. Worth mentioning:
```
-d=0 - run without UX, print out framerate only. -d=2 run with UX
-t - TensorRT graph
-t -i - TensorRT graph with INT8 precision.
```
## Slowdown due to UX
The application uses a bare-bones OpenCV UI for visual feedback (`imshow`) and that causes a significant perf hit, so to measure actual performance we run with `-d=0` which suppresses the UI.
\ No newline at end of file
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
This diff is collapsed.
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <stdio.h>
#include "cuda_runtime_api.h"
#include "dynlink_nvcuvid.h"
tcuvidCreateVideoSource *cuvidCreateVideoSource;
tcuvidCreateVideoSourceW *cuvidCreateVideoSourceW;
tcuvidDestroyVideoSource *cuvidDestroyVideoSource;
tcuvidSetVideoSourceState *cuvidSetVideoSourceState;
tcuvidGetVideoSourceState *cuvidGetVideoSourceState;
tcuvidGetSourceVideoFormat *cuvidGetSourceVideoFormat;
tcuvidGetSourceAudioFormat *cuvidGetSourceAudioFormat;
tcuvidCreateVideoParser *cuvidCreateVideoParser;
tcuvidParseVideoData *cuvidParseVideoData;
tcuvidDestroyVideoParser *cuvidDestroyVideoParser;
tcuvidCreateDecoder *cuvidCreateDecoder;
tcuvidDestroyDecoder *cuvidDestroyDecoder;
tcuvidDecodePicture *cuvidDecodePicture;
tcuvidMapVideoFrame *cuvidMapVideoFrame;
tcuvidUnmapVideoFrame *cuvidUnmapVideoFrame;
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
tcuvidMapVideoFrame64 *cuvidMapVideoFrame64;
tcuvidUnmapVideoFrame64 *cuvidUnmapVideoFrame64;
#endif
//tcuvidGetVideoFrameSurface *cuvidGetVideoFrameSurface;
tcuvidCtxLockCreate *cuvidCtxLockCreate;
tcuvidCtxLockDestroy *cuvidCtxLockDestroy;
tcuvidCtxLock *cuvidCtxLock;
tcuvidCtxUnlock *cuvidCtxUnlock;
// Auto-lock helper for C++ applications
CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx)
: m_ctx(ctx)
{
cuvidCtxLock(m_ctx, 0);
}
CCtxAutoLock::~CCtxAutoLock()
{
cuvidCtxUnlock(m_ctx, 0);
}
#define STRINGIFY(X) #X
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#include <Windows.h>
#ifdef UNICODE
static LPCWSTR __DriverLibName = L"nvcuvid.dll";
#else
static LPCSTR __DriverLibName = "nvcuvid.dll";
#endif
typedef HMODULE DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = LoadLibrary(__DriverLibName);
if (*pInstance == NULL)
{
printf("LoadLibrary \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, STRINGIFY(name##_v2));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h>
static char __DriverLibName[] = "libnvcuvid.so";
typedef void *DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = dlopen(__DriverLibName, RTLD_NOW);
if (*pInstance == NULL)
{
printf("dlopen \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#else
#error unsupported platform
#endif
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while(0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
#define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
CUresult CUDAAPI cuvidInit(unsigned int Flags)
{
DLLDRIVER DriverLib;
CHECKED_CALL(LOAD_LIBRARY(&DriverLib));
// fetch all function pointers
GET_PROC(cuvidCreateVideoSource);
GET_PROC(cuvidCreateVideoSourceW);
GET_PROC(cuvidDestroyVideoSource);
GET_PROC(cuvidSetVideoSourceState);
GET_PROC(cuvidGetVideoSourceState);
GET_PROC(cuvidGetSourceVideoFormat);
GET_PROC(cuvidGetSourceAudioFormat);
GET_PROC(cuvidCreateVideoParser);
GET_PROC(cuvidParseVideoData);
GET_PROC(cuvidDestroyVideoParser);
GET_PROC(cuvidCreateDecoder);
GET_PROC(cuvidDestroyDecoder);
GET_PROC(cuvidDecodePicture);
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
GET_PROC(cuvidMapVideoFrame64);
GET_PROC(cuvidUnmapVideoFrame64);
cuvidMapVideoFrame = cuvidMapVideoFrame64;
cuvidUnmapVideoFrame = cuvidUnmapVideoFrame64;
#else
GET_PROC(cuvidMapVideoFrame);
GET_PROC(cuvidUnmapVideoFrame);
#endif
// GET_PROC(cuvidGetVideoFrameSurface);
GET_PROC(cuvidCtxLockCreate);
GET_PROC(cuvidCtxLockDestroy);
GET_PROC(cuvidCtxLock);
GET_PROC(cuvidCtxUnlock);
return CUDA_SUCCESS;
}
This diff is collapsed.
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{