fst commit

a935f1f0 · fierval · a935f1f0 · a935f1f0 · a935f1f0 · a935f1f0
Commit a935f1f0 authored Feb 19, 2019 by fierval
24 changed files
--- a/.gitignore
+++ b/.gitignore
+# Prerequisites
+*.d
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+# Precompiled Headers
+*.gch
+*.pch
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+# Fortran module files
+*.mod
+*.smod
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Executables
+*.exe
+*.out
+*.app
+build/
+debug/
+.vs/
+.vscode/
+ssd/
+*.tar
\ No newline at end of file
--- a/BatchStreamPPM.h
+++ b/BatchStreamPPM.h
+#ifndef BATCH_STREAM_PPM_H
+#define BATCH_STREAM_PPM_H
+#include <vector>
+#include <assert.h>
+#include <algorithm>
+#include <iomanip>
+#include <fstream>
+#include "NvInfer.h"
+#include "common.h"
+std::string locateFile(const std::string& input);
+static constexpr int INPUT_C = 3;
+static constexpr int INPUT_H = 300;
+static constexpr int INPUT_W = 300;
+extern const char* INPUT_BLOB_NAME;
+class BatchStream
+{
+public:
+	BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
+	{
+		mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
+		mImageSize = mDims.c() * mDims.h() * mDims.w();
+		mBatch.resize(mBatchSize * mImageSize, 0);
+		mLabels.resize(mBatchSize, 0);
+		mFileBatch.resize(mDims.n() * mImageSize, 0);
+		mFileLabels.resize(mDims.n(), 0);
+		reset(0);
+	}
+	void reset(int firstBatch)
+	{
+		mBatchCount = 0;
+		mFileCount = 0;
+		mFileBatchPos = mDims.n();
+		skip(firstBatch);
+	}
+	bool next()
+	{
+		if (mBatchCount == mMaxBatches)
+			return false;
+		for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+		{
+			assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
+			if (mFileBatchPos == mDims.n() && !update())
+				return false;
+			// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
+			csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
+			std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
+		}
+		mBatchCount++;
+		return true;
+	}
+	void skip(int skipCount)
+	{
+		if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
+		{
+			mFileCount += skipCount * mBatchSize / mDims.n();
+			return;
+		}
+		int x = mBatchCount;
+		for (int i = 0; i < skipCount; i++)
+			next();
+		mBatchCount = x;
+	}
+	float *getBatch() { return mBatch.data(); }
+	float *getLabels() { return mLabels.data(); }
+	int getBatchesRead() const { return mBatchCount; }
+	int getBatchSize() const { return mBatchSize; }
+	nvinfer1::DimsNCHW getDims() const { return mDims; }
+private:
+	float* getFileBatch() { return mFileBatch.data(); }
+	float* getFileLabels() { return mFileLabels.data(); }
+	bool update()
+	{
+        std::vector<std::string> fNames;
+	    std::ifstream file(locateFile("list.txt"));
+        if(file)
+        {
+            std::cout  << "Batch #" << mFileCount << "\n";
+            file.seekg(mCurPos);
+        }
+        for(int i = 1; i <= mBatchSize; i++)
+        {
+            std::string sName;
+            std::getline(file, sName);
+            sName = sName + ".ppm";
+            std::cout << "Calibrating with file " << sName << std::endl;
+            fNames.emplace_back(sName);
+        }
+        mCurPos = file.tellg();
+        mFileCount++;
+        std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
+        for (uint32_t i = 0; i < fNames.size(); ++i)
+        {
+            readPPMFile(locateFile(fNames[i]), ppms[i]);
+        }
+        std::vector<float> data(samplesCommon::volume(mDims));
+        long int volChl = mDims.h() * mDims.w();
+        for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
+        {
+            for (int c = 0; c < mDims.c(); ++c)
+            {
+                for (int j = 0; j < volChl; ++j)
+                {
+                    data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
+                }
+            }
+        }
+        std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
+		mFileBatchPos = 0;
+		return true;
+	}
+	int mBatchSize{0};
+	int mMaxBatches{0};
+	int mBatchCount{0};
+	int mFileCount{0}, mFileBatchPos{0};
+	int mImageSize{0};
+    int mCurPos{0};
+	nvinfer1::DimsNCHW mDims;
+	std::vector<float> mBatch;
+	std::vector<float> mLabels;
+	std::vector<float> mFileBatch;
+	std::vector<float> mFileLabels;
+};
+class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
+{
+public:
+    Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
+        : mStream(stream),
+        mCalibrationTableName(std::move(calibrationTableName)),
+        mReadCache(readCache)
+    {
+    	nvinfer1::DimsNCHW dims = mStream.getDims();
+        mInputCount = samplesCommon::volume(dims);
+        CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
+        mStream.reset(firstBatch);
+    }
+    virtual ~Int8EntropyCalibrator()
+    {
+        CHECK_TRT(cudaFree(mDeviceInput));
+    }
+    int getBatchSize() const override { return mStream.getBatchSize(); }
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
+    {
+        if (!mStream.next())
+            return false;
+        CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
+        assert(!strcmp(names[0], INPUT_BLOB_NAME));
+        bindings[0] = mDeviceInput;
+        return true;
+    }
+    const void* readCalibrationCache(size_t& length) override
+    {
+        mCalibrationCache.clear();
+        std::ifstream input(mCalibrationTableName, std::ios::binary);
+        input >> std::noskipws;
+        if (mReadCache && input.good())
+            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
+        length = mCalibrationCache.size();
+        return length ? mCalibrationCache.data() : nullptr;
+    }
+    void writeCalibrationCache(const void* cache, size_t length) override
+    {
+        std::ofstream output(mCalibrationTableName, std::ios::binary);
+        output.write(reinterpret_cast<const char*>(cache), length);
+    }
+private:
+    BatchStream mStream;
+    std::string mCalibrationTableName;
+    bool mReadCache{true};
+    size_t mInputCount;
+    void* mDeviceInput{nullptr};
+    std::vector<char> mCalibrationCache;
+};
+#endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.8)
+project(tf_detector_example LANGUAGES CXX CUDA)
+cmake_policy(SET CMP0074 OLD)
+set(CMAKE_CXX_STANDARD 11)
+# CUDA for cudacodec ops
+find_package(CUDA 9.0 REQUIRED)
+set(SOURCE_FILES 
+    main.cpp
+    utils.cpp 
+    utils.h 
+    dynlink_nvcuvid.cpp 
+    infer_with_trt.cpp
+    inference_base.cpp 
+    inference_tf.cpp
+    inference_trt.cpp
+    channel_first.cu
+    )
+# Tensorflow directories and libraries
+set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
+set(MYHOME $ENV{HOME})
+message("-- Home set to: " ${MYHOME})
+link_directories("/usr/local/tensorflow/lib")
+add_executable(tf_detector_example ${SOURCE_FILES})
+set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+# OpenCV libs
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
+# ==================== PATHS TO SPECIFY! ==================== #
+# TensorFlow headers
+include_directories("/usr/local/tensorflow/include/tensorflow/")
+include_directories("/usr/local/tensorflow/include/third-party/")
+include_directories("/usr/local/tensorflow/include/")
+# IMPORTANT: Protobuf includes. Depends on the anaconda path
+# This is Azure DLVM (not sure if DSVM is the same)
+include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
+# This is a standard install of Anaconda with p36 environment
+include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
+target_link_libraries(tf_detector_example
+    ${CUDA_LIBRARIES}
+    cuda
+    cublas
+    nvinfer
+    nvToolsExt
+    nvparsers 
+    nvinfer_plugin 
+    nvonnxparser 
+    ${CMAKE_DL_LIBS}
+    ${OpenCV_LIBS} 
+    ${TENSORFLOW_LIBS})
--- a/CalibrationTableSSD
+++ b/CalibrationTableSSD
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2019 Boris
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# Optimizied Video Object Detection
+The completed application runs any [Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.mdTensorflow) style object detector in Tensorflow mode (default) and an Inception V2 SSD detector converted from Tensorflow graph to UFF format recognized by TensorRT in TensorRT mode (-t).
+## Building the app
+* Clone the [repo](https://github.com/fierval/fast_od).
+* Get the frozen graph and the class labels files for Tensorflow from [here](https://github.com/fierval/tensorflow-object-detection-cpp/tree/master/demo/ssd_inception_v2)
+* Get the [frozen graph for TensorRT](https://www.dropbox.com/s/nc3tzm95ip356i5/sample_ssd_relu6.uff?dl=0). The class labels file should be available in `/usr/src/tensorrt/data/ssd` directory.
+* Build:
+```sh
+mkdir build
+cd build
+cmake .. # cmake -DCMAKE_BUILD_TYPE=Debug
+```
+## Running
+Command line options are described in [`main.cpp`](https://github.com/fierval/fast_od/blob/master/main.cpp">):
+```cpp
+const String keys =
+    "{d display |1  | view video while objects are detected}"
+    "{t tensorrt|false | use tensorrt}"
+    "{i int8|false| use INT8 (requires callibration)}"
+    "{v video    |  | video for detection}"
+    "{graph ||frozen graph location}"
+    "{labels ||trained labels filelocation}";
+```
+Examples are in `run_*.sh` files in the sources directory. Worth mentioning:
+```
+-d=0 - run without UX, print out framerate only. -d=2 run with UX
+-t - TensorRT graph
+-t -i - TensorRT graph with INT8 precision.
+```
+## Slowdown due to UX
+The application uses a bare-bones OpenCV UI for visual feedback (`imshow`) and that causes a significant perf hit, so to measure actual performance we run with `-d=0` which suppresses the UI.
\ No newline at end of file
--- a/argsParser.h
+++ b/argsParser.h
+#ifndef TENSORRT_ARGS_PARSER_H
+#define TENSORRT_ARGS_PARSER_H
+#include <vector>
+#include <string>
+#include <getopt.h>
+#include <iostream>
+namespace samplesCommon
+{
+//!
+//! \brief The SampleParams structure groups the basic parameters required by
+//!        all sample networks.
+//!
+struct SampleParams
+{
+    int batchSize;                     //!< Number of inputs in a batch
+    int dlaID;
+    std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
+    std::vector<std::string> inputTensorNames;
+    std::vector<std::string> outputTensorNames;
+};
+//!
+//! \brief The CaffeSampleParams structure groups the additional parameters required by
+//!         networks that use caffe
+//!
+struct CaffeSampleParams : public SampleParams
+{
+    std::string prototxtFileName; //!< Filename of prototxt design file of a network
+    std::string weightsFileName;  //!< Filename of trained weights file of a network
+};
+//!
+//! /brief Struct to maintain command-line arguments.
+//!
+struct Args
+{
+    bool runInInt8{false};
+    bool help{false};
+    int useDLA{-1};
+    std::vector<std::string> dataDirs;
+};
+//!
+//! \brief Populates the Args struct with the provided command-line parameters.
+//!
+//! \throw invalid_argument if any of the arguments are not valid
+//!
+//! \return boolean If return value is true, execution can continue, otherwise program should exit
+//!
+inline bool parseArgs(Args& args, int argc, char* argv[])
+{
+    while (1)
+    {
+        int arg;
+        static struct option long_options[] = {
+            {"help", no_argument, 0, 'h'},
+            {"datadir", required_argument, 0, 'd'},
+            {"int8", no_argument, 0, 'i'},
+            {"useDLA", required_argument, 0, 'u'},
+            {nullptr, 0, nullptr, 0}};
+        int option_index = 0;
+        arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
+        if (arg == -1)
+            break;
+        switch (arg)
+        {
+        case 'h':
+            args.help = true;
+            return false;
+        case 'd':
+            if (optarg)
+                args.dataDirs.push_back(optarg);
+            else
+            {
+                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        case 'i':
+            args.runInInt8 = true;
+            break;
+        case 'u':
+            if (optarg)
+                args.useDLA = std::stoi(optarg);
+            break;
+        default:
+            return false;
+        }
+    }
+    return true;
+}
+} // namespace samplesCommon
+#endif // TENSORRT_ARGS_PARSER_H
--- a/channel_first.cu
+++ b/channel_first.cu
+// kernel to convert from OpenCV channel representation to channel-first
+// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
+const int BLOCK_SIZE = 1024;
+#include <cuda_runtime.h>
+__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    int offset = idx / channelsNum;
+    int channel = idx % channelsNum;
+    // what would the row be if we didn't have any padding
+    int row = idx / rowElems;
+    int col = idx % rowElems;
+    // actual element - skip padding
+    int sourceIdx = row * rowSize + col;
+    dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
+}
+// we expect all memory to already reside on device so no need to allocate anything
+void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
+{
+    int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
+    cudaDeviceSynchronize();
+}
--- a/common.h
+++ b/common.h
--- a/cpall.sh
+++ b/cpall.sh
+cd ~/git/tensorflow
+sudo mkdir /usr/local/tensorflow
+sudo mkdir /usr/local/tensorflow/include
+sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
+sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
+sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
+sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
+sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
+sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
+sudo mkdir /usr/local/tensorflow/include/third_party
+sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
+sudo mkdir /usr/local/tensorflow/lib
+sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
--- a/dynlink_nvcuvid.cpp
+++ b/dynlink_nvcuvid.cpp
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+#include <stdio.h>
+#include "cuda_runtime_api.h"
+#include "dynlink_nvcuvid.h"
+tcuvidCreateVideoSource               *cuvidCreateVideoSource;
+tcuvidCreateVideoSourceW              *cuvidCreateVideoSourceW;
+tcuvidDestroyVideoSource              *cuvidDestroyVideoSource;
+tcuvidSetVideoSourceState             *cuvidSetVideoSourceState;
+tcuvidGetVideoSourceState             *cuvidGetVideoSourceState;
+tcuvidGetSourceVideoFormat            *cuvidGetSourceVideoFormat;
+tcuvidGetSourceAudioFormat            *cuvidGetSourceAudioFormat;
+tcuvidCreateVideoParser               *cuvidCreateVideoParser;
+tcuvidParseVideoData                  *cuvidParseVideoData;
+tcuvidDestroyVideoParser              *cuvidDestroyVideoParser;
+tcuvidCreateDecoder                   *cuvidCreateDecoder;
+tcuvidDestroyDecoder                  *cuvidDestroyDecoder;
+tcuvidDecodePicture                   *cuvidDecodePicture;
+tcuvidMapVideoFrame                   *cuvidMapVideoFrame;
+tcuvidUnmapVideoFrame                 *cuvidUnmapVideoFrame;
+#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+tcuvidMapVideoFrame64                 *cuvidMapVideoFrame64;
+tcuvidUnmapVideoFrame64               *cuvidUnmapVideoFrame64;
+#endif
+//tcuvidGetVideoFrameSurface            *cuvidGetVideoFrameSurface;
+tcuvidCtxLockCreate                   *cuvidCtxLockCreate;
+tcuvidCtxLockDestroy                  *cuvidCtxLockDestroy;
+tcuvidCtxLock                         *cuvidCtxLock;
+tcuvidCtxUnlock                       *cuvidCtxUnlock;
+// Auto-lock helper for C++ applications
+CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx) 
+    : m_ctx(ctx) 
+{
+    cuvidCtxLock(m_ctx, 0); 
+}
+CCtxAutoLock::~CCtxAutoLock()
+{ 
+    cuvidCtxUnlock(m_ctx, 0); 
+}
+#define STRINGIFY(X) #X
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#include <Windows.h>
+#ifdef UNICODE
+   static LPCWSTR __DriverLibName = L"nvcuvid.dll";
+#else
+   static LPCSTR __DriverLibName = "nvcuvid.dll";
+#endif
+typedef HMODULE DLLDRIVER;
+static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
+{
+    *pInstance = LoadLibrary(__DriverLibName);
+    if (*pInstance == NULL)
+    {
+        printf("LoadLibrary \"%s\" failed!\n", __DriverLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return CUDA_SUCCESS;
+}
+#define GET_PROC_EX(name, alias, required)                     \
+    alias = (t##name *)GetProcAddress(DriverLib, #name);               \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __DriverLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)GetProcAddress(DriverLib, STRINGIFY(name##_v2));\
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __DriverLibName);                       \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
+#include <dlfcn.h>
+static char __DriverLibName[] = "libnvcuvid.so";
+typedef void *DLLDRIVER;
+static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
+{
+    *pInstance = dlopen(__DriverLibName, RTLD_NOW);
+    if (*pInstance == NULL)
+    {
+        printf("dlopen \"%s\" failed!\n", __DriverLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return CUDA_SUCCESS;
+}
+#define GET_PROC_EX(name, alias, required)                              \
+    alias = (t##name *)dlsym(DriverLib, #name);                        \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __DriverLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)dlsym(DriverLib, STRINGIFY(name##_v2));         \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __DriverLibName);                    \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#else
+#error unsupported platform
+#endif
+#define CHECKED_CALL(call)              \
+    do {                                \
+        CUresult result = (call);       \
+        if (CUDA_SUCCESS != result) {   \
+            return result;              \
+        }                               \
+    } while(0)
+#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
+#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
+#define GET_PROC(name)          GET_PROC_REQUIRED(name)
+#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
+CUresult CUDAAPI cuvidInit(unsigned int Flags)
+{
+    DLLDRIVER DriverLib;
+    CHECKED_CALL(LOAD_LIBRARY(&DriverLib));
+    // fetch all function pointers
+    GET_PROC(cuvidCreateVideoSource);
+    GET_PROC(cuvidCreateVideoSourceW);
+    GET_PROC(cuvidDestroyVideoSource);
+    GET_PROC(cuvidSetVideoSourceState);
+    GET_PROC(cuvidGetVideoSourceState);
+    GET_PROC(cuvidGetSourceVideoFormat);
+    GET_PROC(cuvidGetSourceAudioFormat);
+    GET_PROC(cuvidCreateVideoParser);
+    GET_PROC(cuvidParseVideoData);
+    GET_PROC(cuvidDestroyVideoParser);
+    GET_PROC(cuvidCreateDecoder);
+    GET_PROC(cuvidDestroyDecoder);
+    GET_PROC(cuvidDecodePicture);
+#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+    GET_PROC(cuvidMapVideoFrame64);
+    GET_PROC(cuvidUnmapVideoFrame64);
+    cuvidMapVideoFrame   = cuvidMapVideoFrame64;
+    cuvidUnmapVideoFrame = cuvidUnmapVideoFrame64;
+#else
+    GET_PROC(cuvidMapVideoFrame);
+    GET_PROC(cuvidUnmapVideoFrame);
+#endif
+//    GET_PROC(cuvidGetVideoFrameSurface);
+    GET_PROC(cuvidCtxLockCreate);
+    GET_PROC(cuvidCtxLockDestroy);
+    GET_PROC(cuvidCtxLock);
+    GET_PROC(cuvidCtxUnlock);
+    return CUDA_SUCCESS;
+}
--- a/infer_with_trt.cpp
+++ b/infer_with_trt.cpp
--- a/inference_base.cpp
+++ b/inference_base.cpp
+#include "inference_base.h"
+using tensorflow::Status;
+using namespace std;
+using namespace cv;
+using namespace std::chrono;
+int InferenceBase::ReadClassLabels()
+{
+    Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
+    if (!readLabelsMapStatus.ok())
+    {
+        LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
+        return -1;
+    }
+    else
+        LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
+    return 0;
+}
+void InferenceBase::InitCuda()
+{
+    void *hHandleDriver = nullptr;
+    CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
+    if (cuda_res != CUDA_SUCCESS)
+    {
+        throw exception();
+    }
+    cuda_res = cuvidInit(0);
+    if (cuda_res != CUDA_SUCCESS)
+    {
+        throw exception();
+    }
+    std::cout << "CUDA init: SUCCESS" << endl;
+    cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
+    isCudaInited = true;
+}
+int InferenceBase::Init(string videoStream)
+{
+    if (!isCudaInited)
+    {
+        InitCuda();
+    }
+    if (ReadClassLabels() != 0)
+    {
+        return -1;
+    }
+    if (ReadGraph() != 0)
+    {
+        LOG(ERROR) << "Could not load inference graph";
+        return -1;
+    }
+    LOG(INFO) << "Inference graph loaded";
+    // create video stream
+    d_reader = GetVideoReader(videoStream);
+    if (d_reader == nullptr)
+    {
+        LOG(ERROR) << "Could not create video stream";
+        throw exception();
+    }
+    // save off frame dimensions
+    auto formatStruct = d_reader->format();
+    width = formatStruct.width;
+    height = formatStruct.height;
+    isInitialized = true;
+    return 0;
+}
+void InferenceBase::RunInferenceOnStream()
+{
+    if (!isInitialized)
+    {
+        LOG(ERROR) << "Video streaming not initialized";
+        return;
+    }
+    cuda::GpuMat d_frame;
+    int iFrame = 0, nFrames = 30;
+    double fps = 0., infer_tf_ms = 0.;
+    high_resolution_clock::time_point start = high_resolution_clock::now();
+    high_resolution_clock::time_point end;
+    double duration = 0.;
+    for (;;)
+    {
+        start = high_resolution_clock::now();
+        if (!d_reader->nextFrame(d_frame))
+        {
+            break;
+        }
+        if (doInference(d_frame) != 0)
+        {
+            LOG(ERROR) << "Inference failed";
+            return;
+        }
+        end = high_resolution_clock::now();
+        duration += (double) duration_cast<milliseconds>(end - start).count();
+        visualize(d_frame, fps);
+        if (++iFrame % nFrames == 0)
+        {
+            fps = 1. * nFrames / duration * 1000.;
+            duration = 0.;
+        }
+        if (iFrame % 100 == 0)
+        {
+            LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
+        }
+    }
+}
\ No newline at end of file
--- a/inference_base.h
+++ b/inference_base.h
+#pragma once
+#include "utils.h"
+using namespace std;
+class InferenceBase
+{
+  private:
+    bool isCudaInited;
+    cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
+     {return cv::cudacodec::createVideoReader(video_file);}
+  protected:
+    string labelsFile;
+    string graphFile;
+    map<int, string> labelsMap;
+    virtual int ReadClassLabels();
+    virtual int ReadGraph() = 0;
+    void InitCuda();
+    cv::Ptr<cv::cudacodec::VideoReader> d_reader;
+    double thresholdScore;
+    double thresholdIOU;
+    // frame width and height
+    int height;
+    int width;
+    int debug;
+    bool isInitialized;
+  public:
+    InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
+        : labelsFile(labelsFile)
+        , graphFile(graphFile)
+        , isCudaInited(false)
+        , thresholdScore(threshScore)
+        , thresholdIOU(threshIOU)
+        , isInitialized(false)
+        , labelsMap()
+        , debug(dbg)
+        {}
+    virtual ~InferenceBase() {}
+    void RunInferenceOnStream();
+    virtual int doInference(cv::cuda::GpuMat&) = 0;
+    virtual void visualize(cv::cuda::GpuMat&, double) = 0;
+    virtual int Init(string video_stream);
+    map<int, string> get_labels_map() {return labelsMap;}
+    void set_debug(int dbg) {debug = dbg;}
+};
--- a/inference_tf.cpp
+++ b/inference_tf.cpp
+#include "inference_tf.h"
+using tensorflow::Status;
+using tensorflow::Tensor;
+using namespace cv;
+using tensorflow::int32;
+int InferenceTensorflow::ReadGraph()
+{
+    LOG(INFO) << "graphFile:" << graphFile;
+    Status loadGraphStatus = loadGraph(graphFile, &session);
+    if (!loadGraphStatus.ok())
+    {
+        LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
+        return -1;
+    }
+    else
+        LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
+    return 0;
+}
+// allocate input tensor
+int InferenceTensorflow::Init(string videoStream)
+{
+    if (InferenceBase::Init(videoStream) != 0)
+    {
+        return -1;
+    }
+    // configure callable options
+    opts.add_feed(inputLayer);
+    for (auto const &value : outputLayer)
+    {
+        opts.add_fetch(value);
+    }
+    const string gpu_device_name = GPUDeviceName(session.get());
+    opts.clear_fetch_devices();
+    opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
+    auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
+    if (!runStatus.ok())
+    {
+        LOG(ERROR) << "Failed to make callable";
+    }
+    // allocate tensor on the GPU
+    tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
+    tensorflow::PlatformGpuId platform_gpu_id(0);
+    tensorflow::GPUMemAllocator *sub_allocator =
+        new tensorflow::GPUMemAllocator(
+            tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
+            platform_gpu_id, false /*use_unified_memory*/, {}, {});
+    tensorflow::GPUBFCAllocator *allocator =
+        new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
+    inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
+    LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
+    return 0;
+}
+int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
+{
+    Status runStatus;
+    readTensorFromGpuMat(d_frame, inputTensor);
+    runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
+    if (!runStatus.ok())
+    {
+        LOG(ERROR) << "Running model failed: " << runStatus;
+        return -1;
+    }
+    return 0;
+}
+void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
+{
+    // Extract results from the outputs vector
+    tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
+    tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
+    tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
+    tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
+    vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
+    if (debug & 0x1)
+    {
+        for (size_t i = 0; i < goodIdxs.size(); i++)
+            LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
+                      << " (" << classes(goodIdxs.at(i)) << "), box:"
+                      << "," << boxes(0, goodIdxs.at(i), 0) << ","
+                      << boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
+                      << boxes(0, goodIdxs.at(i), 3);
+    }
+    // Draw bboxes and captions
+    if (debug & 0x2)
+    {
+        Mat frame;
+        d_frame.download(frame);
+        drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
+        auto color = Scalar(255, 0, 255);
+        drawFrameworkSignature(frame, fps, "Tensorflow", color);
+    }
+}
--- a/inference_tf.h
+++ b/inference_tf.h
+#pragma once
+#include "inference_base.h"
+using namespace std;
+using tensorflow::CallableOptions;
+using tensorflow::Tensor;
+using tensorflow::Session;
+class InferenceTensorflow : public InferenceBase
+{
+  private:
+    const string inputLayer = "image_tensor:0";
+    const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
+    CallableOptions opts;
+    std::unique_ptr<tensorflow::Session> session;
+    Session::CallableHandle feed_gpu_fetch_cpu;
+    // Allocate input tensor on the gpu
+    Tensor inputTensor;
+    vector<Tensor> outputs;
+  protected:
+    int ReadGraph() override;
+    int doInference(cv::cuda::GpuMat& d_frame) override;
+    void visualize(cv::cuda::GpuMat &d_frame, double) override;
+  public:
+    InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0) 
+    : InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
+    , opts()
+    { }
+    int Init(string videoStream) override;
+    virtual ~InferenceTensorflow() {  session->ReleaseCallable(feed_gpu_fetch_cpu);}
+};
\ No newline at end of file
--- a/inference_trt.cpp
+++ b/inference_trt.cpp
+#include "inference_trt.h"
+using namespace cv;
+using namespace std;
+int InferenceTensorRT::ReadGraph()
+{
+    auto runtimeEngineContext = CreateTrtEngineAndContext(graphFile, isInt8);
+    runtime = std::get<0>(runtimeEngineContext);
+    engine = std::get<1>(runtimeEngineContext);
+    context = std::get<2>(runtimeEngineContext);
+    return 0;
+}
+int InferenceTensorRT::ReadClassLabels()
+{
+    populateClassLabels(labelsVector, labelsFile);
+    return 0;
+}
+int InferenceTensorRT::doInference(cv::cuda::GpuMat &d_frame)
+{
+    auto inferenceTuple = doInferenceWithTrt(d_frame, context, labelsVector);
+    detections = std::get<0>(inferenceTuple);
+    numDetections = std::get<1>(inferenceTuple);
+    return 0;
+}
+void InferenceTensorRT::visualize(cv::cuda::GpuMat &d_frame, double fps)
+{
+    Mat img;
+    d_frame.download(img);
+    for (int p = 0; p < N; ++p)
+    {
+        for (int i = 0; i < numDetections[p]; ++i)
+        {
+            float *det = &detections[0] + (p * detectionOutputParam.keepTopK + i) * 7;
+            if (det[2] < visualizeThreshold)
+                continue;
+            // Output format for each detection is stored in the below order
+            // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+            assert((int)det[1] < OUTPUT_CLS_SIZE);
+            std::string storeName = outFileRoot + labelsVector[(int)det[1]] + "-" + std::to_string(det[2]) + ".jpg";
+            if (debug & 0x2)
+            {
+                // det array idxs: (4, 3) = (y0, x0), (6, 5) = (y1, x1)
+                // dets are in absolute coordinates: 0 <= pt <= 1
+                drawBoundingBoxOnImage(img, det[4], det[3], det[6], det[5], det[2], labelsVector[(int)det[1]]);
+            }
+        }
+    }
+    if (debug & 0x2)
+    {
+        string framework("TensorRT");
+        if (isInt8)
+        {
+            framework += " (INT8)";
+        }
+        auto color = Scalar(0, 255, 255);
+        drawFrameworkSignature(img, fps, framework, color);
+    }
+}
--- a/inference_trt.h
+++ b/inference_trt.h
+#pragma once
+#include "inference_base.h"
+using namespace std;
+class InferenceTensorRT : public InferenceBase
+{
+private:
+  IRuntime *runtime;
+  ICudaEngine *engine;
+  IExecutionContext *context;
+  bool isInt8;
+  //batch size
+  const int N = 1;
+  const float visualizeThreshold = 0.5;
+  vector<string> labelsVector;
+  vector<int> numDetections;
+  vector<float> detections;
+  string outFileRoot;
+protected:
+  int ReadGraph() override;
+  int ReadClassLabels() override;
+  int doInference(cv::cuda::GpuMat &d_frame) override;
+  void visualize(cv::cuda::GpuMat&, double) override;
+public:
+  InferenceTensorRT(const string &labelsFile, const string &graphFile, bool isInt8, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0, string outFile="")
+      : InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
+      , labelsVector()
+      , numDetections(N)
+      , detections(N * detectionOutputParam.keepTopK * 7)
+      , outFileRoot(outFile)
+      , isInt8(isInt8)
+  {
+  }
+  virtual ~InferenceTensorRT()
+  {
+    if(context != nullptr)
+    {
+      context->destroy();
+    }
+    if(engine != nullptr)
+    {
+      engine->destroy();
+    }
+    if(runtime != nullptr)
+    {
+      runtime->destroy();
+    }
+  }
+};
\ No newline at end of file
--- a/main.cpp
+++ b/main.cpp
+#include "inference_base.h"
+#include "inference_tf.h"
+#include "inference_trt.h"
+#include <cuda_profiler_api.h>
+using tensorflow::CallableOptions;
+using tensorflow::int32;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::Tensor;
+using namespace std;
+using namespace cv;
+using namespace std::chrono;
+int main(int argc, char *argv[])
+{
+    if (!tensorflow::IsGoogleCudaEnabled())
+    {
+        LOG(ERROR) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda";
+        return -1;
+    }
+    const String keys =
+        "{d display |1  | view video while objects are detected}"
+        "{t tensorrt|false | use tensorrt}"
+        "{i int8|false| use INT8 (requires callibration)}"
+        "{v video    |  | video for detection}"
+        "{graph ||frozen graph location}"
+        "{labels ||trained labels filelocation}";
+    // Set dirs variables
+    string ROOTDIR = "";
+    CommandLineParser parser(argc, argv, keys);
+    int showWindow = parser.get<int>("d");
+    String video_file = parser.get<String>("v");
+    bool is_tensor_rt = parser.get<bool>("t");
+    bool is_int8 = parser.get<bool>("i");
+    String LABELS = parser.get<String>("labels");
+    String GRAPH = parser.get<String>("graph");
+    unique_ptr<InferenceBase> infer(is_tensor_rt ? 
+        (InferenceBase *) new InferenceTensorRT(LABELS, GRAPH, is_int8) 
+        : (InferenceBase *) new InferenceTensorflow(LABELS, GRAPH));
+    infer->set_debug(showWindow);
+    infer->Init(video_file);
+    infer->RunInferenceOnStream();
+    return 0;
+}
\ No newline at end of file
--- a/run_tf.sh
+++ b/run_tf.sh
+./build/tf_detector_example -d=$1 \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/home/boris/model/frozen_inference_graph.pb \
+    -labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
--- a/run_trt.sh
+++ b/run_trt.sh
+./build/tf_detector_example \
+    -d=$1 \
+    -t \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
+    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/run_trt_int8.sh
+++ b/run_trt_int8.sh
+./build/tf_detector_example \
+    -d=$1 \
+    -i \
+    -t \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
+    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/utils.cpp
+++ b/utils.cpp
--- a/utils.h
+++ b/utils.h
+#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
+#define TF_DETECTOR_EXAMPLE_UTILS_H
+#endif //TF_DETECTOR_EXAMPLE_UTILS_H
+#include <vector>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <unordered_map>
+#include <math.h>
+#include <regex>
+#include <tuple>
+#include <cassert>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <sstream>
+#include <time.h>
+#include "BatchStreamPPM.h"
+#include "NvUffParser.h"
+#include "common.h"
+#include "NvInferPlugin.h"
+// Required for CUDA check
+#include "tensorflow/core/util/port.h"
+// GPU allocator
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+// Direct session
+#include "tensorflow/core/common_runtime/direct_session.h"
+#include <cv.hpp>
+#include <opencv2/cudacodec.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+// CUDA includes. Order matters
+#include <dynlink_nvcuvid.h>
+#include "cuda_runtime_api.h"
+using namespace std;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::Session;
+using namespace nvinfer1;
+using namespace nvuffparser;
+string type2str(int type);
+Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
+Status loadGraph(const string &graph_file_name,
+                 std::unique_ptr<tensorflow::Session> *session);
+Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
+Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
+void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
+void drawBoundingBoxesOnImage(cv::Mat &image,
+                              tensorflow::TTypes<float>::Flat &scores,
+                              tensorflow::TTypes<float>::Flat &classes,
+                              tensorflow::TTypes<float,3>::Tensor &boxes,
+                              std::map<int, string> &labelsMap,
+                              std::vector<size_t> &idxs);
+void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
+double IOU(cv::Rect box1, cv::Rect box2);
+std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
+                                tensorflow::TTypes<float, 3>::Tensor &boxes,
+                                double thresholdIOU, double thresholdScore);
+bool IsCUDATensor(const Tensor &t);
+string GPUDeviceName(Session* session);
+std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
+std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
+extern DetectionOutputParameters detectionOutputParam;
+void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
+void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
+extern const int OUTPUT_CLS_SIZE;
+extern const int OUTPUT_BBOX_SIZE;