fst commit

a935f1f0 · fierval · a935f1f0 · a935f1f0 · a935f1f0 · a935f1f0
Commit a935f1f0 authored Feb 19, 2019 by fierval
24 changed files
--- a/.gitignore
+++ b/.gitignore
+# Prerequisites
+*.d
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+# Precompiled Headers
+*.gch
+*.pch
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+# Fortran module files
+*.mod
+*.smod
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+# Executables
+*.exe
+*.out
+*.app
+build/
+debug/
+.vs/
+.vscode/
+ssd/
+*.tar
\ No newline at end of file
--- a/BatchStreamPPM.h
+++ b/BatchStreamPPM.h
+#ifndef BATCH_STREAM_PPM_H
+#define BATCH_STREAM_PPM_H
+#include <vector>
+#include <assert.h>
+#include <algorithm>
+#include <iomanip>
+#include <fstream>
+#include "NvInfer.h"
+#include "common.h"
+std::string locateFile(const std::string& input);
+static constexpr int INPUT_C = 3;
+static constexpr int INPUT_H = 300;
+static constexpr int INPUT_W = 300;
+extern const char* INPUT_BLOB_NAME;
+class BatchStream
+{
+public:
+	BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
+	{
+		mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
+		mImageSize = mDims.c() * mDims.h() * mDims.w();
+		mBatch.resize(mBatchSize * mImageSize, 0);
+		mLabels.resize(mBatchSize, 0);
+		mFileBatch.resize(mDims.n() * mImageSize, 0);
+		mFileLabels.resize(mDims.n(), 0);
+		reset(0);
+	}
+	void reset(int firstBatch)
+	{
+		mBatchCount = 0;
+		mFileCount = 0;
+		mFileBatchPos = mDims.n();
+		skip(firstBatch);
+	}
+	bool next()
+	{
+		if (mBatchCount == mMaxBatches)
+			return false;
+		for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
+		{
+			assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
+			if (mFileBatchPos == mDims.n() && !update())
+				return false;
+			// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
+			csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
+			std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
+		}
+		mBatchCount++;
+		return true;
+	}
+	void skip(int skipCount)
+	{
+		if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
+		{
+			mFileCount += skipCount * mBatchSize / mDims.n();
+			return;
+		}
+		int x = mBatchCount;
+		for (int i = 0; i < skipCount; i++)
+			next();
+		mBatchCount = x;
+	}
+	float *getBatch() { return mBatch.data(); }
+	float *getLabels() { return mLabels.data(); }
+	int getBatchesRead() const { return mBatchCount; }
+	int getBatchSize() const { return mBatchSize; }
+	nvinfer1::DimsNCHW getDims() const { return mDims; }
+private:
+	float* getFileBatch() { return mFileBatch.data(); }
+	float* getFileLabels() { return mFileLabels.data(); }
+	bool update()
+	{
+        std::vector<std::string> fNames;
+	    std::ifstream file(locateFile("list.txt"));
+        if(file)
+        {
+            std::cout  << "Batch #" << mFileCount << "\n";
+            file.seekg(mCurPos);
+        }
+        for(int i = 1; i <= mBatchSize; i++)
+        {
+            std::string sName;
+            std::getline(file, sName);
+            sName = sName + ".ppm";
+            std::cout << "Calibrating with file " << sName << std::endl;
+            fNames.emplace_back(sName);
+        }
+        mCurPos = file.tellg();
+        mFileCount++;
+        std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
+        for (uint32_t i = 0; i < fNames.size(); ++i)
+        {
+            readPPMFile(locateFile(fNames[i]), ppms[i]);
+        }
+        std::vector<float> data(samplesCommon::volume(mDims));
+        long int volChl = mDims.h() * mDims.w();
+        for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
+        {
+            for (int c = 0; c < mDims.c(); ++c)
+            {
+                for (int j = 0; j < volChl; ++j)
+                {
+                    data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
+                }
+            }
+        }
+        std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
+		mFileBatchPos = 0;
+		return true;
+	}
+	int mBatchSize{0};
+	int mMaxBatches{0};
+	int mBatchCount{0};
+	int mFileCount{0}, mFileBatchPos{0};
+	int mImageSize{0};
+    int mCurPos{0};
+	nvinfer1::DimsNCHW mDims;
+	std::vector<float> mBatch;
+	std::vector<float> mLabels;
+	std::vector<float> mFileBatch;
+	std::vector<float> mFileLabels;
+};
+class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
+{
+public:
+    Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
+        : mStream(stream),
+        mCalibrationTableName(std::move(calibrationTableName)),
+        mReadCache(readCache)
+    {
+    	nvinfer1::DimsNCHW dims = mStream.getDims();
+        mInputCount = samplesCommon::volume(dims);
+        CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
+        mStream.reset(firstBatch);
+    }
+    virtual ~Int8EntropyCalibrator()
+    {
+        CHECK_TRT(cudaFree(mDeviceInput));
+    }
+    int getBatchSize() const override { return mStream.getBatchSize(); }
+    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
+    {
+        if (!mStream.next())
+            return false;
+        CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
+        assert(!strcmp(names[0], INPUT_BLOB_NAME));
+        bindings[0] = mDeviceInput;
+        return true;
+    }
+    const void* readCalibrationCache(size_t& length) override
+    {
+        mCalibrationCache.clear();
+        std::ifstream input(mCalibrationTableName, std::ios::binary);
+        input >> std::noskipws;
+        if (mReadCache && input.good())
+            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
+        length = mCalibrationCache.size();
+        return length ? mCalibrationCache.data() : nullptr;
+    }
+    void writeCalibrationCache(const void* cache, size_t length) override
+    {
+        std::ofstream output(mCalibrationTableName, std::ios::binary);
+        output.write(reinterpret_cast<const char*>(cache), length);
+    }
+private:
+    BatchStream mStream;
+    std::string mCalibrationTableName;
+    bool mReadCache{true};
+    size_t mInputCount;
+    void* mDeviceInput{nullptr};
+    std::vector<char> mCalibrationCache;
+};
+#endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.8)
+project(tf_detector_example LANGUAGES CXX CUDA)
+cmake_policy(SET CMP0074 OLD)
+set(CMAKE_CXX_STANDARD 11)
+# CUDA for cudacodec ops
+find_package(CUDA 9.0 REQUIRED)
+set(SOURCE_FILES 
+    main.cpp
+    utils.cpp 
+    utils.h 
+    dynlink_nvcuvid.cpp 
+    infer_with_trt.cpp
+    inference_base.cpp 
+    inference_tf.cpp
+    inference_trt.cpp
+    channel_first.cu
+    )
+# Tensorflow directories and libraries
+set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
+set(MYHOME $ENV{HOME})
+message("-- Home set to: " ${MYHOME})
+link_directories("/usr/local/tensorflow/lib")
+add_executable(tf_detector_example ${SOURCE_FILES})
+set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+# OpenCV libs
+find_package(OpenCV REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
+# ==================== PATHS TO SPECIFY! ==================== #
+# TensorFlow headers
+include_directories("/usr/local/tensorflow/include/tensorflow/")
+include_directories("/usr/local/tensorflow/include/third-party/")
+include_directories("/usr/local/tensorflow/include/")
+# IMPORTANT: Protobuf includes. Depends on the anaconda path
+# This is Azure DLVM (not sure if DSVM is the same)
+include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
+# This is a standard install of Anaconda with p36 environment
+include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
+target_link_libraries(tf_detector_example
+    ${CUDA_LIBRARIES}
+    cuda
+    cublas
+    nvinfer
+    nvToolsExt
+    nvparsers 
+    nvinfer_plugin 
+    nvonnxparser 
+    ${CMAKE_DL_LIBS}
+    ${OpenCV_LIBS} 
+    ${TENSORFLOW_LIBS})
--- a/CalibrationTableSSD
+++ b/CalibrationTableSSD
+1
+(Unnamed ITensor* 9): 3d418f1e
+Input: 3c010a14
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d205fca
+(Unnamed ITensor* 225): 3d368720
+(Unnamed ITensor* 412): 3d418f1e
+(Unnamed ITensor* 195): 3dafce6e
+(Unnamed ITensor* 138): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/MaxPool_3a_3x3/MaxPool: 3d418f1e
+(Unnamed ITensor* 463): 3d418f1e
+(Unnamed ITensor* 75): 3d2dcb21
+(Unnamed ITensor* 157): 3d418f1e
+BoxPredictor_3/ClassPredictor/BiasAdd: 3c8c8ef8
+FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2c_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/MaxPool_2a_3x3/MaxPool: 3d418f1e
+(Unnamed ITensor* 61): 3d418f1e
+(Unnamed ITensor* 462): 3d3d75f1
+(Unnamed ITensor* 156): 3d618943
+(Unnamed ITensor* 24): 3d913052
+(Unnamed ITensor* 32): 3d6533f9
+(Unnamed ITensor* 83): 3d3ca52c
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 118): 3d4aef99
+(Unnamed ITensor* 485): 3d1d4f1e
+BoxPredictor_4/BoxEncodingPredictor/BiasAdd: 3ca49bb9
+(Unnamed ITensor* 84): 3d418f1e
+(Unnamed ITensor* 160): 3d418f1e
+BoxPredictor_5/ClassPredictor/BiasAdd: 3c773985
+(Unnamed ITensor* 316): 3d63dc8a
+FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise: 3de7428e
+(Unnamed ITensor* 90): 3d73f085
+(Unnamed ITensor* 91): 3d418f1e
+(Unnamed ITensor* 419): 3d418f1e
+(Unnamed ITensor* 374): 3d59dbf2
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3c8d1a
+FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_5_1x1_64/Relu6: 3d17eae6
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 507): 3d418f1e
+(Unnamed ITensor* 2): 3c010a14
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 112): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 126): 3d20913a
+(Unnamed ITensor* 104): 3d80ab32
+(Unnamed ITensor* 134): 3d8dd320
+(Unnamed ITensor* 324): 3d418f1e
+(Unnamed ITensor* 135): 3d418f1e
+(Unnamed ITensor* 628): 3d9d9605
+(Unnamed ITensor* 449): 3d418f1e
+(Unnamed ITensor* 119): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 513): 3d5e275c
+(Unnamed ITensor* 164): 3d946ceb
+Squeeze_2: 3cc8bb82
+(Unnamed ITensor* 167): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu6: 3d2d4927
+(Unnamed ITensor* 541): 3d37a99c
+(Unnamed ITensor* 143): 3d418f1e
+(Unnamed ITensor* 240): 3d418f1e
+(Unnamed ITensor* 150): 3d418f1e
+(Unnamed ITensor* 165): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 310): 3d418f1e
+(Unnamed ITensor* 260): 3d60aac4
+(Unnamed ITensor* 405): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 105): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 382): 3d1e3cff
+(Unnamed ITensor* 550): 3d418f1e
+(Unnamed ITensor* 391): 3d418f1e
+FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_2_1x1_256/Relu6: 3d37a347
+(Unnamed ITensor* 448): 3d6ab083
+(Unnamed ITensor* 142): 3dd08cf3
+(Unnamed ITensor* 595): 3d418f1e
+BoxPredictor_1/ClassPredictor/BiasAdd: 3e194e24
+concat_box_conf: 3e1bb222
+(Unnamed ITensor* 594): 3d4ff643
+(Unnamed ITensor* 602): 3d418f1e
+BoxPredictor_5/Reshape_1: 3c773985
+concat_box_loc: 3de14ea0
+BoxPredictor_4/ClassPredictor/BiasAdd: 3ca5201c
+Squeeze_4: 3ca49bb9
+(Unnamed ITensor* 621): 3d418f1e
+(Unnamed ITensor* 624): 3d17eae6
+BoxPredictor_2/ClassPredictor/BiasAdd: 3e1ec6c2
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/Relu6: 3d156ede
+(Unnamed ITensor* 33): 3d418f1e
+(Unnamed ITensor* 500): 3d418f1e
+BoxPredictor_2/Reshape_1: 3e1ec6c2
+FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/Relu6: 3d418f1e
+BoxPredictor_5/BoxEncodingPredictor/BiasAdd: 3cdbc092
+GridAnchor_1: 3a500341
+(Unnamed ITensor* 569): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 620): 3d17eae6
+(Unnamed ITensor* 418): 3d91976a
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 111): 3d85a99e
+(Unnamed ITensor* 575): 3dc8e55f
+(Unnamed ITensor* 601): 3d8b91c4
+BoxPredictor_1/Reshape_1: 3e194e24
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu6: 3d433d97
+(Unnamed ITensor* 545): 3d37a347
+BoxPredictor_3/Reshape_1: 3c8c8ef8
+(Unnamed ITensor* 347): 3d418f1e
+(Unnamed ITensor* 568): 3d1c5b35
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 471): 3d418f1e
+(Unnamed ITensor* 455): 3d500012
+(Unnamed ITensor* 303): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/Relu6: 3d20913a
+BoxPredictor_4/Reshape_1: 3ca5201c
+GridAnchor_4 copy: 3c3aa18a
+FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_3_3x3_s2_256/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/Relu6: 3d1e3cff
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
+GridAnchor_5 copy: 3c2b37e3
+(Unnamed ITensor* 331): 3d3ca1fe
+NMS_1: 1
+BoxPredictor_3/BoxEncodingPredictor/BiasAdd: 3cafbf65
+(Unnamed ITensor* 188): 3dc61b5c
+(Unnamed ITensor* 196): 3d418f1e
+(Unnamed ITensor* 209): 3dc05776
+GridAnchor_2 copy: 3c2c4ae8
+(Unnamed ITensor* 367): 3d7bf53d
+(Unnamed ITensor* 361): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/Relu6: 3d3ceddc
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3772e6
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/Relu6: 3d31060c
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 411): 3d836c20
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 18): 3d418f1e
+(Unnamed ITensor* 390): 3d9a604f
+(Unnamed ITensor* 346): 3d67b7ae
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+BoxPredictor_2/BoxEncodingPredictor/BiasAdd: 3cc8bb82
+(Unnamed ITensor* 217): 3d4cf10e
+FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_2_3x3_s2_512/Relu6: 3d418f1e
+(Unnamed ITensor* 233): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 542): 3d418f1e
+(Unnamed ITensor* 67): 3d8e8123
+(Unnamed ITensor* 247): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d1d88d2
+(Unnamed ITensor* 302): 3daf4176
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 239): 3d4f00df
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 514): 3d418f1e
+(Unnamed ITensor* 435): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 317): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 289): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 478): 3d3244f6
+(Unnamed ITensor* 549): 3dbeda4a
+(Unnamed ITensor* 261): 3d418f1e
+(Unnamed ITensor* 492): 3d9e1645
+(Unnamed ITensor* 441): 3d15c098
+(Unnamed ITensor* 479): 3d418f1e
+(Unnamed ITensor* 493): 3d418f1e
+BoxPredictor_0/Reshape_1: 3e13296c
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d258e36
+(Unnamed ITensor* 339): 3d5f2411
+FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_4_3x3_s2_256/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
+Squeeze_1: 3d2f0384
+GridAnchor: 3a4f5b62
+(Unnamed ITensor* 368): 3d418f1e
+Squeeze: 3df34968
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 375): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d18b9fa
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
+(Unnamed ITensor* 253): 3d92390f
+(Unnamed ITensor* 210): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu6: 3d4af27d
+Squeeze_3: 3cafbf65
+(Unnamed ITensor* 340): 3d418f1e
+(Unnamed ITensor* 11): 3d418f1e
+(Unnamed ITensor* 295): 3d9c64d4
+FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_3_1x1_128/Relu6: 3d1c5b35
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu6: 3d15c098
+(Unnamed ITensor* 323): 3d5d9fd1
+GridAnchor_4: 3c3aa18a
+(Unnamed ITensor* 360): 3d88c0ec
+(Unnamed ITensor* 25): 3d418f1e
+(Unnamed ITensor* 288): 3d6b9ef7
+(Unnamed ITensor* 226): 3d418f1e
+(Unnamed ITensor* 456): 3d418f1e
+(Unnamed ITensor* 46): 3d86ba82
+BoxPredictor_0/BoxEncodingPredictor/BiasAdd: 3df34968
+(Unnamed ITensor* 232): 3ddb36a3
+(Unnamed ITensor* 521): 3cb42ac7
+GridAnchor_3 copy: 3c348982
+(Unnamed ITensor* 296): 3d418f1e
+BoxPredictor_0/ClassPredictor/BiasAdd: 3e13296c
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/Relu6: 3d2dcb21
+(Unnamed ITensor* 202): 3d87a00a
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 269): 3d418f1e
+GridAnchor_3: 3c348982
+(Unnamed ITensor* 218): 3d418f1e
+(Unnamed ITensor* 203): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 486): 3d418f1e
+(Unnamed ITensor* 268): 3d0e4f64
+Squeeze_5: 3cdbc092
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 254): 3d418f1e
+(Unnamed ITensor* 182): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/AvgPool_0a_3x3/AvgPool: 3cb90e57
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 175): 3d418f1e
+(Unnamed ITensor* 98): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/AvgPool_0a_3x3/AvgPool: 3d04ebdf
+(Unnamed ITensor* 354): 3d418f1e
+(Unnamed ITensor* 181): 3d8ef349
+(Unnamed ITensor* 353): 3d3ce1d6
+(Unnamed ITensor* 174): 3d5b5745
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
+GridAnchor_1 copy: 3a500341
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu6: 3cb42ac7
+(Unnamed ITensor* 149): 3d869442
+(Unnamed ITensor* 68): 3d418f1e
+(Unnamed ITensor* 17): 3d9d3367
+(Unnamed ITensor* 404): 3d9d92ab
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 309): 3d8ac690
+BoxPredictor_1/BoxEncodingPredictor/BiasAdd: 3d2f0384
+(Unnamed ITensor* 60): 3d74b08e
+(Unnamed ITensor* 189): 3d418f1e
+(Unnamed ITensor* 97): 3d3f7d2c
+(Unnamed ITensor* 53): 3d7e3945
+(Unnamed ITensor* 8): 3e350553
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+GridAnchor_5: 3c2b37e3
+(Unnamed ITensor* 76): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 522): 3d418f1e
+(Unnamed ITensor* 39): 3da0973e
+(Unnamed ITensor* 127): 3d418f1e
+(Unnamed ITensor* 54): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
+(Unnamed ITensor* 576): 3d418f1e
+(Unnamed ITensor* 332): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2b_1x1/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 47): 3d418f1e
+(Unnamed ITensor* 40): 3d418f1e
+(Unnamed ITensor* 246): 3d7bd2d9
+FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/Relu6: 3d418f1e
+(Unnamed ITensor* 398): 3d418f1e
+(Unnamed ITensor* 383): 3d418f1e
+(Unnamed ITensor* 427): 3d541a3f
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+(Unnamed ITensor* 397): 3d523857
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu6: 3d333265
+(Unnamed ITensor* 442): 3d418f1e
+(Unnamed ITensor* 470): 3d71a2ed
+(Unnamed ITensor* 499): 3d2d4927
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu6: 3d3d75f1
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu6: 3d1d4f1e
+(Unnamed ITensor* 434): 3d439787
+(Unnamed ITensor* 629): 3d418f1e
+(Unnamed ITensor* 506): 3d74b7dd
+(Unnamed ITensor* 428): 3d418f1e
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
+GridAnchor_2: 3c2c4ae8
+FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
+NMS: 3da1a245
+GridAnchor copy: 3a4f5b62
+FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_4_1x1_128/Relu6: 3d418f1e
--- a/LICENSE
+++ b/LICENSE
+MIT License
+Copyright (c) 2019 Boris
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
+# Optimizied Video Object Detection
+The completed application runs any [Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.mdTensorflow) style object detector in Tensorflow mode (default) and an Inception V2 SSD detector converted from Tensorflow graph to UFF format recognized by TensorRT in TensorRT mode (-t).
+## Building the app
+* Clone the [repo](https://github.com/fierval/fast_od).
+* Get the frozen graph and the class labels files for Tensorflow from [here](https://github.com/fierval/tensorflow-object-detection-cpp/tree/master/demo/ssd_inception_v2)
+* Get the [frozen graph for TensorRT](https://www.dropbox.com/s/nc3tzm95ip356i5/sample_ssd_relu6.uff?dl=0). The class labels file should be available in `/usr/src/tensorrt/data/ssd` directory.
+* Build:
+```sh
+mkdir build
+cd build
+cmake .. # cmake -DCMAKE_BUILD_TYPE=Debug
+```
+## Running
+Command line options are described in [`main.cpp`](https://github.com/fierval/fast_od/blob/master/main.cpp">):
+```cpp
+const String keys =
+    "{d display |1  | view video while objects are detected}"
+    "{t tensorrt|false | use tensorrt}"
+    "{i int8|false| use INT8 (requires callibration)}"
+    "{v video    |  | video for detection}"
+    "{graph ||frozen graph location}"
+    "{labels ||trained labels filelocation}";
+```
+Examples are in `run_*.sh` files in the sources directory. Worth mentioning:
+```
+-d=0 - run without UX, print out framerate only. -d=2 run with UX
+-t - TensorRT graph
+-t -i - TensorRT graph with INT8 precision.
+```
+## Slowdown due to UX
+The application uses a bare-bones OpenCV UI for visual feedback (`imshow`) and that causes a significant perf hit, so to measure actual performance we run with `-d=0` which suppresses the UI.
\ No newline at end of file
--- a/argsParser.h
+++ b/argsParser.h
+#ifndef TENSORRT_ARGS_PARSER_H
+#define TENSORRT_ARGS_PARSER_H
+#include <vector>
+#include <string>
+#include <getopt.h>
+#include <iostream>
+namespace samplesCommon
+{
+//!
+//! \brief The SampleParams structure groups the basic parameters required by
+//!        all sample networks.
+//!
+struct SampleParams
+{
+    int batchSize;                     //!< Number of inputs in a batch
+    int dlaID;
+    std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
+    std::vector<std::string> inputTensorNames;
+    std::vector<std::string> outputTensorNames;
+};
+//!
+//! \brief The CaffeSampleParams structure groups the additional parameters required by
+//!         networks that use caffe
+//!
+struct CaffeSampleParams : public SampleParams
+{
+    std::string prototxtFileName; //!< Filename of prototxt design file of a network
+    std::string weightsFileName;  //!< Filename of trained weights file of a network
+};
+//!
+//! /brief Struct to maintain command-line arguments.
+//!
+struct Args
+{
+    bool runInInt8{false};
+    bool help{false};
+    int useDLA{-1};
+    std::vector<std::string> dataDirs;
+};
+//!
+//! \brief Populates the Args struct with the provided command-line parameters.
+//!
+//! \throw invalid_argument if any of the arguments are not valid
+//!
+//! \return boolean If return value is true, execution can continue, otherwise program should exit
+//!
+inline bool parseArgs(Args& args, int argc, char* argv[])
+{
+    while (1)
+    {
+        int arg;
+        static struct option long_options[] = {
+            {"help", no_argument, 0, 'h'},
+            {"datadir", required_argument, 0, 'd'},
+            {"int8", no_argument, 0, 'i'},
+            {"useDLA", required_argument, 0, 'u'},
+            {nullptr, 0, nullptr, 0}};
+        int option_index = 0;
+        arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
+        if (arg == -1)
+            break;
+        switch (arg)
+        {
+        case 'h':
+            args.help = true;
+            return false;
+        case 'd':
+            if (optarg)
+                args.dataDirs.push_back(optarg);
+            else
+            {
+                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
+                return false;
+            }
+            break;
+        case 'i':
+            args.runInInt8 = true;
+            break;
+        case 'u':
+            if (optarg)
+                args.useDLA = std::stoi(optarg);
+            break;
+        default:
+            return false;
+        }
+    }
+    return true;
+}
+} // namespace samplesCommon
+#endif // TENSORRT_ARGS_PARSER_H
--- a/channel_first.cu
+++ b/channel_first.cu
+// kernel to convert from OpenCV channel representation to channel-first
+// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
+const int BLOCK_SIZE = 1024;
+#include <cuda_runtime.h>
+__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    int offset = idx / channelsNum;
+    int channel = idx % channelsNum;
+    // what would the row be if we didn't have any padding
+    int row = idx / rowElems;
+    int col = idx % rowElems;
+    // actual element - skip padding
+    int sourceIdx = row * rowSize + col;
+    dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
+}
+// we expect all memory to already reside on device so no need to allocate anything
+void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
+{
+    int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
+    cudaDeviceSynchronize();
+}
--- a/common.h
+++ b/common.h
+#ifndef TENSORRT_COMMON_H
+#define TENSORRT_COMMON_H
+#include "NvInfer.h"
+#include "NvInferPlugin.h"
+// ONNX is not supported in Windows
+#ifndef _MSC_VER
+#include "NvOnnxConfig.h"
+#include "NvOnnxParser.h"
+#endif
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <cuda_runtime_api.h>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <new>
+#include <numeric>
+#include <ratio>
+#include <string>
+#include <utility>
+#include <vector>
+using namespace std;
+using namespace nvinfer1;
+using namespace plugin;
+#define CHECK_TRT(status)                             \
+    do                                            \
+    {                                             \
+        auto ret = (status);                      \
+        if (ret != 0)                             \
+        {                                         \
+            std::cout << "Cuda failure: " << ret; \
+            abort();                              \
+        }                                         \
+    } while (0)
+constexpr long double operator"" _GB(long double val)
+{
+    return val * (1 << 30);
+}
+constexpr long double operator"" _MB(long double val) { return val * (1 << 20); }
+constexpr long double operator"" _KB(long double val) { return val * (1 << 10); }
+// These is necessary if we want to be able to write 1_GB instead of 1.0_GB.
+// Since the return type is signed, -1_GB will work as expected.
+constexpr long long int operator"" _GB(long long unsigned int val) { return val * (1 << 30); }
+constexpr long long int operator"" _MB(long long unsigned int val) { return val * (1 << 20); }
+constexpr long long int operator"" _KB(long long unsigned int val) { return val * (1 << 10); }
+// Logger for TensorRT info/warning/errors
+class Logger : public nvinfer1::ILogger
+{
+public:
+    Logger(Severity severity = Severity::kWARNING)
+        : reportableSeverity(severity)
+    {
+    }
+    void log(Severity severity, const char* msg) override
+    {
+        // suppress messages with severity enum value greater than the reportable
+        if (severity > reportableSeverity)
+            return;
+        switch (severity)
+        {
+        case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
+        case Severity::kERROR: std::cerr << "ERROR: "; break;
+        case Severity::kWARNING: std::cerr << "WARNING: "; break;
+        case Severity::kINFO: std::cerr << "INFO: "; break;
+        default: std::cerr << "UNKNOWN: "; break;
+        }
+        std::cerr << msg << std::endl;
+    }
+    Severity reportableSeverity;
+};
+struct SimpleProfiler : public nvinfer1::IProfiler
+{
+    struct Record
+    {
+        float time{0};
+        int count{0};
+    };
+    virtual void reportLayerTime(const char* layerName, float ms)
+    {
+        mProfile[layerName].count++;
+        mProfile[layerName].time += ms;
+    }
+    SimpleProfiler(
+        const char* name,
+        const std::vector<SimpleProfiler>& srcProfilers = std::vector<SimpleProfiler>())
+        : mName(name)
+    {
+        for (const auto& srcProfiler : srcProfilers)
+        {
+            for (const auto& rec : srcProfiler.mProfile)
+            {
+                auto it = mProfile.find(rec.first);
+                if (it == mProfile.end())
+                {
+                    mProfile.insert(rec);
+                }
+                else
+                {
+                    it->second.time += rec.second.time;
+                    it->second.count += rec.second.count;
+                }
+            }
+        }
+    }
+    friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value)
+    {
+        out << "========== " << value.mName << " profile ==========" << std::endl;
+        float totalTime = 0;
+        std::string layerNameStr = "TensorRT layer name";
+        int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
+        for (const auto& elem : value.mProfile)
+        {
+            totalTime += elem.second.time;
+            maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
+        }
+        auto old_settings = out.flags();
+        auto old_precision = out.precision();
+        // Output header
+        {
+            out << std::setw(maxLayerNameLength) << layerNameStr << " ";
+            out << std::setw(12) << "Runtime, "
+                << "%"
+                << " ";
+            out << std::setw(12) << "Invocations"
+                << " ";
+            out << std::setw(12) << "Runtime, ms" << std::endl;
+        }
+        for (const auto& elem : value.mProfile)
+        {
+            out << std::setw(maxLayerNameLength) << elem.first << " ";
+            out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.second.time * 100.0F / totalTime) << "%"
+                << " ";
+            out << std::setw(12) << elem.second.count << " ";
+            out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << std::endl;
+        }
+        out.flags(old_settings);
+        out.precision(old_precision);
+        out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl;
+        return out;
+    }
+private:
+    std::string mName;
+    std::map<std::string, Record> mProfile;
+};
+// Locate path to file, given its filename or filepath suffix and possible dirs it might lie in
+// Function will also walk back MAX_DEPTH dirs from CWD to CHECK_TRT for such a file path
+inline std::string locateFile(const std::string& filepathSuffix, const std::vector<std::string>& directories)
+{
+    const int MAX_DEPTH{10};
+    bool found{false};
+    std::string filepath;
+    for (auto& dir : directories)
+    {
+        filepath = dir + filepathSuffix;
+        for (int i = 0; i < MAX_DEPTH && !found; i++)
+        {
+            std::ifstream CHECK_TRTFile(filepath);
+            found = CHECK_TRTFile.is_open();
+            if (found)
+                break;
+            filepath = "../" + filepath; // Try again in parent dir
+        }
+        if (found)
+        {
+            break;
+        }
+        filepath.clear();
+    }
+    if (filepath.empty())
+    {
+        std::string directoryList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
+                                                    [](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
+        std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << directoryList << std::endl;
+        exit(EXIT_FAILURE);
+    }
+    return filepath;
+}
+inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
+{
+    std::ifstream infile(fileName, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open.");
+    std::string magic, h, w, max;
+    infile >> magic >> h >> w >> max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(buffer), inH * inW);
+}
+namespace samplesCommon
+{
+inline void* safeCudaMalloc(size_t memSize)
+{
+    void* deviceMem;
+    CHECK_TRT(cudaMalloc(&deviceMem, memSize));
+    if (deviceMem == nullptr)
+    {
+        std::cerr << "Out of memory" << std::endl;
+        exit(1);
+    }
+    return deviceMem;
+}
+inline bool isDebug()
+{
+    return (std::getenv("TENSORRT_DEBUG") ? true : false);
+}
+struct InferDeleter
+{
+    template <typename T>
+    void operator()(T* obj) const
+    {
+        if (obj)
+        {
+            obj->destroy();
+        }
+    }
+};
+template <typename T>
+inline std::shared_ptr<T> infer_object(T* obj)
+{
+    if (!obj)
+    {
+        throw std::runtime_error("Failed to create object");
+    }
+    return std::shared_ptr<T>(obj, InferDeleter());
+}
+template <class Iter>
+inline std::vector<size_t> argsort(Iter begin, Iter end, bool reverse = false)
+{
+    std::vector<size_t> inds(end - begin);
+    std::iota(inds.begin(), inds.end(), 0);
+    if (reverse)
+    {
+        std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
+            return begin[i2] < begin[i1];
+        });
+    }
+    else
+    {
+        std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
+            return begin[i1] < begin[i2];
+        });
+    }
+    return inds;
+}
+inline bool readReferenceFile(const std::string& fileName, std::vector<std::string>& refVector)
+{
+    std::ifstream infile(fileName);
+    if (!infile.is_open())
+    {
+        cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << endl;
+        return false;
+    }
+    std::string line;
+    while (std::getline(infile, line))
+    {
+        if (line.empty())
+            continue;
+        refVector.push_back(line);
+    }
+    infile.close();
+    return true;
+}
+template <typename result_vector_t>
+inline std::vector<std::string> classify(const vector<string>& refVector, const result_vector_t& output, const size_t topK)
+{
+    auto inds = samplesCommon::argsort(output.cbegin(), output.cend(), true);
+    std::vector<std::string> result;
+    for (size_t k = 0; k < topK; ++k)
+    {
+        result.push_back(refVector[inds[k]]);
+    }
+    return result;
+}
+//...LG returns top K indices, not values.
+template <typename T>
+inline vector<size_t> topK(const vector<T> inp, const size_t k)
+{
+    vector<size_t> result;
+    std::vector<size_t> inds = samplesCommon::argsort(inp.cbegin(), inp.cend(), true);
+    result.assign(inds.begin(), inds.begin() + k);
+    return result;
+}
+template <typename T>
+inline bool readASCIIFile(const string& fileName, const size_t size, vector<T>& out)
+{
+    std::ifstream infile(fileName);
+    if (!infile.is_open())
+    {
+        cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << endl;
+        return false;
+    }
+    out.clear();
+    out.reserve(size);
+    out.assign(std::istream_iterator<T>(infile), std::istream_iterator<T>());
+    infile.close();
+    return true;
+}
+template <typename T>
+inline bool writeASCIIFile(const string& fileName, const vector<T>& in)
+{
+    std::ofstream outfile(fileName);
+    if (!outfile.is_open())
+    {
+        cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << endl;
+        return false;
+    }
+    for (auto fn : in)
+    {
+        outfile << fn << " ";
+    }
+    outfile.close();
+    return true;
+}
+inline void print_version()
+{
+//... This can be only done after statically linking this support into parserONNX.library
+#if 0
+    std::cout << "Parser built against:" << std::endl;
+    std::cout << "  ONNX IR version:  " << nvonnxparser::onnx_ir_version_string(onnx::IR_VERSION) << std::endl;
+#endif
+    std::cout << "  TensorRT version: "
+              << NV_TENSORRT_MAJOR << "."
+              << NV_TENSORRT_MINOR << "."
+              << NV_TENSORRT_PATCH << "."
+              << NV_TENSORRT_BUILD << std::endl;
+}
+inline string getFileType(const string& filepath)
+{
+    return filepath.substr(filepath.find_last_of(".") + 1);
+}
+inline string toLower(const string& inp)
+{
+    string out = inp;
+    std::transform(out.begin(), out.end(), out.begin(), ::tolower);
+    return out;
+}
+inline void enableDLA(IBuilder* b, int useDLACore)
+{
+    if (useDLACore >= 0)
+    {
+        b->allowGPUFallback(true);
+        b->setFp16Mode(true);
+        b->setDefaultDeviceType(DeviceType::kDLA);
+        b->setDLACore(useDLACore);
+    }
+}
+inline int parseDLA(int argc, char** argv)
+{
+    for (int i = 1; i < argc; i++)
+    {
+        std::string arg(argv[i]);
+        if (strncmp(argv[i], "--useDLACore=", 13) == 0)
+            return stoi(argv[i] + 13);
+    }
+    return -1;
+}
+inline unsigned int getElementSize(nvinfer1::DataType t)
+{
+    switch (t)
+    {
+    case nvinfer1::DataType::kINT32: return 4;
+    case nvinfer1::DataType::kFLOAT: return 4;
+    case nvinfer1::DataType::kHALF: return 2;
+    case nvinfer1::DataType::kINT8: return 1;
+    }
+    throw std::runtime_error("Invalid DataType.");
+    return 0;
+}
+inline int64_t volume(const nvinfer1::Dims& d)
+{
+    return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
+}
+template <int C, int H, int W>
+struct PPM
+{
+    std::string magic, fileName;
+    int h, w, max;
+    uint8_t buffer[C * H * W];
+};
+struct BBox
+{
+    float x1, y1, x2, y2;
+};
+template <int C, int H, int W>
+inline void readPPMFile(const std::string& filename, samplesCommon::PPM<C, H, W>& ppm)
+{
+    ppm.fileName = filename;
+    std::ifstream infile(filename, std::ifstream::binary);
+    assert(infile.is_open() && "Attempting to read from a file that is not open.");
+    infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
+    infile.seekg(1, infile.cur);
+    infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
+}
+template <int C, int H, int W>
+inline void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const BBox& bbox)
+{
+    std::ofstream outfile("./" + filename, std::ofstream::binary);
+    assert(!outfile.fail());
+    outfile << "P6"
+            << "\n"
+            << ppm.w << " " << ppm.h << "\n"
+            << ppm.max << "\n";
+    auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
+    const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
+    const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
+    const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
+    const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1);
+    for (int x = x1; x <= x2; ++x)
+    {
+        // bbox top border
+        ppm.buffer[(y1 * ppm.w + x) * 3] = 255;
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0;
+        ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0;
+        // bbox bottom border
+        ppm.buffer[(y2 * ppm.w + x) * 3] = 255;
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0;
+        ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0;
+    }
+    for (int y = y1; y <= y2; ++y)
+    {
+        // bbox left border
+        ppm.buffer[(y * ppm.w + x1) * 3] = 255;
+        ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0;
+        ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0;
+        // bbox right border
+        ppm.buffer[(y * ppm.w + x2) * 3] = 255;
+        ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0;
+        ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0;
+    }
+    outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
+}
+class TimerBase
+{
+public:
+    virtual void start() {}
+    virtual void stop() {}
+    float microseconds() const noexcept { return mMs * 1000.f; }
+    float milliseconds() const noexcept { return mMs; }
+    float seconds() const noexcept { return mMs / 1000.f; }
+    void reset() noexcept { mMs = 0.f; }
+protected:
+    float mMs{0.0f};
+};
+class GpuTimer : public TimerBase
+{
+public:
+    GpuTimer(cudaStream_t stream)
+        : mStream(stream)
+    {
+        CHECK_TRT(cudaEventCreate(&mStart));
+        CHECK_TRT(cudaEventCreate(&mStop));
+    }
+    ~GpuTimer()
+    {
+        CHECK_TRT(cudaEventDestroy(mStart));
+        CHECK_TRT(cudaEventDestroy(mStop));
+    }
+    void start() { CHECK_TRT(cudaEventRecord(mStart, mStream)); }
+    void stop()
+    {
+        CHECK_TRT(cudaEventRecord(mStop, mStream));
+        float ms{0.0f};
+        CHECK_TRT(cudaEventSynchronize(mStop));
+        CHECK_TRT(cudaEventElapsedTime(&ms, mStart, mStop));
+        mMs += ms;
+    }
+private:
+    cudaEvent_t mStart, mStop;
+    cudaStream_t mStream;
+}; // class GpuTimer
+template <typename Clock>
+class CpuTimer : public TimerBase
+{
+public:
+    using clock_type = Clock;
+    void start() { mStart = Clock::now(); }
+    void stop()
+    {
+        mStop = Clock::now();
+        mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
+    }
+private:
+    std::chrono::time_point<Clock> mStart, mStop;
+}; // class CpuTimer
+using PreciseCpuTimer = CpuTimer<std::chrono::high_resolution_clock>;
+} // namespace samplesCommon
+#endif // TENSORRT_COMMON_H
--- a/cpall.sh
+++ b/cpall.sh
+cd ~/git/tensorflow
+sudo mkdir /usr/local/tensorflow
+sudo mkdir /usr/local/tensorflow/include
+sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
+sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
+sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
+sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
+sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
+sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
+sudo mkdir /usr/local/tensorflow/include/third_party
+sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
+sudo mkdir /usr/local/tensorflow/lib
+sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
--- a/dynlink_nvcuvid.cpp
+++ b/dynlink_nvcuvid.cpp
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+#include <stdio.h>
+#include "cuda_runtime_api.h"
+#include "dynlink_nvcuvid.h"
+tcuvidCreateVideoSource               *cuvidCreateVideoSource;
+tcuvidCreateVideoSourceW              *cuvidCreateVideoSourceW;
+tcuvidDestroyVideoSource              *cuvidDestroyVideoSource;
+tcuvidSetVideoSourceState             *cuvidSetVideoSourceState;
+tcuvidGetVideoSourceState             *cuvidGetVideoSourceState;
+tcuvidGetSourceVideoFormat            *cuvidGetSourceVideoFormat;
+tcuvidGetSourceAudioFormat            *cuvidGetSourceAudioFormat;
+tcuvidCreateVideoParser               *cuvidCreateVideoParser;
+tcuvidParseVideoData                  *cuvidParseVideoData;
+tcuvidDestroyVideoParser              *cuvidDestroyVideoParser;
+tcuvidCreateDecoder                   *cuvidCreateDecoder;
+tcuvidDestroyDecoder                  *cuvidDestroyDecoder;
+tcuvidDecodePicture                   *cuvidDecodePicture;
+tcuvidMapVideoFrame                   *cuvidMapVideoFrame;
+tcuvidUnmapVideoFrame                 *cuvidUnmapVideoFrame;
+#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+tcuvidMapVideoFrame64                 *cuvidMapVideoFrame64;
+tcuvidUnmapVideoFrame64               *cuvidUnmapVideoFrame64;
+#endif
+//tcuvidGetVideoFrameSurface            *cuvidGetVideoFrameSurface;
+tcuvidCtxLockCreate                   *cuvidCtxLockCreate;
+tcuvidCtxLockDestroy                  *cuvidCtxLockDestroy;
+tcuvidCtxLock                         *cuvidCtxLock;
+tcuvidCtxUnlock                       *cuvidCtxUnlock;
+// Auto-lock helper for C++ applications
+CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx) 
+    : m_ctx(ctx) 
+{
+    cuvidCtxLock(m_ctx, 0); 
+}
+CCtxAutoLock::~CCtxAutoLock()
+{ 
+    cuvidCtxUnlock(m_ctx, 0); 
+}
+#define STRINGIFY(X) #X
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#include <Windows.h>
+#ifdef UNICODE
+   static LPCWSTR __DriverLibName = L"nvcuvid.dll";
+#else
+   static LPCSTR __DriverLibName = "nvcuvid.dll";
+#endif
+typedef HMODULE DLLDRIVER;
+static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
+{
+    *pInstance = LoadLibrary(__DriverLibName);
+    if (*pInstance == NULL)
+    {
+        printf("LoadLibrary \"%s\" failed!\n", __DriverLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return CUDA_SUCCESS;
+}
+#define GET_PROC_EX(name, alias, required)                     \
+    alias = (t##name *)GetProcAddress(DriverLib, #name);               \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __DriverLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)GetProcAddress(DriverLib, STRINGIFY(name##_v2));\
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __DriverLibName);                       \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
+#include <dlfcn.h>
+static char __DriverLibName[] = "libnvcuvid.so";
+typedef void *DLLDRIVER;
+static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
+{
+    *pInstance = dlopen(__DriverLibName, RTLD_NOW);
+    if (*pInstance == NULL)
+    {
+        printf("dlopen \"%s\" failed!\n", __DriverLibName);
+        return CUDA_ERROR_UNKNOWN;
+    }
+    return CUDA_SUCCESS;
+}
+#define GET_PROC_EX(name, alias, required)                              \
+    alias = (t##name *)dlsym(DriverLib, #name);                        \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               #name, __DriverLibName);                                  \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#define GET_PROC_EX_V2(name, alias, required)                           \
+    alias = (t##name *)dlsym(DriverLib, STRINGIFY(name##_v2));         \
+    if (alias == NULL && required) {                                    \
+        printf("Failed to find required function \"%s\" in %s\n",       \
+               STRINGIFY(name##_v2), __DriverLibName);                    \
+        return CUDA_ERROR_UNKNOWN;                                      \
+    }
+#else
+#error unsupported platform
+#endif
+#define CHECKED_CALL(call)              \
+    do {                                \
+        CUresult result = (call);       \
+        if (CUDA_SUCCESS != result) {   \
+            return result;              \
+        }                               \
+    } while(0)
+#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
+#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
+#define GET_PROC(name)          GET_PROC_REQUIRED(name)
+#define GET_PROC_V2(name)       GET_PROC_EX_V2(name,name,1)
+CUresult CUDAAPI cuvidInit(unsigned int Flags)
+{
+    DLLDRIVER DriverLib;
+    CHECKED_CALL(LOAD_LIBRARY(&DriverLib));
+    // fetch all function pointers
+    GET_PROC(cuvidCreateVideoSource);
+    GET_PROC(cuvidCreateVideoSourceW);
+    GET_PROC(cuvidDestroyVideoSource);
+    GET_PROC(cuvidSetVideoSourceState);
+    GET_PROC(cuvidGetVideoSourceState);
+    GET_PROC(cuvidGetSourceVideoFormat);
+    GET_PROC(cuvidGetSourceAudioFormat);
+    GET_PROC(cuvidCreateVideoParser);
+    GET_PROC(cuvidParseVideoData);
+    GET_PROC(cuvidDestroyVideoParser);
+    GET_PROC(cuvidCreateDecoder);
+    GET_PROC(cuvidDestroyDecoder);
+    GET_PROC(cuvidDecodePicture);
+#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+    GET_PROC(cuvidMapVideoFrame64);
+    GET_PROC(cuvidUnmapVideoFrame64);
+    cuvidMapVideoFrame   = cuvidMapVideoFrame64;
+    cuvidUnmapVideoFrame = cuvidUnmapVideoFrame64;
+#else
+    GET_PROC(cuvidMapVideoFrame);
+    GET_PROC(cuvidUnmapVideoFrame);
+#endif
+//    GET_PROC(cuvidGetVideoFrameSurface);
+    GET_PROC(cuvidCtxLockCreate);
+    GET_PROC(cuvidCtxLockDestroy);
+    GET_PROC(cuvidCtxLock);
+    GET_PROC(cuvidCtxUnlock);
+    return CUDA_SUCCESS;
+}
--- a/infer_with_trt.cpp
+++ b/infer_with_trt.cpp
+#include "utils.h"
+using namespace cv::cuda;
+const char *INPUT_BLOB_NAME = "Input";
+static Logger gLogger;
+// TODO: refactor once done
+static bool globalRunInInt8 = false;
+#define RETURN_AND_LOG(ret, severity, message)                                 \
+    do                                                                         \
+    {                                                                          \
+        std::string error_message = "sample_uff_ssd: " + std::string(message); \
+        gLogger.log(ILogger::Severity::k##severity, error_message.c_str());    \
+        return (ret);                                                          \
+    } while (0)
+const int OUTPUT_CLS_SIZE = 91;
+const int OUTPUT_BBOX_SIZE = OUTPUT_CLS_SIZE * 4;
+const char *OUTPUT_BLOB_NAME0 = "NMS";
+//INT8 Calibration, currently set to calibrate over 100 images
+static constexpr int CAL_BATCH_SIZE = 50;
+static constexpr int FIRST_CAL_BATCH = 0, NB_CAL_BATCHES = 10;
+// Concat layers
+// mbox_priorbox, mbox_loc, mbox_conf
+const int concatAxis[2] = {1, 1};
+const bool ignoreBatch[2] = {false, false};
+DetectionOutputParameters detectionOutputParam{true, false, 0, OUTPUT_CLS_SIZE, 100, 100, 0.5, 0.6, CodeTypeSSD::TF_CENTER, {0, 2, 1}, true, true};
+// Visualization
+const float visualizeThreshold = 0.5;
+void printOutput(int64_t eltCount, DataType dtype, void *buffer)
+{
+    std::cout << eltCount << " eltCount" << std::endl;
+    assert(samplesCommon::getElementSize(dtype) == sizeof(float));
+    std::cout << "--- OUTPUT ---" << std::endl;
+    size_t memSize = eltCount * samplesCommon::getElementSize(dtype);
+    float *outputs = new float[eltCount];
+    CHECK_TRT(cudaMemcpyAsync(outputs, buffer, memSize, cudaMemcpyDeviceToHost));
+    int maxIdx = std::distance(outputs, std::max_element(outputs, outputs + eltCount));
+    for (int64_t eltIdx = 0; eltIdx < eltCount; ++eltIdx)
+    {
+        std::cout << eltIdx << " => " << outputs[eltIdx] << "\t : ";
+        if (eltIdx == maxIdx)
+            std::cout << "***";
+        std::cout << "\n";
+    }
+    std::cout << std::endl;
+    delete[] outputs;
+}
+std::string locateFile(const std::string &input)
+{
+    std::vector<std::string> dirs{"data/ssd/",
+                                  "data/ssd/VOC2007/",
+                                  "data/ssd/VOC2007/PPMImages/",
+                                  "data/samples/ssd/",
+                                  "data/samples/ssd/VOC2007/",
+                                  "data/samples/ssd/VOC2007/PPMImages/"};
+    return locateFile(input, dirs);
+}
+void populateTFInputData(float *data)
+{
+    auto graphFileName = locateFile("inp_bus.txt");
+    std::ifstream labelFile(graphFileName);
+    string line;
+    int id = 0;
+    while (getline(labelFile, line))
+    {
+        istringstream iss(line);
+        float num;
+        iss >> num;
+        data[id++] = num;
+    }
+    return;
+}
+void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName)
+{
+    std::ifstream labelFile(labelFileName);
+    string line;
+    int id = 0;
+    while (getline(labelFile, line))
+    {
+        CLASSES.push_back(line);
+    }
+    return;
+}
+std::vector<std::pair<int64_t, DataType>>
+calculateBindingBufferSizes(const ICudaEngine &engine, int nbBindings, int batchSize)
+{
+    std::vector<std::pair<int64_t, DataType>> sizes;
+    for (int i = 0; i < nbBindings; ++i)
+    {
+        Dims dims = engine.getBindingDimensions(i);
+        DataType dtype = engine.getBindingDataType(i);
+        int64_t eltCount = samplesCommon::volume(dims) * batchSize;
+        sizes.push_back(std::make_pair(eltCount, dtype));
+    }
+    return sizes;
+}
+ICudaEngine *loadModelAndCreateEngine(const char *uffFile, int maxBatchSize,
+                                      IUffParser *parser, IInt8Calibrator *calibrator, IHostMemory *&trtModelStream, bool isInt8)
+{
+    // Create the builder
+    IBuilder *builder = createInferBuilder(gLogger);
+    // Parse the UFF model to populate the network, then set the outputs.
+    INetworkDefinition *network = builder->createNetwork();
+    std::cout << "Begin parsing model..." << std::endl;
+    if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
+        RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");
+    std::cout << "End parsing model..." << std::endl;
+    // Build the engine.
+    builder->setMaxBatchSize(maxBatchSize);
+    // The _GB literal operator is defined in common/common.h
+    builder->setMaxWorkspaceSize(1_GB); // We need about 1GB of scratch space for the plugin layer for batch size 5.
+    builder->setHalf2Mode(false);
+    if (isInt8)
+    {
+        builder->setInt8Mode(true);
+        builder->setInt8Calibrator(calibrator);
+    }
+    std::cout << "Begin building engine..." << std::endl;
+    ICudaEngine *engine = builder->buildCudaEngine(*network);
+    if (!engine)
+        RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");
+    std::cout << "End building engine..." << std::endl;
+    // We don't need the network any more, and we can destroy the parser.
+    network->destroy();
+    parser->destroy();
+    // Serialize the engine, then close everything down.
+    trtModelStream = engine->serialize();
+    builder->destroy();
+    shutdownProtobufLibrary();
+    return engine;
+}
+void doInference(IExecutionContext &context, float *inputData, float *detectionOut, int *keepCount, int batchSize)
+{
+    const ICudaEngine &engine = context.getEngine();
+    // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
+    // of these, but in this case we know that there is exactly 1 input and 2 output.
+    int nbBindings = engine.getNbBindings();
+    std::vector<void *> buffers(nbBindings);
+    std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);
+    // In order to bind the buffers, we need to know the names of the input and output tensors.
+    // Note that indices are guaranteed to be less than IEngine::getNbBindings().
+    int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
+        outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
+        outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1);
+    for (int i = 0; i < nbBindings; ++i)
+    {
+        // inputData is already allocated on the device
+        if (i == inputIndex)
+        {
+            continue;
+        }
+        auto bufferSizesOutput = buffersSizes[i];
+        buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second));
+    }
+    cudaStream_t stream;
+    CHECK_TRT(cudaStreamCreate(&stream));
+    // make sure the data we are about to use is allocated on the GPU
+    cudaPointerAttributes attributes;
+    cudaError_t err = cudaPointerGetAttributes(&attributes, inputData);
+    #if CUDART_VERSION >= 10000
+    assert(err != cudaErrorInvalidValue && attributes.type == cudaMemoryTypeDevice);
+    #else
+    assert(err != cudaErrorInvalidValue && attributes.memoryType == cudaMemoryTypeDevice);
+    #endif
+    buffers[inputIndex] = inputData;
+    auto t_start = std::chrono::high_resolution_clock::now();
+    context.execute(batchSize, &buffers[0]);
+    auto t_end = std::chrono::high_resolution_clock::now();
+    float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
+    //std::cout << "Time taken for inference is " << total << " ms." << std::endl;
+    for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
+    {
+        if (engine.bindingIsInput(bindingIdx))
+            continue;
+#ifdef SSD_INT8_DEBUG
+        auto bufferSizesOutput = buffersSizes[bindingIdx];
+        printOutput(bufferSizesOutput.first, bufferSizesOutput.second,
+                    buffers[bindingIdx]);
+#endif
+    }
+    CHECK_TRT(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    CHECK_TRT(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream));
+    cudaStreamSynchronize(stream);
+    // Release the stream and the buffers
+    cudaStreamDestroy(stream);
+    CHECK_TRT(cudaFree(buffers[inputIndex]));
+    CHECK_TRT(cudaFree(buffers[outputIndex0]));
+    CHECK_TRT(cudaFree(buffers[outputIndex1]));
+}
+class FlattenConcat : public IPluginV2
+{
+public:
+    FlattenConcat(int concatAxis, bool ignoreBatch)
+        : mIgnoreBatch(ignoreBatch)
+        , mConcatAxisID(concatAxis)
+    {
+        assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
+    }
+    //clone constructor
+    FlattenConcat(int concatAxis, bool ignoreBatch, int numInputs, int outputConcatAxis, int* inputConcatAxis)
+        : mIgnoreBatch(ignoreBatch)
+        , mConcatAxisID(concatAxis)
+        , mOutputConcatAxis(outputConcatAxis)
+        , mNumInputs(numInputs)
+    {
+        CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
+        for (int i = 0; i < mNumInputs; ++i)
+            mInputConcatAxis[i] = inputConcatAxis[i];
+    }
+    FlattenConcat(const void* data, size_t length)
+    {
+        const char *d = reinterpret_cast<const char*>(data), *a = d;
+        mIgnoreBatch = read<bool>(d);
+        mConcatAxisID = read<int>(d);
+        assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
+        mOutputConcatAxis = read<int>(d);
+        mNumInputs = read<int>(d);
+        CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
+        CHECK_TRT(cudaMallocHost((void**) &mCopySize, mNumInputs * sizeof(int)));
+        std::for_each(mInputConcatAxis, mInputConcatAxis + mNumInputs, [&](int& inp) { inp = read<int>(d); });
+        mCHW = read<nvinfer1::DimsCHW>(d);
+        std::for_each(mCopySize, mCopySize + mNumInputs, [&](size_t& inp) { inp = read<size_t>(d); });
+        assert(d == a + length);
+    }
+    ~FlattenConcat()
+    {
+        if (mInputConcatAxis)
+            CHECK_TRT(cudaFreeHost(mInputConcatAxis));
+        if (mCopySize)
+            CHECK_TRT(cudaFreeHost(mCopySize));
+    }
+    int getNbOutputs() const override { return 1; }
+    Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
+    {
+        assert(nbInputDims >= 1);
+        assert(index == 0);
+        mNumInputs = nbInputDims;
+        CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
+        mOutputConcatAxis = 0;
+#ifdef SSD_INT8_DEBUG
+        std::cout << " Concat nbInputs " << nbInputDims << "\n";
+        std::cout << " Concat axis " << mConcatAxisID << "\n";
+        for (int i = 0; i < 6; ++i)
+            for (int j = 0; j < 3; ++j)
+                std::cout << " Concat InputDims[" << i << "]"
+                          << "d[" << j << " is " << inputs[i].d[j] << "\n";
+#endif
+        for (int i = 0; i < nbInputDims; ++i)
+        {
+            int flattenInput = 0;
+            assert(inputs[i].nbDims == 3);
+            if (mConcatAxisID != 1)
+                assert(inputs[i].d[0] == inputs[0].d[0]);
+            if (mConcatAxisID != 2)
+                assert(inputs[i].d[1] == inputs[0].d[1]);
+            if (mConcatAxisID != 3)
+                assert(inputs[i].d[2] == inputs[0].d[2]);
+            flattenInput = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
+            mInputConcatAxis[i] = flattenInput;
+            mOutputConcatAxis += mInputConcatAxis[i];
+        }
+        return DimsCHW(mConcatAxisID == 1 ? mOutputConcatAxis : 1,
+                       mConcatAxisID == 2 ? mOutputConcatAxis : 1,
+                       mConcatAxisID == 3 ? mOutputConcatAxis : 1);
+    }
+    int initialize() override
+    {
+        CHECK_TRT(cublasCreate(&mCublas));
+        return 0;
+    }
+    void terminate() override
+    {
+        CHECK_TRT(cublasDestroy(mCublas));
+    }
+    size_t getWorkspaceSize(int) const override { return 0; }
+    int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
+    {
+        int numConcats = 1;
+        assert(mConcatAxisID != 0);
+        numConcats = std::accumulate(mCHW.d, mCHW.d + mConcatAxisID - 1, 1, std::multiplies<int>());
+        if (!mIgnoreBatch)
+            numConcats *= batchSize;
+        float* output = reinterpret_cast<float*>(outputs[0]);
+        int offset = 0;
+        for (int i = 0; i < mNumInputs; ++i)
+        {
+            const float* input = reinterpret_cast<const float*>(inputs[i]);
+            float* inputTemp;
+            CHECK_TRT(cudaMalloc(&inputTemp, mCopySize[i] * batchSize));
+            CHECK_TRT(cudaMemcpyAsync(inputTemp, input, mCopySize[i] * batchSize, cudaMemcpyDeviceToDevice, stream));
+            for (int n = 0; n < numConcats; ++n)
+            {
+                CHECK_TRT(cublasScopy(mCublas, mInputConcatAxis[i],
+                                  inputTemp + n * mInputConcatAxis[i], 1,
+                                  output + (n * mOutputConcatAxis + offset), 1));
+            }
+            CHECK_TRT(cudaFree(inputTemp));
+            offset += mInputConcatAxis[i];
+        }
+        return 0;
+    }
+    size_t getSerializationSize() const override
+    {
+        return sizeof(bool) + sizeof(int) * (3 + mNumInputs) + sizeof(nvinfer1::Dims) + (sizeof(mCopySize) * mNumInputs);
+    }
+    void serialize(void* buffer) const override
+    {
+        char *d = reinterpret_cast<char*>(buffer), *a = d;
+        write(d, mIgnoreBatch);
+        write(d, mConcatAxisID);
+        write(d, mOutputConcatAxis);
+        write(d, mNumInputs);
+        for (int i = 0; i < mNumInputs; ++i)
+        {
+            write(d, mInputConcatAxis[i]);
+        }
+        write(d, mCHW);
+        for (int i = 0; i < mNumInputs; ++i)
+        {
+            write(d, mCopySize[i]);
+        }
+        assert(d == a + getSerializationSize());
+    }
+    void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
+    {
+        assert(nbOutputs == 1);
+        mCHW = inputs[0];
+        assert(inputs[0].nbDims == 3);
+        CHECK_TRT(cudaMallocHost((void**) &mCopySize, nbInputs * sizeof(int)));
+        for (int i = 0; i < nbInputs; ++i)
+        {
+            mCopySize[i] = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2] * sizeof(float);
+        }
+    }
+    bool supportsFormat(DataType type, PluginFormat format) const override
+    {
+        return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
+    }
+    const char* getPluginType() const override { return "FlattenConcat_TRT"; }
+    const char* getPluginVersion() const override { return "1"; }
+    void destroy() override { delete this; }
+    IPluginV2* clone() const override
+    {
+        return new FlattenConcat(mConcatAxisID, mIgnoreBatch, mNumInputs, mOutputConcatAxis, mInputConcatAxis);
+    }
+    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
+    const char* getPluginNamespace() const override { return mNamespace.c_str(); }
+private:
+    template <typename T>
+    void write(char*& buffer, const T& val) const
+    {
+        *reinterpret_cast<T*>(buffer) = val;
+        buffer += sizeof(T);
+    }
+    template <typename T>
+    T read(const char*& buffer)
+    {
+        T val = *reinterpret_cast<const T*>(buffer);
+        buffer += sizeof(T);
+        return val;
+    }
+    size_t* mCopySize = nullptr;
+    bool mIgnoreBatch{false};
+    int mConcatAxisID{0}, mOutputConcatAxis{0}, mNumInputs{0};
+    int* mInputConcatAxis = nullptr;
+    nvinfer1::Dims mCHW;
+    cublasHandle_t mCublas;
+    std::string mNamespace;
+};
+namespace
+{
+const char *FLATTENCONCAT_PLUGIN_VERSION{"1"};
+const char *FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
+} // namespace
+class FlattenConcatPluginCreator : public IPluginCreator
+{
+public:
+    FlattenConcatPluginCreator()
+    {
+        mPluginAttributes.emplace_back(PluginField("axis", nullptr, PluginFieldType::kINT32, 1));
+        mPluginAttributes.emplace_back(PluginField("ignoreBatch", nullptr, PluginFieldType::kINT32, 1));
+        mFC.nbFields = mPluginAttributes.size();
+        mFC.fields = mPluginAttributes.data();
+    }
+    ~FlattenConcatPluginCreator() {}
+    const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }
+    const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
+    const PluginFieldCollection* getFieldNames() override { return &mFC; }
+    IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
+    {
+        const PluginField* fields = fc->fields;
+        for (int i = 0; i < fc->nbFields; ++i)
+        {
+            const char* attrName = fields[i].name;
+            if (!strcmp(attrName, "axis"))
+            {
+                assert(fields[i].type == PluginFieldType::kINT32);
+                mConcatAxisID = *(static_cast<const int*>(fields[i].data));
+            }
+            if (!strcmp(attrName, "ignoreBatch"))
+            {
+                assert(fields[i].type == PluginFieldType::kINT32);
+                mIgnoreBatch = *(static_cast<const bool*>(fields[i].data));
+            }
+        }
+        return new FlattenConcat(mConcatAxisID, mIgnoreBatch);
+    }
+    IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
+    {
+        //This object will be deleted when the network is destroyed, which will
+        //call Concat::destroy()
+        return new FlattenConcat(serialData, serialLength);
+    }
+    void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
+    const char* getPluginNamespace() const override { return mNamespace.c_str(); }
+private:
+    static PluginFieldCollection mFC;
+    bool mIgnoreBatch{false};
+    int mConcatAxisID;
+    static std::vector<PluginField> mPluginAttributes;
+    std::string mNamespace = "";
+};
+PluginFieldCollection FlattenConcatPluginCreator::mFC{};
+std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;
+REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);
+// 1. convert image to the right size
+// 2. convert to float
+// 3. normalize for inception
+// 4. convert to flat vector, channels first
+float * normalize_for_trt(const cv::cuda::GpuMat &img)
+{
+    cv::Size size(INPUT_W, INPUT_H);
+    cv::cuda::GpuMat resizedMat;
+    cv::cuda::resize(img, resizedMat, size, 0, 0, CV_INTER_LINEAR);
+    cv::cuda::cvtColor(resizedMat, resizedMat, cv::COLOR_BGRA2RGB);
+    unsigned volChl = INPUT_H * INPUT_W;
+    float * data = (float *)samplesCommon::safeCudaMalloc(INPUT_C * volChl * sizeof(float));
+    // we treat the memory as if it's a one-channel, one row image
+    int rowSize = (int)resizedMat.step / (int)resizedMat.elemSize1();
+    // CUDA kernel to reshape the non-continuous GPU Mat structure and make it channel-first continuous
+    channelFirst(resizedMat.ptr<uint8_t>(), data, volChl, INPUT_C, INPUT_W * INPUT_C, rowSize);
+    return data;
+}
+std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8)
+{
+    initLibNvInferPlugins(&gLogger, "");
+    const int N = 10;
+    std::cout << graphFileName << std::endl;
+    auto parser = createUffParser();
+    BatchStream calibrationStream(CAL_BATCH_SIZE, NB_CAL_BATCHES);
+    parser->registerInput("Input", DimsCHW(INPUT_C, INPUT_H, INPUT_W), UffInputOrder::kNCHW);
+    parser->registerOutput("MarkOutput_0");
+    IHostMemory *trtModelStream{nullptr};
+    Int8EntropyCalibrator calibrator(calibrationStream, FIRST_CAL_BATCH, "CalibrationTableSSD");
+    ICudaEngine *tmpEngine = loadModelAndCreateEngine(graphFileName.c_str(), N, parser, &calibrator, trtModelStream, isInt8);
+    assert(tmpEngine != nullptr);
+    assert(trtModelStream != nullptr);
+    tmpEngine->destroy();
+    // Read a random sample image.
+    srand(unsigned(time(nullptr)));
+    // Deserialize the engine.
+    std::cout << "*** deserializing" << std::endl;
+    IRuntime *runtime = createInferRuntime(gLogger);
+    assert(runtime != nullptr);
+    ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
+    assert(engine != nullptr);
+    trtModelStream->destroy();
+    IExecutionContext *context = engine->createExecutionContext();
+    assert(context != nullptr);
+    return std::make_tuple(runtime, engine, context);
+}
+// mat representation of the image,
+std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat &img, IExecutionContext * context, vector<std::string>& CLASSES)
+{
+    const int N = 1;
+    float * data = normalize_for_trt(img);  
+    const std::string outFileRoot = "/home/borisk/images/";
+    // Host memory for outputs.
+    vector<float> detectionOut(N * detectionOutputParam.keepTopK * 7);
+    vector<int> keepCount(N);
+    // Run inference. This will also free the "data" pointer
+    doInference(*context, data, &detectionOut[0], &keepCount[0], N);
+    return std::make_tuple(detectionOut, keepCount);
+}
\ No newline at end of file
--- a/inference_base.cpp
+++ b/inference_base.cpp
+#include "inference_base.h"
+using tensorflow::Status;
+using namespace std;
+using namespace cv;
+using namespace std::chrono;
+int InferenceBase::ReadClassLabels()
+{
+    Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
+    if (!readLabelsMapStatus.ok())
+    {
+        LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
+        return -1;
+    }
+    else
+        LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
+    return 0;
+}
+void InferenceBase::InitCuda()
+{
+    void *hHandleDriver = nullptr;
+    CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
+    if (cuda_res != CUDA_SUCCESS)
+    {
+        throw exception();
+    }
+    cuda_res = cuvidInit(0);
+    if (cuda_res != CUDA_SUCCESS)
+    {
+        throw exception();
+    }
+    std::cout << "CUDA init: SUCCESS" << endl;
+    cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
+    isCudaInited = true;
+}
+int InferenceBase::Init(string videoStream)
+{
+    if (!isCudaInited)
+    {
+        InitCuda();
+    }
+    if (ReadClassLabels() != 0)
+    {
+        return -1;
+    }
+    if (ReadGraph() != 0)
+    {
+        LOG(ERROR) << "Could not load inference graph";
+        return -1;
+    }
+    LOG(INFO) << "Inference graph loaded";
+    // create video stream
+    d_reader = GetVideoReader(videoStream);
+    if (d_reader == nullptr)
+    {
+        LOG(ERROR) << "Could not create video stream";
+        throw exception();
+    }
+    // save off frame dimensions
+    auto formatStruct = d_reader->format();
+    width = formatStruct.width;
+    height = formatStruct.height;
+    isInitialized = true;
+    return 0;
+}
+void InferenceBase::RunInferenceOnStream()
+{
+    if (!isInitialized)
+    {
+        LOG(ERROR) << "Video streaming not initialized";
+        return;
+    }
+    cuda::GpuMat d_frame;
+    int iFrame = 0, nFrames = 30;
+    double fps = 0., infer_tf_ms = 0.;
+    high_resolution_clock::time_point start = high_resolution_clock::now();
+    high_resolution_clock::time_point end;
+    double duration = 0.;
+    for (;;)
+    {
+        start = high_resolution_clock::now();
+        if (!d_reader->nextFrame(d_frame))
+        {
+            break;
+        }
+        if (doInference(d_frame) != 0)
+        {
+            LOG(ERROR) << "Inference failed";
+            return;
+        }
+        end = high_resolution_clock::now();
+        duration += (double) duration_cast<milliseconds>(end - start).count();
+        visualize(d_frame, fps);
+        if (++iFrame % nFrames == 0)
+        {
+            fps = 1. * nFrames / duration * 1000.;
+            duration = 0.;
+        }
+        if (iFrame % 100 == 0)
+        {
+            LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
+        }
+    }
+}
\ No newline at end of file
--- a/inference_base.h
+++ b/inference_base.h
+#pragma once
+#include "utils.h"
+using namespace std;
+class InferenceBase
+{
+  private:
+    bool isCudaInited;
+    cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
+     {return cv::cudacodec::createVideoReader(video_file);}
+  protected:
+    string labelsFile;
+    string graphFile;
+    map<int, string> labelsMap;
+    virtual int ReadClassLabels();
+    virtual int ReadGraph() = 0;
+    void InitCuda();
+    cv::Ptr<cv::cudacodec::VideoReader> d_reader;
+    double thresholdScore;
+    double thresholdIOU;
+    // frame width and height
+    int height;
+    int width;
+    int debug;
+    bool isInitialized;
+  public:
+    InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
+        : labelsFile(labelsFile)
+        , graphFile(graphFile)
+        , isCudaInited(false)
+        , thresholdScore(threshScore)
+        , thresholdIOU(threshIOU)
+        , isInitialized(false)
+        , labelsMap()
+        , debug(dbg)
+        {}
+    virtual ~InferenceBase() {}
+    void RunInferenceOnStream();
+    virtual int doInference(cv::cuda::GpuMat&) = 0;
+    virtual void visualize(cv::cuda::GpuMat&, double) = 0;
+    virtual int Init(string video_stream);
+    map<int, string> get_labels_map() {return labelsMap;}
+    void set_debug(int dbg) {debug = dbg;}
+};
--- a/inference_tf.cpp
+++ b/inference_tf.cpp
+#include "inference_tf.h"
+using tensorflow::Status;
+using tensorflow::Tensor;
+using namespace cv;
+using tensorflow::int32;
+int InferenceTensorflow::ReadGraph()
+{
+    LOG(INFO) << "graphFile:" << graphFile;
+    Status loadGraphStatus = loadGraph(graphFile, &session);
+    if (!loadGraphStatus.ok())
+    {
+        LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
+        return -1;
+    }
+    else
+        LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
+    return 0;
+}
+// allocate input tensor
+int InferenceTensorflow::Init(string videoStream)
+{
+    if (InferenceBase::Init(videoStream) != 0)
+    {
+        return -1;
+    }
+    // configure callable options
+    opts.add_feed(inputLayer);
+    for (auto const &value : outputLayer)
+    {
+        opts.add_fetch(value);
+    }
+    const string gpu_device_name = GPUDeviceName(session.get());
+    opts.clear_fetch_devices();
+    opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
+    auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
+    if (!runStatus.ok())
+    {
+        LOG(ERROR) << "Failed to make callable";
+    }
+    // allocate tensor on the GPU
+    tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
+    tensorflow::PlatformGpuId platform_gpu_id(0);
+    tensorflow::GPUMemAllocator *sub_allocator =
+        new tensorflow::GPUMemAllocator(
+            tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
+            platform_gpu_id, false /*use_unified_memory*/, {}, {});
+    tensorflow::GPUBFCAllocator *allocator =
+        new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
+    inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
+    LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
+    return 0;
+}
+int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
+{
+    Status runStatus;
+    readTensorFromGpuMat(d_frame, inputTensor);
+    runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
+    if (!runStatus.ok())
+    {
+        LOG(ERROR) << "Running model failed: " << runStatus;
+        return -1;
+    }
+    return 0;
+}
+void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
+{
+    // Extract results from the outputs vector
+    tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
+    tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
+    tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
+    tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
+    vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
+    if (debug & 0x1)
+    {
+        for (size_t i = 0; i < goodIdxs.size(); i++)
+            LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
+                      << " (" << classes(goodIdxs.at(i)) << "), box:"
+                      << "," << boxes(0, goodIdxs.at(i), 0) << ","
+                      << boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
+                      << boxes(0, goodIdxs.at(i), 3);
+    }
+    // Draw bboxes and captions
+    if (debug & 0x2)
+    {
+        Mat frame;
+        d_frame.download(frame);
+        drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
+        auto color = Scalar(255, 0, 255);
+        drawFrameworkSignature(frame, fps, "Tensorflow", color);
+    }
+}
--- a/inference_tf.h
+++ b/inference_tf.h
+#pragma once
+#include "inference_base.h"
+using namespace std;
+using tensorflow::CallableOptions;
+using tensorflow::Tensor;
+using tensorflow::Session;
+class InferenceTensorflow : public InferenceBase
+{
+  private:
+    const string inputLayer = "image_tensor:0";
+    const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
+    CallableOptions opts;
+    std::unique_ptr<tensorflow::Session> session;
+    Session::CallableHandle feed_gpu_fetch_cpu;
+    // Allocate input tensor on the gpu
+    Tensor inputTensor;
+    vector<Tensor> outputs;
+  protected:
+    int ReadGraph() override;
+    int doInference(cv::cuda::GpuMat& d_frame) override;
+    void visualize(cv::cuda::GpuMat &d_frame, double) override;
+  public:
+    InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0) 
+    : InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
+    , opts()
+    { }
+    int Init(string videoStream) override;
+    virtual ~InferenceTensorflow() {  session->ReleaseCallable(feed_gpu_fetch_cpu);}
+};
\ No newline at end of file
--- a/inference_trt.cpp
+++ b/inference_trt.cpp
+#include "inference_trt.h"
+using namespace cv;
+using namespace std;
+int InferenceTensorRT::ReadGraph()
+{
+    auto runtimeEngineContext = CreateTrtEngineAndContext(graphFile, isInt8);
+    runtime = std::get<0>(runtimeEngineContext);
+    engine = std::get<1>(runtimeEngineContext);
+    context = std::get<2>(runtimeEngineContext);
+    return 0;
+}
+int InferenceTensorRT::ReadClassLabels()
+{
+    populateClassLabels(labelsVector, labelsFile);
+    return 0;
+}
+int InferenceTensorRT::doInference(cv::cuda::GpuMat &d_frame)
+{
+    auto inferenceTuple = doInferenceWithTrt(d_frame, context, labelsVector);
+    detections = std::get<0>(inferenceTuple);
+    numDetections = std::get<1>(inferenceTuple);
+    return 0;
+}
+void InferenceTensorRT::visualize(cv::cuda::GpuMat &d_frame, double fps)
+{
+    Mat img;
+    d_frame.download(img);
+    for (int p = 0; p < N; ++p)
+    {
+        for (int i = 0; i < numDetections[p]; ++i)
+        {
+            float *det = &detections[0] + (p * detectionOutputParam.keepTopK + i) * 7;
+            if (det[2] < visualizeThreshold)
+                continue;
+            // Output format for each detection is stored in the below order
+            // [image_id, label, confidence, xmin, ymin, xmax, ymax]
+            assert((int)det[1] < OUTPUT_CLS_SIZE);
+            std::string storeName = outFileRoot + labelsVector[(int)det[1]] + "-" + std::to_string(det[2]) + ".jpg";
+            if (debug & 0x2)
+            {
+                // det array idxs: (4, 3) = (y0, x0), (6, 5) = (y1, x1)
+                // dets are in absolute coordinates: 0 <= pt <= 1
+                drawBoundingBoxOnImage(img, det[4], det[3], det[6], det[5], det[2], labelsVector[(int)det[1]]);
+            }
+        }
+    }
+    if (debug & 0x2)
+    {
+        string framework("TensorRT");
+        if (isInt8)
+        {
+            framework += " (INT8)";
+        }
+        auto color = Scalar(0, 255, 255);
+        drawFrameworkSignature(img, fps, framework, color);
+    }
+}
--- a/inference_trt.h
+++ b/inference_trt.h
+#pragma once
+#include "inference_base.h"
+using namespace std;
+class InferenceTensorRT : public InferenceBase
+{
+private:
+  IRuntime *runtime;
+  ICudaEngine *engine;
+  IExecutionContext *context;
+  bool isInt8;
+  //batch size
+  const int N = 1;
+  const float visualizeThreshold = 0.5;
+  vector<string> labelsVector;
+  vector<int> numDetections;
+  vector<float> detections;
+  string outFileRoot;
+protected:
+  int ReadGraph() override;
+  int ReadClassLabels() override;
+  int doInference(cv::cuda::GpuMat &d_frame) override;
+  void visualize(cv::cuda::GpuMat&, double) override;
+public:
+  InferenceTensorRT(const string &labelsFile, const string &graphFile, bool isInt8, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0, string outFile="")
+      : InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
+      , labelsVector()
+      , numDetections(N)
+      , detections(N * detectionOutputParam.keepTopK * 7)
+      , outFileRoot(outFile)
+      , isInt8(isInt8)
+  {
+  }
+  virtual ~InferenceTensorRT()
+  {
+    if(context != nullptr)
+    {
+      context->destroy();
+    }
+    if(engine != nullptr)
+    {
+      engine->destroy();
+    }
+    if(runtime != nullptr)
+    {
+      runtime->destroy();
+    }
+  }
+};
\ No newline at end of file
--- a/main.cpp
+++ b/main.cpp
+#include "inference_base.h"
+#include "inference_tf.h"
+#include "inference_trt.h"
+#include <cuda_profiler_api.h>
+using tensorflow::CallableOptions;
+using tensorflow::int32;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::Tensor;
+using namespace std;
+using namespace cv;
+using namespace std::chrono;
+int main(int argc, char *argv[])
+{
+    if (!tensorflow::IsGoogleCudaEnabled())
+    {
+        LOG(ERROR) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda";
+        return -1;
+    }
+    const String keys =
+        "{d display |1  | view video while objects are detected}"
+        "{t tensorrt|false | use tensorrt}"
+        "{i int8|false| use INT8 (requires callibration)}"
+        "{v video    |  | video for detection}"
+        "{graph ||frozen graph location}"
+        "{labels ||trained labels filelocation}";
+    // Set dirs variables
+    string ROOTDIR = "";
+    CommandLineParser parser(argc, argv, keys);
+    int showWindow = parser.get<int>("d");
+    String video_file = parser.get<String>("v");
+    bool is_tensor_rt = parser.get<bool>("t");
+    bool is_int8 = parser.get<bool>("i");
+    String LABELS = parser.get<String>("labels");
+    String GRAPH = parser.get<String>("graph");
+    unique_ptr<InferenceBase> infer(is_tensor_rt ? 
+        (InferenceBase *) new InferenceTensorRT(LABELS, GRAPH, is_int8) 
+        : (InferenceBase *) new InferenceTensorflow(LABELS, GRAPH));
+    infer->set_debug(showWindow);
+    infer->Init(video_file);
+    infer->RunInferenceOnStream();
+    return 0;
+}
\ No newline at end of file
--- a/run_tf.sh
+++ b/run_tf.sh
+./build/tf_detector_example -d=$1 \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/home/boris/model/frozen_inference_graph.pb \
+    -labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
--- a/run_trt.sh
+++ b/run_trt.sh
+./build/tf_detector_example \
+    -d=$1 \
+    -t \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
+    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/run_trt_int8.sh
+++ b/run_trt_int8.sh
+./build/tf_detector_example \
+    -d=$1 \
+    -i \
+    -t \
+    -v=/home/boris/Videos/ride_2.mp4 \
+    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
+    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/utils.cpp
+++ b/utils.cpp
+#include "utils.h"
+using namespace std;
+using namespace cv;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::int32;
+using tensorflow::DeviceAttributes;
+/** Read a model graph definition (xxx.pb) from disk, and creates a session object you can use to run it.
+ */
+Status loadGraph(const string &graph_file_name,
+                 unique_ptr<tensorflow::Session> *session) {
+    tensorflow::GraphDef graph_def;
+    Status load_graph_status =
+            ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
+    if (!load_graph_status.ok()) {
+        return tensorflow::errors::NotFound("Failed to load compute graph at '",
+                                            graph_file_name, "'");
+    }
+    tensorflow::SessionOptions session_options;
+    session_options.config.mutable_gpu_options()->set_allow_growth(true);
+    session->reset(tensorflow::NewSession(session_options));
+    Status session_create_status = (*session)->Create(graph_def);
+    if (!session_create_status.ok()) {
+        return session_create_status;
+    }
+    return Status::OK();
+}
+/** Read a labels map file (xxx.pbtxt) from disk to translate class numbers into human-readable labels.
+ */
+Status readLabelsMapFile(const string &fileName, map<int, string> &labelsMap) {
+    // Read file into a string
+    ifstream t(fileName);
+    if (t.bad())
+        return tensorflow::errors::NotFound("Failed to load labels map at '", fileName, "'");
+    stringstream buffer;
+    buffer << t.rdbuf();
+    string fileString = buffer.str();
+    // Search entry patterns of type 'item { ... }' and parse each of them
+    smatch matcherEntry;
+    smatch matcherId;
+    smatch matcherName;
+    const regex reEntry("item \\{([\\S\\s]*?)\\}");
+    const regex reId("id: [0-9]+");
+    const regex reDisplayName("display_name: (\"|\').+(\"|\')");
+    const regex reName("name: (\"|\').+(\"|\')");
+    string entry;
+    const string namePrefix = "name: \"";
+    const string display_name = "display_name: \"";
+    const size_t idOffset = string("id: ").length();
+    size_t nameOffset = display_name.length();
+    // we first try to parse "display_name"
+    // and fall back if it does not exist
+    bool isParsingName = false;
+    auto stringBegin = sregex_iterator(fileString.begin(), fileString.end(), reEntry);
+    auto stringEnd = sregex_iterator();
+    int id;
+    string name;
+    for (sregex_iterator i = stringBegin; i != stringEnd; i++) {
+        matcherEntry = *i;
+        entry = matcherEntry.str();
+        regex_search(entry, matcherId, reId);
+        if (!matcherId.empty())
+            id = stoi(matcherId[0].str().substr(idOffset, matcherId[0].str().length() - idOffset));
+        else
+            continue;
+        if(!isParsingName)
+        {
+            regex_search(entry, matcherName, reDisplayName);
+            if(matcherName.empty())
+            {
+                isParsingName = true;
+                nameOffset = namePrefix.length();
+            }
+        }
+        if(isParsingName)
+        {
+            regex_search(entry, matcherName, reName);
+        }
+        if (!matcherName.empty())
+            name = matcherName[0].str().substr(nameOffset, matcherName[0].str().length() - nameOffset - 1);
+        else
+            continue;
+        labelsMap.insert(pair<int, string>(id, name));
+    }
+    return Status::OK();
+}
+/** Convert Mat image into tensor of shape (1, height, width, d) where last three dims are equal to the original dims.
+ */
+Status readTensorFromMat(const Mat &mat, Tensor &outTensor) {
+    // Trick from https://github.com/tensorflow/tensorflow/issues/8033
+    tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
+    Mat fakeMat(mat.rows, mat.cols, CV_8UC3, p);
+    cv::cvtColor(mat, fakeMat, COLOR_BGR2RGB);
+    return Status::OK();
+}
+Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor) {
+    tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
+    cv::cuda::GpuMat fakeMat(g_mat.rows, g_mat.cols, CV_8UC3, p);
+    // comes in with 4 channels -> 3 channels
+    cv::cuda::cvtColor(g_mat, fakeMat, COLOR_BGRA2RGB);
+    return Status::OK();
+}
+/** Draw bounding box and add caption to the image.
+ *  Boolean flag _scaled_ shows if the passed coordinates are in relative units (true by default in tensorflow detection)
+ */
+void drawBoundingBoxOnImage(Mat &image, double yMin, double xMin, double yMax, double xMax, double score, string label, bool scaled) {
+    cv::Point tl, br;
+    if (scaled) {
+        tl = cv::Point((int) (xMin * image.cols), (int) (yMin * image.rows));
+        br = cv::Point((int) (xMax * image.cols), (int) (yMax * image.rows));
+    } else {
+        tl = cv::Point((int) xMin, (int) yMin);
+        br = cv::Point((int) xMax, (int) yMax);
+    }
+    cv::rectangle(image, tl, br, cv::Scalar(0, 255, 255), 1);
+    // Ceiling the score down to 3 decimals (weird!)
+    float scoreRounded = floorf(score * 1000) / 1000;
+    string scoreString = to_string(scoreRounded).substr(0, 5);
+    string caption = label + " (" + scoreString + ")";
+    // Adding caption of type "LABEL (X.XXX)" to the top-left corner of the bounding box
+    int fontCoeff = 12;
+    cv::Point brRect = cv::Point(tl.x + caption.length() * fontCoeff / 1.6, tl.y + fontCoeff);
+    cv::rectangle(image, tl, brRect, cv::Scalar(0, 255, 255), -1);
+    cv::Point textCorner = cv::Point(tl.x, tl.y + fontCoeff * 0.9);
+    cv::putText(image, caption, textCorner, FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 0, 0));
+}
+/** Draw bounding boxes and add captions to the image.
+ *  Box is drawn only if corresponding score is higher than the _threshold_.
+ */
+void drawFrameworkSignature(Mat& image, double fps, string signature, Scalar& color)
+{
+        putText(image, "TensorFlow", Point(0, image.rows - 30), FONT_HERSHEY_SIMPLEX, 0.7, color, 2);
+        putText(image, to_string(fps).substr(0, 5), Point(0, image.rows - 5), FONT_HERSHEY_SIMPLEX, 0.7, Scalar(255, 255, 255), 2);
+        imshow("stream", image);
+        waitKey(1);
+}
+void drawBoundingBoxesOnImage(Mat &image,
+                              tensorflow::TTypes<float>::Flat &scores,
+                              tensorflow::TTypes<float>::Flat &classes,
+                              tensorflow::TTypes<float,3>::Tensor &boxes,
+                              map<int, string> &labelsMap,
+                              vector<size_t> &idxs) {
+    for (int j = 0; j < idxs.size(); j++)
+        drawBoundingBoxOnImage(image,
+                               boxes(0,idxs.at(j),0), boxes(0,idxs.at(j),1),
+                               boxes(0,idxs.at(j),2), boxes(0,idxs.at(j),3),
+                               scores(idxs.at(j)), labelsMap[classes(idxs.at(j))]);
+}
+/** Calculate intersection-over-union (IOU) for two given bbox Rects.
+ */
+double IOU(Rect2f box1, Rect2f box2) {
+    float xA = max(box1.tl().x, box2.tl().x);
+    float yA = max(box1.tl().y, box2.tl().y);
+    float xB = min(box1.br().x, box2.br().x);
+    float yB = min(box1.br().y, box2.br().y);
+    float intersectArea = abs((xB - xA) * (yB - yA));
+    float unionArea = abs(box1.area()) + abs(box2.area()) - intersectArea;
+    return 1. * intersectArea / unionArea;
+}
+/** Return idxs of good boxes (ones with highest confidence score (>= thresholdScore)
+ *  and IOU <= thresholdIOU with others).
+ */
+vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
+                           tensorflow::TTypes<float, 3>::Tensor &boxes,
+                           double thresholdIOU, double thresholdScore) {
+    vector<size_t> sortIdxs(scores.size());
+    iota(sortIdxs.begin(), sortIdxs.end(), 0);
+    // Create set of "bad" idxs
+    set<size_t> badIdxs = set<size_t>();
+    size_t i = 0;
+    while (i < sortIdxs.size()) {
+        if (scores(sortIdxs.at(i)) < thresholdScore)
+            badIdxs.insert(sortIdxs[i]);
+        if (badIdxs.find(sortIdxs.at(i)) != badIdxs.end()) {
+            i++;
+            continue;
+        }
+        Rect2f box1 = Rect2f(Point2f(boxes(0, sortIdxs.at(i), 1), boxes(0, sortIdxs.at(i), 0)),
+                             Point2f(boxes(0, sortIdxs.at(i), 3), boxes(0, sortIdxs.at(i), 2)));
+        for (size_t j = i + 1; j < sortIdxs.size(); j++) {
+            if (scores(sortIdxs.at(j)) < thresholdScore) {
+                badIdxs.insert(sortIdxs[j]);
+                continue;
+            }
+            Rect2f box2 = Rect2f(Point2f(boxes(0, sortIdxs.at(j), 1), boxes(0, sortIdxs.at(j), 0)),
+                                 Point2f(boxes(0, sortIdxs.at(j), 3), boxes(0, sortIdxs.at(j), 2)));
+            if (IOU(box1, box2) > thresholdIOU)
+                badIdxs.insert(sortIdxs[j]);
+        }
+        i++;
+    }
+    // Prepare "good" idxs for return
+    vector<size_t> goodIdxs = vector<size_t>();
+    for (auto it = sortIdxs.begin(); it != sortIdxs.end(); it++)
+        if (badIdxs.find(sortIdxs.at(*it)) == badIdxs.end())
+            goodIdxs.push_back(*it);
+    return goodIdxs;
+}
+string type2str(int type) {
+  string r;
+  uchar depth = type & CV_MAT_DEPTH_MASK;
+  uchar chans = 1 + (type >> CV_CN_SHIFT);
+  switch ( depth ) {
+    case CV_8U:  r = "8U"; break;
+    case CV_8S:  r = "8S"; break;
+    case CV_16U: r = "16U"; break;
+    case CV_16S: r = "16S"; break;
+    case CV_32S: r = "32S"; break;
+    case CV_32F: r = "32F"; break;
+    case CV_64F: r = "64F"; break;
+    default:     r = "User"; break;
+  }
+  r += "C";
+  r += (chans+'0');
+  return r;
+}
+bool IsCUDATensor(const Tensor &t)
+{
+    cudaPointerAttributes attributes;
+    cudaError_t err =
+        cudaPointerGetAttributes(&attributes, t.tensor_data().data());
+    if (err == cudaErrorInvalidValue)
+        return false;
+    CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
+#if CUDART_VERSION >= 10000    
+    return (attributes.type == cudaMemoryTypeDevice);
+#else
+    return (attributes.memoryType == cudaMemoryTypeDevice);
+#endif
+}
+string GPUDeviceName(Session* session) {
+  std::vector<DeviceAttributes> devices;
+  TF_CHECK_OK(session->ListDevices(&devices));
+  for (const DeviceAttributes& d : devices) {
+    LOG(INFO) << "Device: " << d.name();
+    if (d.device_type() == "GPU" || d.device_type() == "gpu") {
+      return d.name();
+    }
+  }
+  return "";
+}
\ No newline at end of file
--- a/utils.h
+++ b/utils.h
+#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
+#define TF_DETECTOR_EXAMPLE_UTILS_H
+#endif //TF_DETECTOR_EXAMPLE_UTILS_H
+#include <vector>
+#include <string>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <unordered_map>
+#include <math.h>
+#include <regex>
+#include <tuple>
+#include <cassert>
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <sstream>
+#include <time.h>
+#include "BatchStreamPPM.h"
+#include "NvUffParser.h"
+#include "common.h"
+#include "NvInferPlugin.h"
+// Required for CUDA check
+#include "tensorflow/core/util/port.h"
+// GPU allocator
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
+// Direct session
+#include "tensorflow/core/common_runtime/direct_session.h"
+#include <cv.hpp>
+#include <opencv2/cudacodec.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/cudaimgproc.hpp>
+#include <opencv2/cudawarping.hpp>
+// CUDA includes. Order matters
+#include <dynlink_nvcuvid.h>
+#include "cuda_runtime_api.h"
+using namespace std;
+using tensorflow::Tensor;
+using tensorflow::Status;
+using tensorflow::Session;
+using namespace nvinfer1;
+using namespace nvuffparser;
+string type2str(int type);
+Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
+Status loadGraph(const string &graph_file_name,
+                 std::unique_ptr<tensorflow::Session> *session);
+Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
+Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
+void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
+void drawBoundingBoxesOnImage(cv::Mat &image,
+                              tensorflow::TTypes<float>::Flat &scores,
+                              tensorflow::TTypes<float>::Flat &classes,
+                              tensorflow::TTypes<float,3>::Tensor &boxes,
+                              std::map<int, string> &labelsMap,
+                              std::vector<size_t> &idxs);
+void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
+double IOU(cv::Rect box1, cv::Rect box2);
+std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
+                                tensorflow::TTypes<float, 3>::Tensor &boxes,
+                                double thresholdIOU, double thresholdScore);
+bool IsCUDATensor(const Tensor &t);
+string GPUDeviceName(Session* session);
+std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
+std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
+extern DetectionOutputParameters detectionOutputParam;
+void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
+void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
+extern const int OUTPUT_CLS_SIZE;
+extern const int OUTPUT_BBOX_SIZE;