standalone code

773648ca · Oleg Dzhimiev · d8533f78 · d8533f78 · 773648ca · d8533f78
Commit 773648ca authored Feb 19, 2020 by Oleg Dzhimiev
27 changed files
--- a/BatchStreamPPM.h
+++ b/BatchStreamPPM.h
-#ifndef BATCH_STREAM_PPM_H
-#define BATCH_STREAM_PPM_H
-#include <vector>
-#include <assert.h>
-#include <algorithm>
-#include <iomanip>
-#include <fstream>
-#include "NvInfer.h"
-#include "common.h"
-std::string locateFile(const std::string& input);
-static constexpr int INPUT_C = 3;
-static constexpr int INPUT_H = 300;
-static constexpr int INPUT_W = 300;
-extern const char* INPUT_BLOB_NAME;
-class BatchStream
-{
-public:
-	BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
-	{
-		mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
-		mImageSize = mDims.c() * mDims.h() * mDims.w();
-		mBatch.resize(mBatchSize * mImageSize, 0);
-		mLabels.resize(mBatchSize, 0);
-		mFileBatch.resize(mDims.n() * mImageSize, 0);
-		mFileLabels.resize(mDims.n(), 0);
-		reset(0);
-	}
-	void reset(int firstBatch)
-	{
-		mBatchCount = 0;
-		mFileCount = 0;
-		mFileBatchPos = mDims.n();
-		skip(firstBatch);
-	}
-	bool next()
-	{
-		if (mBatchCount == mMaxBatches)
-			return false;
-		for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
-		{
-			assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
-			if (mFileBatchPos == mDims.n() && !update())
-				return false;
-			// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
-			csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
-			std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
-		}
-		mBatchCount++;
-		return true;
-	}
-	void skip(int skipCount)
-	{
-		if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
-		{
-			mFileCount += skipCount * mBatchSize / mDims.n();
-			return;
-		}
-		int x = mBatchCount;
-		for (int i = 0; i < skipCount; i++)
-			next();
-		mBatchCount = x;
-	}
-	float *getBatch() { return mBatch.data(); }
-	float *getLabels() { return mLabels.data(); }
-	int getBatchesRead() const { return mBatchCount; }
-	int getBatchSize() const { return mBatchSize; }
-	nvinfer1::DimsNCHW getDims() const { return mDims; }
-private:
-	float* getFileBatch() { return mFileBatch.data(); }
-	float* getFileLabels() { return mFileLabels.data(); }
-	bool update()
-	{
-        std::vector<std::string> fNames;
-	    std::ifstream file(locateFile("list.txt"));
-        if(file)
-        {
-            std::cout  << "Batch #" << mFileCount << "\n";
-            file.seekg(mCurPos);
-        }
-        for(int i = 1; i <= mBatchSize; i++)
-        {
-            std::string sName;
-            std::getline(file, sName);
-            sName = sName + ".ppm";
-            std::cout << "Calibrating with file " << sName << std::endl;
-            fNames.emplace_back(sName);
-        }
-        mCurPos = file.tellg();
-        mFileCount++;
-        std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
-        for (uint32_t i = 0; i < fNames.size(); ++i)
-        {
-            readPPMFile(locateFile(fNames[i]), ppms[i]);
-        }
-        std::vector<float> data(samplesCommon::volume(mDims));
-        long int volChl = mDims.h() * mDims.w();
-        for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
-        {
-            for (int c = 0; c < mDims.c(); ++c)
-            {
-                for (int j = 0; j < volChl; ++j)
-                {
-                    data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
-                }
-            }
-        }
-        std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
-		mFileBatchPos = 0;
-		return true;
-	}
-	int mBatchSize{0};
-	int mMaxBatches{0};
-	int mBatchCount{0};
-	int mFileCount{0}, mFileBatchPos{0};
-	int mImageSize{0};
-    int mCurPos{0};
-	nvinfer1::DimsNCHW mDims;
-	std::vector<float> mBatch;
-	std::vector<float> mLabels;
-	std::vector<float> mFileBatch;
-	std::vector<float> mFileLabels;
-};
-class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
-{
-public:
-    Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
-        : mStream(stream),
-        mCalibrationTableName(std::move(calibrationTableName)),
-        mReadCache(readCache)
-    {
-    	nvinfer1::DimsNCHW dims = mStream.getDims();
-        mInputCount = samplesCommon::volume(dims);
-        CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
-        mStream.reset(firstBatch);
-    }
-    virtual ~Int8EntropyCalibrator()
-    {
-        CHECK_TRT(cudaFree(mDeviceInput));
-    }
-    int getBatchSize() const override { return mStream.getBatchSize(); }
-    bool getBatch(void* bindings[], const char* names[], int nbBindings) override
-    {
-        if (!mStream.next())
-            return false;
-        CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
-        assert(!strcmp(names[0], INPUT_BLOB_NAME));
-        bindings[0] = mDeviceInput;
-        return true;
-    }
-    const void* readCalibrationCache(size_t& length) override
-    {
-        mCalibrationCache.clear();
-        std::ifstream input(mCalibrationTableName, std::ios::binary);
-        input >> std::noskipws;
-        if (mReadCache && input.good())
-            std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
-        length = mCalibrationCache.size();
-        return length ? mCalibrationCache.data() : nullptr;
-    }
-    void writeCalibrationCache(const void* cache, size_t length) override
-    {
-        std::ofstream output(mCalibrationTableName, std::ios::binary);
-        output.write(reinterpret_cast<const char*>(cache), length);
-    }
-private:
-    BatchStream mStream;
-    std::string mCalibrationTableName;
-    bool mReadCache{true};
-    size_t mInputCount;
-    void* mDeviceInput{nullptr};
-    std::vector<char> mCalibrationCache;
-};
-#endif
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.16)
-set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
+project(tf-gpu-feed LANGUAGES CXX CUDA)
-project(tf_detector_example LANGUAGES CXX CUDA)
 cmake_policy(SET CMP0074 OLD)
 set(CMAKE_CXX_STANDARD 11)
-# CUDA for cudacodec ops
+set(CUDACXX /usr/local/cuda/bin/nvcc)
 find_package(CUDA 9.0 REQUIRED)
-set(SOURCE_FILES 
+set(SOURCE_FILES
    main.cpp
-    utils.cpp 
+    dynlink_nvcuvid.cpp
-    utils.h 
+    array.cu
-    dynlink_nvcuvid.cpp 
-    infer_with_trt.cpp
-    inference_base.cpp 
-    inference_tf.cpp
-    channel_first.cu
    )
-# Tensorflow directories and libraries
 set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
-set(MYHOME $ENV{HOME})
-message("-- Home set to: " ${MYHOME})
 link_directories("/usr/local/tensorflow/lib")
-add_executable(tf_detector_example ${SOURCE_FILES})
+add_executable(tf-gpu-feed ${SOURCE_FILES})
-set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(tf-gpu-feed PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
-# OpenCV libs
-find_package(OpenCV REQUIRED)
-include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
+include_directories(${CUDA_INCLUDE_DIRS})
-# ==================== PATHS TO SPECIFY! ==================== #
 # TensorFlow headers
 include_directories("/usr/local/tensorflow/include/tensorflow/")
 include_directories("/usr/local/tensorflow/include/third-party/")
 include_directories("/usr/local/tensorflow/include/")
-# IMPORTANT: Protobuf includes. Depends on the anaconda path
+target_link_libraries(tf-gpu-feed
-# This is Azure DLVM (not sure if DSVM is the same)
-#include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
-# This is a standard install of Anaconda with p36 environment
-#include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
-target_link_libraries(tf_detector_example
    ${CUDA_LIBRARIES}
    cuda
    cublas
    nvinfer
    nvToolsExt
-    nvparsers 
+    nvparsers
-    nvinfer_plugin 
+    nvinfer_plugin
-    nvonnxparser 
+    nvonnxparser
    ${CMAKE_DL_LIBS}
-    ${OpenCV_LIBS} 
    ${TENSORFLOW_LIBS})
--- a/CalibrationTableSSD
+++ b/CalibrationTableSSD
--- a/LICENSE
+++ b/LICENSE
-MIT License
-Copyright (c) 2019 Boris
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/README.md
+++ b/README.md
-# tensorflow-feed-from-gpu
+# Run
-Simple TF test
+```
+mkdir build
-## Setup in Eclipse
+cd build
+cmake ..
-From **Eclipse (2019-12)**:
+./tf-gpu-feed
-* File > Open Projects from File System...
+```
-* Directory... > navigate to project's root > Finish
\ No newline at end of file
-Tried importing a few times - indexer does not work sometimes.
--- a/argsParser.h
+++ b/argsParser.h
-#ifndef TENSORRT_ARGS_PARSER_H
-#define TENSORRT_ARGS_PARSER_H
-#include <vector>
-#include <string>
-#include <getopt.h>
-#include <iostream>
-namespace samplesCommon
-{
-//!
-//! \brief The SampleParams structure groups the basic parameters required by
-//!        all sample networks.
-//!
-struct SampleParams
-{
-    int batchSize;                     //!< Number of inputs in a batch
-    int dlaID;
-    std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
-    std::vector<std::string> inputTensorNames;
-    std::vector<std::string> outputTensorNames;
-};
-//!
-//! \brief The CaffeSampleParams structure groups the additional parameters required by
-//!         networks that use caffe
-//!
-struct CaffeSampleParams : public SampleParams
-{
-    std::string prototxtFileName; //!< Filename of prototxt design file of a network
-    std::string weightsFileName;  //!< Filename of trained weights file of a network
-};
-//!
-//! /brief Struct to maintain command-line arguments.
-//!
-struct Args
-{
-    bool runInInt8{false};
-    bool help{false};
-    int useDLA{-1};
-    std::vector<std::string> dataDirs;
-};
-//!
-//! \brief Populates the Args struct with the provided command-line parameters.
-//!
-//! \throw invalid_argument if any of the arguments are not valid
-//!
-//! \return boolean If return value is true, execution can continue, otherwise program should exit
-//!
-inline bool parseArgs(Args& args, int argc, char* argv[])
-{
-    while (1)
-    {
-        int arg;
-        static struct option long_options[] = {
-            {"help", no_argument, 0, 'h'},
-            {"datadir", required_argument, 0, 'd'},
-            {"int8", no_argument, 0, 'i'},
-            {"useDLA", required_argument, 0, 'u'},
-            {nullptr, 0, nullptr, 0}};
-        int option_index = 0;
-        arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
-        if (arg == -1)
-            break;
-        switch (arg)
-        {
-        case 'h':
-            args.help = true;
-            return false;
-        case 'd':
-            if (optarg)
-                args.dataDirs.push_back(optarg);
-            else
-            {
-                std::cerr << "ERROR: --datadir requires option argument" << std::endl;
-                return false;
-            }
-            break;
-        case 'i':
-            args.runInInt8 = true;
-            break;
-        case 'u':
-            if (optarg)
-                args.useDLA = std::stoi(optarg);
-            break;
-        default:
-            return false;
-        }
-    }
-    return true;
-}
-} // namespace samplesCommon
-#endif // TENSORRT_ARGS_PARSER_H
--- a/array.cu
+++ b/array.cu
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdio.h>
+// this is the program that is to be run on the device for a
+// large number of threads, in our example 100
+// each thread takes care of one entry in the number array,
+// so in order for the thread to know which number to manipulate,
+// a scheme has to be utilized in order to assign each thread a
+// unique number
+__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
+{
+    // this is the assignment of a unique identifier.
+    // blockIdx.x is the unique number of the block, in which the
+    // thread is positioned, blockDim.x holds the number of threads
+    // for each block and threadIdx.x is the number of the thread in
+    // this block.
+    int idx = blockIdx.x*blockDim.x + threadIdx.x;
+    // this tells the thread to manipulate the assigned number in
+    // the array stored in device memory and increment it
+    if (idx<N)
+        numberArray[idx] = numberArray[idx] + 1;
+}
+// this is the "normal" function to be run on the CPU
+// it does the exact same thing as the CUDA function above
+void incrementArray(int *numberArray, int N){
+    // go through every number in the array consecutively
+    // and increment it
+    for(int i=0; i<N; ++i)
+    {
+        numberArray[i] = numberArray[i] + 1;
+    }
+}
+int myCreateCUDAArray(int *tf_ptr){
+    // some arbitrary array length
+    int numberOfNumbers = 100;
+    // declare some arrays for storing numbers
+    int *numbers1, *numbers2;
+    numbers1 = tf_ptr;
+    // reserve (allocate) some working space for the numbers in device memory
+    cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
+    cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
+    // fill the input array with some numbers
+    for(int i=0;i<numberOfNumbers;i++)
+    {
+        numbers1[i] = i;    // this will be manipulated by the CUDA device (GPU)
+        numbers2[i] = i;    // this will be manipulated by the CPU (as any standard C program would do)
+    }
+    // tell the device (GPU) to do its magic
+    incrementArrayViaCUDAdevice<<<1, numberOfNumbers>>>(numbers1, numberOfNumbers);
+    // wait for the device to finish working
+    cudaDeviceSynchronize();
+    // compute the same function "normally" on the CPU
+    incrementArray(numbers2, numberOfNumbers);
+    // check if the GPU did the same as the CPU
+    bool workedCorrectly = true;
+    for(int i=0;i<numberOfNumbers;i++)
+    {
+        if (numbers1[i] != numbers2[i])
+            workedCorrectly = 0;
+        printf(" %d vs %d |",numbers1[i],numbers2[i]);
+    }
+    printf("\n");
+    if (workedCorrectly == 1)
+        printf("The device performed well!\n");
+    else
+        printf("Something went wrong. The output numbers are not what was to be expected...\n");
+    // free the space that has been used by our arrays so that
+    // other programs might use it
+    cudaFree(numbers1);
+    cudaFree(numbers2);
+    return 0;
+}
--- a/array.h
+++ b/array.h
+int myCreateCUDAArray(int *tf_ptr);
--- a/channel_first.cu
+++ b/channel_first.cu
-// kernel to convert from OpenCV channel representation to channel-first
-// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
-const int BLOCK_SIZE = 1024;
-#include <cuda_runtime.h>
-__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
-{
-    int idx = threadIdx.x + blockIdx.x * blockDim.x;
-    int offset = idx / channelsNum;
-    int channel = idx % channelsNum;
-    // what would the row be if we didn't have any padding
-    int row = idx / rowElems;
-    int col = idx % rowElems;
-    // actual element - skip padding
-    int sourceIdx = row * rowSize + col;
-    dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
-}
-// we expect all memory to already reside on device so no need to allocate anything
-void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
-{
-    int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
-    channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
-    cudaDeviceSynchronize();
-}
--- a/common.h
+++ b/common.h
--- a/cpall.sh
+++ b/cpall.sh
-cd ~/git/tensorflow
-sudo mkdir /usr/local/tensorflow
-sudo mkdir /usr/local/tensorflow/include
-sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
-sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
-sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
-sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
-sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
-sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
-sudo mkdir /usr/local/tensorflow/include/third_party
-sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
-sudo mkdir /usr/local/tensorflow/lib
-sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
-FROM fierval/tensorrt:19.02-py3
-# nvcuvid
-ADD nvcuvid/* /usr/local/cuda/targets/x86_64-linux/include/
-# opencv
-RUN apt-get update
-RUN apt-get install -y git libgtk2.0-dev curl pkg-config autoconf automake libtool libavcodec-dev \
-libavformat-dev libswscale-dev python-dev python-numpy libtbb2 libtbb-dev \
-libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev unzip libcurl4-gnutls-dev zlib1g-dev
-RUN apt-get install -y wget
-RUN apt-get install -y vim
-## CMAKE
-ADD https://cmake.org/files/v3.13/cmake-3.13.0.tar.gz /
-RUN tar xzvf /cmake-3.13.0.tar.gz -C / \
-    && cd /cmake-3.13.0 \
-    && ./bootstrap \
-    && make -j15 \
-    && make install
-# Second: get and build OpenCV 3.3.1
-#
-ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz /
-RUN tar xzvf /protobuf-cpp-3.6.1.tar.gz -C /
-RUN cd /protobuf-3.6.1 \
-    && ./configure \
-    && make -j15 \
-    && make install \
-    && ldconfig
-ADD https://github.com/opencv/opencv_contrib/archive/3.3.1.zip  / 
-RUN unzip -o /3.3.1.zip
-ADD https://github.com/opencv/opencv/archive/3.3.1.zip / 
-RUN unzip -o /3.3.1.zip 
-RUN cd /workspace/opencv-3.3.1 \
-    && mkdir build \
-    && cd build \
-    && cmake -DBUILD_TIFF=ON \
-       -DBUILD_opencv_java=OFF \
-       -DBUILD_SHARED_LIBS=OFF \
-       -DWITH_CUDA=ON \
-       -DBUILD_PERF_TESTS=OFF \
-       -DBUILD_TESTS=OFF \
-       -DBUILD_opencv_codacodec=ON \
-       # -DENABLE_FAST_MATH=1 \
-       # -DCUDA_FAST_MATH=1 \
-       -DWITH_CUBLAS=1 \
-       -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
-       -DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-3.3.1/modules/ \
-       ##
-       -DCUDA_ARCH_BIN='7.0' \
-       -DCUDA_ARCH_PTX="" \
-       ##
-       ## AVX in dispatch because not all machines have it
-       -DCPU_DISPATCH=AVX,AVX2 \
-       -DENABLE_PRECOMPILED_HEADERS=OFF \
-       -DWITH_OPENGL=OFF \
-       -DWITH_OPENCL=OFF \
-       -DWITH_QT=OFF \
-       -DWITH_NVCUVID=ON \
-       -DWITH_IPP=ON \
-       -DWITH_TBB=ON \
-       -DFORCE_VTK=ON \
-       -DWITH_EIGEN=ON \
-       -DWITH_V4L=ON \
-       -DWITH_XINE=ON \
-       -DWITH_GDAL=ON \
-       -DWITH_1394=OFF \
-       -DWITH_FFMPEG=OFF \
-       -DBUILD_PROTOBUF=ON \
-       -DBUILD_TESTS=OFF \
-       -DBUILD_PERF_TESTS=OFF \
-       -DBUILD_opencv_xfeatures2d=OFF \
-       -DCMAKE_BUILD_TYPE=RELEASE \
-       -DCMAKE_INSTALL_PREFIX=/usr/local \
-    .. \
-    && make -j15 \
-    && make install \
-    && rm /3.3.1.zip \
-    && rm /cmake-3.13.0.tar.gz
-RUN mkdir fast_od
-RUN mkdir /home/boris
-# tensorflow libraries
-ADD tensorflow.tar /
--- a/docker/nvcuvid/dynlink_cuda.h
+++ b/docker/nvcuvid/dynlink_cuda.h
-/*
- * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-#ifndef __cuda_h__
-#define __cuda_h__
-/**
-* CUDA API version support
-*/
-#include "dynlink_cuda_cuda.h"
-#endif //__cuda_h__
--- a/docker/nvcuvid/dynlink_cuda_cuda.h
+++ b/docker/nvcuvid/dynlink_cuda_cuda.h
--- a/docker/nvcuvid/dynlink_cuviddec.h
+++ b/docker/nvcuvid/dynlink_cuviddec.h
--- a/docker/nvcuvid/dynlink_nvcuvid.h
+++ b/docker/nvcuvid/dynlink_nvcuvid.h
--- a/infer_with_trt.cpp
+++ b/infer_with_trt.cpp
--- a/inference_base.cpp
+++ b/inference_base.cpp
-#include "inference_base.h"
-using tensorflow::Status;
-using namespace std;
-using namespace cv;
-using namespace std::chrono;
-int InferenceBase::ReadClassLabels()
-{
-    Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
-    if (!readLabelsMapStatus.ok())
-    {
-        LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
-        return -1;
-    }
-    else
-        LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
-    return 0;
-}
-void InferenceBase::InitCuda()
-{
-    void *hHandleDriver = nullptr;
-    CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
-    if (cuda_res != CUDA_SUCCESS)
-    {
-        throw exception();
-    }
-    cuda_res = cuvidInit(0);
-    if (cuda_res != CUDA_SUCCESS)
-    {
-        throw exception();
-    }
-    std::cout << "CUDA init: SUCCESS" << endl;
-    cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
-    isCudaInited = true;
-}
-int InferenceBase::Init(string videoStream)
-{
-    if (!isCudaInited)
-    {
-        InitCuda();
-    }
-    if (ReadClassLabels() != 0)
-    {
-        LOG(ERROR) << "ReadClassLabels returned non-zero\n";
-        return -1;
-    }
-    LOG(INFO) << "CUDA INIT DONE\n";
-    /*
-    if (ReadGraph() != 0)
-    {
-        LOG(ERROR) << "Could not load inference graph";
-        return -1;
-    }
-    LOG(INFO) << "Inference graph loaded";
-    // create video stream
-    d_reader = GetVideoReader(videoStream);
-    if (d_reader == nullptr)
-    {
-        LOG(ERROR) << "Could not create video stream";
-        throw exception();
-    }
-    // save off frame dimensions
-    auto formatStruct = d_reader->format();
-    width = formatStruct.width;
-    height = formatStruct.height;
-    */
-    isInitialized = true;
-    return 0;
-}
-void InferenceBase::RunInferenceOnStream()
-{
-    if (!isInitialized)
-    {
-        LOG(ERROR) << "Video streaming not initialized";
-        return;
-    }
-    cuda::GpuMat d_frame;
-    int iFrame = 0, nFrames = 30;
-    double fps = 0., infer_tf_ms = 0.;
-    high_resolution_clock::time_point start = high_resolution_clock::now();
-    high_resolution_clock::time_point end;
-    double duration = 0.;
-    for (;;)
-    {
-        start = high_resolution_clock::now();
-        if (!d_reader->nextFrame(d_frame))
-        {
-            break;
-        }
-        if (doInference(d_frame) != 0)
-        {
-            LOG(ERROR) << "Inference failed";
-            return;
-        }
-        end = high_resolution_clock::now();
-        duration += (double) duration_cast<milliseconds>(end - start).count();
-        visualize(d_frame, fps);
-        if (++iFrame % nFrames == 0)
-        {
-            fps = 1. * nFrames / duration * 1000.;
-            duration = 0.;
-        }
-        if (iFrame % 100 == 0)
-        {
-            LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
-        }
-    }
-}
--- a/inference_base.h
+++ b/inference_base.h
-#pragma once
-#include "utils.h"
-using namespace std;
-class InferenceBase
-{
-  private:
-    bool isCudaInited;
-    cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
-     {return cv::cudacodec::createVideoReader(video_file);}
-  protected:
-    string labelsFile;
-    string graphFile;
-    map<int, string> labelsMap;
-    virtual int ReadClassLabels();
-    virtual int ReadGraph() = 0;
-    void InitCuda();
-    cv::Ptr<cv::cudacodec::VideoReader> d_reader;
-    double thresholdScore;
-    double thresholdIOU;
-    // frame width and height
-    int height;
-    int width;
-    int debug;
-    bool isInitialized;
-  public:
-    InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
-        : labelsFile(labelsFile)
-        , graphFile(graphFile)
-        , isCudaInited(false)
-        , thresholdScore(threshScore)
-        , thresholdIOU(threshIOU)
-        , isInitialized(false)
-        , labelsMap()
-        , width(1280)
-        , height(720)
-        , debug(dbg)
-        {}
-    virtual ~InferenceBase() {}
-    void RunInferenceOnStream();
-    virtual int doInference(cv::cuda::GpuMat&) = 0;
-    virtual void visualize(cv::cuda::GpuMat&, double) = 0;
-    virtual int Init(string video_stream);
-    map<int, string> get_labels_map() {return labelsMap;}
-    void set_debug(int dbg) {debug = dbg;}
-};
--- a/inference_tf.cpp
+++ b/inference_tf.cpp
-#include "inference_tf.h"
-using tensorflow::Status;
-using tensorflow::Tensor;
-using namespace cv;
-using tensorflow::int32;
-int InferenceTensorflow::ReadGraph()
-{
-    LOG(INFO) << "graphFile:" << graphFile;
-    Status loadGraphStatus = loadGraph(graphFile, &session);
-    if (!loadGraphStatus.ok())
-    {
-        LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
-        return -1;
-    }
-    else
-        LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
-    return 0;
-}
-// allocate input tensor
-int InferenceTensorflow::Init(string videoStream)
-{
-    if (InferenceBase::Init(videoStream) != 0)
-    {
-        LOG(INFO) << "Init(videostream) exit non-zero (aka huge fail)";
-        return -1;
-    }
-    LOG(INFO) << "Init(videostream): PASS\n";
-    LOG(INFO) << "The session must exist at this point, see loadGraph() in inference_base.cpp";
-    // configure callable options
-    opts.add_feed(inputLayer);
-    for (auto const &value : outputLayer)
-    {
-        opts.add_fetch(value);
-    }
-    const string gpu_device_name = GPUDeviceName(session.get());
-    opts.clear_fetch_devices();
-    opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
-    auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
-    if (!runStatus.ok())
-    {
-        LOG(ERROR) << "Failed to make callable";
-    }
-    LOG(INFO) << "Shape of the GPU tensor: (1, " << height << ", " << width << ", 3)\n";
-    // allocate tensor on the GPU
-    tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
-    tensorflow::PlatformGpuId platform_gpu_id(0);
-    tensorflow::GPUMemAllocator *sub_allocator =
-        new tensorflow::GPUMemAllocator(
-            tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
-            platform_gpu_id, false /*use_unified_memory*/, {}, {});
-    tensorflow::GPUBFCAllocator *allocator =
-        new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
-    inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
-    LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
-    return 0;
-}
-int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
-{
-    Status runStatus;
-    readTensorFromGpuMat(d_frame, inputTensor);
-    runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
-    if (!runStatus.ok())
-    {
-        LOG(ERROR) << "Running model failed: " << runStatus;
-        return -1;
-    }
-    return 0;
-}
-void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
-{
-    // Extract results from the outputs vector
-    tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
-    tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
-    tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
-    tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
-    vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
-    if (debug & 0x1)
-    {
-        for (size_t i = 0; i < goodIdxs.size(); i++)
-            LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
-                      << " (" << classes(goodIdxs.at(i)) << "), box:"
-                      << "," << boxes(0, goodIdxs.at(i), 0) << ","
-                      << boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
-                      << boxes(0, goodIdxs.at(i), 3);
-    }
-    // Draw bboxes and captions
-    if (debug & 0x2)
-    {
-        Mat frame;
-        d_frame.download(frame);
-        drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
-        auto color = Scalar(255, 0, 255);
-        drawFrameworkSignature(frame, fps, "Tensorflow", color);
-    }
-}
--- a/inference_tf.h
+++ b/inference_tf.h
-#pragma once
-#include "inference_base.h"
-using namespace std;
-using tensorflow::CallableOptions;
-using tensorflow::Tensor;
-using tensorflow::Session;
-class InferenceTensorflow : public InferenceBase
-{
-  private:
-    const string inputLayer = "image_tensor:0";
-    const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
-    CallableOptions opts;
-    std::unique_ptr<tensorflow::Session> session;
-    Session::CallableHandle feed_gpu_fetch_cpu;
-    // Allocate input tensor on the gpu
-    Tensor inputTensor;
-    vector<Tensor> outputs;
-  protected:
-    int ReadGraph() override;
-    int doInference(cv::cuda::GpuMat& d_frame) override;
-    void visualize(cv::cuda::GpuMat &d_frame, double) override;
-  public:
-    InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0) 
-    : InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
-    , opts()
-    { }
-    int Init(string videoStream) override;
-    virtual ~InferenceTensorflow() {  session->ReleaseCallable(feed_gpu_fetch_cpu);}
-};
\ No newline at end of file
--- a/main.cpp
+++ b/main.cpp
--- a/run_tf.sh
+++ b/run_tf.sh
-./build/tf_detector_example -d=$1 \
-    -v=/home/boris/Videos/ride_2.mp4 \
-    -graph=/home/boris/model/frozen_inference_graph.pb \
-    -labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
--- a/run_trt.sh
+++ b/run_trt.sh
-./build/tf_detector_example \
-    -d=$1 \
-    -t \
-    -v=/home/boris/Videos/ride_2.mp4 \
-    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/run_trt_int8.sh
+++ b/run_trt_int8.sh
-./build/tf_detector_example \
-    -d=$1 \
-    -i \
-    -t \
-    -v=/home/boris/Videos/ride_2.mp4 \
-    -graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-    -labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
--- a/utils.cpp
+++ b/utils.cpp
--- a/utils.h
+++ b/utils.h
-#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
-#define TF_DETECTOR_EXAMPLE_UTILS_H
-#endif //TF_DETECTOR_EXAMPLE_UTILS_H
-#include <vector>
-#include <string>
-#include <fstream>
-#include <iostream>
-#include <map>
-#include <unordered_map>
-#include <math.h>
-#include <regex>
-#include <tuple>
-#include <cassert>
-#include <cublas_v2.h>
-#include <cudnn.h>
-#include <sstream>
-#include <time.h>
-#include "BatchStreamPPM.h"
-#include "NvUffParser.h"
-#include "common.h"
-#include "NvInferPlugin.h"
-// Required for CUDA check
-#include "tensorflow/core/util/port.h"
-// GPU allocator
-#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
-// Direct session
-#include "tensorflow/core/common_runtime/direct_session.h"
-#include <cv.hpp>
-#include <opencv2/cudacodec.hpp>
-#include <opencv2/core/cuda.hpp>
-#include <opencv2/cudaimgproc.hpp>
-#include <opencv2/cudawarping.hpp>
-// CUDA includes. Order matters
-#include <dynlink_nvcuvid.h>
-#include "cuda_runtime_api.h"
-using namespace std;
-using tensorflow::Tensor;
-using tensorflow::Status;
-using tensorflow::Session;
-using namespace nvinfer1;
-using namespace nvuffparser;
-string type2str(int type);
-Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
-Status loadGraph(const string &graph_file_name,
-                 std::unique_ptr<tensorflow::Session> *session);
-Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
-Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
-void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
-void drawBoundingBoxesOnImage(cv::Mat &image,
-                              tensorflow::TTypes<float>::Flat &scores,
-                              tensorflow::TTypes<float>::Flat &classes,
-                              tensorflow::TTypes<float,3>::Tensor &boxes,
-                              std::map<int, string> &labelsMap,
-                              std::vector<size_t> &idxs);
-void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
-double IOU(cv::Rect box1, cv::Rect box2);
-std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
-                                tensorflow::TTypes<float, 3>::Tensor &boxes,
-                                double thresholdIOU, double thresholdScore);
-bool IsCUDATensor(const Tensor &t);
-string GPUDeviceName(Session* session);
-std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
-std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
-extern DetectionOutputParameters detectionOutputParam;
-void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
-void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
-extern const int OUTPUT_CLS_SIZE;
-extern const int OUTPUT_BBOX_SIZE;