Commit 773648ca authored by Oleg Dzhimiev's avatar Oleg Dzhimiev

standalone code

parent d8533f78
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.16)
set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc)
project(tf_detector_example LANGUAGES CXX CUDA)
project(tf-gpu-feed LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops
set(CUDACXX /usr/local/cuda/bin/nvcc)
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES
set(SOURCE_FILES
main.cpp
utils.cpp
utils.h
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
channel_first.cu
dynlink_nvcuvid.cpp
array.cu
)
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
add_executable(tf-gpu-feed ${SOURCE_FILES})
set_target_properties(tf-gpu-feed PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
include_directories(${CUDA_INCLUDE_DIRS})
# TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path
# This is Azure DLVM (not sure if DSVM is the same)
#include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
#include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
target_link_libraries(tf-gpu-feed
${CUDA_LIBRARIES}
cuda
cublas
nvinfer
nvToolsExt
nvparsers
nvinfer_plugin
nvonnxparser
nvparsers
nvinfer_plugin
nvonnxparser
${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS})
This diff is collapsed.
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# tensorflow-feed-from-gpu
Simple TF test
## Setup in Eclipse
From **Eclipse (2019-12)**:
* File > Open Projects from File System...
* Directory... > navigate to project's root > Finish
Tried importing a few times - indexer does not work sometimes.
# Run
```
mkdir build
cd build
cmake ..
./tf-gpu-feed
```
\ No newline at end of file
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
// this is the program that is to be run on the device for a
// large number of threads, in our example 100
// each thread takes care of one entry in the number array,
// so in order for the thread to know which number to manipulate,
// a scheme has to be utilized in order to assign each thread a
// unique number
__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
{
// this is the assignment of a unique identifier.
// blockIdx.x is the unique number of the block, in which the
// thread is positioned, blockDim.x holds the number of threads
// for each block and threadIdx.x is the number of the thread in
// this block.
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// this tells the thread to manipulate the assigned number in
// the array stored in device memory and increment it
if (idx<N)
numberArray[idx] = numberArray[idx] + 1;
}
// this is the "normal" function to be run on the CPU
// it does the exact same thing as the CUDA function above
void incrementArray(int *numberArray, int N){
// go through every number in the array consecutively
// and increment it
for(int i=0; i<N; ++i)
{
numberArray[i] = numberArray[i] + 1;
}
}
int myCreateCUDAArray(int *tf_ptr){
// some arbitrary array length
int numberOfNumbers = 100;
// declare some arrays for storing numbers
int *numbers1, *numbers2;
numbers1 = tf_ptr;
// reserve (allocate) some working space for the numbers in device memory
cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
// fill the input array with some numbers
for(int i=0;i<numberOfNumbers;i++)
{
numbers1[i] = i; // this will be manipulated by the CUDA device (GPU)
numbers2[i] = i; // this will be manipulated by the CPU (as any standard C program would do)
}
// tell the device (GPU) to do its magic
incrementArrayViaCUDAdevice<<<1, numberOfNumbers>>>(numbers1, numberOfNumbers);
// wait for the device to finish working
cudaDeviceSynchronize();
// compute the same function "normally" on the CPU
incrementArray(numbers2, numberOfNumbers);
// check if the GPU did the same as the CPU
bool workedCorrectly = true;
for(int i=0;i<numberOfNumbers;i++)
{
if (numbers1[i] != numbers2[i])
workedCorrectly = 0;
printf(" %d vs %d |",numbers1[i],numbers2[i]);
}
printf("\n");
if (workedCorrectly == 1)
printf("The device performed well!\n");
else
printf("Something went wrong. The output numbers are not what was to be expected...\n");
// free the space that has been used by our arrays so that
// other programs might use it
cudaFree(numbers1);
cudaFree(numbers2);
return 0;
}
int myCreateCUDAArray(int *tf_ptr);
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
This diff is collapsed.
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
FROM fierval/tensorrt:19.02-py3
# nvcuvid
ADD nvcuvid/* /usr/local/cuda/targets/x86_64-linux/include/
# opencv
RUN apt-get update
RUN apt-get install -y git libgtk2.0-dev curl pkg-config autoconf automake libtool libavcodec-dev \
libavformat-dev libswscale-dev python-dev python-numpy libtbb2 libtbb-dev \
libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev unzip libcurl4-gnutls-dev zlib1g-dev
RUN apt-get install -y wget
RUN apt-get install -y vim
## CMAKE
ADD https://cmake.org/files/v3.13/cmake-3.13.0.tar.gz /
RUN tar xzvf /cmake-3.13.0.tar.gz -C / \
&& cd /cmake-3.13.0 \
&& ./bootstrap \
&& make -j15 \
&& make install
# Second: get and build OpenCV 3.3.1
#
ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz /
RUN tar xzvf /protobuf-cpp-3.6.1.tar.gz -C /
RUN cd /protobuf-3.6.1 \
&& ./configure \
&& make -j15 \
&& make install \
&& ldconfig
ADD https://github.com/opencv/opencv_contrib/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
ADD https://github.com/opencv/opencv/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
RUN cd /workspace/opencv-3.3.1 \
&& mkdir build \
&& cd build \
&& cmake -DBUILD_TIFF=ON \
-DBUILD_opencv_java=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_CUDA=ON \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_TESTS=OFF \
-DBUILD_opencv_codacodec=ON \
# -DENABLE_FAST_MATH=1 \
# -DCUDA_FAST_MATH=1 \
-DWITH_CUBLAS=1 \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
-DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-3.3.1/modules/ \
##
-DCUDA_ARCH_BIN='7.0' \
-DCUDA_ARCH_PTX="" \
##
## AVX in dispatch because not all machines have it
-DCPU_DISPATCH=AVX,AVX2 \
-DENABLE_PRECOMPILED_HEADERS=OFF \
-DWITH_OPENGL=OFF \
-DWITH_OPENCL=OFF \
-DWITH_QT=OFF \
-DWITH_NVCUVID=ON \
-DWITH_IPP=ON \
-DWITH_TBB=ON \
-DFORCE_VTK=ON \
-DWITH_EIGEN=ON \
-DWITH_V4L=ON \
-DWITH_XINE=ON \
-DWITH_GDAL=ON \
-DWITH_1394=OFF \
-DWITH_FFMPEG=OFF \
-DBUILD_PROTOBUF=ON \
-DBUILD_TESTS=OFF \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_opencv_xfeatures2d=OFF \
-DCMAKE_BUILD_TYPE=RELEASE \
-DCMAKE_INSTALL_PREFIX=/usr/local \
.. \
&& make -j15 \
&& make install \
&& rm /3.3.1.zip \
&& rm /cmake-3.13.0.tar.gz
RUN mkdir fast_od
RUN mkdir /home/boris
# tensorflow libraries
ADD tensorflow.tar /
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef __cuda_h__
#define __cuda_h__
/**
* CUDA API version support
*/
#include "dynlink_cuda_cuda.h"
#endif //__cuda_h__
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
cuda_res = cuvidInit(0);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
std::cout << "CUDA init: SUCCESS" << endl;
cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
isCudaInited = true;
}
int InferenceBase::Init(string videoStream)
{
if (!isCudaInited)
{
InitCuda();
}
if (ReadClassLabels() != 0)
{
LOG(ERROR) << "ReadClassLabels returned non-zero\n";
return -1;
}
LOG(INFO) << "CUDA INIT DONE\n";
/*
if (ReadGraph() != 0)
{
LOG(ERROR) << "Could not load inference graph";
return -1;
}
LOG(INFO) << "Inference graph loaded";
// create video stream
d_reader = GetVideoReader(videoStream);
if (d_reader == nullptr)
{
LOG(ERROR) << "Could not create video stream";
throw exception();
}
// save off frame dimensions
auto formatStruct = d_reader->format();
width = formatStruct.width;
height = formatStruct.height;
*/
isInitialized = true;
return 0;
}
void InferenceBase::RunInferenceOnStream()
{
if (!isInitialized)
{
LOG(ERROR) << "Video streaming not initialized";
return;
}
cuda::GpuMat d_frame;
int iFrame = 0, nFrames = 30;
double fps = 0., infer_tf_ms = 0.;
high_resolution_clock::time_point start = high_resolution_clock::now();
high_resolution_clock::time_point end;
double duration = 0.;
for (;;)
{
start = high_resolution_clock::now();
if (!d_reader->nextFrame(d_frame))
{
break;
}
if (doInference(d_frame) != 0)
{
LOG(ERROR) << "Inference failed";
return;
}
end = high_resolution_clock::now();
duration += (double) duration_cast<milliseconds>(end - start).count();
visualize(d_frame, fps);
if (++iFrame % nFrames == 0)
{
fps = 1. * nFrames / duration * 1000.;
duration = 0.;
}
if (iFrame % 100 == 0)
{
LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
}
}
}
#pragma once
#include "utils.h"
using namespace std;
class InferenceBase
{
private:
bool isCudaInited;
cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
{return cv::cudacodec::createVideoReader(video_file);}
protected:
string labelsFile;
string graphFile;
map<int, string> labelsMap;
virtual int ReadClassLabels();
virtual int ReadGraph() = 0;
void InitCuda();
cv::Ptr<cv::cudacodec::VideoReader> d_reader;
double thresholdScore;
double thresholdIOU;
// frame width and height
int height;
int width;
int debug;
bool isInitialized;
public:
InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
: labelsFile(labelsFile)
, graphFile(graphFile)
, isCudaInited(false)
, thresholdScore(threshScore)
, thresholdIOU(threshIOU)
, isInitialized(false)
, labelsMap()
, width(1280)
, height(720)
, debug(dbg)
{}
virtual ~InferenceBase() {}
void RunInferenceOnStream();
virtual int doInference(cv::cuda::GpuMat&) = 0;
virtual void visualize(cv::cuda::GpuMat&, double) = 0;
virtual int Init(string video_stream);
map<int, string> get_labels_map() {return labelsMap;}
void set_debug(int dbg) {debug = dbg;}
};
#include "inference_tf.h"
using tensorflow::Status;
using tensorflow::Tensor;
using namespace cv;
using tensorflow::int32;
int InferenceTensorflow::ReadGraph()
{
LOG(INFO) << "graphFile:" << graphFile;
Status loadGraphStatus = loadGraph(graphFile, &session);
if (!loadGraphStatus.ok())
{
LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
return -1;
}
else
LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
return 0;
}
// allocate input tensor
int InferenceTensorflow::Init(string videoStream)
{
if (InferenceBase::Init(videoStream) != 0)
{
LOG(INFO) << "Init(videostream) exit non-zero (aka huge fail)";
return -1;
}
LOG(INFO) << "Init(videostream): PASS\n";
LOG(INFO) << "The session must exist at this point, see loadGraph() in inference_base.cpp";
// configure callable options
opts.add_feed(inputLayer);
for (auto const &value : outputLayer)
{
opts.add_fetch(value);
}
const string gpu_device_name = GPUDeviceName(session.get());
opts.clear_fetch_devices();
opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok())
{
LOG(ERROR) << "Failed to make callable";
}
LOG(INFO) << "Shape of the GPU tensor: (1, " << height << ", " << width << ", 3)\n";
// allocate tensor on the GPU
tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
new tensorflow::GPUMemAllocator(
tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
platform_gpu_id, false /*use_unified_memory*/, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
return 0;
}
int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
{
Status runStatus;
readTensorFromGpuMat(d_frame, inputTensor);
runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
if (!runStatus.ok())
{
LOG(ERROR) << "Running model failed: " << runStatus;
return -1;
}
return 0;
}
void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
// Extract results from the outputs vector
tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
if (debug & 0x1)
{
for (size_t i = 0; i < goodIdxs.size(); i++)
LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
<< " (" << classes(goodIdxs.at(i)) << "), box:"
<< "," << boxes(0, goodIdxs.at(i), 0) << ","
<< boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
<< boxes(0, goodIdxs.at(i), 3);
}
// Draw bboxes and captions
if (debug & 0x2)
{
Mat frame;
d_frame.download(frame);
drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
auto color = Scalar(255, 0, 255);
drawFrameworkSignature(frame, fps, "Tensorflow", color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
using tensorflow::CallableOptions;
using tensorflow::Tensor;
using tensorflow::Session;
class InferenceTensorflow : public InferenceBase
{
private:
const string inputLayer = "image_tensor:0";
const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
CallableOptions opts;
std::unique_ptr<tensorflow::Session> session;
Session::CallableHandle feed_gpu_fetch_cpu;
// Allocate input tensor on the gpu
Tensor inputTensor;
vector<Tensor> outputs;
protected:
int ReadGraph() override;
int doInference(cv::cuda::GpuMat& d_frame) override;
void visualize(cv::cuda::GpuMat &d_frame, double) override;
public:
InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0)
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, opts()
{ }
int Init(string videoStream) override;
virtual ~InferenceTensorflow() { session->ReleaseCallable(feed_gpu_fetch_cpu);}
};
\ No newline at end of file
This diff is collapsed.
./build/tf_detector_example -d=$1 \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/home/boris/model/frozen_inference_graph.pb \
-labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-i \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
This diff is collapsed.
#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
#define TF_DETECTOR_EXAMPLE_UTILS_H
#endif //TF_DETECTOR_EXAMPLE_UTILS_H
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <map>
#include <unordered_map>
#include <math.h>
#include <regex>
#include <tuple>
#include <cassert>
#include <cublas_v2.h>
#include <cudnn.h>
#include <sstream>
#include <time.h>
#include "BatchStreamPPM.h"
#include "NvUffParser.h"
#include "common.h"
#include "NvInferPlugin.h"
// Required for CUDA check
#include "tensorflow/core/util/port.h"
// GPU allocator
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
// Direct session
#include "tensorflow/core/common_runtime/direct_session.h"
#include <cv.hpp>
#include <opencv2/cudacodec.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
// CUDA includes. Order matters
#include <dynlink_nvcuvid.h>
#include "cuda_runtime_api.h"
using namespace std;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::Session;
using namespace nvinfer1;
using namespace nvuffparser;
string type2str(int type);
Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
Status loadGraph(const string &graph_file_name,
std::unique_ptr<tensorflow::Session> *session);
Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
void drawBoundingBoxesOnImage(cv::Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
std::map<int, string> &labelsMap,
std::vector<size_t> &idxs);
void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
double IOU(cv::Rect box1, cv::Rect box2);
std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore);
bool IsCUDATensor(const Tensor &t);
string GPUDeviceName(Session* session);
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
extern DetectionOutputParameters detectionOutputParam;
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
extern const int OUTPUT_CLS_SIZE;
extern const int OUTPUT_BBOX_SIZE;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment