Commit a935f1f0 authored by fierval's avatar fierval

fst commit

parents
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
build/
debug/
.vs/
.vscode/
ssd/
*.tar
\ No newline at end of file
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.8)
project(tf_detector_example LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops
find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES
main.cpp
utils.cpp
utils.h
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
inference_trt.cpp
channel_first.cu
)
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
# TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path
# This is Azure DLVM (not sure if DSVM is the same)
include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
${CUDA_LIBRARIES}
cuda
cublas
nvinfer
nvToolsExt
nvparsers
nvinfer_plugin
nvonnxparser
${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS})
1
(Unnamed ITensor* 9): 3d418f1e
Input: 3c010a14
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d205fca
(Unnamed ITensor* 225): 3d368720
(Unnamed ITensor* 412): 3d418f1e
(Unnamed ITensor* 195): 3dafce6e
(Unnamed ITensor* 138): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/MaxPool_3a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 463): 3d418f1e
(Unnamed ITensor* 75): 3d2dcb21
(Unnamed ITensor* 157): 3d418f1e
BoxPredictor_3/ClassPredictor/BiasAdd: 3c8c8ef8
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/MaxPool_2a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 61): 3d418f1e
(Unnamed ITensor* 462): 3d3d75f1
(Unnamed ITensor* 156): 3d618943
(Unnamed ITensor* 24): 3d913052
(Unnamed ITensor* 32): 3d6533f9
(Unnamed ITensor* 83): 3d3ca52c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 118): 3d4aef99
(Unnamed ITensor* 485): 3d1d4f1e
BoxPredictor_4/BoxEncodingPredictor/BiasAdd: 3ca49bb9
(Unnamed ITensor* 84): 3d418f1e
(Unnamed ITensor* 160): 3d418f1e
BoxPredictor_5/ClassPredictor/BiasAdd: 3c773985
(Unnamed ITensor* 316): 3d63dc8a
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise: 3de7428e
(Unnamed ITensor* 90): 3d73f085
(Unnamed ITensor* 91): 3d418f1e
(Unnamed ITensor* 419): 3d418f1e
(Unnamed ITensor* 374): 3d59dbf2
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3c8d1a
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_5_1x1_64/Relu6: 3d17eae6
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 507): 3d418f1e
(Unnamed ITensor* 2): 3c010a14
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 112): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 126): 3d20913a
(Unnamed ITensor* 104): 3d80ab32
(Unnamed ITensor* 134): 3d8dd320
(Unnamed ITensor* 324): 3d418f1e
(Unnamed ITensor* 135): 3d418f1e
(Unnamed ITensor* 628): 3d9d9605
(Unnamed ITensor* 449): 3d418f1e
(Unnamed ITensor* 119): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 513): 3d5e275c
(Unnamed ITensor* 164): 3d946ceb
Squeeze_2: 3cc8bb82
(Unnamed ITensor* 167): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu6: 3d2d4927
(Unnamed ITensor* 541): 3d37a99c
(Unnamed ITensor* 143): 3d418f1e
(Unnamed ITensor* 240): 3d418f1e
(Unnamed ITensor* 150): 3d418f1e
(Unnamed ITensor* 165): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 310): 3d418f1e
(Unnamed ITensor* 260): 3d60aac4
(Unnamed ITensor* 405): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 105): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 382): 3d1e3cff
(Unnamed ITensor* 550): 3d418f1e
(Unnamed ITensor* 391): 3d418f1e
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_2_1x1_256/Relu6: 3d37a347
(Unnamed ITensor* 448): 3d6ab083
(Unnamed ITensor* 142): 3dd08cf3
(Unnamed ITensor* 595): 3d418f1e
BoxPredictor_1/ClassPredictor/BiasAdd: 3e194e24
concat_box_conf: 3e1bb222
(Unnamed ITensor* 594): 3d4ff643
(Unnamed ITensor* 602): 3d418f1e
BoxPredictor_5/Reshape_1: 3c773985
concat_box_loc: 3de14ea0
BoxPredictor_4/ClassPredictor/BiasAdd: 3ca5201c
Squeeze_4: 3ca49bb9
(Unnamed ITensor* 621): 3d418f1e
(Unnamed ITensor* 624): 3d17eae6
BoxPredictor_2/ClassPredictor/BiasAdd: 3e1ec6c2
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/Relu6: 3d156ede
(Unnamed ITensor* 33): 3d418f1e
(Unnamed ITensor* 500): 3d418f1e
BoxPredictor_2/Reshape_1: 3e1ec6c2
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/Relu6: 3d418f1e
BoxPredictor_5/BoxEncodingPredictor/BiasAdd: 3cdbc092
GridAnchor_1: 3a500341
(Unnamed ITensor* 569): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 620): 3d17eae6
(Unnamed ITensor* 418): 3d91976a
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 111): 3d85a99e
(Unnamed ITensor* 575): 3dc8e55f
(Unnamed ITensor* 601): 3d8b91c4
BoxPredictor_1/Reshape_1: 3e194e24
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu6: 3d433d97
(Unnamed ITensor* 545): 3d37a347
BoxPredictor_3/Reshape_1: 3c8c8ef8
(Unnamed ITensor* 347): 3d418f1e
(Unnamed ITensor* 568): 3d1c5b35
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 471): 3d418f1e
(Unnamed ITensor* 455): 3d500012
(Unnamed ITensor* 303): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/Relu6: 3d20913a
BoxPredictor_4/Reshape_1: 3ca5201c
GridAnchor_4 copy: 3c3aa18a
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_3_3x3_s2_256/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/Relu6: 3d1e3cff
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
GridAnchor_5 copy: 3c2b37e3
(Unnamed ITensor* 331): 3d3ca1fe
NMS_1: 1
BoxPredictor_3/BoxEncodingPredictor/BiasAdd: 3cafbf65
(Unnamed ITensor* 188): 3dc61b5c
(Unnamed ITensor* 196): 3d418f1e
(Unnamed ITensor* 209): 3dc05776
GridAnchor_2 copy: 3c2c4ae8
(Unnamed ITensor* 367): 3d7bf53d
(Unnamed ITensor* 361): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/Relu6: 3d3ceddc
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3772e6
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/Relu6: 3d31060c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 411): 3d836c20
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 18): 3d418f1e
(Unnamed ITensor* 390): 3d9a604f
(Unnamed ITensor* 346): 3d67b7ae
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
BoxPredictor_2/BoxEncodingPredictor/BiasAdd: 3cc8bb82
(Unnamed ITensor* 217): 3d4cf10e
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_2_3x3_s2_512/Relu6: 3d418f1e
(Unnamed ITensor* 233): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 542): 3d418f1e
(Unnamed ITensor* 67): 3d8e8123
(Unnamed ITensor* 247): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d1d88d2
(Unnamed ITensor* 302): 3daf4176
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 239): 3d4f00df
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 514): 3d418f1e
(Unnamed ITensor* 435): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 317): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 289): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 478): 3d3244f6
(Unnamed ITensor* 549): 3dbeda4a
(Unnamed ITensor* 261): 3d418f1e
(Unnamed ITensor* 492): 3d9e1645
(Unnamed ITensor* 441): 3d15c098
(Unnamed ITensor* 479): 3d418f1e
(Unnamed ITensor* 493): 3d418f1e
BoxPredictor_0/Reshape_1: 3e13296c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d258e36
(Unnamed ITensor* 339): 3d5f2411
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_4_3x3_s2_256/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
Squeeze_1: 3d2f0384
GridAnchor: 3a4f5b62
(Unnamed ITensor* 368): 3d418f1e
Squeeze: 3df34968
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 375): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d18b9fa
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 253): 3d92390f
(Unnamed ITensor* 210): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu6: 3d4af27d
Squeeze_3: 3cafbf65
(Unnamed ITensor* 340): 3d418f1e
(Unnamed ITensor* 11): 3d418f1e
(Unnamed ITensor* 295): 3d9c64d4
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_3_1x1_128/Relu6: 3d1c5b35
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu6: 3d15c098
(Unnamed ITensor* 323): 3d5d9fd1
GridAnchor_4: 3c3aa18a
(Unnamed ITensor* 360): 3d88c0ec
(Unnamed ITensor* 25): 3d418f1e
(Unnamed ITensor* 288): 3d6b9ef7
(Unnamed ITensor* 226): 3d418f1e
(Unnamed ITensor* 456): 3d418f1e
(Unnamed ITensor* 46): 3d86ba82
BoxPredictor_0/BoxEncodingPredictor/BiasAdd: 3df34968
(Unnamed ITensor* 232): 3ddb36a3
(Unnamed ITensor* 521): 3cb42ac7
GridAnchor_3 copy: 3c348982
(Unnamed ITensor* 296): 3d418f1e
BoxPredictor_0/ClassPredictor/BiasAdd: 3e13296c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/Relu6: 3d2dcb21
(Unnamed ITensor* 202): 3d87a00a
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 269): 3d418f1e
GridAnchor_3: 3c348982
(Unnamed ITensor* 218): 3d418f1e
(Unnamed ITensor* 203): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 486): 3d418f1e
(Unnamed ITensor* 268): 3d0e4f64
Squeeze_5: 3cdbc092
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 254): 3d418f1e
(Unnamed ITensor* 182): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/AvgPool_0a_3x3/AvgPool: 3cb90e57
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 175): 3d418f1e
(Unnamed ITensor* 98): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/AvgPool_0a_3x3/AvgPool: 3d04ebdf
(Unnamed ITensor* 354): 3d418f1e
(Unnamed ITensor* 181): 3d8ef349
(Unnamed ITensor* 353): 3d3ce1d6
(Unnamed ITensor* 174): 3d5b5745
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
GridAnchor_1 copy: 3a500341
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu6: 3cb42ac7
(Unnamed ITensor* 149): 3d869442
(Unnamed ITensor* 68): 3d418f1e
(Unnamed ITensor* 17): 3d9d3367
(Unnamed ITensor* 404): 3d9d92ab
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 309): 3d8ac690
BoxPredictor_1/BoxEncodingPredictor/BiasAdd: 3d2f0384
(Unnamed ITensor* 60): 3d74b08e
(Unnamed ITensor* 189): 3d418f1e
(Unnamed ITensor* 97): 3d3f7d2c
(Unnamed ITensor* 53): 3d7e3945
(Unnamed ITensor* 8): 3e350553
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
GridAnchor_5: 3c2b37e3
(Unnamed ITensor* 76): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 522): 3d418f1e
(Unnamed ITensor* 39): 3da0973e
(Unnamed ITensor* 127): 3d418f1e
(Unnamed ITensor* 54): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 576): 3d418f1e
(Unnamed ITensor* 332): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2b_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 47): 3d418f1e
(Unnamed ITensor* 40): 3d418f1e
(Unnamed ITensor* 246): 3d7bd2d9
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/Relu6: 3d418f1e
(Unnamed ITensor* 398): 3d418f1e
(Unnamed ITensor* 383): 3d418f1e
(Unnamed ITensor* 427): 3d541a3f
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 397): 3d523857
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu6: 3d333265
(Unnamed ITensor* 442): 3d418f1e
(Unnamed ITensor* 470): 3d71a2ed
(Unnamed ITensor* 499): 3d2d4927
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu6: 3d3d75f1
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu6: 3d1d4f1e
(Unnamed ITensor* 434): 3d439787
(Unnamed ITensor* 629): 3d418f1e
(Unnamed ITensor* 506): 3d74b7dd
(Unnamed ITensor* 428): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
GridAnchor_2: 3c2c4ae8
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
NMS: 3da1a245
GridAnchor copy: 3a4f5b62
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_4_1x1_128/Relu6: 3d418f1e
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Optimizied Video Object Detection
The completed application runs any [Model Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.mdTensorflow) style object detector in Tensorflow mode (default) and an Inception V2 SSD detector converted from Tensorflow graph to UFF format recognized by TensorRT in TensorRT mode (-t).
## Building the app
* Clone the [repo](https://github.com/fierval/fast_od).
* Get the frozen graph and the class labels files for Tensorflow from [here](https://github.com/fierval/tensorflow-object-detection-cpp/tree/master/demo/ssd_inception_v2)
* Get the [frozen graph for TensorRT](https://www.dropbox.com/s/nc3tzm95ip356i5/sample_ssd_relu6.uff?dl=0). The class labels file should be available in `/usr/src/tensorrt/data/ssd` directory.
* Build:
```sh
mkdir build
cd build
cmake .. # cmake -DCMAKE_BUILD_TYPE=Debug
```
## Running
Command line options are described in [`main.cpp`](https://github.com/fierval/fast_od/blob/master/main.cpp">):
```cpp
const String keys =
"{d display |1 | view video while objects are detected}"
"{t tensorrt|false | use tensorrt}"
"{i int8|false| use INT8 (requires callibration)}"
"{v video | | video for detection}"
"{graph ||frozen graph location}"
"{labels ||trained labels filelocation}";
```
Examples are in `run_*.sh` files in the sources directory. Worth mentioning:
```
-d=0 - run without UX, print out framerate only. -d=2 run with UX
-t - TensorRT graph
-t -i - TensorRT graph with INT8 precision.
```
## Slowdown due to UX
The application uses a bare-bones OpenCV UI for visual feedback (`imshow`) and that causes a significant perf hit, so to measure actual performance we run with `-d=0` which suppresses the UI.
\ No newline at end of file
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
#ifndef TENSORRT_COMMON_H
#define TENSORRT_COMMON_H
#include "NvInfer.h"
#include "NvInferPlugin.h"
// ONNX is not supported in Windows
#ifndef _MSC_VER
#include "NvOnnxConfig.h"
#include "NvOnnxParser.h"
#endif
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstring>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <map>
#include <memory>
#include <new>
#include <numeric>
#include <ratio>
#include <string>
#include <utility>
#include <vector>
using namespace std;
using namespace nvinfer1;
using namespace plugin;
#define CHECK_TRT(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cout << "Cuda failure: " << ret; \
abort(); \
} \
} while (0)
constexpr long double operator"" _GB(long double val)
{
return val * (1 << 30);
}
constexpr long double operator"" _MB(long double val) { return val * (1 << 20); }
constexpr long double operator"" _KB(long double val) { return val * (1 << 10); }
// These is necessary if we want to be able to write 1_GB instead of 1.0_GB.
// Since the return type is signed, -1_GB will work as expected.
constexpr long long int operator"" _GB(long long unsigned int val) { return val * (1 << 30); }
constexpr long long int operator"" _MB(long long unsigned int val) { return val * (1 << 20); }
constexpr long long int operator"" _KB(long long unsigned int val) { return val * (1 << 10); }
// Logger for TensorRT info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING)
: reportableSeverity(severity)
{
}
void log(Severity severity, const char* msg) override
{
// suppress messages with severity enum value greater than the reportable
if (severity > reportableSeverity)
return;
switch (severity)
{
case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
case Severity::kERROR: std::cerr << "ERROR: "; break;
case Severity::kWARNING: std::cerr << "WARNING: "; break;
case Severity::kINFO: std::cerr << "INFO: "; break;
default: std::cerr << "UNKNOWN: "; break;
}
std::cerr << msg << std::endl;
}
Severity reportableSeverity;
};
struct SimpleProfiler : public nvinfer1::IProfiler
{
struct Record
{
float time{0};
int count{0};
};
virtual void reportLayerTime(const char* layerName, float ms)
{
mProfile[layerName].count++;
mProfile[layerName].time += ms;
}
SimpleProfiler(
const char* name,
const std::vector<SimpleProfiler>& srcProfilers = std::vector<SimpleProfiler>())
: mName(name)
{
for (const auto& srcProfiler : srcProfilers)
{
for (const auto& rec : srcProfiler.mProfile)
{
auto it = mProfile.find(rec.first);
if (it == mProfile.end())
{
mProfile.insert(rec);
}
else
{
it->second.time += rec.second.time;
it->second.count += rec.second.count;
}
}
}
}
friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value)
{
out << "========== " << value.mName << " profile ==========" << std::endl;
float totalTime = 0;
std::string layerNameStr = "TensorRT layer name";
int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
for (const auto& elem : value.mProfile)
{
totalTime += elem.second.time;
maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
}
auto old_settings = out.flags();
auto old_precision = out.precision();
// Output header
{
out << std::setw(maxLayerNameLength) << layerNameStr << " ";
out << std::setw(12) << "Runtime, "
<< "%"
<< " ";
out << std::setw(12) << "Invocations"
<< " ";
out << std::setw(12) << "Runtime, ms" << std::endl;
}
for (const auto& elem : value.mProfile)
{
out << std::setw(maxLayerNameLength) << elem.first << " ";
out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.second.time * 100.0F / totalTime) << "%"
<< " ";
out << std::setw(12) << elem.second.count << " ";
out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << std::endl;
}
out.flags(old_settings);
out.precision(old_precision);
out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl;
return out;
}
private:
std::string mName;
std::map<std::string, Record> mProfile;
};
// Locate path to file, given its filename or filepath suffix and possible dirs it might lie in
// Function will also walk back MAX_DEPTH dirs from CWD to CHECK_TRT for such a file path
inline std::string locateFile(const std::string& filepathSuffix, const std::vector<std::string>& directories)
{
const int MAX_DEPTH{10};
bool found{false};
std::string filepath;
for (auto& dir : directories)
{
filepath = dir + filepathSuffix;
for (int i = 0; i < MAX_DEPTH && !found; i++)
{
std::ifstream CHECK_TRTFile(filepath);
found = CHECK_TRTFile.is_open();
if (found)
break;
filepath = "../" + filepath; // Try again in parent dir
}
if (found)
{
break;
}
filepath.clear();
}
if (filepath.empty())
{
std::string directoryList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
[](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << directoryList << std::endl;
exit(EXIT_FAILURE);
}
return filepath;
}
inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
{
std::ifstream infile(fileName, std::ifstream::binary);
assert(infile.is_open() && "Attempting to read from a file that is not open.");
std::string magic, h, w, max;
infile >> magic >> h >> w >> max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(buffer), inH * inW);
}
namespace samplesCommon
{
inline void* safeCudaMalloc(size_t memSize)
{
void* deviceMem;
CHECK_TRT(cudaMalloc(&deviceMem, memSize));
if (deviceMem == nullptr)
{
std::cerr << "Out of memory" << std::endl;
exit(1);
}
return deviceMem;
}
inline bool isDebug()
{
return (std::getenv("TENSORRT_DEBUG") ? true : false);
}
struct InferDeleter
{
template <typename T>
void operator()(T* obj) const
{
if (obj)
{
obj->destroy();
}
}
};
template <typename T>
inline std::shared_ptr<T> infer_object(T* obj)
{
if (!obj)
{
throw std::runtime_error("Failed to create object");
}
return std::shared_ptr<T>(obj, InferDeleter());
}
template <class Iter>
inline std::vector<size_t> argsort(Iter begin, Iter end, bool reverse = false)
{
std::vector<size_t> inds(end - begin);
std::iota(inds.begin(), inds.end(), 0);
if (reverse)
{
std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
return begin[i2] < begin[i1];
});
}
else
{
std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
return begin[i1] < begin[i2];
});
}
return inds;
}
inline bool readReferenceFile(const std::string& fileName, std::vector<std::string>& refVector)
{
std::ifstream infile(fileName);
if (!infile.is_open())
{
cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << endl;
return false;
}
std::string line;
while (std::getline(infile, line))
{
if (line.empty())
continue;
refVector.push_back(line);
}
infile.close();
return true;
}
template <typename result_vector_t>
inline std::vector<std::string> classify(const vector<string>& refVector, const result_vector_t& output, const size_t topK)
{
auto inds = samplesCommon::argsort(output.cbegin(), output.cend(), true);
std::vector<std::string> result;
for (size_t k = 0; k < topK; ++k)
{
result.push_back(refVector[inds[k]]);
}
return result;
}
//...LG returns top K indices, not values.
template <typename T>
inline vector<size_t> topK(const vector<T> inp, const size_t k)
{
vector<size_t> result;
std::vector<size_t> inds = samplesCommon::argsort(inp.cbegin(), inp.cend(), true);
result.assign(inds.begin(), inds.begin() + k);
return result;
}
template <typename T>
inline bool readASCIIFile(const string& fileName, const size_t size, vector<T>& out)
{
std::ifstream infile(fileName);
if (!infile.is_open())
{
cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << endl;
return false;
}
out.clear();
out.reserve(size);
out.assign(std::istream_iterator<T>(infile), std::istream_iterator<T>());
infile.close();
return true;
}
template <typename T>
inline bool writeASCIIFile(const string& fileName, const vector<T>& in)
{
std::ofstream outfile(fileName);
if (!outfile.is_open())
{
cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << endl;
return false;
}
for (auto fn : in)
{
outfile << fn << " ";
}
outfile.close();
return true;
}
inline void print_version()
{
//... This can be only done after statically linking this support into parserONNX.library
#if 0
std::cout << "Parser built against:" << std::endl;
std::cout << " ONNX IR version: " << nvonnxparser::onnx_ir_version_string(onnx::IR_VERSION) << std::endl;
#endif
std::cout << " TensorRT version: "
<< NV_TENSORRT_MAJOR << "."
<< NV_TENSORRT_MINOR << "."
<< NV_TENSORRT_PATCH << "."
<< NV_TENSORRT_BUILD << std::endl;
}
inline string getFileType(const string& filepath)
{
return filepath.substr(filepath.find_last_of(".") + 1);
}
inline string toLower(const string& inp)
{
string out = inp;
std::transform(out.begin(), out.end(), out.begin(), ::tolower);
return out;
}
inline void enableDLA(IBuilder* b, int useDLACore)
{
if (useDLACore >= 0)
{
b->allowGPUFallback(true);
b->setFp16Mode(true);
b->setDefaultDeviceType(DeviceType::kDLA);
b->setDLACore(useDLACore);
}
}
inline int parseDLA(int argc, char** argv)
{
for (int i = 1; i < argc; i++)
{
std::string arg(argv[i]);
if (strncmp(argv[i], "--useDLACore=", 13) == 0)
return stoi(argv[i] + 13);
}
return -1;
}
inline unsigned int getElementSize(nvinfer1::DataType t)
{
switch (t)
{
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kINT8: return 1;
}
throw std::runtime_error("Invalid DataType.");
return 0;
}
inline int64_t volume(const nvinfer1::Dims& d)
{
return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}
template <int C, int H, int W>
struct PPM
{
std::string magic, fileName;
int h, w, max;
uint8_t buffer[C * H * W];
};
struct BBox
{
float x1, y1, x2, y2;
};
template <int C, int H, int W>
inline void readPPMFile(const std::string& filename, samplesCommon::PPM<C, H, W>& ppm)
{
ppm.fileName = filename;
std::ifstream infile(filename, std::ifstream::binary);
assert(infile.is_open() && "Attempting to read from a file that is not open.");
infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
template <int C, int H, int W>
inline void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const BBox& bbox)
{
std::ofstream outfile("./" + filename, std::ofstream::binary);
assert(!outfile.fail());
outfile << "P6"
<< "\n"
<< ppm.w << " " << ppm.h << "\n"
<< ppm.max << "\n";
auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1);
for (int x = x1; x <= x2; ++x)
{
// bbox top border
ppm.buffer[(y1 * ppm.w + x) * 3] = 255;
ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0;
// bbox bottom border
ppm.buffer[(y2 * ppm.w + x) * 3] = 255;
ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0;
}
for (int y = y1; y <= y2; ++y)
{
// bbox left border
ppm.buffer[(y * ppm.w + x1) * 3] = 255;
ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0;
// bbox right border
ppm.buffer[(y * ppm.w + x2) * 3] = 255;
ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0;
}
outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
class TimerBase
{
public:
virtual void start() {}
virtual void stop() {}
float microseconds() const noexcept { return mMs * 1000.f; }
float milliseconds() const noexcept { return mMs; }
float seconds() const noexcept { return mMs / 1000.f; }
void reset() noexcept { mMs = 0.f; }
protected:
float mMs{0.0f};
};
class GpuTimer : public TimerBase
{
public:
GpuTimer(cudaStream_t stream)
: mStream(stream)
{
CHECK_TRT(cudaEventCreate(&mStart));
CHECK_TRT(cudaEventCreate(&mStop));
}
~GpuTimer()
{
CHECK_TRT(cudaEventDestroy(mStart));
CHECK_TRT(cudaEventDestroy(mStop));
}
void start() { CHECK_TRT(cudaEventRecord(mStart, mStream)); }
void stop()
{
CHECK_TRT(cudaEventRecord(mStop, mStream));
float ms{0.0f};
CHECK_TRT(cudaEventSynchronize(mStop));
CHECK_TRT(cudaEventElapsedTime(&ms, mStart, mStop));
mMs += ms;
}
private:
cudaEvent_t mStart, mStop;
cudaStream_t mStream;
}; // class GpuTimer
template <typename Clock>
class CpuTimer : public TimerBase
{
public:
using clock_type = Clock;
void start() { mStart = Clock::now(); }
void stop()
{
mStop = Clock::now();
mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
}
private:
std::chrono::time_point<Clock> mStart, mStop;
}; // class CpuTimer
using PreciseCpuTimer = CpuTimer<std::chrono::high_resolution_clock>;
} // namespace samplesCommon
#endif // TENSORRT_COMMON_H
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
/*
* Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <stdio.h>
#include "cuda_runtime_api.h"
#include "dynlink_nvcuvid.h"
tcuvidCreateVideoSource *cuvidCreateVideoSource;
tcuvidCreateVideoSourceW *cuvidCreateVideoSourceW;
tcuvidDestroyVideoSource *cuvidDestroyVideoSource;
tcuvidSetVideoSourceState *cuvidSetVideoSourceState;
tcuvidGetVideoSourceState *cuvidGetVideoSourceState;
tcuvidGetSourceVideoFormat *cuvidGetSourceVideoFormat;
tcuvidGetSourceAudioFormat *cuvidGetSourceAudioFormat;
tcuvidCreateVideoParser *cuvidCreateVideoParser;
tcuvidParseVideoData *cuvidParseVideoData;
tcuvidDestroyVideoParser *cuvidDestroyVideoParser;
tcuvidCreateDecoder *cuvidCreateDecoder;
tcuvidDestroyDecoder *cuvidDestroyDecoder;
tcuvidDecodePicture *cuvidDecodePicture;
tcuvidMapVideoFrame *cuvidMapVideoFrame;
tcuvidUnmapVideoFrame *cuvidUnmapVideoFrame;
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
tcuvidMapVideoFrame64 *cuvidMapVideoFrame64;
tcuvidUnmapVideoFrame64 *cuvidUnmapVideoFrame64;
#endif
//tcuvidGetVideoFrameSurface *cuvidGetVideoFrameSurface;
tcuvidCtxLockCreate *cuvidCtxLockCreate;
tcuvidCtxLockDestroy *cuvidCtxLockDestroy;
tcuvidCtxLock *cuvidCtxLock;
tcuvidCtxUnlock *cuvidCtxUnlock;
// Auto-lock helper for C++ applications
CCtxAutoLock::CCtxAutoLock(CUvideoctxlock ctx)
: m_ctx(ctx)
{
cuvidCtxLock(m_ctx, 0);
}
CCtxAutoLock::~CCtxAutoLock()
{
cuvidCtxUnlock(m_ctx, 0);
}
#define STRINGIFY(X) #X
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#include <Windows.h>
#ifdef UNICODE
static LPCWSTR __DriverLibName = L"nvcuvid.dll";
#else
static LPCSTR __DriverLibName = "nvcuvid.dll";
#endif
typedef HMODULE DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = LoadLibrary(__DriverLibName);
if (*pInstance == NULL)
{
printf("LoadLibrary \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)GetProcAddress(DriverLib, STRINGIFY(name##_v2));\
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#elif defined(__unix__) || defined(__APPLE__) || defined(__MACOSX)
#include <dlfcn.h>
static char __DriverLibName[] = "libnvcuvid.so";
typedef void *DLLDRIVER;
static CUresult LOAD_LIBRARY(DLLDRIVER *pInstance)
{
*pInstance = dlopen(__DriverLibName, RTLD_NOW);
if (*pInstance == NULL)
{
printf("dlopen \"%s\" failed!\n", __DriverLibName);
return CUDA_ERROR_UNKNOWN;
}
return CUDA_SUCCESS;
}
#define GET_PROC_EX(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, #name); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
#name, __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#define GET_PROC_EX_V2(name, alias, required) \
alias = (t##name *)dlsym(DriverLib, STRINGIFY(name##_v2)); \
if (alias == NULL && required) { \
printf("Failed to find required function \"%s\" in %s\n", \
STRINGIFY(name##_v2), __DriverLibName); \
return CUDA_ERROR_UNKNOWN; \
}
#else
#error unsupported platform
#endif
#define CHECKED_CALL(call) \
do { \
CUresult result = (call); \
if (CUDA_SUCCESS != result) { \
return result; \
} \
} while(0)
#define GET_PROC_REQUIRED(name) GET_PROC_EX(name,name,1)
#define GET_PROC_OPTIONAL(name) GET_PROC_EX(name,name,0)
#define GET_PROC(name) GET_PROC_REQUIRED(name)
#define GET_PROC_V2(name) GET_PROC_EX_V2(name,name,1)
CUresult CUDAAPI cuvidInit(unsigned int Flags)
{
DLLDRIVER DriverLib;
CHECKED_CALL(LOAD_LIBRARY(&DriverLib));
// fetch all function pointers
GET_PROC(cuvidCreateVideoSource);
GET_PROC(cuvidCreateVideoSourceW);
GET_PROC(cuvidDestroyVideoSource);
GET_PROC(cuvidSetVideoSourceState);
GET_PROC(cuvidGetVideoSourceState);
GET_PROC(cuvidGetSourceVideoFormat);
GET_PROC(cuvidGetSourceAudioFormat);
GET_PROC(cuvidCreateVideoParser);
GET_PROC(cuvidParseVideoData);
GET_PROC(cuvidDestroyVideoParser);
GET_PROC(cuvidCreateDecoder);
GET_PROC(cuvidDestroyDecoder);
GET_PROC(cuvidDecodePicture);
#if defined(WIN64) || defined(_WIN64) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
GET_PROC(cuvidMapVideoFrame64);
GET_PROC(cuvidUnmapVideoFrame64);
cuvidMapVideoFrame = cuvidMapVideoFrame64;
cuvidUnmapVideoFrame = cuvidUnmapVideoFrame64;
#else
GET_PROC(cuvidMapVideoFrame);
GET_PROC(cuvidUnmapVideoFrame);
#endif
// GET_PROC(cuvidGetVideoFrameSurface);
GET_PROC(cuvidCtxLockCreate);
GET_PROC(cuvidCtxLockDestroy);
GET_PROC(cuvidCtxLock);
GET_PROC(cuvidCtxUnlock);
return CUDA_SUCCESS;
}
#include "utils.h"
using namespace cv::cuda;
const char *INPUT_BLOB_NAME = "Input";
static Logger gLogger;
// TODO: refactor once done
static bool globalRunInInt8 = false;
#define RETURN_AND_LOG(ret, severity, message) \
do \
{ \
std::string error_message = "sample_uff_ssd: " + std::string(message); \
gLogger.log(ILogger::Severity::k##severity, error_message.c_str()); \
return (ret); \
} while (0)
const int OUTPUT_CLS_SIZE = 91;
const int OUTPUT_BBOX_SIZE = OUTPUT_CLS_SIZE * 4;
const char *OUTPUT_BLOB_NAME0 = "NMS";
//INT8 Calibration, currently set to calibrate over 100 images
static constexpr int CAL_BATCH_SIZE = 50;
static constexpr int FIRST_CAL_BATCH = 0, NB_CAL_BATCHES = 10;
// Concat layers
// mbox_priorbox, mbox_loc, mbox_conf
const int concatAxis[2] = {1, 1};
const bool ignoreBatch[2] = {false, false};
DetectionOutputParameters detectionOutputParam{true, false, 0, OUTPUT_CLS_SIZE, 100, 100, 0.5, 0.6, CodeTypeSSD::TF_CENTER, {0, 2, 1}, true, true};
// Visualization
const float visualizeThreshold = 0.5;
void printOutput(int64_t eltCount, DataType dtype, void *buffer)
{
std::cout << eltCount << " eltCount" << std::endl;
assert(samplesCommon::getElementSize(dtype) == sizeof(float));
std::cout << "--- OUTPUT ---" << std::endl;
size_t memSize = eltCount * samplesCommon::getElementSize(dtype);
float *outputs = new float[eltCount];
CHECK_TRT(cudaMemcpyAsync(outputs, buffer, memSize, cudaMemcpyDeviceToHost));
int maxIdx = std::distance(outputs, std::max_element(outputs, outputs + eltCount));
for (int64_t eltIdx = 0; eltIdx < eltCount; ++eltIdx)
{
std::cout << eltIdx << " => " << outputs[eltIdx] << "\t : ";
if (eltIdx == maxIdx)
std::cout << "***";
std::cout << "\n";
}
std::cout << std::endl;
delete[] outputs;
}
std::string locateFile(const std::string &input)
{
std::vector<std::string> dirs{"data/ssd/",
"data/ssd/VOC2007/",
"data/ssd/VOC2007/PPMImages/",
"data/samples/ssd/",
"data/samples/ssd/VOC2007/",
"data/samples/ssd/VOC2007/PPMImages/"};
return locateFile(input, dirs);
}
void populateTFInputData(float *data)
{
auto graphFileName = locateFile("inp_bus.txt");
std::ifstream labelFile(graphFileName);
string line;
int id = 0;
while (getline(labelFile, line))
{
istringstream iss(line);
float num;
iss >> num;
data[id++] = num;
}
return;
}
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName)
{
std::ifstream labelFile(labelFileName);
string line;
int id = 0;
while (getline(labelFile, line))
{
CLASSES.push_back(line);
}
return;
}
std::vector<std::pair<int64_t, DataType>>
calculateBindingBufferSizes(const ICudaEngine &engine, int nbBindings, int batchSize)
{
std::vector<std::pair<int64_t, DataType>> sizes;
for (int i = 0; i < nbBindings; ++i)
{
Dims dims = engine.getBindingDimensions(i);
DataType dtype = engine.getBindingDataType(i);
int64_t eltCount = samplesCommon::volume(dims) * batchSize;
sizes.push_back(std::make_pair(eltCount, dtype));
}
return sizes;
}
ICudaEngine *loadModelAndCreateEngine(const char *uffFile, int maxBatchSize,
IUffParser *parser, IInt8Calibrator *calibrator, IHostMemory *&trtModelStream, bool isInt8)
{
// Create the builder
IBuilder *builder = createInferBuilder(gLogger);
// Parse the UFF model to populate the network, then set the outputs.
INetworkDefinition *network = builder->createNetwork();
std::cout << "Begin parsing model..." << std::endl;
if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");
std::cout << "End parsing model..." << std::endl;
// Build the engine.
builder->setMaxBatchSize(maxBatchSize);
// The _GB literal operator is defined in common/common.h
builder->setMaxWorkspaceSize(1_GB); // We need about 1GB of scratch space for the plugin layer for batch size 5.
builder->setHalf2Mode(false);
if (isInt8)
{
builder->setInt8Mode(true);
builder->setInt8Calibrator(calibrator);
}
std::cout << "Begin building engine..." << std::endl;
ICudaEngine *engine = builder->buildCudaEngine(*network);
if (!engine)
RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");
std::cout << "End building engine..." << std::endl;
// We don't need the network any more, and we can destroy the parser.
network->destroy();
parser->destroy();
// Serialize the engine, then close everything down.
trtModelStream = engine->serialize();
builder->destroy();
shutdownProtobufLibrary();
return engine;
}
void doInference(IExecutionContext &context, float *inputData, float *detectionOut, int *keepCount, int batchSize)
{
const ICudaEngine &engine = context.getEngine();
// Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly 1 input and 2 output.
int nbBindings = engine.getNbBindings();
std::vector<void *> buffers(nbBindings);
std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings().
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1);
for (int i = 0; i < nbBindings; ++i)
{
// inputData is already allocated on the device
if (i == inputIndex)
{
continue;
}
auto bufferSizesOutput = buffersSizes[i];
buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second));
}
cudaStream_t stream;
CHECK_TRT(cudaStreamCreate(&stream));
// make sure the data we are about to use is allocated on the GPU
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, inputData);
#if CUDART_VERSION >= 10000
assert(err != cudaErrorInvalidValue && attributes.type == cudaMemoryTypeDevice);
#else
assert(err != cudaErrorInvalidValue && attributes.memoryType == cudaMemoryTypeDevice);
#endif
buffers[inputIndex] = inputData;
auto t_start = std::chrono::high_resolution_clock::now();
context.execute(batchSize, &buffers[0]);
auto t_end = std::chrono::high_resolution_clock::now();
float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
//std::cout << "Time taken for inference is " << total << " ms." << std::endl;
for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
{
if (engine.bindingIsInput(bindingIdx))
continue;
#ifdef SSD_INT8_DEBUG
auto bufferSizesOutput = buffersSizes[bindingIdx];
printOutput(bufferSizesOutput.first, bufferSizesOutput.second,
buffers[bindingIdx]);
#endif
}
CHECK_TRT(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_TRT(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release the stream and the buffers
cudaStreamDestroy(stream);
CHECK_TRT(cudaFree(buffers[inputIndex]));
CHECK_TRT(cudaFree(buffers[outputIndex0]));
CHECK_TRT(cudaFree(buffers[outputIndex1]));
}
class FlattenConcat : public IPluginV2
{
public:
FlattenConcat(int concatAxis, bool ignoreBatch)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
{
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
}
//clone constructor
FlattenConcat(int concatAxis, bool ignoreBatch, int numInputs, int outputConcatAxis, int* inputConcatAxis)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
, mOutputConcatAxis(outputConcatAxis)
, mNumInputs(numInputs)
{
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
for (int i = 0; i < mNumInputs; ++i)
mInputConcatAxis[i] = inputConcatAxis[i];
}
FlattenConcat(const void* data, size_t length)
{
const char *d = reinterpret_cast<const char*>(data), *a = d;
mIgnoreBatch = read<bool>(d);
mConcatAxisID = read<int>(d);
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
mOutputConcatAxis = read<int>(d);
mNumInputs = read<int>(d);
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
CHECK_TRT(cudaMallocHost((void**) &mCopySize, mNumInputs * sizeof(int)));
std::for_each(mInputConcatAxis, mInputConcatAxis + mNumInputs, [&](int& inp) { inp = read<int>(d); });
mCHW = read<nvinfer1::DimsCHW>(d);
std::for_each(mCopySize, mCopySize + mNumInputs, [&](size_t& inp) { inp = read<size_t>(d); });
assert(d == a + length);
}
~FlattenConcat()
{
if (mInputConcatAxis)
CHECK_TRT(cudaFreeHost(mInputConcatAxis));
if (mCopySize)
CHECK_TRT(cudaFreeHost(mCopySize));
}
int getNbOutputs() const override { return 1; }
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims >= 1);
assert(index == 0);
mNumInputs = nbInputDims;
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
mOutputConcatAxis = 0;
#ifdef SSD_INT8_DEBUG
std::cout << " Concat nbInputs " << nbInputDims << "\n";
std::cout << " Concat axis " << mConcatAxisID << "\n";
for (int i = 0; i < 6; ++i)
for (int j = 0; j < 3; ++j)
std::cout << " Concat InputDims[" << i << "]"
<< "d[" << j << " is " << inputs[i].d[j] << "\n";
#endif
for (int i = 0; i < nbInputDims; ++i)
{
int flattenInput = 0;
assert(inputs[i].nbDims == 3);
if (mConcatAxisID != 1)
assert(inputs[i].d[0] == inputs[0].d[0]);
if (mConcatAxisID != 2)
assert(inputs[i].d[1] == inputs[0].d[1]);
if (mConcatAxisID != 3)
assert(inputs[i].d[2] == inputs[0].d[2]);
flattenInput = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
mInputConcatAxis[i] = flattenInput;
mOutputConcatAxis += mInputConcatAxis[i];
}
return DimsCHW(mConcatAxisID == 1 ? mOutputConcatAxis : 1,
mConcatAxisID == 2 ? mOutputConcatAxis : 1,
mConcatAxisID == 3 ? mOutputConcatAxis : 1);
}
int initialize() override
{
CHECK_TRT(cublasCreate(&mCublas));
return 0;
}
void terminate() override
{
CHECK_TRT(cublasDestroy(mCublas));
}
size_t getWorkspaceSize(int) const override { return 0; }
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
int numConcats = 1;
assert(mConcatAxisID != 0);
numConcats = std::accumulate(mCHW.d, mCHW.d + mConcatAxisID - 1, 1, std::multiplies<int>());
if (!mIgnoreBatch)
numConcats *= batchSize;
float* output = reinterpret_cast<float*>(outputs[0]);
int offset = 0;
for (int i = 0; i < mNumInputs; ++i)
{
const float* input = reinterpret_cast<const float*>(inputs[i]);
float* inputTemp;
CHECK_TRT(cudaMalloc(&inputTemp, mCopySize[i] * batchSize));
CHECK_TRT(cudaMemcpyAsync(inputTemp, input, mCopySize[i] * batchSize, cudaMemcpyDeviceToDevice, stream));
for (int n = 0; n < numConcats; ++n)
{
CHECK_TRT(cublasScopy(mCublas, mInputConcatAxis[i],
inputTemp + n * mInputConcatAxis[i], 1,
output + (n * mOutputConcatAxis + offset), 1));
}
CHECK_TRT(cudaFree(inputTemp));
offset += mInputConcatAxis[i];
}
return 0;
}
size_t getSerializationSize() const override
{
return sizeof(bool) + sizeof(int) * (3 + mNumInputs) + sizeof(nvinfer1::Dims) + (sizeof(mCopySize) * mNumInputs);
}
void serialize(void* buffer) const override
{
char *d = reinterpret_cast<char*>(buffer), *a = d;
write(d, mIgnoreBatch);
write(d, mConcatAxisID);
write(d, mOutputConcatAxis);
write(d, mNumInputs);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mInputConcatAxis[i]);
}
write(d, mCHW);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mCopySize[i]);
}
assert(d == a + getSerializationSize());
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
assert(nbOutputs == 1);
mCHW = inputs[0];
assert(inputs[0].nbDims == 3);
CHECK_TRT(cudaMallocHost((void**) &mCopySize, nbInputs * sizeof(int)));
for (int i = 0; i < nbInputs; ++i)
{
mCopySize[i] = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2] * sizeof(float);
}
}
bool supportsFormat(DataType type, PluginFormat format) const override
{
return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
}
const char* getPluginType() const override { return "FlattenConcat_TRT"; }
const char* getPluginVersion() const override { return "1"; }
void destroy() override { delete this; }
IPluginV2* clone() const override
{
return new FlattenConcat(mConcatAxisID, mIgnoreBatch, mNumInputs, mOutputConcatAxis, mInputConcatAxis);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
size_t* mCopySize = nullptr;
bool mIgnoreBatch{false};
int mConcatAxisID{0}, mOutputConcatAxis{0}, mNumInputs{0};
int* mInputConcatAxis = nullptr;
nvinfer1::Dims mCHW;
cublasHandle_t mCublas;
std::string mNamespace;
};
namespace
{
const char *FLATTENCONCAT_PLUGIN_VERSION{"1"};
const char *FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
} // namespace
class FlattenConcatPluginCreator : public IPluginCreator
{
public:
FlattenConcatPluginCreator()
{
mPluginAttributes.emplace_back(PluginField("axis", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("ignoreBatch", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
~FlattenConcatPluginCreator() {}
const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }
const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "axis"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mConcatAxisID = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "ignoreBatch"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mIgnoreBatch = *(static_cast<const bool*>(fields[i].data));
}
}
return new FlattenConcat(mConcatAxisID, mIgnoreBatch);
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
//This object will be deleted when the network is destroyed, which will
//call Concat::destroy()
return new FlattenConcat(serialData, serialLength);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
static PluginFieldCollection mFC;
bool mIgnoreBatch{false};
int mConcatAxisID;
static std::vector<PluginField> mPluginAttributes;
std::string mNamespace = "";
};
PluginFieldCollection FlattenConcatPluginCreator::mFC{};
std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;
REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);
// 1. convert image to the right size
// 2. convert to float
// 3. normalize for inception
// 4. convert to flat vector, channels first
float * normalize_for_trt(const cv::cuda::GpuMat &img)
{
cv::Size size(INPUT_W, INPUT_H);
cv::cuda::GpuMat resizedMat;
cv::cuda::resize(img, resizedMat, size, 0, 0, CV_INTER_LINEAR);
cv::cuda::cvtColor(resizedMat, resizedMat, cv::COLOR_BGRA2RGB);
unsigned volChl = INPUT_H * INPUT_W;
float * data = (float *)samplesCommon::safeCudaMalloc(INPUT_C * volChl * sizeof(float));
// we treat the memory as if it's a one-channel, one row image
int rowSize = (int)resizedMat.step / (int)resizedMat.elemSize1();
// CUDA kernel to reshape the non-continuous GPU Mat structure and make it channel-first continuous
channelFirst(resizedMat.ptr<uint8_t>(), data, volChl, INPUT_C, INPUT_W * INPUT_C, rowSize);
return data;
}
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8)
{
initLibNvInferPlugins(&gLogger, "");
const int N = 10;
std::cout << graphFileName << std::endl;
auto parser = createUffParser();
BatchStream calibrationStream(CAL_BATCH_SIZE, NB_CAL_BATCHES);
parser->registerInput("Input", DimsCHW(INPUT_C, INPUT_H, INPUT_W), UffInputOrder::kNCHW);
parser->registerOutput("MarkOutput_0");
IHostMemory *trtModelStream{nullptr};
Int8EntropyCalibrator calibrator(calibrationStream, FIRST_CAL_BATCH, "CalibrationTableSSD");
ICudaEngine *tmpEngine = loadModelAndCreateEngine(graphFileName.c_str(), N, parser, &calibrator, trtModelStream, isInt8);
assert(tmpEngine != nullptr);
assert(trtModelStream != nullptr);
tmpEngine->destroy();
// Read a random sample image.
srand(unsigned(time(nullptr)));
// Deserialize the engine.
std::cout << "*** deserializing" << std::endl;
IRuntime *runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
return std::make_tuple(runtime, engine, context);
}
// mat representation of the image,
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat &img, IExecutionContext * context, vector<std::string>& CLASSES)
{
const int N = 1;
float * data = normalize_for_trt(img);
const std::string outFileRoot = "/home/borisk/images/";
// Host memory for outputs.
vector<float> detectionOut(N * detectionOutputParam.keepTopK * 7);
vector<int> keepCount(N);
// Run inference. This will also free the "data" pointer
doInference(*context, data, &detectionOut[0], &keepCount[0], N);
return std::make_tuple(detectionOut, keepCount);
}
\ No newline at end of file
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
cuda_res = cuvidInit(0);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
std::cout << "CUDA init: SUCCESS" << endl;
cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
isCudaInited = true;
}
int InferenceBase::Init(string videoStream)
{
if (!isCudaInited)
{
InitCuda();
}
if (ReadClassLabels() != 0)
{
return -1;
}
if (ReadGraph() != 0)
{
LOG(ERROR) << "Could not load inference graph";
return -1;
}
LOG(INFO) << "Inference graph loaded";
// create video stream
d_reader = GetVideoReader(videoStream);
if (d_reader == nullptr)
{
LOG(ERROR) << "Could not create video stream";
throw exception();
}
// save off frame dimensions
auto formatStruct = d_reader->format();
width = formatStruct.width;
height = formatStruct.height;
isInitialized = true;
return 0;
}
void InferenceBase::RunInferenceOnStream()
{
if (!isInitialized)
{
LOG(ERROR) << "Video streaming not initialized";
return;
}
cuda::GpuMat d_frame;
int iFrame = 0, nFrames = 30;
double fps = 0., infer_tf_ms = 0.;
high_resolution_clock::time_point start = high_resolution_clock::now();
high_resolution_clock::time_point end;
double duration = 0.;
for (;;)
{
start = high_resolution_clock::now();
if (!d_reader->nextFrame(d_frame))
{
break;
}
if (doInference(d_frame) != 0)
{
LOG(ERROR) << "Inference failed";
return;
}
end = high_resolution_clock::now();
duration += (double) duration_cast<milliseconds>(end - start).count();
visualize(d_frame, fps);
if (++iFrame % nFrames == 0)
{
fps = 1. * nFrames / duration * 1000.;
duration = 0.;
}
if (iFrame % 100 == 0)
{
LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
}
}
}
\ No newline at end of file
#pragma once
#include "utils.h"
using namespace std;
class InferenceBase
{
private:
bool isCudaInited;
cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
{return cv::cudacodec::createVideoReader(video_file);}
protected:
string labelsFile;
string graphFile;
map<int, string> labelsMap;
virtual int ReadClassLabels();
virtual int ReadGraph() = 0;
void InitCuda();
cv::Ptr<cv::cudacodec::VideoReader> d_reader;
double thresholdScore;
double thresholdIOU;
// frame width and height
int height;
int width;
int debug;
bool isInitialized;
public:
InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
: labelsFile(labelsFile)
, graphFile(graphFile)
, isCudaInited(false)
, thresholdScore(threshScore)
, thresholdIOU(threshIOU)
, isInitialized(false)
, labelsMap()
, debug(dbg)
{}
virtual ~InferenceBase() {}
void RunInferenceOnStream();
virtual int doInference(cv::cuda::GpuMat&) = 0;
virtual void visualize(cv::cuda::GpuMat&, double) = 0;
virtual int Init(string video_stream);
map<int, string> get_labels_map() {return labelsMap;}
void set_debug(int dbg) {debug = dbg;}
};
#include "inference_tf.h"
using tensorflow::Status;
using tensorflow::Tensor;
using namespace cv;
using tensorflow::int32;
int InferenceTensorflow::ReadGraph()
{
LOG(INFO) << "graphFile:" << graphFile;
Status loadGraphStatus = loadGraph(graphFile, &session);
if (!loadGraphStatus.ok())
{
LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
return -1;
}
else
LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
return 0;
}
// allocate input tensor
int InferenceTensorflow::Init(string videoStream)
{
if (InferenceBase::Init(videoStream) != 0)
{
return -1;
}
// configure callable options
opts.add_feed(inputLayer);
for (auto const &value : outputLayer)
{
opts.add_fetch(value);
}
const string gpu_device_name = GPUDeviceName(session.get());
opts.clear_fetch_devices();
opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok())
{
LOG(ERROR) << "Failed to make callable";
}
// allocate tensor on the GPU
tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
new tensorflow::GPUMemAllocator(
tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
platform_gpu_id, false /*use_unified_memory*/, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
return 0;
}
int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
{
Status runStatus;
readTensorFromGpuMat(d_frame, inputTensor);
runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
if (!runStatus.ok())
{
LOG(ERROR) << "Running model failed: " << runStatus;
return -1;
}
return 0;
}
void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
// Extract results from the outputs vector
tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
if (debug & 0x1)
{
for (size_t i = 0; i < goodIdxs.size(); i++)
LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
<< " (" << classes(goodIdxs.at(i)) << "), box:"
<< "," << boxes(0, goodIdxs.at(i), 0) << ","
<< boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
<< boxes(0, goodIdxs.at(i), 3);
}
// Draw bboxes and captions
if (debug & 0x2)
{
Mat frame;
d_frame.download(frame);
drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
auto color = Scalar(255, 0, 255);
drawFrameworkSignature(frame, fps, "Tensorflow", color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
using tensorflow::CallableOptions;
using tensorflow::Tensor;
using tensorflow::Session;
class InferenceTensorflow : public InferenceBase
{
private:
const string inputLayer = "image_tensor:0";
const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
CallableOptions opts;
std::unique_ptr<tensorflow::Session> session;
Session::CallableHandle feed_gpu_fetch_cpu;
// Allocate input tensor on the gpu
Tensor inputTensor;
vector<Tensor> outputs;
protected:
int ReadGraph() override;
int doInference(cv::cuda::GpuMat& d_frame) override;
void visualize(cv::cuda::GpuMat &d_frame, double) override;
public:
InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0)
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, opts()
{ }
int Init(string videoStream) override;
virtual ~InferenceTensorflow() { session->ReleaseCallable(feed_gpu_fetch_cpu);}
};
\ No newline at end of file
#include "inference_trt.h"
using namespace cv;
using namespace std;
int InferenceTensorRT::ReadGraph()
{
auto runtimeEngineContext = CreateTrtEngineAndContext(graphFile, isInt8);
runtime = std::get<0>(runtimeEngineContext);
engine = std::get<1>(runtimeEngineContext);
context = std::get<2>(runtimeEngineContext);
return 0;
}
int InferenceTensorRT::ReadClassLabels()
{
populateClassLabels(labelsVector, labelsFile);
return 0;
}
int InferenceTensorRT::doInference(cv::cuda::GpuMat &d_frame)
{
auto inferenceTuple = doInferenceWithTrt(d_frame, context, labelsVector);
detections = std::get<0>(inferenceTuple);
numDetections = std::get<1>(inferenceTuple);
return 0;
}
void InferenceTensorRT::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
Mat img;
d_frame.download(img);
for (int p = 0; p < N; ++p)
{
for (int i = 0; i < numDetections[p]; ++i)
{
float *det = &detections[0] + (p * detectionOutputParam.keepTopK + i) * 7;
if (det[2] < visualizeThreshold)
continue;
// Output format for each detection is stored in the below order
// [image_id, label, confidence, xmin, ymin, xmax, ymax]
assert((int)det[1] < OUTPUT_CLS_SIZE);
std::string storeName = outFileRoot + labelsVector[(int)det[1]] + "-" + std::to_string(det[2]) + ".jpg";
if (debug & 0x2)
{
// det array idxs: (4, 3) = (y0, x0), (6, 5) = (y1, x1)
// dets are in absolute coordinates: 0 <= pt <= 1
drawBoundingBoxOnImage(img, det[4], det[3], det[6], det[5], det[2], labelsVector[(int)det[1]]);
}
}
}
if (debug & 0x2)
{
string framework("TensorRT");
if (isInt8)
{
framework += " (INT8)";
}
auto color = Scalar(0, 255, 255);
drawFrameworkSignature(img, fps, framework, color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
class InferenceTensorRT : public InferenceBase
{
private:
IRuntime *runtime;
ICudaEngine *engine;
IExecutionContext *context;
bool isInt8;
//batch size
const int N = 1;
const float visualizeThreshold = 0.5;
vector<string> labelsVector;
vector<int> numDetections;
vector<float> detections;
string outFileRoot;
protected:
int ReadGraph() override;
int ReadClassLabels() override;
int doInference(cv::cuda::GpuMat &d_frame) override;
void visualize(cv::cuda::GpuMat&, double) override;
public:
InferenceTensorRT(const string &labelsFile, const string &graphFile, bool isInt8, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0, string outFile="")
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, labelsVector()
, numDetections(N)
, detections(N * detectionOutputParam.keepTopK * 7)
, outFileRoot(outFile)
, isInt8(isInt8)
{
}
virtual ~InferenceTensorRT()
{
if(context != nullptr)
{
context->destroy();
}
if(engine != nullptr)
{
engine->destroy();
}
if(runtime != nullptr)
{
runtime->destroy();
}
}
};
\ No newline at end of file
#include "inference_base.h"
#include "inference_tf.h"
#include "inference_trt.h"
#include <cuda_profiler_api.h>
using tensorflow::CallableOptions;
using tensorflow::int32;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::Tensor;
using namespace std;
using namespace cv;
using namespace std::chrono;
int main(int argc, char *argv[])
{
if (!tensorflow::IsGoogleCudaEnabled())
{
LOG(ERROR) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda";
return -1;
}
const String keys =
"{d display |1 | view video while objects are detected}"
"{t tensorrt|false | use tensorrt}"
"{i int8|false| use INT8 (requires callibration)}"
"{v video | | video for detection}"
"{graph ||frozen graph location}"
"{labels ||trained labels filelocation}";
// Set dirs variables
string ROOTDIR = "";
CommandLineParser parser(argc, argv, keys);
int showWindow = parser.get<int>("d");
String video_file = parser.get<String>("v");
bool is_tensor_rt = parser.get<bool>("t");
bool is_int8 = parser.get<bool>("i");
String LABELS = parser.get<String>("labels");
String GRAPH = parser.get<String>("graph");
unique_ptr<InferenceBase> infer(is_tensor_rt ?
(InferenceBase *) new InferenceTensorRT(LABELS, GRAPH, is_int8)
: (InferenceBase *) new InferenceTensorflow(LABELS, GRAPH));
infer->set_debug(showWindow);
infer->Init(video_file);
infer->RunInferenceOnStream();
return 0;
}
\ No newline at end of file
./build/tf_detector_example -d=$1 \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/home/boris/model/frozen_inference_graph.pb \
-labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-i \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
#include "utils.h"
using namespace std;
using namespace cv;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::int32;
using tensorflow::DeviceAttributes;
/** Read a model graph definition (xxx.pb) from disk, and creates a session object you can use to run it.
*/
Status loadGraph(const string &graph_file_name,
unique_ptr<tensorflow::Session> *session) {
tensorflow::GraphDef graph_def;
Status load_graph_status =
ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
if (!load_graph_status.ok()) {
return tensorflow::errors::NotFound("Failed to load compute graph at '",
graph_file_name, "'");
}
tensorflow::SessionOptions session_options;
session_options.config.mutable_gpu_options()->set_allow_growth(true);
session->reset(tensorflow::NewSession(session_options));
Status session_create_status = (*session)->Create(graph_def);
if (!session_create_status.ok()) {
return session_create_status;
}
return Status::OK();
}
/** Read a labels map file (xxx.pbtxt) from disk to translate class numbers into human-readable labels.
*/
Status readLabelsMapFile(const string &fileName, map<int, string> &labelsMap) {
// Read file into a string
ifstream t(fileName);
if (t.bad())
return tensorflow::errors::NotFound("Failed to load labels map at '", fileName, "'");
stringstream buffer;
buffer << t.rdbuf();
string fileString = buffer.str();
// Search entry patterns of type 'item { ... }' and parse each of them
smatch matcherEntry;
smatch matcherId;
smatch matcherName;
const regex reEntry("item \\{([\\S\\s]*?)\\}");
const regex reId("id: [0-9]+");
const regex reDisplayName("display_name: (\"|\').+(\"|\')");
const regex reName("name: (\"|\').+(\"|\')");
string entry;
const string namePrefix = "name: \"";
const string display_name = "display_name: \"";
const size_t idOffset = string("id: ").length();
size_t nameOffset = display_name.length();
// we first try to parse "display_name"
// and fall back if it does not exist
bool isParsingName = false;
auto stringBegin = sregex_iterator(fileString.begin(), fileString.end(), reEntry);
auto stringEnd = sregex_iterator();
int id;
string name;
for (sregex_iterator i = stringBegin; i != stringEnd; i++) {
matcherEntry = *i;
entry = matcherEntry.str();
regex_search(entry, matcherId, reId);
if (!matcherId.empty())
id = stoi(matcherId[0].str().substr(idOffset, matcherId[0].str().length() - idOffset));
else
continue;
if(!isParsingName)
{
regex_search(entry, matcherName, reDisplayName);
if(matcherName.empty())
{
isParsingName = true;
nameOffset = namePrefix.length();
}
}
if(isParsingName)
{
regex_search(entry, matcherName, reName);
}
if (!matcherName.empty())
name = matcherName[0].str().substr(nameOffset, matcherName[0].str().length() - nameOffset - 1);
else
continue;
labelsMap.insert(pair<int, string>(id, name));
}
return Status::OK();
}
/** Convert Mat image into tensor of shape (1, height, width, d) where last three dims are equal to the original dims.
*/
Status readTensorFromMat(const Mat &mat, Tensor &outTensor) {
// Trick from https://github.com/tensorflow/tensorflow/issues/8033
tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
Mat fakeMat(mat.rows, mat.cols, CV_8UC3, p);
cv::cvtColor(mat, fakeMat, COLOR_BGR2RGB);
return Status::OK();
}
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor) {
tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
cv::cuda::GpuMat fakeMat(g_mat.rows, g_mat.cols, CV_8UC3, p);
// comes in with 4 channels -> 3 channels
cv::cuda::cvtColor(g_mat, fakeMat, COLOR_BGRA2RGB);
return Status::OK();
}
/** Draw bounding box and add caption to the image.
* Boolean flag _scaled_ shows if the passed coordinates are in relative units (true by default in tensorflow detection)
*/
void drawBoundingBoxOnImage(Mat &image, double yMin, double xMin, double yMax, double xMax, double score, string label, bool scaled) {
cv::Point tl, br;
if (scaled) {
tl = cv::Point((int) (xMin * image.cols), (int) (yMin * image.rows));
br = cv::Point((int) (xMax * image.cols), (int) (yMax * image.rows));
} else {
tl = cv::Point((int) xMin, (int) yMin);
br = cv::Point((int) xMax, (int) yMax);
}
cv::rectangle(image, tl, br, cv::Scalar(0, 255, 255), 1);
// Ceiling the score down to 3 decimals (weird!)
float scoreRounded = floorf(score * 1000) / 1000;
string scoreString = to_string(scoreRounded).substr(0, 5);
string caption = label + " (" + scoreString + ")";
// Adding caption of type "LABEL (X.XXX)" to the top-left corner of the bounding box
int fontCoeff = 12;
cv::Point brRect = cv::Point(tl.x + caption.length() * fontCoeff / 1.6, tl.y + fontCoeff);
cv::rectangle(image, tl, brRect, cv::Scalar(0, 255, 255), -1);
cv::Point textCorner = cv::Point(tl.x, tl.y + fontCoeff * 0.9);
cv::putText(image, caption, textCorner, FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 0, 0));
}
/** Draw bounding boxes and add captions to the image.
* Box is drawn only if corresponding score is higher than the _threshold_.
*/
void drawFrameworkSignature(Mat& image, double fps, string signature, Scalar& color)
{
putText(image, "TensorFlow", Point(0, image.rows - 30), FONT_HERSHEY_SIMPLEX, 0.7, color, 2);
putText(image, to_string(fps).substr(0, 5), Point(0, image.rows - 5), FONT_HERSHEY_SIMPLEX, 0.7, Scalar(255, 255, 255), 2);
imshow("stream", image);
waitKey(1);
}
void drawBoundingBoxesOnImage(Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
map<int, string> &labelsMap,
vector<size_t> &idxs) {
for (int j = 0; j < idxs.size(); j++)
drawBoundingBoxOnImage(image,
boxes(0,idxs.at(j),0), boxes(0,idxs.at(j),1),
boxes(0,idxs.at(j),2), boxes(0,idxs.at(j),3),
scores(idxs.at(j)), labelsMap[classes(idxs.at(j))]);
}
/** Calculate intersection-over-union (IOU) for two given bbox Rects.
*/
double IOU(Rect2f box1, Rect2f box2) {
float xA = max(box1.tl().x, box2.tl().x);
float yA = max(box1.tl().y, box2.tl().y);
float xB = min(box1.br().x, box2.br().x);
float yB = min(box1.br().y, box2.br().y);
float intersectArea = abs((xB - xA) * (yB - yA));
float unionArea = abs(box1.area()) + abs(box2.area()) - intersectArea;
return 1. * intersectArea / unionArea;
}
/** Return idxs of good boxes (ones with highest confidence score (>= thresholdScore)
* and IOU <= thresholdIOU with others).
*/
vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore) {
vector<size_t> sortIdxs(scores.size());
iota(sortIdxs.begin(), sortIdxs.end(), 0);
// Create set of "bad" idxs
set<size_t> badIdxs = set<size_t>();
size_t i = 0;
while (i < sortIdxs.size()) {
if (scores(sortIdxs.at(i)) < thresholdScore)
badIdxs.insert(sortIdxs[i]);
if (badIdxs.find(sortIdxs.at(i)) != badIdxs.end()) {
i++;
continue;
}
Rect2f box1 = Rect2f(Point2f(boxes(0, sortIdxs.at(i), 1), boxes(0, sortIdxs.at(i), 0)),
Point2f(boxes(0, sortIdxs.at(i), 3), boxes(0, sortIdxs.at(i), 2)));
for (size_t j = i + 1; j < sortIdxs.size(); j++) {
if (scores(sortIdxs.at(j)) < thresholdScore) {
badIdxs.insert(sortIdxs[j]);
continue;
}
Rect2f box2 = Rect2f(Point2f(boxes(0, sortIdxs.at(j), 1), boxes(0, sortIdxs.at(j), 0)),
Point2f(boxes(0, sortIdxs.at(j), 3), boxes(0, sortIdxs.at(j), 2)));
if (IOU(box1, box2) > thresholdIOU)
badIdxs.insert(sortIdxs[j]);
}
i++;
}
// Prepare "good" idxs for return
vector<size_t> goodIdxs = vector<size_t>();
for (auto it = sortIdxs.begin(); it != sortIdxs.end(); it++)
if (badIdxs.find(sortIdxs.at(*it)) == badIdxs.end())
goodIdxs.push_back(*it);
return goodIdxs;
}
string type2str(int type) {
string r;
uchar depth = type & CV_MAT_DEPTH_MASK;
uchar chans = 1 + (type >> CV_CN_SHIFT);
switch ( depth ) {
case CV_8U: r = "8U"; break;
case CV_8S: r = "8S"; break;
case CV_16U: r = "16U"; break;
case CV_16S: r = "16S"; break;
case CV_32S: r = "32S"; break;
case CV_32F: r = "32F"; break;
case CV_64F: r = "64F"; break;
default: r = "User"; break;
}
r += "C";
r += (chans+'0');
return r;
}
bool IsCUDATensor(const Tensor &t)
{
cudaPointerAttributes attributes;
cudaError_t err =
cudaPointerGetAttributes(&attributes, t.tensor_data().data());
if (err == cudaErrorInvalidValue)
return false;
CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
#if CUDART_VERSION >= 10000
return (attributes.type == cudaMemoryTypeDevice);
#else
return (attributes.memoryType == cudaMemoryTypeDevice);
#endif
}
string GPUDeviceName(Session* session) {
std::vector<DeviceAttributes> devices;
TF_CHECK_OK(session->ListDevices(&devices));
for (const DeviceAttributes& d : devices) {
LOG(INFO) << "Device: " << d.name();
if (d.device_type() == "GPU" || d.device_type() == "gpu") {
return d.name();
}
}
return "";
}
\ No newline at end of file
#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
#define TF_DETECTOR_EXAMPLE_UTILS_H
#endif //TF_DETECTOR_EXAMPLE_UTILS_H
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <map>
#include <unordered_map>
#include <math.h>
#include <regex>
#include <tuple>
#include <cassert>
#include <cublas_v2.h>
#include <cudnn.h>
#include <sstream>
#include <time.h>
#include "BatchStreamPPM.h"
#include "NvUffParser.h"
#include "common.h"
#include "NvInferPlugin.h"
// Required for CUDA check
#include "tensorflow/core/util/port.h"
// GPU allocator
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
// Direct session
#include "tensorflow/core/common_runtime/direct_session.h"
#include <cv.hpp>
#include <opencv2/cudacodec.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
// CUDA includes. Order matters
#include <dynlink_nvcuvid.h>
#include "cuda_runtime_api.h"
using namespace std;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::Session;
using namespace nvinfer1;
using namespace nvuffparser;
string type2str(int type);
Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
Status loadGraph(const string &graph_file_name,
std::unique_ptr<tensorflow::Session> *session);
Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
void drawBoundingBoxesOnImage(cv::Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
std::map<int, string> &labelsMap,
std::vector<size_t> &idxs);
void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
double IOU(cv::Rect box1, cv::Rect box2);
std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore);
bool IsCUDATensor(const Tensor &t);
string GPUDeviceName(Session* session);
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
extern DetectionOutputParameters detectionOutputParam;
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
extern const int OUTPUT_CLS_SIZE;
extern const int OUTPUT_BBOX_SIZE;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment