Commit 773648ca authored by Oleg Dzhimiev's avatar Oleg Dzhimiev

standalone code

parent d8533f78
#ifndef BATCH_STREAM_PPM_H
#define BATCH_STREAM_PPM_H
#include <vector>
#include <assert.h>
#include <algorithm>
#include <iomanip>
#include <fstream>
#include "NvInfer.h"
#include "common.h"
std::string locateFile(const std::string& input);
static constexpr int INPUT_C = 3;
static constexpr int INPUT_H = 300;
static constexpr int INPUT_W = 300;
extern const char* INPUT_BLOB_NAME;
class BatchStream
{
public:
BatchStream(int batchSize, int maxBatches) : mBatchSize(batchSize), mMaxBatches(maxBatches)
{
mDims = nvinfer1::DimsNCHW{batchSize, 3, 300, 300 };
mImageSize = mDims.c() * mDims.h() * mDims.w();
mBatch.resize(mBatchSize * mImageSize, 0);
mLabels.resize(mBatchSize, 0);
mFileBatch.resize(mDims.n() * mImageSize, 0);
mFileLabels.resize(mDims.n(), 0);
reset(0);
}
void reset(int firstBatch)
{
mBatchCount = 0;
mFileCount = 0;
mFileBatchPos = mDims.n();
skip(firstBatch);
}
bool next()
{
if (mBatchCount == mMaxBatches)
return false;
for (int csize = 1, batchPos = 0; batchPos < mBatchSize; batchPos += csize, mFileBatchPos += csize)
{
assert(mFileBatchPos > 0 && mFileBatchPos <= mDims.n());
if (mFileBatchPos == mDims.n() && !update())
return false;
// copy the smaller of: elements left to fulfill the request, or elements left in the file buffer.
csize = std::min(mBatchSize - batchPos, mDims.n() - mFileBatchPos);
std::copy_n(getFileBatch() + mFileBatchPos * mImageSize, csize * mImageSize, getBatch() + batchPos * mImageSize);
}
mBatchCount++;
return true;
}
void skip(int skipCount)
{
if (mBatchSize >= mDims.n() && mBatchSize % mDims.n() == 0 && mFileBatchPos == mDims.n())
{
mFileCount += skipCount * mBatchSize / mDims.n();
return;
}
int x = mBatchCount;
for (int i = 0; i < skipCount; i++)
next();
mBatchCount = x;
}
float *getBatch() { return mBatch.data(); }
float *getLabels() { return mLabels.data(); }
int getBatchesRead() const { return mBatchCount; }
int getBatchSize() const { return mBatchSize; }
nvinfer1::DimsNCHW getDims() const { return mDims; }
private:
float* getFileBatch() { return mFileBatch.data(); }
float* getFileLabels() { return mFileLabels.data(); }
bool update()
{
std::vector<std::string> fNames;
std::ifstream file(locateFile("list.txt"));
if(file)
{
std::cout << "Batch #" << mFileCount << "\n";
file.seekg(mCurPos);
}
for(int i = 1; i <= mBatchSize; i++)
{
std::string sName;
std::getline(file, sName);
sName = sName + ".ppm";
std::cout << "Calibrating with file " << sName << std::endl;
fNames.emplace_back(sName);
}
mCurPos = file.tellg();
mFileCount++;
std::vector<samplesCommon::PPM<INPUT_C, INPUT_H, INPUT_W>> ppms(fNames.size());
for (uint32_t i = 0; i < fNames.size(); ++i)
{
readPPMFile(locateFile(fNames[i]), ppms[i]);
}
std::vector<float> data(samplesCommon::volume(mDims));
long int volChl = mDims.h() * mDims.w();
for (int i = 0, volImg = mDims.c() * mDims.h() * mDims.w(); i < mBatchSize; ++i)
{
for (int c = 0; c < mDims.c(); ++c)
{
for (int j = 0; j < volChl; ++j)
{
data[i * volImg + c * volChl + j] = (2.0 / 255.0) * float(ppms[i].buffer[j * mDims.c() + c]) - 1.0;
}
}
}
std::copy_n(data.data(), mDims.n() * mImageSize, getFileBatch());
mFileBatchPos = 0;
return true;
}
int mBatchSize{0};
int mMaxBatches{0};
int mBatchCount{0};
int mFileCount{0}, mFileBatchPos{0};
int mImageSize{0};
int mCurPos{0};
nvinfer1::DimsNCHW mDims;
std::vector<float> mBatch;
std::vector<float> mLabels;
std::vector<float> mFileBatch;
std::vector<float> mFileLabels;
};
class Int8EntropyCalibrator : public nvinfer1::IInt8EntropyCalibrator
{
public:
Int8EntropyCalibrator(BatchStream& stream, int firstBatch, std::string calibrationTableName, bool readCache = true)
: mStream(stream),
mCalibrationTableName(std::move(calibrationTableName)),
mReadCache(readCache)
{
nvinfer1::DimsNCHW dims = mStream.getDims();
mInputCount = samplesCommon::volume(dims);
CHECK_TRT(cudaMalloc(&mDeviceInput, mInputCount * sizeof(float)));
mStream.reset(firstBatch);
}
virtual ~Int8EntropyCalibrator()
{
CHECK_TRT(cudaFree(mDeviceInput));
}
int getBatchSize() const override { return mStream.getBatchSize(); }
bool getBatch(void* bindings[], const char* names[], int nbBindings) override
{
if (!mStream.next())
return false;
CHECK_TRT(cudaMemcpy(mDeviceInput, mStream.getBatch(), mInputCount * sizeof(float), cudaMemcpyHostToDevice));
assert(!strcmp(names[0], INPUT_BLOB_NAME));
bindings[0] = mDeviceInput;
return true;
}
const void* readCalibrationCache(size_t& length) override
{
mCalibrationCache.clear();
std::ifstream input(mCalibrationTableName, std::ios::binary);
input >> std::noskipws;
if (mReadCache && input.good())
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(mCalibrationCache));
length = mCalibrationCache.size();
return length ? mCalibrationCache.data() : nullptr;
}
void writeCalibrationCache(const void* cache, size_t length) override
{
std::ofstream output(mCalibrationTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(cache), length);
}
private:
BatchStream mStream;
std::string mCalibrationTableName;
bool mReadCache{true};
size_t mInputCount;
void* mDeviceInput{nullptr};
std::vector<char> mCalibrationCache;
};
#endif
cmake_minimum_required(VERSION 3.16) cmake_minimum_required(VERSION 3.16)
set(ENV{CUDACXX} /usr/local/cuda/bin/nvcc) project(tf-gpu-feed LANGUAGES CXX CUDA)
project(tf_detector_example LANGUAGES CXX CUDA)
cmake_policy(SET CMP0074 OLD) cmake_policy(SET CMP0074 OLD)
set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD 11)
# CUDA for cudacodec ops set(CUDACXX /usr/local/cuda/bin/nvcc)
find_package(CUDA 9.0 REQUIRED) find_package(CUDA 9.0 REQUIRED)
set(SOURCE_FILES set(SOURCE_FILES
main.cpp main.cpp
utils.cpp dynlink_nvcuvid.cpp
utils.h array.cu
dynlink_nvcuvid.cpp
infer_with_trt.cpp
inference_base.cpp
inference_tf.cpp
channel_first.cu
) )
# Tensorflow directories and libraries
set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so) set(TENSORFLOW_LIBS libtensorflow_cc.so libtensorflow_framework.so)
set(MYHOME $ENV{HOME})
message("-- Home set to: " ${MYHOME})
link_directories("/usr/local/tensorflow/lib") link_directories("/usr/local/tensorflow/lib")
add_executable(tf_detector_example ${SOURCE_FILES}) add_executable(tf-gpu-feed ${SOURCE_FILES})
set_target_properties(tf_detector_example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) set_target_properties(tf-gpu-feed PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# OpenCV libs
find_package(OpenCV REQUIRED)
include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
# ==================== PATHS TO SPECIFY! ==================== #
# TensorFlow headers # TensorFlow headers
include_directories("/usr/local/tensorflow/include/tensorflow/") include_directories("/usr/local/tensorflow/include/tensorflow/")
include_directories("/usr/local/tensorflow/include/third-party/") include_directories("/usr/local/tensorflow/include/third-party/")
include_directories("/usr/local/tensorflow/include/") include_directories("/usr/local/tensorflow/include/")
# IMPORTANT: Protobuf includes. Depends on the anaconda path target_link_libraries(tf-gpu-feed
# This is Azure DLVM (not sure if DSVM is the same)
#include_directories("/data/anaconda/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
# This is a standard install of Anaconda with p36 environment
#include_directories("${MYHOME}/anaconda3/envs/py36/lib/python3.6/site-packages/tensorflow/include/")
target_link_libraries(tf_detector_example
${CUDA_LIBRARIES} ${CUDA_LIBRARIES}
cuda cuda
cublas cublas
nvinfer nvinfer
nvToolsExt nvToolsExt
nvparsers nvparsers
nvinfer_plugin nvinfer_plugin
nvonnxparser nvonnxparser
${CMAKE_DL_LIBS} ${CMAKE_DL_LIBS}
${OpenCV_LIBS}
${TENSORFLOW_LIBS}) ${TENSORFLOW_LIBS})
1
(Unnamed ITensor* 9): 3d418f1e
Input: 3c010a14
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d205fca
(Unnamed ITensor* 225): 3d368720
(Unnamed ITensor* 412): 3d418f1e
(Unnamed ITensor* 195): 3dafce6e
(Unnamed ITensor* 138): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/MaxPool_3a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 463): 3d418f1e
(Unnamed ITensor* 75): 3d2dcb21
(Unnamed ITensor* 157): 3d418f1e
BoxPredictor_3/ClassPredictor/BiasAdd: 3c8c8ef8
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/MaxPool_2a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 61): 3d418f1e
(Unnamed ITensor* 462): 3d3d75f1
(Unnamed ITensor* 156): 3d618943
(Unnamed ITensor* 24): 3d913052
(Unnamed ITensor* 32): 3d6533f9
(Unnamed ITensor* 83): 3d3ca52c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 118): 3d4aef99
(Unnamed ITensor* 485): 3d1d4f1e
BoxPredictor_4/BoxEncodingPredictor/BiasAdd: 3ca49bb9
(Unnamed ITensor* 84): 3d418f1e
(Unnamed ITensor* 160): 3d418f1e
BoxPredictor_5/ClassPredictor/BiasAdd: 3c773985
(Unnamed ITensor* 316): 3d63dc8a
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/separable_conv2d/depthwise: 3de7428e
(Unnamed ITensor* 90): 3d73f085
(Unnamed ITensor* 91): 3d418f1e
(Unnamed ITensor* 419): 3d418f1e
(Unnamed ITensor* 374): 3d59dbf2
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3c8d1a
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_5_1x1_64/Relu6: 3d17eae6
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 507): 3d418f1e
(Unnamed ITensor* 2): 3c010a14
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 112): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 126): 3d20913a
(Unnamed ITensor* 104): 3d80ab32
(Unnamed ITensor* 134): 3d8dd320
(Unnamed ITensor* 324): 3d418f1e
(Unnamed ITensor* 135): 3d418f1e
(Unnamed ITensor* 628): 3d9d9605
(Unnamed ITensor* 449): 3d418f1e
(Unnamed ITensor* 119): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 513): 3d5e275c
(Unnamed ITensor* 164): 3d946ceb
Squeeze_2: 3cc8bb82
(Unnamed ITensor* 167): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0a_1x1/Relu6: 3d2d4927
(Unnamed ITensor* 541): 3d37a99c
(Unnamed ITensor* 143): 3d418f1e
(Unnamed ITensor* 240): 3d418f1e
(Unnamed ITensor* 150): 3d418f1e
(Unnamed ITensor* 165): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 310): 3d418f1e
(Unnamed ITensor* 260): 3d60aac4
(Unnamed ITensor* 405): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/Conv2d_0b_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 105): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 382): 3d1e3cff
(Unnamed ITensor* 550): 3d418f1e
(Unnamed ITensor* 391): 3d418f1e
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_2_1x1_256/Relu6: 3d37a347
(Unnamed ITensor* 448): 3d6ab083
(Unnamed ITensor* 142): 3dd08cf3
(Unnamed ITensor* 595): 3d418f1e
BoxPredictor_1/ClassPredictor/BiasAdd: 3e194e24
concat_box_conf: 3e1bb222
(Unnamed ITensor* 594): 3d4ff643
(Unnamed ITensor* 602): 3d418f1e
BoxPredictor_5/Reshape_1: 3c773985
concat_box_loc: 3de14ea0
BoxPredictor_4/ClassPredictor/BiasAdd: 3ca5201c
Squeeze_4: 3ca49bb9
(Unnamed ITensor* 621): 3d418f1e
(Unnamed ITensor* 624): 3d17eae6
BoxPredictor_2/ClassPredictor/BiasAdd: 3e1ec6c2
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/Conv2d_0b_1x1/Relu6: 3d156ede
(Unnamed ITensor* 33): 3d418f1e
(Unnamed ITensor* 500): 3d418f1e
BoxPredictor_2/Reshape_1: 3e1ec6c2
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_5_3x3_s2_128/Relu6: 3d418f1e
BoxPredictor_5/BoxEncodingPredictor/BiasAdd: 3cdbc092
GridAnchor_1: 3a500341
(Unnamed ITensor* 569): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 620): 3d17eae6
(Unnamed ITensor* 418): 3d91976a
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 111): 3d85a99e
(Unnamed ITensor* 575): 3dc8e55f
(Unnamed ITensor* 601): 3d8b91c4
BoxPredictor_1/Reshape_1: 3e194e24
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0a_1x1/Relu6: 3d433d97
(Unnamed ITensor* 545): 3d37a347
BoxPredictor_3/Reshape_1: 3c8c8ef8
(Unnamed ITensor* 347): 3d418f1e
(Unnamed ITensor* 568): 3d1c5b35
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 471): 3d418f1e
(Unnamed ITensor* 455): 3d500012
(Unnamed ITensor* 303): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_3/Conv2d_0b_1x1/Relu6: 3d20913a
BoxPredictor_4/Reshape_1: 3ca5201c
GridAnchor_4 copy: 3c3aa18a
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_3_3x3_s2_256/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/Conv2d_0b_1x1/Relu6: 3d1e3cff
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
GridAnchor_5 copy: 3c2b37e3
(Unnamed ITensor* 331): 3d3ca1fe
NMS_1: 1
BoxPredictor_3/BoxEncodingPredictor/BiasAdd: 3cafbf65
(Unnamed ITensor* 188): 3dc61b5c
(Unnamed ITensor* 196): 3d418f1e
(Unnamed ITensor* 209): 3dc05776
GridAnchor_2 copy: 3c2c4ae8
(Unnamed ITensor* 367): 3d7bf53d
(Unnamed ITensor* 361): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0b_3x3/Relu6: 3d3ceddc
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_0/Conv2d_0a_1x1/Relu6: 3d3772e6
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/Conv2d_0b_1x1/Relu6: 3d31060c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 411): 3d836c20
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 18): 3d418f1e
(Unnamed ITensor* 390): 3d9a604f
(Unnamed ITensor* 346): 3d67b7ae
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
BoxPredictor_2/BoxEncodingPredictor/BiasAdd: 3cc8bb82
(Unnamed ITensor* 217): 3d4cf10e
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_2_3x3_s2_512/Relu6: 3d418f1e
(Unnamed ITensor* 233): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 542): 3d418f1e
(Unnamed ITensor* 67): 3d8e8123
(Unnamed ITensor* 247): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_3/AvgPool_0a_3x3/AvgPool: 3d1d88d2
(Unnamed ITensor* 302): 3daf4176
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 239): 3d4f00df
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 514): 3d418f1e
(Unnamed ITensor* 435): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 317): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 289): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 478): 3d3244f6
(Unnamed ITensor* 549): 3dbeda4a
(Unnamed ITensor* 261): 3d418f1e
(Unnamed ITensor* 492): 3d9e1645
(Unnamed ITensor* 441): 3d15c098
(Unnamed ITensor* 479): 3d418f1e
(Unnamed ITensor* 493): 3d418f1e
BoxPredictor_0/Reshape_1: 3e13296c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d258e36
(Unnamed ITensor* 339): 3d5f2411
FeatureExtractor/InceptionV2/Mixed_5c_2_Conv2d_4_3x3_s2_256/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_1a_3x3/Relu6: 3d418f1e
Squeeze_1: 3d2f0384
GridAnchor: 3a4f5b62
(Unnamed ITensor* 368): 3d418f1e
Squeeze: 3df34968
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 375): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/MaxPool_0a_3x3/MaxPool: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/AvgPool_0a_3x3/AvgPool: 3d18b9fa
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_2/MaxPool_1a_3x3/MaxPool: 3d418f1e
(Unnamed ITensor* 253): 3d92390f
(Unnamed ITensor* 210): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_3/Conv2d_0b_1x1/Relu6: 3d4af27d
Squeeze_3: 3cafbf65
(Unnamed ITensor* 340): 3d418f1e
(Unnamed ITensor* 11): 3d418f1e
(Unnamed ITensor* 295): 3d9c64d4
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_3_1x1_128/Relu6: 3d1c5b35
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_1/Conv2d_0b_3x3/Relu6: 3d15c098
(Unnamed ITensor* 323): 3d5d9fd1
GridAnchor_4: 3c3aa18a
(Unnamed ITensor* 360): 3d88c0ec
(Unnamed ITensor* 25): 3d418f1e
(Unnamed ITensor* 288): 3d6b9ef7
(Unnamed ITensor* 226): 3d418f1e
(Unnamed ITensor* 456): 3d418f1e
(Unnamed ITensor* 46): 3d86ba82
BoxPredictor_0/BoxEncodingPredictor/BiasAdd: 3df34968
(Unnamed ITensor* 232): 3ddb36a3
(Unnamed ITensor* 521): 3cb42ac7
GridAnchor_3 copy: 3c348982
(Unnamed ITensor* 296): 3d418f1e
BoxPredictor_0/ClassPredictor/BiasAdd: 3e13296c
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_3/Conv2d_0b_1x1/Relu6: 3d2dcb21
(Unnamed ITensor* 202): 3d87a00a
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 269): 3d418f1e
GridAnchor_3: 3c348982
(Unnamed ITensor* 218): 3d418f1e
(Unnamed ITensor* 203): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4c/Branch_2/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 486): 3d418f1e
(Unnamed ITensor* 268): 3d0e4f64
Squeeze_5: 3cdbc092
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 254): 3d418f1e
(Unnamed ITensor* 182): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4d/Branch_3/AvgPool_0a_3x3/AvgPool: 3cb90e57
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 175): 3d418f1e
(Unnamed ITensor* 98): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_3/AvgPool_0a_3x3/AvgPool: 3d04ebdf
(Unnamed ITensor* 354): 3d418f1e
(Unnamed ITensor* 181): 3d8ef349
(Unnamed ITensor* 353): 3d3ce1d6
(Unnamed ITensor* 174): 3d5b5745
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_1/Conv2d_0b_3x3/Relu6: 3d418f1e
GridAnchor_1 copy: 3a500341
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_3/Conv2d_0b_1x1/Relu6: 3cb42ac7
(Unnamed ITensor* 149): 3d869442
(Unnamed ITensor* 68): 3d418f1e
(Unnamed ITensor* 17): 3d9d3367
(Unnamed ITensor* 404): 3d9d92ab
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4e/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 309): 3d8ac690
BoxPredictor_1/BoxEncodingPredictor/BiasAdd: 3d2f0384
(Unnamed ITensor* 60): 3d74b08e
(Unnamed ITensor* 189): 3d418f1e
(Unnamed ITensor* 97): 3d3f7d2c
(Unnamed ITensor* 53): 3d7e3945
(Unnamed ITensor* 8): 3e350553
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
GridAnchor_5: 3c2b37e3
(Unnamed ITensor* 76): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3c/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 522): 3d418f1e
(Unnamed ITensor* 39): 3da0973e
(Unnamed ITensor* 127): 3d418f1e
(Unnamed ITensor* 54): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4b/Branch_2/Conv2d_0b_3x3/Relu6: 3d418f1e
(Unnamed ITensor* 576): 3d418f1e
(Unnamed ITensor* 332): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_2b_1x1/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 47): 3d418f1e
(Unnamed ITensor* 40): 3d418f1e
(Unnamed ITensor* 246): 3d7bd2d9
FeatureExtractor/InceptionV2/InceptionV2/Conv2d_1a_7x7/Relu6: 3d418f1e
(Unnamed ITensor* 398): 3d418f1e
(Unnamed ITensor* 383): 3d418f1e
(Unnamed ITensor* 427): 3d541a3f
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
(Unnamed ITensor* 397): 3d523857
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5a/Branch_0/Conv2d_1a_3x3/Relu6: 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_0/Conv2d_0a_1x1/Relu6: 3d333265
(Unnamed ITensor* 442): 3d418f1e
(Unnamed ITensor* 470): 3d71a2ed
(Unnamed ITensor* 499): 3d2d4927
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5b/Branch_2/Conv2d_0c_3x3/Relu6: 3d3d75f1
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_1/Conv2d_0a_1x1/Relu6: 3d1d4f1e
(Unnamed ITensor* 434): 3d439787
(Unnamed ITensor* 629): 3d418f1e
(Unnamed ITensor* 506): 3d74b7dd
(Unnamed ITensor* 428): 3d418f1e
FeatureExtractor/InceptionV2/InceptionV2/Mixed_5c/Branch_2/Conv2d_0c_3x3/Relu6: 3d418f1e
GridAnchor_2: 3c2c4ae8
FeatureExtractor/InceptionV2/InceptionV2/Mixed_4a/Branch_0/Conv2d_0a_1x1/Relu6: 3d418f1e
NMS: 3da1a245
GridAnchor copy: 3a4f5b62
FeatureExtractor/InceptionV2/Mixed_5c_1_Conv2d_4_1x1_128/Relu6: 3d418f1e
MIT License
Copyright (c) 2019 Boris
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# tensorflow-feed-from-gpu # Run
Simple TF test ```
mkdir build
## Setup in Eclipse cd build
cmake ..
From **Eclipse (2019-12)**: ./tf-gpu-feed
* File > Open Projects from File System... ```
* Directory... > navigate to project's root > Finish \ No newline at end of file
Tried importing a few times - indexer does not work sometimes.
#ifndef TENSORRT_ARGS_PARSER_H
#define TENSORRT_ARGS_PARSER_H
#include <vector>
#include <string>
#include <getopt.h>
#include <iostream>
namespace samplesCommon
{
//!
//! \brief The SampleParams structure groups the basic parameters required by
//! all sample networks.
//!
struct SampleParams
{
int batchSize; //!< Number of inputs in a batch
int dlaID;
std::vector<std::string> dataDirs; //!< Directory paths where sample data files are stored
std::vector<std::string> inputTensorNames;
std::vector<std::string> outputTensorNames;
};
//!
//! \brief The CaffeSampleParams structure groups the additional parameters required by
//! networks that use caffe
//!
struct CaffeSampleParams : public SampleParams
{
std::string prototxtFileName; //!< Filename of prototxt design file of a network
std::string weightsFileName; //!< Filename of trained weights file of a network
};
//!
//! /brief Struct to maintain command-line arguments.
//!
struct Args
{
bool runInInt8{false};
bool help{false};
int useDLA{-1};
std::vector<std::string> dataDirs;
};
//!
//! \brief Populates the Args struct with the provided command-line parameters.
//!
//! \throw invalid_argument if any of the arguments are not valid
//!
//! \return boolean If return value is true, execution can continue, otherwise program should exit
//!
inline bool parseArgs(Args& args, int argc, char* argv[])
{
while (1)
{
int arg;
static struct option long_options[] = {
{"help", no_argument, 0, 'h'},
{"datadir", required_argument, 0, 'd'},
{"int8", no_argument, 0, 'i'},
{"useDLA", required_argument, 0, 'u'},
{nullptr, 0, nullptr, 0}};
int option_index = 0;
arg = getopt_long(argc, argv, "hd:iu", long_options, &option_index);
if (arg == -1)
break;
switch (arg)
{
case 'h':
args.help = true;
return false;
case 'd':
if (optarg)
args.dataDirs.push_back(optarg);
else
{
std::cerr << "ERROR: --datadir requires option argument" << std::endl;
return false;
}
break;
case 'i':
args.runInInt8 = true;
break;
case 'u':
if (optarg)
args.useDLA = std::stoi(optarg);
break;
default:
return false;
}
}
return true;
}
} // namespace samplesCommon
#endif // TENSORRT_ARGS_PARSER_H
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
// this is the program that is to be run on the device for a
// large number of threads, in our example 100
// each thread takes care of one entry in the number array,
// so in order for the thread to know which number to manipulate,
// a scheme has to be utilized in order to assign each thread a
// unique number
__global__ void incrementArrayViaCUDAdevice(int *numberArray, int N)
{
// this is the assignment of a unique identifier.
// blockIdx.x is the unique number of the block, in which the
// thread is positioned, blockDim.x holds the number of threads
// for each block and threadIdx.x is the number of the thread in
// this block.
int idx = blockIdx.x*blockDim.x + threadIdx.x;
// this tells the thread to manipulate the assigned number in
// the array stored in device memory and increment it
if (idx<N)
numberArray[idx] = numberArray[idx] + 1;
}
// this is the "normal" function to be run on the CPU
// it does the exact same thing as the CUDA function above
void incrementArray(int *numberArray, int N){
// go through every number in the array consecutively
// and increment it
for(int i=0; i<N; ++i)
{
numberArray[i] = numberArray[i] + 1;
}
}
int myCreateCUDAArray(int *tf_ptr){
// some arbitrary array length
int numberOfNumbers = 100;
// declare some arrays for storing numbers
int *numbers1, *numbers2;
numbers1 = tf_ptr;
// reserve (allocate) some working space for the numbers in device memory
cudaMallocManaged(&numbers1, sizeof(int)*numberOfNumbers);
cudaMallocManaged(&numbers2, sizeof(int)*numberOfNumbers);
// fill the input array with some numbers
for(int i=0;i<numberOfNumbers;i++)
{
numbers1[i] = i; // this will be manipulated by the CUDA device (GPU)
numbers2[i] = i; // this will be manipulated by the CPU (as any standard C program would do)
}
// tell the device (GPU) to do its magic
incrementArrayViaCUDAdevice<<<1, numberOfNumbers>>>(numbers1, numberOfNumbers);
// wait for the device to finish working
cudaDeviceSynchronize();
// compute the same function "normally" on the CPU
incrementArray(numbers2, numberOfNumbers);
// check if the GPU did the same as the CPU
bool workedCorrectly = true;
for(int i=0;i<numberOfNumbers;i++)
{
if (numbers1[i] != numbers2[i])
workedCorrectly = 0;
printf(" %d vs %d |",numbers1[i],numbers2[i]);
}
printf("\n");
if (workedCorrectly == 1)
printf("The device performed well!\n");
else
printf("Something went wrong. The output numbers are not what was to be expected...\n");
// free the space that has been used by our arrays so that
// other programs might use it
cudaFree(numbers1);
cudaFree(numbers2);
return 0;
}
int myCreateCUDAArray(int *tf_ptr);
// kernel to convert from OpenCV channel representation to channel-first
// see: https://docs.opencv.org/2.4/doc/tutorials/core/how_to_scan_images/how_to_scan_images.html#how-the-image-matrix-is-stored-in-the-memory
const int BLOCK_SIZE = 1024;
#include <cuda_runtime.h>
__global__ void channelFirstKernel(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
int offset = idx / channelsNum;
int channel = idx % channelsNum;
// what would the row be if we didn't have any padding
int row = idx / rowElems;
int col = idx % rowElems;
// actual element - skip padding
int sourceIdx = row * rowSize + col;
dest[channelSize * channel + offset] = ((float) source[sourceIdx]) * (2.0/255.0) - 1.0;
}
// we expect all memory to already reside on device so no need to allocate anything
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElems, int rowSize)
{
int nBlocks = (channelSize * channelsNum + BLOCK_SIZE - 1) / BLOCK_SIZE;
channelFirstKernel<<<nBlocks, BLOCK_SIZE>>>(source, dest, channelSize, channelsNum, rowElems, rowSize);
cudaDeviceSynchronize();
}
#ifndef TENSORRT_COMMON_H
#define TENSORRT_COMMON_H
#include "NvInfer.h"
#include "NvInferPlugin.h"
// ONNX is not supported in Windows
#ifndef _MSC_VER
#include "NvOnnxConfig.h"
#include "NvOnnxParser.h"
#endif
#include <algorithm>
#include <cassert>
#include <chrono>
#include <cmath>
#include <cstring>
#include <cuda_runtime_api.h>
#include <fstream>
#include <iostream>
#include <iomanip>
#include <iterator>
#include <map>
#include <memory>
#include <new>
#include <numeric>
#include <ratio>
#include <string>
#include <utility>
#include <vector>
using namespace std;
using namespace nvinfer1;
using namespace plugin;
#define CHECK_TRT(status) \
do \
{ \
auto ret = (status); \
if (ret != 0) \
{ \
std::cout << "Cuda failure: " << ret; \
abort(); \
} \
} while (0)
constexpr long double operator"" _GB(long double val)
{
return val * (1 << 30);
}
constexpr long double operator"" _MB(long double val) { return val * (1 << 20); }
constexpr long double operator"" _KB(long double val) { return val * (1 << 10); }
// These is necessary if we want to be able to write 1_GB instead of 1.0_GB.
// Since the return type is signed, -1_GB will work as expected.
constexpr long long int operator"" _GB(long long unsigned int val) { return val * (1 << 30); }
constexpr long long int operator"" _MB(long long unsigned int val) { return val * (1 << 20); }
constexpr long long int operator"" _KB(long long unsigned int val) { return val * (1 << 10); }
// Logger for TensorRT info/warning/errors
class Logger : public nvinfer1::ILogger
{
public:
Logger(Severity severity = Severity::kWARNING)
: reportableSeverity(severity)
{
}
void log(Severity severity, const char* msg) override
{
// suppress messages with severity enum value greater than the reportable
if (severity > reportableSeverity)
return;
switch (severity)
{
case Severity::kINTERNAL_ERROR: std::cerr << "INTERNAL_ERROR: "; break;
case Severity::kERROR: std::cerr << "ERROR: "; break;
case Severity::kWARNING: std::cerr << "WARNING: "; break;
case Severity::kINFO: std::cerr << "INFO: "; break;
default: std::cerr << "UNKNOWN: "; break;
}
std::cerr << msg << std::endl;
}
Severity reportableSeverity;
};
struct SimpleProfiler : public nvinfer1::IProfiler
{
struct Record
{
float time{0};
int count{0};
};
virtual void reportLayerTime(const char* layerName, float ms)
{
mProfile[layerName].count++;
mProfile[layerName].time += ms;
}
SimpleProfiler(
const char* name,
const std::vector<SimpleProfiler>& srcProfilers = std::vector<SimpleProfiler>())
: mName(name)
{
for (const auto& srcProfiler : srcProfilers)
{
for (const auto& rec : srcProfiler.mProfile)
{
auto it = mProfile.find(rec.first);
if (it == mProfile.end())
{
mProfile.insert(rec);
}
else
{
it->second.time += rec.second.time;
it->second.count += rec.second.count;
}
}
}
}
friend std::ostream& operator<<(std::ostream& out, const SimpleProfiler& value)
{
out << "========== " << value.mName << " profile ==========" << std::endl;
float totalTime = 0;
std::string layerNameStr = "TensorRT layer name";
int maxLayerNameLength = std::max(static_cast<int>(layerNameStr.size()), 70);
for (const auto& elem : value.mProfile)
{
totalTime += elem.second.time;
maxLayerNameLength = std::max(maxLayerNameLength, static_cast<int>(elem.first.size()));
}
auto old_settings = out.flags();
auto old_precision = out.precision();
// Output header
{
out << std::setw(maxLayerNameLength) << layerNameStr << " ";
out << std::setw(12) << "Runtime, "
<< "%"
<< " ";
out << std::setw(12) << "Invocations"
<< " ";
out << std::setw(12) << "Runtime, ms" << std::endl;
}
for (const auto& elem : value.mProfile)
{
out << std::setw(maxLayerNameLength) << elem.first << " ";
out << std::setw(12) << std::fixed << std::setprecision(1) << (elem.second.time * 100.0F / totalTime) << "%"
<< " ";
out << std::setw(12) << elem.second.count << " ";
out << std::setw(12) << std::fixed << std::setprecision(2) << elem.second.time << std::endl;
}
out.flags(old_settings);
out.precision(old_precision);
out << "========== " << value.mName << " total runtime = " << totalTime << " ms ==========" << std::endl;
return out;
}
private:
std::string mName;
std::map<std::string, Record> mProfile;
};
// Locate path to file, given its filename or filepath suffix and possible dirs it might lie in
// Function will also walk back MAX_DEPTH dirs from CWD to CHECK_TRT for such a file path
inline std::string locateFile(const std::string& filepathSuffix, const std::vector<std::string>& directories)
{
const int MAX_DEPTH{10};
bool found{false};
std::string filepath;
for (auto& dir : directories)
{
filepath = dir + filepathSuffix;
for (int i = 0; i < MAX_DEPTH && !found; i++)
{
std::ifstream CHECK_TRTFile(filepath);
found = CHECK_TRTFile.is_open();
if (found)
break;
filepath = "../" + filepath; // Try again in parent dir
}
if (found)
{
break;
}
filepath.clear();
}
if (filepath.empty())
{
std::string directoryList = std::accumulate(directories.begin() + 1, directories.end(), directories.front(),
[](const std::string& a, const std::string& b) { return a + "\n\t" + b; });
std::cout << "Could not find " << filepathSuffix << " in data directories:\n\t" << directoryList << std::endl;
exit(EXIT_FAILURE);
}
return filepath;
}
inline void readPGMFile(const std::string& fileName, uint8_t* buffer, int inH, int inW)
{
std::ifstream infile(fileName, std::ifstream::binary);
assert(infile.is_open() && "Attempting to read from a file that is not open.");
std::string magic, h, w, max;
infile >> magic >> h >> w >> max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(buffer), inH * inW);
}
namespace samplesCommon
{
inline void* safeCudaMalloc(size_t memSize)
{
void* deviceMem;
CHECK_TRT(cudaMalloc(&deviceMem, memSize));
if (deviceMem == nullptr)
{
std::cerr << "Out of memory" << std::endl;
exit(1);
}
return deviceMem;
}
inline bool isDebug()
{
return (std::getenv("TENSORRT_DEBUG") ? true : false);
}
struct InferDeleter
{
template <typename T>
void operator()(T* obj) const
{
if (obj)
{
obj->destroy();
}
}
};
template <typename T>
inline std::shared_ptr<T> infer_object(T* obj)
{
if (!obj)
{
throw std::runtime_error("Failed to create object");
}
return std::shared_ptr<T>(obj, InferDeleter());
}
template <class Iter>
inline std::vector<size_t> argsort(Iter begin, Iter end, bool reverse = false)
{
std::vector<size_t> inds(end - begin);
std::iota(inds.begin(), inds.end(), 0);
if (reverse)
{
std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
return begin[i2] < begin[i1];
});
}
else
{
std::sort(inds.begin(), inds.end(), [&begin](size_t i1, size_t i2) {
return begin[i1] < begin[i2];
});
}
return inds;
}
inline bool readReferenceFile(const std::string& fileName, std::vector<std::string>& refVector)
{
std::ifstream infile(fileName);
if (!infile.is_open())
{
cout << "ERROR: readReferenceFile: Attempting to read from a file that is not open." << endl;
return false;
}
std::string line;
while (std::getline(infile, line))
{
if (line.empty())
continue;
refVector.push_back(line);
}
infile.close();
return true;
}
template <typename result_vector_t>
inline std::vector<std::string> classify(const vector<string>& refVector, const result_vector_t& output, const size_t topK)
{
auto inds = samplesCommon::argsort(output.cbegin(), output.cend(), true);
std::vector<std::string> result;
for (size_t k = 0; k < topK; ++k)
{
result.push_back(refVector[inds[k]]);
}
return result;
}
//...LG returns top K indices, not values.
template <typename T>
inline vector<size_t> topK(const vector<T> inp, const size_t k)
{
vector<size_t> result;
std::vector<size_t> inds = samplesCommon::argsort(inp.cbegin(), inp.cend(), true);
result.assign(inds.begin(), inds.begin() + k);
return result;
}
template <typename T>
inline bool readASCIIFile(const string& fileName, const size_t size, vector<T>& out)
{
std::ifstream infile(fileName);
if (!infile.is_open())
{
cout << "ERROR readASCIIFile: Attempting to read from a file that is not open." << endl;
return false;
}
out.clear();
out.reserve(size);
out.assign(std::istream_iterator<T>(infile), std::istream_iterator<T>());
infile.close();
return true;
}
template <typename T>
inline bool writeASCIIFile(const string& fileName, const vector<T>& in)
{
std::ofstream outfile(fileName);
if (!outfile.is_open())
{
cout << "ERROR: writeASCIIFile: Attempting to write to a file that is not open." << endl;
return false;
}
for (auto fn : in)
{
outfile << fn << " ";
}
outfile.close();
return true;
}
inline void print_version()
{
//... This can be only done after statically linking this support into parserONNX.library
#if 0
std::cout << "Parser built against:" << std::endl;
std::cout << " ONNX IR version: " << nvonnxparser::onnx_ir_version_string(onnx::IR_VERSION) << std::endl;
#endif
std::cout << " TensorRT version: "
<< NV_TENSORRT_MAJOR << "."
<< NV_TENSORRT_MINOR << "."
<< NV_TENSORRT_PATCH << "."
<< NV_TENSORRT_BUILD << std::endl;
}
inline string getFileType(const string& filepath)
{
return filepath.substr(filepath.find_last_of(".") + 1);
}
inline string toLower(const string& inp)
{
string out = inp;
std::transform(out.begin(), out.end(), out.begin(), ::tolower);
return out;
}
inline void enableDLA(IBuilder* b, int useDLACore)
{
if (useDLACore >= 0)
{
b->allowGPUFallback(true);
b->setFp16Mode(true);
b->setDefaultDeviceType(DeviceType::kDLA);
b->setDLACore(useDLACore);
}
}
inline int parseDLA(int argc, char** argv)
{
for (int i = 1; i < argc; i++)
{
std::string arg(argv[i]);
if (strncmp(argv[i], "--useDLACore=", 13) == 0)
return stoi(argv[i] + 13);
}
return -1;
}
inline unsigned int getElementSize(nvinfer1::DataType t)
{
switch (t)
{
case nvinfer1::DataType::kINT32: return 4;
case nvinfer1::DataType::kFLOAT: return 4;
case nvinfer1::DataType::kHALF: return 2;
case nvinfer1::DataType::kINT8: return 1;
}
throw std::runtime_error("Invalid DataType.");
return 0;
}
inline int64_t volume(const nvinfer1::Dims& d)
{
return std::accumulate(d.d, d.d + d.nbDims, 1, std::multiplies<int64_t>());
}
template <int C, int H, int W>
struct PPM
{
std::string magic, fileName;
int h, w, max;
uint8_t buffer[C * H * W];
};
struct BBox
{
float x1, y1, x2, y2;
};
template <int C, int H, int W>
inline void readPPMFile(const std::string& filename, samplesCommon::PPM<C, H, W>& ppm)
{
ppm.fileName = filename;
std::ifstream infile(filename, std::ifstream::binary);
assert(infile.is_open() && "Attempting to read from a file that is not open.");
infile >> ppm.magic >> ppm.w >> ppm.h >> ppm.max;
infile.seekg(1, infile.cur);
infile.read(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
template <int C, int H, int W>
inline void writePPMFileWithBBox(const std::string& filename, PPM<C, H, W>& ppm, const BBox& bbox)
{
std::ofstream outfile("./" + filename, std::ofstream::binary);
assert(!outfile.fail());
outfile << "P6"
<< "\n"
<< ppm.w << " " << ppm.h << "\n"
<< ppm.max << "\n";
auto round = [](float x) -> int { return int(std::floor(x + 0.5f)); };
const int x1 = std::min(std::max(0, round(int(bbox.x1))), W - 1);
const int x2 = std::min(std::max(0, round(int(bbox.x2))), W - 1);
const int y1 = std::min(std::max(0, round(int(bbox.y1))), H - 1);
const int y2 = std::min(std::max(0, round(int(bbox.y2))), H - 1);
for (int x = x1; x <= x2; ++x)
{
// bbox top border
ppm.buffer[(y1 * ppm.w + x) * 3] = 255;
ppm.buffer[(y1 * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(y1 * ppm.w + x) * 3 + 2] = 0;
// bbox bottom border
ppm.buffer[(y2 * ppm.w + x) * 3] = 255;
ppm.buffer[(y2 * ppm.w + x) * 3 + 1] = 0;
ppm.buffer[(y2 * ppm.w + x) * 3 + 2] = 0;
}
for (int y = y1; y <= y2; ++y)
{
// bbox left border
ppm.buffer[(y * ppm.w + x1) * 3] = 255;
ppm.buffer[(y * ppm.w + x1) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + x1) * 3 + 2] = 0;
// bbox right border
ppm.buffer[(y * ppm.w + x2) * 3] = 255;
ppm.buffer[(y * ppm.w + x2) * 3 + 1] = 0;
ppm.buffer[(y * ppm.w + x2) * 3 + 2] = 0;
}
outfile.write(reinterpret_cast<char*>(ppm.buffer), ppm.w * ppm.h * 3);
}
class TimerBase
{
public:
virtual void start() {}
virtual void stop() {}
float microseconds() const noexcept { return mMs * 1000.f; }
float milliseconds() const noexcept { return mMs; }
float seconds() const noexcept { return mMs / 1000.f; }
void reset() noexcept { mMs = 0.f; }
protected:
float mMs{0.0f};
};
class GpuTimer : public TimerBase
{
public:
GpuTimer(cudaStream_t stream)
: mStream(stream)
{
CHECK_TRT(cudaEventCreate(&mStart));
CHECK_TRT(cudaEventCreate(&mStop));
}
~GpuTimer()
{
CHECK_TRT(cudaEventDestroy(mStart));
CHECK_TRT(cudaEventDestroy(mStop));
}
void start() { CHECK_TRT(cudaEventRecord(mStart, mStream)); }
void stop()
{
CHECK_TRT(cudaEventRecord(mStop, mStream));
float ms{0.0f};
CHECK_TRT(cudaEventSynchronize(mStop));
CHECK_TRT(cudaEventElapsedTime(&ms, mStart, mStop));
mMs += ms;
}
private:
cudaEvent_t mStart, mStop;
cudaStream_t mStream;
}; // class GpuTimer
template <typename Clock>
class CpuTimer : public TimerBase
{
public:
using clock_type = Clock;
void start() { mStart = Clock::now(); }
void stop()
{
mStop = Clock::now();
mMs += std::chrono::duration<float, std::milli>{mStop - mStart}.count();
}
private:
std::chrono::time_point<Clock> mStart, mStop;
}; // class CpuTimer
using PreciseCpuTimer = CpuTimer<std::chrono::high_resolution_clock>;
} // namespace samplesCommon
#endif // TENSORRT_COMMON_H
cd ~/git/tensorflow
sudo mkdir /usr/local/tensorflow
sudo mkdir /usr/local/tensorflow/include
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/Eigen /usr/local/tensorflow/include/
sudo cp -r tensorflow/contrib/makefile/downloads/eigen/unsupported /usr/local/tensorflow/include/
sudo cp tensorflow/contrib/makefile/downloads/nsync/public/* /usr/local/tensorflow/include/
sudo cp -r bazel-genfiles/tensorflow /usr/local/tensorflow/include/
sudo cp -r tensorflow/cc /usr/local/tensorflow/include/tensorflow
sudo cp -r tensorflow/core /usr/local/tensorflow/include/tensorflow
sudo mkdir /usr/local/tensorflow/include/third_party
sudo cp -r third_party/eigen3 /usr/local/tensorflow/include/third_party/
sudo mkdir /usr/local/tensorflow/lib
sudo cp bazel-bin/tensorflow/libtensorflow_*.so /usr/local/tensorflow/lib
\ No newline at end of file
FROM fierval/tensorrt:19.02-py3
# nvcuvid
ADD nvcuvid/* /usr/local/cuda/targets/x86_64-linux/include/
# opencv
RUN apt-get update
RUN apt-get install -y git libgtk2.0-dev curl pkg-config autoconf automake libtool libavcodec-dev \
libavformat-dev libswscale-dev python-dev python-numpy libtbb2 libtbb-dev \
libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev unzip libcurl4-gnutls-dev zlib1g-dev
RUN apt-get install -y wget
RUN apt-get install -y vim
## CMAKE
ADD https://cmake.org/files/v3.13/cmake-3.13.0.tar.gz /
RUN tar xzvf /cmake-3.13.0.tar.gz -C / \
&& cd /cmake-3.13.0 \
&& ./bootstrap \
&& make -j15 \
&& make install
# Second: get and build OpenCV 3.3.1
#
ADD https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz /
RUN tar xzvf /protobuf-cpp-3.6.1.tar.gz -C /
RUN cd /protobuf-3.6.1 \
&& ./configure \
&& make -j15 \
&& make install \
&& ldconfig
ADD https://github.com/opencv/opencv_contrib/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
ADD https://github.com/opencv/opencv/archive/3.3.1.zip /
RUN unzip -o /3.3.1.zip
RUN cd /workspace/opencv-3.3.1 \
&& mkdir build \
&& cd build \
&& cmake -DBUILD_TIFF=ON \
-DBUILD_opencv_java=OFF \
-DBUILD_SHARED_LIBS=OFF \
-DWITH_CUDA=ON \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_TESTS=OFF \
-DBUILD_opencv_codacodec=ON \
# -DENABLE_FAST_MATH=1 \
# -DCUDA_FAST_MATH=1 \
-DWITH_CUBLAS=1 \
-DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
-DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-3.3.1/modules/ \
##
-DCUDA_ARCH_BIN='7.0' \
-DCUDA_ARCH_PTX="" \
##
## AVX in dispatch because not all machines have it
-DCPU_DISPATCH=AVX,AVX2 \
-DENABLE_PRECOMPILED_HEADERS=OFF \
-DWITH_OPENGL=OFF \
-DWITH_OPENCL=OFF \
-DWITH_QT=OFF \
-DWITH_NVCUVID=ON \
-DWITH_IPP=ON \
-DWITH_TBB=ON \
-DFORCE_VTK=ON \
-DWITH_EIGEN=ON \
-DWITH_V4L=ON \
-DWITH_XINE=ON \
-DWITH_GDAL=ON \
-DWITH_1394=OFF \
-DWITH_FFMPEG=OFF \
-DBUILD_PROTOBUF=ON \
-DBUILD_TESTS=OFF \
-DBUILD_PERF_TESTS=OFF \
-DBUILD_opencv_xfeatures2d=OFF \
-DCMAKE_BUILD_TYPE=RELEASE \
-DCMAKE_INSTALL_PREFIX=/usr/local \
.. \
&& make -j15 \
&& make install \
&& rm /3.3.1.zip \
&& rm /cmake-3.13.0.tar.gz
RUN mkdir fast_od
RUN mkdir /home/boris
# tensorflow libraries
ADD tensorflow.tar /
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef __cuda_h__
#define __cuda_h__
/**
* CUDA API version support
*/
#include "dynlink_cuda_cuda.h"
#endif //__cuda_h__
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef __cuda_cuda_h__
#define __cuda_cuda_h__
#include <stdlib.h>
#ifndef __CUDA_API_VERSION
#define __CUDA_API_VERSION 4000
#endif
/**
* \defgroup CUDA_DRIVER CUDA Driver API
*
* This section describes the low-level CUDA driver application programming
* interface.
*
* @{
*/
/**
* \defgroup CUDA_TYPES Data types used by CUDA driver
* @{
*/
/**
* CUDA API version number
*/
#define CUDA_VERSION 4000 /* 4.0 */
#ifdef __cplusplus
extern "C" {
#endif
/**
* CUDA device pointer
*/
#if __CUDA_API_VERSION >= 3020
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
typedef unsigned long long CUdeviceptr;
#else
typedef unsigned int CUdeviceptr;
#endif
#endif /* __CUDA_API_VERSION >= 3020 */
typedef int CUdevice; /**< CUDA device */
typedef struct CUctx_st *CUcontext; /**< CUDA context */
typedef struct CUmod_st *CUmodule; /**< CUDA module */
typedef struct CUfunc_st *CUfunction; /**< CUDA function */
typedef struct CUarray_st *CUarray; /**< CUDA array */
typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */
typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */
typedef struct CUevent_st *CUevent; /**< CUDA event */
typedef struct CUstream_st *CUstream; /**< CUDA stream */
typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
#if 0
typedef struct CUuuid_st /**< CUDA definition of UUID */
{
char bytes[16];
} CUuuid;
#endif
/**
* Context creation flags
*/
typedef enum CUctx_flags_enum
{
CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */
CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */
#if __CUDA_API_VERSION < 4000
CU_CTX_SCHED_MASK = 0x03,
CU_CTX_FLAGS_MASK = 0x1f
#else
CU_CTX_SCHED_MASK = 0x07,
CU_CTX_PRIMARY = 0x20, /**< Initialize and return the primary context */
CU_CTX_FLAGS_MASK = 0x3f
#endif
} CUctx_flags;
/**
* Event creation flags
*/
typedef enum CUevent_flags_enum
{
CU_EVENT_DEFAULT = 0, /**< Default event flag */
CU_EVENT_BLOCKING_SYNC = 1, /**< Event uses blocking synchronization */
CU_EVENT_DISABLE_TIMING = 2 /**< Event will not record timing data */
} CUevent_flags;
/**
* Array formats
*/
typedef enum CUarray_format_enum
{
CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */
CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */
CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */
} CUarray_format;
/**
* Texture reference addressing modes
*/
typedef enum CUaddress_mode_enum
{
CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */
CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */
CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */
} CUaddress_mode;
/**
* Texture reference filtering modes
*/
typedef enum CUfilter_mode_enum
{
CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */
CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */
} CUfilter_mode;
/**
* Device properties
*/
typedef enum CUdevice_attribute_enum
{
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */
CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */
CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */
CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */
CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Peak clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */
CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */
CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */
CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */
CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */
CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Maximum texture array width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Maximum texture array height */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */
CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */
CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */
CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */
CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */
CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35 /**< Device is using TCC driver model */
#if __CUDA_API_VERSION >= 4000
, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */
CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */
CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */
CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device uses shares a unified address space with the host */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */
CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43 /**< Maximum layers in a 1D layered texture */
#endif
} CUdevice_attribute;
/**
* Legacy device properties
*/
typedef struct CUdevprop_st
{
int maxThreadsPerBlock; /**< Maximum number of threads per block */
int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */
int maxGridSize[3]; /**< Maximum size of each dimension of a grid */
int sharedMemPerBlock; /**< Shared memory available per block in bytes */
int totalConstantMemory; /**< Constant memory available on device in bytes */
int SIMDWidth; /**< Warp size in threads */
int memPitch; /**< Maximum pitch in bytes allowed by memory copies */
int regsPerBlock; /**< 32-bit registers available per block */
int clockRate; /**< Clock frequency in kilohertz */
int textureAlign; /**< Alignment requirement for textures */
} CUdevprop;
/**
* Function properties
*/
typedef enum CUfunction_attribute_enum
{
/**
* The maximum number of threads per block, beyond which a launch of the
* function would fail. This number depends on both the function and the
* device on which the function is currently loaded.
*/
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
/**
* The size in bytes of statically-allocated shared memory required by
* this function. This does not include dynamically-allocated shared
* memory requested by the user at runtime.
*/
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
/**
* The size in bytes of user-allocated constant memory required by this
* function.
*/
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
/**
* The size in bytes of local memory used by each thread of this function.
*/
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
/**
* The number of registers used by each thread of this function.
*/
CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
/**
* The PTX virtual architecture version for which the function was
* compiled. This value is the major PTX version * 10 + the minor PTX
* version, so a PTX version 1.3 function would return the value 13.
* Note that this may return the undefined value of 0 for cubins
* compiled prior to CUDA 3.0.
*/
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
/**
* The binary architecture version for which the function was compiled.
* This value is the major binary version * 10 + the minor binary version,
* so a binary version 1.3 function would return the value 13. Note that
* this will return a value of 10 for legacy cubins that do not have a
* properly-encoded binary architecture version.
*/
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
CU_FUNC_ATTRIBUTE_MAX
} CUfunction_attribute;
/**
* Function cache configurations
*/
typedef enum CUfunc_cache_enum
{
CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */
CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */
CU_FUNC_CACHE_PREFER_L1 = 0x02 /**< prefer larger L1 cache and smaller shared memory */
} CUfunc_cache;
/**
* Memory types
*/
typedef enum CUmemorytype_enum
{
CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */
CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */
CU_MEMORYTYPE_ARRAY = 0x03 /**< Array memory */
#if __CUDA_API_VERSION >= 4000
, CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */
#endif
} CUmemorytype;
/**
* Compute Modes
*/
typedef enum CUcomputemode_enum
{
CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */
CU_COMPUTEMODE_EXCLUSIVE = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
CU_COMPUTEMODE_PROHIBITED = 2 /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
#if __CUDA_API_VERSION >= 4000
, CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
#endif
} CUcomputemode;
/**
* Online compiler options
*/
typedef enum CUjit_option_enum
{
/**
* Max number of registers that a thread may use.\n
* Option type: unsigned int
*/
CU_JIT_MAX_REGISTERS = 0,
/**
* IN: Specifies minimum number of threads per block to target compilation
* for\n
* OUT: Returns the number of threads the compiler actually targeted.
* This restricts the resource utilization fo the compiler (e.g. max
* registers) such that a block with the given number of threads should be
* able to launch based on register limitations. Note, this option does not
* currently take into account any other resource limitations, such as
* shared memory utilization.\n
* Option type: unsigned int
*/
CU_JIT_THREADS_PER_BLOCK,
/**
* Returns a float value in the option of the wall clock time, in
* milliseconds, spent creating the cubin\n
* Option type: float
*/
CU_JIT_WALL_TIME,
/**
* Pointer to a buffer in which to print any log messsages from PTXAS
* that are informational in nature (the buffer size is specified via
* option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
* Option type: char*
*/
CU_JIT_INFO_LOG_BUFFER,
/**
* IN: Log buffer size in bytes. Log messages will be capped at this size
* (including null terminator)\n
* OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int
*/
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
/**
* Pointer to a buffer in which to print any log messages from PTXAS that
* reflect errors (the buffer size is specified via option
* ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
* Option type: char*
*/
CU_JIT_ERROR_LOG_BUFFER,
/**
* IN: Log buffer size in bytes. Log messages will be capped at this size
* (including null terminator)\n
* OUT: Amount of log buffer filled with messages\n
* Option type: unsigned int
*/
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
/**
* Level of optimizations to apply to generated code (0 - 4), with 4
* being the default and highest level of optimizations.\n
* Option type: unsigned int
*/
CU_JIT_OPTIMIZATION_LEVEL,
/**
* No option value required. Determines the target based on the current
* attached context (default)\n
* Option type: No option value needed
*/
CU_JIT_TARGET_FROM_CUCONTEXT,
/**
* Target is chosen based on supplied ::CUjit_target_enum.\n
* Option type: unsigned int for enumerated type ::CUjit_target_enum
*/
CU_JIT_TARGET,
/**
* Specifies choice of fallback strategy if matching cubin is not found.
* Choice is based on supplied ::CUjit_fallback_enum.\n
* Option type: unsigned int for enumerated type ::CUjit_fallback_enum
*/
CU_JIT_FALLBACK_STRATEGY
} CUjit_option;
/**
* Online compilation targets
*/
typedef enum CUjit_target_enum
{
CU_TARGET_COMPUTE_10 = 0, /**< Compute device class 1.0 */
CU_TARGET_COMPUTE_11, /**< Compute device class 1.1 */
CU_TARGET_COMPUTE_12, /**< Compute device class 1.2 */
CU_TARGET_COMPUTE_13, /**< Compute device class 1.3 */
CU_TARGET_COMPUTE_20, /**< Compute device class 2.0 */
CU_TARGET_COMPUTE_21 /**< Compute device class 2.1 */
} CUjit_target;
/**
* Cubin matching fallback strategies
*/
typedef enum CUjit_fallback_enum
{
CU_PREFER_PTX = 0, /**< Prefer to compile ptx */
CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code */
} CUjit_fallback;
/**
* Flags to register a graphics resource
*/
typedef enum CUgraphicsRegisterFlags_enum
{
CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00,
CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01,
CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04
} CUgraphicsRegisterFlags;
/**
* Flags for mapping and unmapping interop resources
*/
typedef enum CUgraphicsMapResourceFlags_enum
{
CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
} CUgraphicsMapResourceFlags;
/**
* Array indices for cube faces
*/
typedef enum CUarray_cubemap_face_enum
{
CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */
CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */
CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */
CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */
CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */
CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */
} CUarray_cubemap_face;
/**
* Limits
*/
typedef enum CUlimit_enum
{
CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */
CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */
CU_LIMIT_MALLOC_HEAP_SIZE = 0x02 /**< GPU malloc heap size */
} CUlimit;
/**
* Error codes
*/
typedef enum cudaError_enum
{
/**
* The API call returned with no errors. In the case of query calls, this
* can also mean that the operation being queried is complete (see
* ::cuEventQuery() and ::cuStreamQuery()).
*/
CUDA_SUCCESS = 0,
/**
* This indicates that one or more of the parameters passed to the API call
* is not within an acceptable range of values.
*/
CUDA_ERROR_INVALID_VALUE = 1,
/**
* The API call failed because it was unable to allocate enough memory to
* perform the requested operation.
*/
CUDA_ERROR_OUT_OF_MEMORY = 2,
/**
* This indicates that the CUDA driver has not been initialized with
* ::cuInit() or that initialization has failed.
*/
CUDA_ERROR_NOT_INITIALIZED = 3,
/**
* This indicates that the CUDA driver is in the process of shutting down.
*/
CUDA_ERROR_DEINITIALIZED = 4,
/**
* This indicates profiling APIs are called while application is running
* in visual profiler mode.
*/
CUDA_ERROR_PROFILER_DISABLED = 5,
/**
* This indicates profiling has not been initialized for this context.
* Call cuProfilerInitialize() to resolve this.
*/
CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6,
/**
* This indicates profiler has already been started and probably
* cuProfilerStart() is incorrectly called.
*/
CUDA_ERROR_PROFILER_ALREADY_STARTED = 7,
/**
* This indicates profiler has already been stopped and probably
* cuProfilerStop() is incorrectly called.
*/
CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8,
/**
* This indicates that no CUDA-capable devices were detected by the installed
* CUDA driver.
*/
CUDA_ERROR_NO_DEVICE = 100,
/**
* This indicates that the device ordinal supplied by the user does not
* correspond to a valid CUDA device.
*/
CUDA_ERROR_INVALID_DEVICE = 101,
/**
* This indicates that the device kernel image is invalid. This can also
* indicate an invalid CUDA module.
*/
CUDA_ERROR_INVALID_IMAGE = 200,
/**
* This most frequently indicates that there is no context bound to the
* current thread. This can also be returned if the context passed to an
* API call is not a valid handle (such as a context that has had
* ::cuCtxDestroy() invoked on it). This can also be returned if a user
* mixes different API versions (i.e. 3010 context with 3020 API calls).
* See ::cuCtxGetApiVersion() for more details.
*/
CUDA_ERROR_INVALID_CONTEXT = 201,
/**
* This indicated that the context being supplied as a parameter to the
* API call was already the active context.
* \deprecated
* This error return is deprecated as of CUDA 3.2. It is no longer an
* error to attempt to push the active context via ::cuCtxPushCurrent().
*/
CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,
/**
* This indicates that a map or register operation has failed.
*/
CUDA_ERROR_MAP_FAILED = 205,
/**
* This indicates that an unmap or unregister operation has failed.
*/
CUDA_ERROR_UNMAP_FAILED = 206,
/**
* This indicates that the specified array is currently mapped and thus
* cannot be destroyed.
*/
CUDA_ERROR_ARRAY_IS_MAPPED = 207,
/**
* This indicates that the resource is already mapped.
*/
CUDA_ERROR_ALREADY_MAPPED = 208,
/**
* This indicates that there is no kernel image available that is suitable
* for the device. This can occur when a user specifies code generation
* options for a particular CUDA source file that do not include the
* corresponding device configuration.
*/
CUDA_ERROR_NO_BINARY_FOR_GPU = 209,
/**
* This indicates that a resource has already been acquired.
*/
CUDA_ERROR_ALREADY_ACQUIRED = 210,
/**
* This indicates that a resource is not mapped.
*/
CUDA_ERROR_NOT_MAPPED = 211,
/**
* This indicates that a mapped resource is not available for access as an
* array.
*/
CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212,
/**
* This indicates that a mapped resource is not available for access as a
* pointer.
*/
CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,
/**
* This indicates that an uncorrectable ECC error was detected during
* execution.
*/
CUDA_ERROR_ECC_UNCORRECTABLE = 214,
/**
* This indicates that the ::CUlimit passed to the API call is not
* supported by the active device.
*/
CUDA_ERROR_UNSUPPORTED_LIMIT = 215,
/**
* This indicates that the ::CUcontext passed to the API call can
* only be bound to a single CPU thread at a time but is already
* bound to a CPU thread.
*/
CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216,
/**
* This indicates that the device kernel source is invalid.
*/
CUDA_ERROR_INVALID_SOURCE = 300,
/**
* This indicates that the file specified was not found.
*/
CUDA_ERROR_FILE_NOT_FOUND = 301,
/**
* This indicates that a link to a shared object failed to resolve.
*/
CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
/**
* This indicates that initialization of a shared object failed.
*/
CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303,
/**
* This indicates that an OS call failed.
*/
CUDA_ERROR_OPERATING_SYSTEM = 304,
/**
* This indicates that a resource handle passed to the API call was not
* valid. Resource handles are opaque types like ::CUstream and ::CUevent.
*/
CUDA_ERROR_INVALID_HANDLE = 400,
/**
* This indicates that a named symbol was not found. Examples of symbols
* are global/constant variable names, texture names, and surface names.
*/
CUDA_ERROR_NOT_FOUND = 500,
/**
* This indicates that asynchronous operations issued previously have not
* completed yet. This result is not actually an error, but must be indicated
* differently than ::CUDA_SUCCESS (which indicates completion). Calls that
* may return this value include ::cuEventQuery() and ::cuStreamQuery().
*/
CUDA_ERROR_NOT_READY = 600,
/**
* An exception occurred on the device while executing a kernel. Common
* causes include dereferencing an invalid device pointer and accessing
* out of bounds shared memory. The context cannot be used, so it must
* be destroyed (and a new one should be created). All existing device
* memory allocations from this context are invalid and must be
* reconstructed if the program is to continue using CUDA.
*/
CUDA_ERROR_LAUNCH_FAILED = 700,
/**
* This indicates that a launch did not occur because it did not have
* appropriate resources. This error usually indicates that the user has
* attempted to pass too many arguments to the device kernel, or the
* kernel launch specifies too many threads for the kernel's register
* count. Passing arguments of the wrong size (i.e. a 64-bit pointer
* when a 32-bit int is expected) is equivalent to passing too many
* arguments and can also result in this error.
*/
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,
/**
* This indicates that the device kernel took too long to execute. This can
* only occur if timeouts are enabled - see the device attribute
* ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
* context cannot be used (and must be destroyed similar to
* ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
* this context are invalid and must be reconstructed if the program is to
* continue using CUDA.
*/
CUDA_ERROR_LAUNCH_TIMEOUT = 702,
/**
* This error indicates a kernel launch that uses an incompatible texturing
* mode.
*/
CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703,
/**
* This error indicates that a call to ::cuCtxEnablePeerAccess() is
* trying to re-enable peer access to a context which has already
* had peer access to it enabled.
*/
CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
/**
* This error indicates that a call to ::cuMemPeerRegister is trying to
* register memory from a context which has not had peer access
* enabled yet via ::cuCtxEnablePeerAccess(), or that
* ::cuCtxDisablePeerAccess() is trying to disable peer access
* which has not been enabled yet.
*/
CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705,
/**
* This error indicates that a call to ::cuMemPeerRegister is trying to
* register already-registered memory.
*/
CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
/**
* This error indicates that a call to ::cuMemPeerUnregister is trying to
* unregister memory that has not been registered.
*/
CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED = 707,
/**
* This error indicates that ::cuCtxCreate was called with the flag
* ::CU_CTX_PRIMARY on a device which already has initialized its
* primary context.
*/
CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708,
/**
* This error indicates that the context current to the calling thread
* has been destroyed using ::cuCtxDestroy, or is a primary context which
* has not yet been initialized.
*/
CUDA_ERROR_CONTEXT_IS_DESTROYED = 709,
/**
* This indicates that an unknown internal error has occurred.
*/
CUDA_ERROR_UNKNOWN = 999
} CUresult;
#if __CUDA_API_VERSION >= 4000
/**
* If set, host memory is portable between CUDA contexts.
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_PORTABLE 0x01
/**
* If set, host memory is mapped into CUDA address space and
* ::cuMemHostGetDevicePointer() may be called on the host pointer.
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_DEVICEMAP 0x02
/**
* If set, host memory is allocated as write-combined - fast to write,
* faster to DMA, slow to read except via SSE4 streaming load instruction
* (MOVNTDQA).
* Flag for ::cuMemHostAlloc()
*/
#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04
/**
* If set, host memory is portable between CUDA contexts.
* Flag for ::cuMemHostRegister()
*/
#define CU_MEMHOSTREGISTER_PORTABLE 0x01
/**
* If set, host memory is mapped into CUDA address space and
* ::cuMemHostGetDevicePointer() may be called on the host pointer.
* Flag for ::cuMemHostRegister()
*/
#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02
/**
* If set, peer memory is mapped into CUDA address space and
* ::cuMemPeerGetDevicePointer() may be called on the host pointer.
* Flag for ::cuMemPeerRegister()
*/
#define CU_MEMPEERREGISTER_DEVICEMAP 0x02
#endif
#if __CUDA_API_VERSION >= 3020
/**
* 2D memory copy parameters
*/
typedef struct CUDA_MEMCPY2D_st
{
size_t srcXInBytes; /**< Source X in bytes */
size_t srcY; /**< Source Y */
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
const void *srcHost; /**< Source host pointer */
CUdeviceptr srcDevice; /**< Source device pointer */
CUarray srcArray; /**< Source array reference */
size_t srcPitch; /**< Source pitch (ignored when src is array) */
size_t dstXInBytes; /**< Destination X in bytes */
size_t dstY; /**< Destination Y */
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
void *dstHost; /**< Destination host pointer */
CUdeviceptr dstDevice; /**< Destination device pointer */
CUarray dstArray; /**< Destination array reference */
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
size_t WidthInBytes; /**< Width of 2D memory copy in bytes */
size_t Height; /**< Height of 2D memory copy */
} CUDA_MEMCPY2D;
/**
* 3D memory copy parameters
*/
typedef struct CUDA_MEMCPY3D_st
{
size_t srcXInBytes; /**< Source X in bytes */
size_t srcY; /**< Source Y */
size_t srcZ; /**< Source Z */
size_t srcLOD; /**< Source LOD */
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
const void *srcHost; /**< Source host pointer */
CUdeviceptr srcDevice; /**< Source device pointer */
CUarray srcArray; /**< Source array reference */
void *reserved0; /**< Must be NULL */
size_t srcPitch; /**< Source pitch (ignored when src is array) */
size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
size_t dstXInBytes; /**< Destination X in bytes */
size_t dstY; /**< Destination Y */
size_t dstZ; /**< Destination Z */
size_t dstLOD; /**< Destination LOD */
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
void *dstHost; /**< Destination host pointer */
CUdeviceptr dstDevice; /**< Destination device pointer */
CUarray dstArray; /**< Destination array reference */
void *reserved1; /**< Must be NULL */
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
size_t Height; /**< Height of 3D memory copy */
size_t Depth; /**< Depth of 3D memory copy */
} CUDA_MEMCPY3D;
/**
* 3D memory cross-context copy parameters
*/
typedef struct CUDA_MEMCPY3D_PEER_st
{
size_t srcXInBytes; /**< Source X in bytes */
size_t srcY; /**< Source Y */
size_t srcZ; /**< Source Z */
size_t srcLOD; /**< Source LOD */
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
const void *srcHost; /**< Source host pointer */
CUdeviceptr srcDevice; /**< Source device pointer */
CUarray srcArray; /**< Source array reference */
CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
size_t srcPitch; /**< Source pitch (ignored when src is array) */
size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
size_t dstXInBytes; /**< Destination X in bytes */
size_t dstY; /**< Destination Y */
size_t dstZ; /**< Destination Z */
size_t dstLOD; /**< Destination LOD */
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
void *dstHost; /**< Destination host pointer */
CUdeviceptr dstDevice; /**< Destination device pointer */
CUarray dstArray; /**< Destination array reference */
CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
size_t dstPitch; /**< Destination pitch (ignored when dst is array) */
size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
size_t WidthInBytes; /**< Width of 3D memory copy in bytes */
size_t Height; /**< Height of 3D memory copy */
size_t Depth; /**< Depth of 3D memory copy */
} CUDA_MEMCPY3D_PEER;
/**
* Array descriptor
*/
typedef struct CUDA_ARRAY_DESCRIPTOR_st
{
size_t Width; /**< Width of array */
size_t Height; /**< Height of array */
CUarray_format Format; /**< Array format */
unsigned int NumChannels; /**< Channels per array element */
} CUDA_ARRAY_DESCRIPTOR;
/**
* 3D array descriptor
*/
typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
{
size_t Width; /**< Width of 3D array */
size_t Height; /**< Height of 3D array */
size_t Depth; /**< Depth of 3D array */
CUarray_format Format; /**< Array format */
unsigned int NumChannels; /**< Channels per array element */
unsigned int Flags; /**< Flags */
} CUDA_ARRAY3D_DESCRIPTOR;
#endif /* __CUDA_API_VERSION >= 3020 */
/**
* If set, the CUDA array is a collection of layers, where each layer is either a 1D
* or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
* of layers, not the depth of a 3D array.
*/
#define CUDA_ARRAY3D_LAYERED 0x01
/**
* Deprecated, use CUDA_ARRAY3D_LAYERED
*/
#define CUDA_ARRAY3D_2DARRAY 0x01
/**
* This flag must be set in order to bind a surface reference
* to the CUDA array
*/
#define CUDA_ARRAY3D_SURFACE_LDST 0x02
/**
* Override the texref format with a format inferred from the array.
* Flag for ::cuTexRefSetArray()
*/
#define CU_TRSA_OVERRIDE_FORMAT 0x01
/**
* Read the texture as integers rather than promoting the values to floats
* in the range [0,1].
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_READ_AS_INTEGER 0x01
/**
* Use normalized texture coordinates in the range [0,1) instead of [0,dim).
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
/**
* Perform sRGB->linear conversion during texture read.
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_SRGB 0x10
/**
* End of array terminator for the \p extra parameter to
* ::cuLaunchKernel
*/
#define CU_LAUNCH_PARAM_END ((void*)0x00)
/**
* Indicator that the next value in the \p extra parameter to
* ::cuLaunchKernel will be a pointer to a buffer containing all kernel
* parameters used for launching kernel \p f. This buffer needs to
* honor all alignment/padding requirements of the individual parameters.
* If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
* \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
* effect.
*/
#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
/**
* Indicator that the next value in the \p extra parameter to
* ::cuLaunchKernel will be a pointer to a size_t which contains the
* size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
* It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
* in the \p extra array if the value associated with
* ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
*/
#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
/**
* For texture references loaded into the module, use default texunit from
* texture reference.
*/
#define CU_PARAM_TR_DEFAULT -1
/**
* CUDA API made obselete at API version 3020
*/
#if defined(__CUDA_API_VERSION_INTERNAL)
#define CUdeviceptr CUdeviceptr_v1
#define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st
#define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1
#define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st
#define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1
#define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st
#define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1
#define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st
#define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1
#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
typedef unsigned int CUdeviceptr;
typedef struct CUDA_MEMCPY2D_st
{
unsigned int srcXInBytes; /**< Source X in bytes */
unsigned int srcY; /**< Source Y */
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
const void *srcHost; /**< Source host pointer */
CUdeviceptr srcDevice; /**< Source device pointer */
CUarray srcArray; /**< Source array reference */
unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
unsigned int dstXInBytes; /**< Destination X in bytes */
unsigned int dstY; /**< Destination Y */
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
void *dstHost; /**< Destination host pointer */
CUdeviceptr dstDevice; /**< Destination device pointer */
CUarray dstArray; /**< Destination array reference */
unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */
unsigned int Height; /**< Height of 2D memory copy */
} CUDA_MEMCPY2D;
typedef struct CUDA_MEMCPY3D_st
{
unsigned int srcXInBytes; /**< Source X in bytes */
unsigned int srcY; /**< Source Y */
unsigned int srcZ; /**< Source Z */
unsigned int srcLOD; /**< Source LOD */
CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
const void *srcHost; /**< Source host pointer */
CUdeviceptr srcDevice; /**< Source device pointer */
CUarray srcArray; /**< Source array reference */
void *reserved0; /**< Must be NULL */
unsigned int srcPitch; /**< Source pitch (ignored when src is array) */
unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */
unsigned int dstXInBytes; /**< Destination X in bytes */
unsigned int dstY; /**< Destination Y */
unsigned int dstZ; /**< Destination Z */
unsigned int dstLOD; /**< Destination LOD */
CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
void *dstHost; /**< Destination host pointer */
CUdeviceptr dstDevice; /**< Destination device pointer */
CUarray dstArray; /**< Destination array reference */
void *reserved1; /**< Must be NULL */
unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */
unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */
unsigned int Height; /**< Height of 3D memory copy */
unsigned int Depth; /**< Depth of 3D memory copy */
} CUDA_MEMCPY3D;
typedef struct CUDA_ARRAY_DESCRIPTOR_st
{
unsigned int Width; /**< Width of array */
unsigned int Height; /**< Height of array */
CUarray_format Format; /**< Array format */
unsigned int NumChannels; /**< Channels per array element */
} CUDA_ARRAY_DESCRIPTOR;
typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
{
unsigned int Width; /**< Width of 3D array */
unsigned int Height; /**< Height of 3D array */
unsigned int Depth; /**< Depth of 3D array */
CUarray_format Format; /**< Array format */
unsigned int NumChannels; /**< Channels per array element */
unsigned int Flags; /**< Flags */
} CUDA_ARRAY3D_DESCRIPTOR;
#endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */
/*
* If set, the CUDA array contains an array of 2D slices
* and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
* the number of slices, not the depth of a 3D array.
*/
#define CUDA_ARRAY3D_2DARRAY 0x01
/**
* This flag must be set in order to bind a surface reference
* to the CUDA array
*/
#define CUDA_ARRAY3D_SURFACE_LDST 0x02
/**
* Override the texref format with a format inferred from the array.
* Flag for ::cuTexRefSetArray()
*/
#define CU_TRSA_OVERRIDE_FORMAT 0x01
/**
* Read the texture as integers rather than promoting the values to floats
* in the range [0,1].
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_READ_AS_INTEGER 0x01
/**
* Use normalized texture coordinates in the range [0,1) instead of [0,dim).
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_NORMALIZED_COORDINATES 0x02
/**
* Perform sRGB->linear conversion during texture read.
* Flag for ::cuTexRefSetFlags()
*/
#define CU_TRSF_SRGB 0x10
/**
* For texture references loaded into the module, use default texunit from
* texture reference.
*/
#define CU_PARAM_TR_DEFAULT -1
/** @} */ /* END CUDA_TYPES */
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
#define CUDAAPI __stdcall
#else
#define CUDAAPI
#endif
/**
* \defgroup CUDA_INITIALIZE Initialization
*
* This section describes the initialization functions of the low-level CUDA
* driver application programming interface.
*
* @{
*/
/*********************************
** Initialization
*********************************/
typedef CUresult CUDAAPI tcuInit(unsigned int Flags);
/*********************************
** Driver Version Query
*********************************/
typedef CUresult CUDAAPI tcuDriverGetVersion(int *driverVersion);
/************************************
**
** Device management
**
***********************************/
typedef CUresult CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
typedef CUresult CUDAAPI tcuDeviceGetCount(int *count);
typedef CUresult CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
typedef CUresult CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
#else
typedef CUresult CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
#endif
typedef CUresult CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
typedef CUresult CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
/************************************
**
** Context management
**
***********************************/
typedef CUresult CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
typedef CUresult CUDAAPI tcuCtxDestroy(CUcontext ctx);
typedef CUresult CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
typedef CUresult CUDAAPI tcuCtxDetach(CUcontext ctx);
typedef CUresult CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
typedef CUresult CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
typedef CUresult CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
typedef CUresult CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
typedef CUresult CUDAAPI tcuCtxGetDevice(CUdevice *device);
typedef CUresult CUDAAPI tcuCtxSynchronize(void);
/************************************
**
** Module management
**
***********************************/
typedef CUresult CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
typedef CUresult CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
typedef CUresult CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
typedef CUresult CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
typedef CUresult CUDAAPI tcuModuleUnload(CUmodule hmod);
typedef CUresult CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
#else
typedef CUresult CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
#endif
typedef CUresult CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
typedef CUresult CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
/************************************
**
** Memory management
**
***********************************/
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
size_t *pPitch,
size_t WidthInBytes,
size_t Height,
// size of biggest r/w to be performed by kernels on this memory
// 4, 8 or 16 bytes
unsigned int ElementSizeBytes
);
#else
typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
unsigned int *pPitch,
unsigned int WidthInBytes,
unsigned int Height,
// size of biggest r/w to be performed by kernels on this memory
// 4, 8 or 16 bytes
unsigned int ElementSizeBytes
);
#endif
typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
#else
typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
#endif
typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
/************************************
**
** Synchronous Memcpy
**
** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
** but if host memory is involved, they wait until the copy is done before returning.
**
***********************************/
// 1D functions
#if __CUDA_API_VERSION >= 3020
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
// device <-> array memory
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
// array <-> array memory
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
#else
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
// device <-> array memory
typedef CUresult CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
typedef CUresult CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
// array <-> array memory
typedef CUresult CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
#endif
// 2D memcpy
typedef CUresult CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
typedef CUresult CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
// 3D memcpy
typedef CUresult CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
/************************************
**
** Asynchronous Memcpy
**
** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
** memcpy's done with these functions execute in parallel with the CPU and, if
** the hardware is available, may execute in parallel with the GPU.
** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
**
***********************************/
// 1D functions
#if __CUDA_API_VERSION >= 3020
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
const void *srcHost, size_t ByteCount, CUstream hStream);
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
size_t ByteCount, CUstream hStream);
#else
// system <-> device memory
typedef CUresult CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
const void *srcHost, unsigned int ByteCount, CUstream hStream);
typedef CUresult CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
// device <-> device memory
typedef CUresult CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
// system <-> array memory
typedef CUresult CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
const void *srcHost, unsigned int ByteCount, CUstream hStream);
typedef CUresult CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
unsigned int ByteCount, CUstream hStream);
#endif
// 2D memcpy
typedef CUresult CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
// 3D memcpy
typedef CUresult CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
/************************************
**
** Memset
**
***********************************/
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
#else
typedef CUresult CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
typedef CUresult CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
typedef CUresult CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
typedef CUresult CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
typedef CUresult CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
typedef CUresult CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
#endif
/************************************
**
** Function management
**
***********************************/
typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
unsigned int sharedMemBytes,
CUstream hStream, void **kernelParams, void **extra);
/************************************
**
** Array management
**
***********************************/
typedef CUresult CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
typedef CUresult CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
typedef CUresult CUDAAPI tcuArrayDestroy(CUarray hArray);
typedef CUresult CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
typedef CUresult CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
/************************************
**
** Texture reference management
**
***********************************/
typedef CUresult CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
typedef CUresult CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
typedef CUresult CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
#else
typedef CUresult CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
typedef CUresult CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
#endif
typedef CUresult CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
typedef CUresult CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
typedef CUresult CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
typedef CUresult CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
typedef CUresult CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
typedef CUresult CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
typedef CUresult CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
typedef CUresult CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
typedef CUresult CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
typedef CUresult CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
/************************************
**
** Surface reference management
**
***********************************/
typedef CUresult CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
typedef CUresult CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
/************************************
**
** Parameter management
**
***********************************/
typedef CUresult CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
typedef CUresult CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
typedef CUresult CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
typedef CUresult CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
typedef CUresult CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
/************************************
**
** Launch functions
**
***********************************/
typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
/************************************
**
** Events
**
***********************************/
typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
/************************************
**
** Streams
**
***********************************/
typedef CUresult CUDAAPI tcuStreamCreate(CUstream *phStream, unsigned int Flags);
typedef CUresult CUDAAPI tcuStreamQuery(CUstream hStream);
typedef CUresult CUDAAPI tcuStreamSynchronize(CUstream hStream);
typedef CUresult CUDAAPI tcuStreamDestroy(CUstream hStream);
/************************************
**
** Graphics interop
**
***********************************/
typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
#if __CUDA_API_VERSION >= 3020
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
#else
typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
#endif
typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
/************************************
**
** Export tables
**
***********************************/
typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
/************************************
**
** Limits
**
***********************************/
typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
extern tcuDriverGetVersion *cuDriverGetVersion;
extern tcuDeviceGet *cuDeviceGet;
extern tcuDeviceGetCount *cuDeviceGetCount;
extern tcuDeviceGetName *cuDeviceGetName;
extern tcuDeviceComputeCapability *cuDeviceComputeCapability;
extern tcuDeviceGetProperties *cuDeviceGetProperties;
extern tcuDeviceGetAttribute *cuDeviceGetAttribute;
extern tcuCtxDestroy *cuCtxDestroy;
extern tcuCtxAttach *cuCtxAttach;
extern tcuCtxDetach *cuCtxDetach;
extern tcuCtxPushCurrent *cuCtxPushCurrent;
extern tcuCtxPopCurrent *cuCtxPopCurrent;
extern tcuCtxSetCurrent *cuCtxSetCurrent;
extern tcuCtxGetCurrent *cuCtxGetCurrent;
extern tcuCtxGetDevice *cuCtxGetDevice;
extern tcuCtxSynchronize *cuCtxSynchronize;
extern tcuModuleLoad *cuModuleLoad;
extern tcuModuleLoadData *cuModuleLoadData;
extern tcuModuleLoadDataEx *cuModuleLoadDataEx;
extern tcuModuleLoadFatBinary *cuModuleLoadFatBinary;
extern tcuModuleUnload *cuModuleUnload;
extern tcuModuleGetFunction *cuModuleGetFunction;
extern tcuModuleGetTexRef *cuModuleGetTexRef;
extern tcuModuleGetSurfRef *cuModuleGetSurfRef;
extern tcuMemFreeHost *cuMemFreeHost;
extern tcuMemHostAlloc *cuMemHostAlloc;
extern tcuMemHostGetFlags *cuMemHostGetFlags;
extern tcuMemHostRegister *cuMemHostRegister;
extern tcuMemHostUnregister *cuMemHostUnregister;
extern tcuMemcpy *cuMemcpy;
extern tcuMemcpyPeer *cuMemcpyPeer;
extern tcuDeviceTotalMem *cuDeviceTotalMem;
extern tcuCtxCreate *cuCtxCreate;
extern tcuModuleGetGlobal *cuModuleGetGlobal;
extern tcuMemGetInfo *cuMemGetInfo;
extern tcuMemAlloc *cuMemAlloc;
extern tcuMemAllocPitch *cuMemAllocPitch;
extern tcuMemFree *cuMemFree;
extern tcuMemGetAddressRange *cuMemGetAddressRange;
extern tcuMemAllocHost *cuMemAllocHost;
extern tcuMemHostGetDevicePointer *cuMemHostGetDevicePointer;
extern tcuFuncSetBlockShape *cuFuncSetBlockShape;
extern tcuFuncSetSharedSize *cuFuncSetSharedSize;
extern tcuFuncGetAttribute *cuFuncGetAttribute;
extern tcuFuncSetCacheConfig *cuFuncSetCacheConfig;
extern tcuLaunchKernel *cuLaunchKernel;
extern tcuArrayDestroy *cuArrayDestroy;
extern tcuTexRefCreate *cuTexRefCreate;
extern tcuTexRefDestroy *cuTexRefDestroy;
extern tcuTexRefSetArray *cuTexRefSetArray;
extern tcuTexRefSetFormat *cuTexRefSetFormat;
extern tcuTexRefSetAddressMode *cuTexRefSetAddressMode;
extern tcuTexRefSetFilterMode *cuTexRefSetFilterMode;
extern tcuTexRefSetFlags *cuTexRefSetFlags;
extern tcuTexRefGetArray *cuTexRefGetArray;
extern tcuTexRefGetAddressMode *cuTexRefGetAddressMode;
extern tcuTexRefGetFilterMode *cuTexRefGetFilterMode;
extern tcuTexRefGetFormat *cuTexRefGetFormat;
extern tcuTexRefGetFlags *cuTexRefGetFlags;
extern tcuSurfRefSetArray *cuSurfRefSetArray;
extern tcuSurfRefGetArray *cuSurfRefGetArray;
extern tcuParamSetSize *cuParamSetSize;
extern tcuParamSeti *cuParamSeti;
extern tcuParamSetf *cuParamSetf;
extern tcuParamSetv *cuParamSetv;
extern tcuParamSetTexRef *cuParamSetTexRef;
extern tcuLaunch *cuLaunch;
extern tcuLaunchGrid *cuLaunchGrid;
extern tcuLaunchGridAsync *cuLaunchGridAsync;
extern tcuEventCreate *cuEventCreate;
extern tcuEventRecord *cuEventRecord;
extern tcuEventQuery *cuEventQuery;
extern tcuEventSynchronize *cuEventSynchronize;
extern tcuEventDestroy *cuEventDestroy;
extern tcuEventElapsedTime *cuEventElapsedTime;
extern tcuStreamCreate *cuStreamCreate;
extern tcuStreamQuery *cuStreamQuery;
extern tcuStreamSynchronize *cuStreamSynchronize;
extern tcuStreamDestroy *cuStreamDestroy;
extern tcuGraphicsUnregisterResource *cuGraphicsUnregisterResource;
extern tcuGraphicsSubResourceGetMappedArray *cuGraphicsSubResourceGetMappedArray;
extern tcuGraphicsResourceSetMapFlags *cuGraphicsResourceSetMapFlags;
extern tcuGraphicsMapResources *cuGraphicsMapResources;
extern tcuGraphicsUnmapResources *cuGraphicsUnmapResources;
extern tcuGetExportTable *cuGetExportTable;
extern tcuCtxSetLimit *cuCtxSetLimit;
extern tcuCtxGetLimit *cuCtxGetLimit;
// These functions could be using the CUDA 3.2 interface (_v2)
extern tcuMemcpyHtoD *cuMemcpyHtoD;
extern tcuMemcpyDtoH *cuMemcpyDtoH;
extern tcuMemcpyDtoD *cuMemcpyDtoD;
extern tcuMemcpyDtoA *cuMemcpyDtoA;
extern tcuMemcpyAtoD *cuMemcpyAtoD;
extern tcuMemcpyHtoA *cuMemcpyHtoA;
extern tcuMemcpyAtoH *cuMemcpyAtoH;
extern tcuMemcpyAtoA *cuMemcpyAtoA;
extern tcuMemcpy2D *cuMemcpy2D;
extern tcuMemcpy2DUnaligned *cuMemcpy2DUnaligned;
extern tcuMemcpy3D *cuMemcpy3D;
extern tcuMemcpyHtoDAsync *cuMemcpyHtoDAsync;
extern tcuMemcpyDtoHAsync *cuMemcpyDtoHAsync;
extern tcuMemcpyDtoDAsync *cuMemcpyDtoDAsync;
extern tcuMemcpyHtoAAsync *cuMemcpyHtoAAsync;
extern tcuMemcpyAtoHAsync *cuMemcpyAtoHAsync;
extern tcuMemcpy2DAsync *cuMemcpy2DAsync;
extern tcuMemcpy3DAsync *cuMemcpy3DAsync;
extern tcuMemsetD8 *cuMemsetD8;
extern tcuMemsetD16 *cuMemsetD16;
extern tcuMemsetD32 *cuMemsetD32;
extern tcuMemsetD2D8 *cuMemsetD2D8;
extern tcuMemsetD2D16 *cuMemsetD2D16;
extern tcuMemsetD2D32 *cuMemsetD2D32;
extern tcuArrayCreate *cuArrayCreate;
extern tcuArrayGetDescriptor *cuArrayGetDescriptor;
extern tcuArray3DCreate *cuArray3DCreate;
extern tcuArray3DGetDescriptor *cuArray3DGetDescriptor;
extern tcuTexRefSetAddress *cuTexRefSetAddress;
extern tcuTexRefSetAddress2D *cuTexRefSetAddress2D;
extern tcuTexRefGetAddress *cuTexRefGetAddress;
extern tcuGraphicsResourceGetMappedPointer *cuGraphicsResourceGetMappedPointer;
/************************************/
CUresult CUDAAPI cuInit (unsigned int, int cudaVersion, void *hHandleDriver);
/************************************/
#ifdef __cplusplus
}
#endif
#endif //__cuda_cuda_h__
/*
* This copyright notice applies to this header file only:
*
* Copyright (c) 2010-2017 NVIDIA Corporation
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the software, and to permit persons to whom the
* software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*****************************************************************************************************/
//! \file cuviddec.h
//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
//! \date 2015-2017
//! This file contains constants, structure definitions and function prototypes used for decoding.
/*****************************************************************************************************/
#if !defined(__CUDA_VIDEO_H__)
#define __CUDA_VIDEO_H__
#ifndef __dynlink_cuda_h__
#include "dynlink_cuda.h"
#endif // __dynlink_cuda_h__
#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
#define __CUVID_DEVPTR64
#endif
#endif
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
typedef void *CUvideodecoder;
typedef struct _CUcontextlock_st *CUvideoctxlock;
/*********************************************************************************/
//! \enum cudaVideoCodec
//! Video codec enums
//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
/*********************************************************************************/
typedef enum cudaVideoCodec_enum {
cudaVideoCodec_MPEG1=0, /**< MPEG1 */
cudaVideoCodec_MPEG2, /**< MPEG2 */
cudaVideoCodec_MPEG4, /**< MPEG4 */
cudaVideoCodec_VC1, /**< VC1 */
cudaVideoCodec_H264, /**< H264 */
cudaVideoCodec_JPEG, /**< JPEG */
cudaVideoCodec_H264_SVC, /**< H264-SVC */
cudaVideoCodec_H264_MVC, /**< H264-MVC */
cudaVideoCodec_HEVC, /**< HEVC */
cudaVideoCodec_VP8, /**< VP8 */
cudaVideoCodec_VP9, /**< VP9 */
cudaVideoCodec_NumCodecs, /**< Max codecs */
// Uncompressed YUV
cudaVideoCodec_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')), /**< Y,U,V (4:2:0) */
cudaVideoCodec_YV12 = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')), /**< Y,V,U (4:2:0) */
cudaVideoCodec_NV12 = (('N'<<24)|('V'<<16)|('1'<<8)|('2')), /**< Y,UV (4:2:0) */
cudaVideoCodec_YUYV = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')), /**< YUYV/YUY2 (4:2:2) */
cudaVideoCodec_UYVY = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y')) /**< UYVY (4:2:2) */
} cudaVideoCodec;
/*********************************************************************************/
//! \enum cudaVideoSurfaceFormat
//! Video surface format enums used for output format of decoded output
//! These enums are used in CUVIDDECODECREATEINFO structure
/*********************************************************************************/
typedef enum cudaVideoSurfaceFormat_enum {
cudaVideoSurfaceFormat_NV12=0, /**< NV12 format */
cudaVideoSurfaceFormat_P016=1 /**< 16 bit semiplaner format. Can be used for 10 bit(6LSB bits 0),
12 bit (4LSB bits 0) */
} cudaVideoSurfaceFormat;
/******************************************************************************************************************/
//! \enum cudaVideoDeinterlaceMode
//! Deinterlacing mode enums
//! These enums are used in CUVIDDECODECREATEINFO structure
//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing
//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes
/******************************************************************************************************************/
typedef enum cudaVideoDeinterlaceMode_enum {
cudaVideoDeinterlaceMode_Weave=0, /**< Weave both fields (no deinterlacing) */
cudaVideoDeinterlaceMode_Bob, /**< Drop one field */
cudaVideoDeinterlaceMode_Adaptive /**< Adaptive deinterlacing */
} cudaVideoDeinterlaceMode;
/**************************************************************************************************************/
//! \enum cudaVideoChromaFormat
//! Chroma format enums
//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
//! JPEG supports Monochrome, YUV 4:2:0, YUV 4:2:2 and YUV 4:4:4 chroma formats.
//! H264, HEVC, VP9, VP8, VC1, MPEG1, MPEG2 and MPEG4 support YUV 4:2:0 chroma format only.
/**************************************************************************************************************/
typedef enum cudaVideoChromaFormat_enum {
cudaVideoChromaFormat_Monochrome=0, /**< MonoChrome */
cudaVideoChromaFormat_420, /**< YUV 4:2:0 */
cudaVideoChromaFormat_422, /**< YUV 4:2:2 */
cudaVideoChromaFormat_444 /**< YUV 4:4:4 */
} cudaVideoChromaFormat;
/*************************************************************************************************************/
//! \enum cudaVideoCreateFlags
//! Decoder flag enums to select preferred decode path
//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible
/*************************************************************************************************************/
typedef enum cudaVideoCreateFlags_enum {
cudaVideoCreate_Default = 0x00, /**< Default operation mode: use dedicated video engines */
cudaVideoCreate_PreferCUDA = 0x01, /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */
cudaVideoCreate_PreferDXVA = 0x02, /**< Go through DXVA internally if possible (requires D3D9 interop) */
cudaVideoCreate_PreferCUVID = 0x04 /**< Use dedicated video engines directly */
} cudaVideoCreateFlags;
/**************************************************************************************************************/
//! \struct CUVIDDECODECAPS;
//! This structure is used in cuvidGetDecoderCaps API
/**************************************************************************************************************/
typedef struct _CUVIDDECODECAPS
{
cudaVideoCodec eCodecType; /**< IN: cudaVideoCodec_XXX */
cudaVideoChromaFormat eChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
unsigned int nBitDepthMinus8; /**< IN: The Value "BitDepth minus 8" */
unsigned int reserved1[3]; /**< Reserved for future use - set to zero */
unsigned char bIsSupported; /**< OUT: 1 if codec supported, 0 if not supported */
unsigned char reserved2[3]; /**< Reserved for future use - set to zero */
unsigned int nMaxWidth; /**< OUT: Max supported coded width in pixels */
unsigned int nMaxHeight; /**< OUT: Max supported coded height in pixels */
unsigned int nMaxMBCount; /**< OUT: Max supported macroblock count
CodedWidth*CodedHeight/256 must be <= nMaxMBCount */
unsigned short nMinWidth; /**< OUT: Min supported coded width in pixels */
unsigned short nMinHeight; /**< OUT: Min supported coded height in pixels */
unsigned int reserved3[11]; /**< Reserved for future use - set to zero */
} CUVIDDECODECAPS;
/**************************************************************************************************************/
//! \struct CUVIDDECODECREATEINFO
//! This structure is used in cuvidCreateDecoder API
/**************************************************************************************************************/
typedef struct _CUVIDDECODECREATEINFO
{
unsigned long ulWidth; /**< IN: Coded sequence width in pixels */
unsigned long ulHeight; /**< IN: Coded sequence height in pixels */
unsigned long ulNumDecodeSurfaces; /**< IN: Maximum number of internal decode surfaces */
cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */
cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX */
unsigned long ulCreationFlags; /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX) */
unsigned long bitDepthMinus8; /**< IN: The value "BitDepth minus 8" */
unsigned long ulIntraDecodeOnly; /**< IN: Set 1 only if video has all intra frames (default value is 0). This will
optimize video memory for Intra frames only decoding. The support is limited
to specific codecs(H264 rightnow), the flag will be ignored for codecs which
are not supported. However decoding might fail if the flag is enabled in case
of supported codecs for regular bit streams having P and/or B frames. */
unsigned long Reserved1[3]; /**< Reserved for future use - set to zero */
/**
* IN: area of the frame that should be displayed
*/
struct {
short left;
short top;
short right;
short bottom;
} display_area;
cudaVideoSurfaceFormat OutputFormat; /**< IN: cudaVideoSurfaceFormat_XXX */
cudaVideoDeinterlaceMode DeinterlaceMode; /**< IN: cudaVideoDeinterlaceMode_XXX */
unsigned long ulTargetWidth; /**< IN: Post-processed output width (Should be aligned to 2) */
unsigned long ulTargetHeight; /**< IN: Post-processed output height (Should be aligbed to 2) */
unsigned long ulNumOutputSurfaces; /**< IN: Maximum number of output surfaces simultaneously mapped */
CUvideoctxlock vidLock; /**< IN: If non-NULL, context lock used for synchronizing ownership of
the cuda context. Needed for cudaVideoCreate_PreferCUDA decode */
/**
* IN: target rectangle in the output frame (for aspect ratio conversion)
* if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
*/
struct {
short left;
short top;
short right;
short bottom;
} target_rect;
unsigned long Reserved2[5]; /**< Reserved for future use - set to zero */
} CUVIDDECODECREATEINFO;
/*********************************************************/
//! \struct CUVIDH264DPBENTRY
//! H.264 DPB entry
//! This structure is used in CUVIDH264PICPARAMS structure
/*********************************************************/
typedef struct _CUVIDH264DPBENTRY
{
int PicIdx; /**< picture index of reference frame */
int FrameIdx; /**< frame_num(short-term) or LongTermFrameIdx(long-term) */
int is_long_term; /**< 0=short term reference, 1=long term reference */
int not_existing; /**< non-existing reference frame (corresponding PicIdx should be set to -1) */
int used_for_reference; /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields */
int FieldOrderCnt[2]; /**< field order count of top and bottom fields */
} CUVIDH264DPBENTRY;
/************************************************************/
//! \struct CUVIDH264MVCEXT
//! H.264 MVC picture parameters ext
//! This structure is used in CUVIDH264PICPARAMS structure
/************************************************************/
typedef struct _CUVIDH264MVCEXT
{
int num_views_minus1; /**< Max number of coded views minus 1 in video : Range - 0 to 1023 */
int view_id; /**< view identifier */
unsigned char inter_view_flag; /**< 1 if used for inter-view prediction, 0 if not */
unsigned char num_inter_view_refs_l0; /**< number of inter-view ref pics in RefPicList0 */
unsigned char num_inter_view_refs_l1; /**< number of inter-view ref pics in RefPicList1 */
unsigned char MVCReserved8Bits; /**< Reserved bits */
int InterViewRefsL0[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList0 */
int InterViewRefsL1[16]; /**< view id of the i-th view component for inter-view prediction in RefPicList1 */
} CUVIDH264MVCEXT;
/*********************************************************/
//! \struct CUVIDH264SVCEXT
//! H.264 SVC picture parameters ext
//! This structure is used in CUVIDH264PICPARAMS structure
/*********************************************************/
typedef struct _CUVIDH264SVCEXT
{
unsigned char profile_idc;
unsigned char level_idc;
unsigned char DQId;
unsigned char DQIdMax;
unsigned char disable_inter_layer_deblocking_filter_idc;
unsigned char ref_layer_chroma_phase_y_plus1;
signed char inter_layer_slice_alpha_c0_offset_div2;
signed char inter_layer_slice_beta_offset_div2;
unsigned short DPBEntryValidFlag;
unsigned char inter_layer_deblocking_filter_control_present_flag;
unsigned char extended_spatial_scalability_idc;
unsigned char adaptive_tcoeff_level_prediction_flag;
unsigned char slice_header_restriction_flag;
unsigned char chroma_phase_x_plus1_flag;
unsigned char chroma_phase_y_plus1;
unsigned char tcoeff_level_prediction_flag;
unsigned char constrained_intra_resampling_flag;
unsigned char ref_layer_chroma_phase_x_plus1_flag;
unsigned char store_ref_base_pic_flag;
unsigned char Reserved8BitsA;
unsigned char Reserved8BitsB;
short scaled_ref_layer_left_offset;
short scaled_ref_layer_top_offset;
short scaled_ref_layer_right_offset;
short scaled_ref_layer_bottom_offset;
unsigned short Reserved16Bits;
struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded.
Linked list ends at the target layer. */
int bRefBaseLayer; /**< whether to store ref base pic */
} CUVIDH264SVCEXT;
/******************************************************/
//! \struct CUVIDH264PICPARAMS
//! H.264 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/******************************************************/
typedef struct _CUVIDH264PICPARAMS
{
// SPS
int log2_max_frame_num_minus4;
int pic_order_cnt_type;
int log2_max_pic_order_cnt_lsb_minus4;
int delta_pic_order_always_zero_flag;
int frame_mbs_only_flag;
int direct_8x8_inference_flag;
int num_ref_frames; // NOTE: shall meet level 4.1 restrictions
unsigned char residual_colour_transform_flag;
unsigned char bit_depth_luma_minus8; // Must be 0 (only 8-bit supported)
unsigned char bit_depth_chroma_minus8; // Must be 0 (only 8-bit supported)
unsigned char qpprime_y_zero_transform_bypass_flag;
// PPS
int entropy_coding_mode_flag;
int pic_order_present_flag;
int num_ref_idx_l0_active_minus1;
int num_ref_idx_l1_active_minus1;
int weighted_pred_flag;
int weighted_bipred_idc;
int pic_init_qp_minus26;
int deblocking_filter_control_present_flag;
int redundant_pic_cnt_present_flag;
int transform_8x8_mode_flag;
int MbaffFrameFlag;
int constrained_intra_pred_flag;
int chroma_qp_index_offset;
int second_chroma_qp_index_offset;
int ref_pic_flag;
int frame_num;
int CurrFieldOrderCnt[2];
// DPB
CUVIDH264DPBENTRY dpb[16]; // List of reference frames within the DPB
// Quantization Matrices (raster-order)
unsigned char WeightScale4x4[6][16];
unsigned char WeightScale8x8[2][64];
// FMO/ASO
unsigned char fmo_aso_enable;
unsigned char num_slice_groups_minus1;
unsigned char slice_group_map_type;
signed char pic_init_qs_minus26;
unsigned int slice_group_change_rate_minus1;
union
{
unsigned long long slice_group_map_addr;
const unsigned char *pMb2SliceGroupMap;
} fmo;
unsigned int Reserved[12];
// SVC/MVC
union
{
CUVIDH264MVCEXT mvcext;
CUVIDH264SVCEXT svcext;
};
} CUVIDH264PICPARAMS;
/********************************************************/
//! \struct CUVIDMPEG2PICPARAMS
//! MPEG-2 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/********************************************************/
typedef struct _CUVIDMPEG2PICPARAMS
{
int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
int BackwardRefIdx; // Picture index of backward reference (B-frames)
int picture_coding_type;
int full_pel_forward_vector;
int full_pel_backward_vector;
int f_code[2][2];
int intra_dc_precision;
int frame_pred_frame_dct;
int concealment_motion_vectors;
int q_scale_type;
int intra_vlc_format;
int alternate_scan;
int top_field_first;
// Quantization matrices (raster order)
unsigned char QuantMatrixIntra[64];
unsigned char QuantMatrixInter[64];
} CUVIDMPEG2PICPARAMS;
// MPEG-4 has VOP types instead of Picture types
#define I_VOP 0
#define P_VOP 1
#define B_VOP 2
#define S_VOP 3
/*******************************************************/
//! \struct CUVIDMPEG4PICPARAMS
//! MPEG-4 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/*******************************************************/
typedef struct _CUVIDMPEG4PICPARAMS
{
int ForwardRefIdx; // Picture index of forward reference (P/B-frames)
int BackwardRefIdx; // Picture index of backward reference (B-frames)
// VOL
int video_object_layer_width;
int video_object_layer_height;
int vop_time_increment_bitcount;
int top_field_first;
int resync_marker_disable;
int quant_type;
int quarter_sample;
int short_video_header;
int divx_flags;
// VOP
int vop_coding_type;
int vop_coded;
int vop_rounding_type;
int alternate_vertical_scan_flag;
int interlaced;
int vop_fcode_forward;
int vop_fcode_backward;
int trd[2];
int trb[2];
// Quantization matrices (raster order)
unsigned char QuantMatrixIntra[64];
unsigned char QuantMatrixInter[64];
int gmc_enabled;
} CUVIDMPEG4PICPARAMS;
/********************************************************/
//! \struct CUVIDVC1PICPARAMS
//! VC1 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/********************************************************/
typedef struct _CUVIDVC1PICPARAMS
{
int ForwardRefIdx; /**< Picture index of forward reference (P/B-frames) */
int BackwardRefIdx; /**< Picture index of backward reference (B-frames) */
int FrameWidth; /**< Actual frame width */
int FrameHeight; /**< Actual frame height */
// PICTURE
int intra_pic_flag; /**< Set to 1 for I,BI frames */
int ref_pic_flag; /**< Set to 1 for I,P frames */
int progressive_fcm; /**< Progressive frame */
// SEQUENCE
int profile;
int postprocflag;
int pulldown;
int interlace;
int tfcntrflag;
int finterpflag;
int psf;
int multires;
int syncmarker;
int rangered;
int maxbframes;
// ENTRYPOINT
int panscan_flag;
int refdist_flag;
int extended_mv;
int dquant;
int vstransform;
int loopfilter;
int fastuvmc;
int overlap;
int quantizer;
int extended_dmv;
int range_mapy_flag;
int range_mapy;
int range_mapuv_flag;
int range_mapuv;
int rangeredfrm; // range reduction state
} CUVIDVC1PICPARAMS;
/***********************************************************/
//! \struct CUVIDJPEGPICPARAMS
//! JPEG picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/***********************************************************/
typedef struct _CUVIDJPEGPICPARAMS
{
int Reserved;
} CUVIDJPEGPICPARAMS;
/*******************************************************/
//! \struct CUVIDHEVCPICPARAMS
//! HEVC picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/*******************************************************/
typedef struct _CUVIDHEVCPICPARAMS
{
// sps
int pic_width_in_luma_samples;
int pic_height_in_luma_samples;
unsigned char log2_min_luma_coding_block_size_minus3;
unsigned char log2_diff_max_min_luma_coding_block_size;
unsigned char log2_min_transform_block_size_minus2;
unsigned char log2_diff_max_min_transform_block_size;
unsigned char pcm_enabled_flag;
unsigned char log2_min_pcm_luma_coding_block_size_minus3;
unsigned char log2_diff_max_min_pcm_luma_coding_block_size;
unsigned char pcm_sample_bit_depth_luma_minus1;
unsigned char pcm_sample_bit_depth_chroma_minus1;
unsigned char pcm_loop_filter_disabled_flag;
unsigned char strong_intra_smoothing_enabled_flag;
unsigned char max_transform_hierarchy_depth_intra;
unsigned char max_transform_hierarchy_depth_inter;
unsigned char amp_enabled_flag;
unsigned char separate_colour_plane_flag;
unsigned char log2_max_pic_order_cnt_lsb_minus4;
unsigned char num_short_term_ref_pic_sets;
unsigned char long_term_ref_pics_present_flag;
unsigned char num_long_term_ref_pics_sps;
unsigned char sps_temporal_mvp_enabled_flag;
unsigned char sample_adaptive_offset_enabled_flag;
unsigned char scaling_list_enable_flag;
unsigned char IrapPicFlag;
unsigned char IdrPicFlag;
unsigned char bit_depth_luma_minus8;
unsigned char bit_depth_chroma_minus8;
unsigned char reserved1[14];
// pps
unsigned char dependent_slice_segments_enabled_flag;
unsigned char slice_segment_header_extension_present_flag;
unsigned char sign_data_hiding_enabled_flag;
unsigned char cu_qp_delta_enabled_flag;
unsigned char diff_cu_qp_delta_depth;
signed char init_qp_minus26;
signed char pps_cb_qp_offset;
signed char pps_cr_qp_offset;
unsigned char constrained_intra_pred_flag;
unsigned char weighted_pred_flag;
unsigned char weighted_bipred_flag;
unsigned char transform_skip_enabled_flag;
unsigned char transquant_bypass_enabled_flag;
unsigned char entropy_coding_sync_enabled_flag;
unsigned char log2_parallel_merge_level_minus2;
unsigned char num_extra_slice_header_bits;
unsigned char loop_filter_across_tiles_enabled_flag;
unsigned char loop_filter_across_slices_enabled_flag;
unsigned char output_flag_present_flag;
unsigned char num_ref_idx_l0_default_active_minus1;
unsigned char num_ref_idx_l1_default_active_minus1;
unsigned char lists_modification_present_flag;
unsigned char cabac_init_present_flag;
unsigned char pps_slice_chroma_qp_offsets_present_flag;
unsigned char deblocking_filter_override_enabled_flag;
unsigned char pps_deblocking_filter_disabled_flag;
signed char pps_beta_offset_div2;
signed char pps_tc_offset_div2;
unsigned char tiles_enabled_flag;
unsigned char uniform_spacing_flag;
unsigned char num_tile_columns_minus1;
unsigned char num_tile_rows_minus1;
unsigned short column_width_minus1[21];
unsigned short row_height_minus1[21];
unsigned int reserved3[15];
// RefPicSets
int NumBitsForShortTermRPSInSlice;
int NumDeltaPocsOfRefRpsIdx;
int NumPocTotalCurr;
int NumPocStCurrBefore;
int NumPocStCurrAfter;
int NumPocLtCurr;
int CurrPicOrderCntVal;
int RefPicIdx[16]; // [refpic] Indices of valid reference pictures (-1 if unused for reference)
int PicOrderCntVal[16]; // [refpic]
unsigned char IsLongTerm[16]; // [refpic] 0=not a long-term reference, 1=long-term reference
unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15)
unsigned char RefPicSetStCurrAfter[8]; // [0..NumPocStCurrAfter-1] -> refpic (0..15)
unsigned char RefPicSetLtCurr[8]; // [0..NumPocLtCurr-1] -> refpic (0..15)
unsigned char RefPicSetInterLayer0[8];
unsigned char RefPicSetInterLayer1[8];
unsigned int reserved4[12];
// scaling lists (diag order)
unsigned char ScalingList4x4[6][16]; // [matrixId][i]
unsigned char ScalingList8x8[6][64]; // [matrixId][i]
unsigned char ScalingList16x16[6][64]; // [matrixId][i]
unsigned char ScalingList32x32[2][64]; // [matrixId][i]
unsigned char ScalingListDCCoeff16x16[6]; // [matrixId]
unsigned char ScalingListDCCoeff32x32[2]; // [matrixId]
} CUVIDHEVCPICPARAMS;
/***********************************************************/
//! \struct CUVIDVP8PICPARAMS
//! VP8 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/***********************************************************/
typedef struct _CUVIDVP8PICPARAMS
{
int width;
int height;
unsigned int first_partition_size;
//Frame Indexes
unsigned char LastRefIdx;
unsigned char GoldenRefIdx;
unsigned char AltRefIdx;
union {
struct {
unsigned char frame_type : 1; /**< 0 = KEYFRAME, 1 = INTERFRAME */
unsigned char version : 3;
unsigned char show_frame : 1;
unsigned char update_mb_segmentation_data : 1; /**< Must be 0 if segmentation is not enabled */
unsigned char Reserved2Bits : 2;
};
unsigned char wFrameTagFlags;
};
unsigned char Reserved1[4];
unsigned int Reserved2[3];
} CUVIDVP8PICPARAMS;
/***********************************************************/
//! \struct CUVIDVP9PICPARAMS
//! VP9 picture parameters
//! This structure is used in CUVIDPICPARAMS structure
/***********************************************************/
typedef struct _CUVIDVP9PICPARAMS
{
unsigned int width;
unsigned int height;
//Frame Indices
unsigned char LastRefIdx;
unsigned char GoldenRefIdx;
unsigned char AltRefIdx;
unsigned char colorSpace;
unsigned short profile : 3;
unsigned short frameContextIdx : 2;
unsigned short frameType : 1;
unsigned short showFrame : 1;
unsigned short errorResilient : 1;
unsigned short frameParallelDecoding : 1;
unsigned short subSamplingX : 1;
unsigned short subSamplingY : 1;
unsigned short intraOnly : 1;
unsigned short allow_high_precision_mv : 1;
unsigned short refreshEntropyProbs : 1;
unsigned short reserved2Bits : 2;
unsigned short reserved16Bits;
unsigned char refFrameSignBias[4];
unsigned char bitDepthMinus8Luma;
unsigned char bitDepthMinus8Chroma;
unsigned char loopFilterLevel;
unsigned char loopFilterSharpness;
unsigned char modeRefLfEnabled;
unsigned char log2_tile_columns;
unsigned char log2_tile_rows;
unsigned char segmentEnabled : 1;
unsigned char segmentMapUpdate : 1;
unsigned char segmentMapTemporalUpdate : 1;
unsigned char segmentFeatureMode : 1;
unsigned char reserved4Bits : 4;
unsigned char segmentFeatureEnable[8][4];
short segmentFeatureData[8][4];
unsigned char mb_segment_tree_probs[7];
unsigned char segment_pred_probs[3];
unsigned char reservedSegment16Bits[2];
int qpYAc;
int qpYDc;
int qpChDc;
int qpChAc;
unsigned int activeRefIdx[3];
unsigned int resetFrameContext;
unsigned int mcomp_filter_type;
unsigned int mbRefLfDelta[4];
unsigned int mbModeLfDelta[2];
unsigned int frameTagSize;
unsigned int offsetToDctParts;
unsigned int reserved128Bits[4];
} CUVIDVP9PICPARAMS;
/******************************************************************************************/
//! \struct CUVIDPICPARAMS
//! Picture parameters for decoding
//! This structure is used in cuvidDecodePicture API
//! IN for cuvidDecodePicture
/******************************************************************************************/
typedef struct _CUVIDPICPARAMS
{
int PicWidthInMbs; /**< IN: Coded frame size in macroblocks */
int FrameHeightInMbs; /**< IN: Coded frame height in macroblocks */
int CurrPicIdx; /**< IN: Output index of the current picture */
int field_pic_flag; /**< IN: 0=frame picture, 1=field picture */
int bottom_field_flag; /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */
int second_field; /**< IN: Second field of a complementary field pair */
// Bitstream data
unsigned int nBitstreamDataLen; /**< IN: Number of bytes in bitstream data buffer */
const unsigned char *pBitstreamData; /**< IN: Ptr to bitstream data for this picture (slice-layer) */
unsigned int nNumSlices; /**< IN: Number of slices in this picture */
const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within
the bitstream data buffer */
int ref_pic_flag; /**< IN: This picture is a reference picture */
int intra_pic_flag; /**< IN: This picture is entirely intra coded */
unsigned int Reserved[30]; /**< Reserved for future use */
// IN: Codec-specific data
union {
CUVIDMPEG2PICPARAMS mpeg2; /**< Also used for MPEG-1 */
CUVIDH264PICPARAMS h264;
CUVIDVC1PICPARAMS vc1;
CUVIDMPEG4PICPARAMS mpeg4;
CUVIDJPEGPICPARAMS jpeg;
CUVIDHEVCPICPARAMS hevc;
CUVIDVP8PICPARAMS vp8;
CUVIDVP9PICPARAMS vp9;
unsigned int CodecReserved[1024];
} CodecSpecific;
} CUVIDPICPARAMS;
/******************************************************/
//! \struct CUVIDPROCPARAMS
//! Picture parameters for postprocessing
//! This structure is used in cuvidMapVideoFrame API
/******************************************************/
typedef struct _CUVIDPROCPARAMS
{
int progressive_frame; /**< IN: Input is progressive (deinterlace_mode will be ignored) */
int second_field; /**< IN: Output the second field (ignored if deinterlace mode is Weave) */
int top_field_first; /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */
int unpaired_field; /**< IN: Input only contains one field (2nd field is invalid) */
// The fields below are used for raw YUV input
unsigned int reserved_flags; /**< Reserved for future use (set to zero) */
unsigned int reserved_zero; /**< Reserved (set to zero) */
unsigned long long raw_input_dptr; /**< IN: Input CUdeviceptr for raw YUV extensions */
unsigned int raw_input_pitch; /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately) */
unsigned int raw_input_format; /**< IN: Input YUV format (cudaVideoCodec_enum) */
unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions */
unsigned int raw_output_pitch; /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately) */
unsigned int Reserved1; /**< Reserved for future use (set to zero) */
CUstream output_stream; /**< IN: stream object used by cuvidMapVideoFrame */
unsigned int Reserved[46]; /**< Reserved for future use (set to zero) */
void *Reserved2[2]; /**< Reserved for future use (set to zero) */
} CUVIDPROCPARAMS;
/***********************************************************************************************************/
//! VIDEO_DECODER
//!
//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode
//! queue at any time, in order to make sure that all decode engines are always busy.
//!
//! Overall data flow:
//! - cuvidGetDecoderCaps(...)
//! - cuvidCreateDecoder(...)
//! - For each picture:
//! + cuvidDecodePicture(N)
//! + cuvidMapVideoFrame(N-4)
//! + do some processing in cuda
//! + cuvidUnmapVideoFrame(N-4)
//! + cuvidDecodePicture(N+1)
//! + cuvidMapVideoFrame(N-3)
//! + ...
//! - cuvidDestroyDecoder(...)
//!
//! NOTE:
//! - When the cuda context is created from a D3D device, the D3D device must also be created
//! with the D3DCREATE_MULTITHREADED flag.
//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending
//! in the decode queue
/***********************************************************************************************************/
/**********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters.
//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure
//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported
//! If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0.
//! E.g. on Geforce GTX 960:
//! App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0;
//! Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth = 48; nMinHeight = 16;
//! nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536;
//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount
/**********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidGetDecoderCaps(CUVIDDECODECAPS *pdc);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci)
//! Create the decoder object based on pdci. A handle to the created decoder is returned
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder)
//! Destroy the decoder object.
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidDestroyDecoder(CUvideodecoder hDecoder);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams)
//! Decode a single picture (field or frame)
//! Kicks off HW decoding
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
/************************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
//! unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and associated
//! pitch of the video frame
/************************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx,
unsigned int *pDevPtr, unsigned int *pPitch,
CUVIDPROCPARAMS *pVPP);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr)
//! Unmap a previously mapped video frame
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
#endif
#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
/************************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
//! unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and associated
//! pitch of the video frame
/************************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
//! Unmap a previously mapped video frame
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
#define tcuvidMapVideoFrame tcuvidMapVideoFrame64
#define tcuvidUnmapVideoFrame tcuvidUnmapVideoFrame64
#endif
#endif
/********************************************************************************************************************/
//!
//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions
//! provide a simple mutex-style host synchronization. If a non-NULL context is specified
//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
//! context before making any cuda calls.
//! A multi-threaded application could create a lock associated with a context handle so that
//! multiple threads can safely share the same cuda context:
//! - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
//! that can be passed to cuvidCtxLockCreate.
//! - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
//!
//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
/********************************************************************************************************************/
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx)
//! This API is used to create CtxLock object
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck)
//! This API is used to free CtxLock object
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCtxLockDestroy(CUvideoctxlock lck);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags)
//! This API is used to acquire ctxlock
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
/********************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags)
//! This API is used to release ctxlock
/********************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
/**********************************************************************************************/
extern tcuvidGetDecoderCaps *cuvidGetDecoderCaps;
extern tcuvidCreateDecoder *cuvidCreateDecoder;
extern tcuvidDestroyDecoder *cuvidDestroyDecoder;
extern tcuvidDecodePicture *cuvidDecodePicture;
extern tcuvidMapVideoFrame *cuvidMapVideoFrame;
extern tcuvidUnmapVideoFrame *cuvidUnmapVideoFrame;
#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
extern tcuvidMapVideoFrame64 *cuvidMapVideoFrame64;
extern tcuvidUnmapVideoFrame64 *cuvidUnmapVideoFrame64;
#endif
// extern tcuvidGetVideoFrameSurface *cuvidGetVideoFrameSurface;
extern tcuvidCtxLockCreate *cuvidCtxLockCreate;
extern tcuvidCtxLockDestroy *cuvidCtxLockDestroy;
extern tcuvidCtxLock *cuvidCtxLock;
extern tcuvidCtxUnlock *cuvidCtxUnlock;
#if defined(__cplusplus)
}
// Auto-lock helper for C++ applications
class CCtxAutoLock
{
private:
CUvideoctxlock m_ctx;
public:
CCtxAutoLock(CUvideoctxlock ctx);
~CCtxAutoLock();
};
#endif /* __cplusplus */
#endif // __CUDA_VIDEO_H__
/*
* This copyright notice applies to this header file only:
*
* Copyright (c) 2010-2017 NVIDIA Corporation
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the software, and to permit persons to whom the
* software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/********************************************************************************************************************/
//! \file nvcuvid.h
//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
//! \date 2015-2017
//! This file contains the interface constants, structure definitions and function prototypes.
/********************************************************************************************************************/
#if !defined(__NVCUVID_H__)
#define __NVCUVID_H__
#include "dynlink_cuviddec.h"
#if defined(__cplusplus)
extern "C" {
#endif /* __cplusplus */
/*********************************
** Initialization
*********************************/
CUresult CUDAAPI cuvidInit(unsigned int Flags);
/***********************************************/
//!
//! High-level helper APIs for video sources
//!
/***********************************************/
typedef void *CUvideosource;
typedef void *CUvideoparser;
typedef long long CUvideotimestamp;
/************************************************************************/
//! \enum cudaVideoState
//! Video source state enums
//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
/************************************************************************/
typedef enum {
cudaVideoState_Error = -1, /**< Error state (invalid source) */
cudaVideoState_Stopped = 0, /**< Source is stopped (or reached end-of-stream) */
cudaVideoState_Started = 1 /**< Source is running and delivering data */
} cudaVideoState;
/************************************************************************/
//! \enum cudaAudioCodec
//! Audio compression enums
//! Used in CUAUDIOFORMAT structure
/************************************************************************/
typedef enum {
cudaAudioCodec_MPEG1=0, /**< MPEG-1 Audio */
cudaAudioCodec_MPEG2, /**< MPEG-2 Audio */
cudaAudioCodec_MP3, /**< MPEG-1 Layer III Audio */
cudaAudioCodec_AC3, /**< Dolby Digital (AC3) Audio */
cudaAudioCodec_LPCM, /**< PCM Audio */
cudaAudioCodec_AAC, /**< AAC Audio */
} cudaAudioCodec;
/************************************************************************************************/
//! \ingroup STRUCTS
//! \struct CUVIDEOFORMAT
//! Video format
//! Used in cuvidGetSourceVideoFormat API
/************************************************************************************************/
typedef struct
{
cudaVideoCodec codec; /**< OUT: Compression format */
/**
* OUT: frame rate = numerator / denominator (for example: 30000/1001)
*/
struct {
/**< OUT: frame rate numerator (0 = unspecified or variable frame rate) */
unsigned int numerator;
/**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
unsigned int denominator;
} frame_rate;
unsigned char progressive_sequence; /**< OUT: 0=interlaced, 1=progressive */
unsigned char bit_depth_luma_minus8; /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
unsigned char bit_depth_chroma_minus8; /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
unsigned char reserved1; /**< Reserved for future use */
unsigned int coded_width; /**< OUT: coded frame width in pixels */
unsigned int coded_height; /**< OUT: coded frame height in pixels */
/**
* area of the frame that should be displayed
* typical example:
* coded_width = 1920, coded_height = 1088
* display_area = { 0,0,1920,1080 }
*/
struct {
int left; /**< OUT: left position of display rect */
int top; /**< OUT: top position of display rect */
int right; /**< OUT: right position of display rect */
int bottom; /**< OUT: bottom position of display rect */
} display_area;
cudaVideoChromaFormat chroma_format; /**< OUT: Chroma format */
unsigned int bitrate; /**< OUT: video bitrate (bps, 0=unknown) */
/**
* OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
*/
struct {
int x;
int y;
} display_aspect_ratio;
/**
* Video Signal Description
* Refer section E.2.1 (VUI parameters semantics) of H264 spec file
*/
struct {
unsigned char video_format : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified */
unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range */
unsigned char reserved_zero_bits : 4; /**< Reserved bits */
unsigned char color_primaries; /**< OUT: chromaticity coordinates of source primaries */
unsigned char transfer_characteristics; /**< OUT: opto-electronic transfer characteristic of the source picture */
unsigned char matrix_coefficients; /**< OUT: used in deriving luma and chroma signals from RGB primaries */
} video_signal_description;
unsigned int seqhdr_data_length; /**< OUT: Additional bytes following (CUVIDEOFORMATEX) */
} CUVIDEOFORMAT;
/****************************************************************/
//! \ingroup STRUCTS
//! \struct CUVIDEOFORMATEX
//! Video format including raw sequence header information
//! Used in cuvidGetSourceVideoFormat API
/****************************************************************/
typedef struct
{
CUVIDEOFORMAT format; /**< OUT: CUVIDEOFORMAT structure */
unsigned char raw_seqhdr_data[1024]; /**< OUT: Sequence header data */
} CUVIDEOFORMATEX;
/****************************************************************/
//! \ingroup STRUCTS
//! \struct CUAUDIOFORMAT
//! Audio formats
//! Used in cuvidGetSourceAudioFormat API
/****************************************************************/
typedef struct
{
cudaAudioCodec codec; /**< OUT: Compression format */
unsigned int channels; /**< OUT: number of audio channels */
unsigned int samplespersec; /**< OUT: sampling frequency */
unsigned int bitrate; /**< OUT: For uncompressed, can also be used to determine bits per sample */
unsigned int reserved1; /**< Reserved for future use */
unsigned int reserved2; /**< Reserved for future use */
} CUAUDIOFORMAT;
/***************************************************************/
//! \enum CUvideopacketflags
//! Data packet flags
//! Used in CUVIDSOURCEDATAPACKET structure
/***************************************************************/
typedef enum {
CUVID_PKT_ENDOFSTREAM = 0x01, /**< Set when this is the last packet for this stream */
CUVID_PKT_TIMESTAMP = 0x02, /**< Timestamp is valid */
CUVID_PKT_DISCONTINUITY = 0x04, /**< Set when a discontinuity has to be signalled */
CUVID_PKT_ENDOFPICTURE = 0x08, /**< Set when the packet contains exactly one frame */
} CUvideopacketflags;
/*****************************************************************************/
//! \ingroup STRUCTS
//! \struct CUVIDSOURCEDATAPACKET
//! Data Packet
//! Used in cuvidParseVideoData API
//! IN for cuvidParseVideoData
/*****************************************************************************/
typedef struct _CUVIDSOURCEDATAPACKET
{
unsigned long flags; /**< IN: Combination of CUVID_PKT_XXX flags */
unsigned long payload_size; /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
const unsigned char *payload; /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
CUvideotimestamp timestamp; /**< IN: Presentation time stamp (10MHz clock), only valid if
CUVID_PKT_TIMESTAMP flag is set */
} CUVIDSOURCEDATAPACKET;
// Callback for packet delivery
typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
/**************************************************************************************************************************/
//! \ingroup STRUCTS
//! \struct CUVIDSOURCEPARAMS
//! Describes parameters needed in cuvidCreateVideoSource API
//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
/**************************************************************************************************************************/
typedef struct _CUVIDSOURCEPARAMS
{
unsigned int ulClockRate; /**< IN: Time stamp units in Hz (0=default=10000000Hz) */
unsigned int uReserved1[7]; /**< Reserved for future use - set to zero */
void *pUserData; /**< IN: User private data passed in to the data handlers */
PFNVIDSOURCECALLBACK pfnVideoDataHandler; /**< IN: Called to deliver video packets */
PFNVIDSOURCECALLBACK pfnAudioDataHandler; /**< IN: Called to deliver audio packets. */
void *pvReserved2[8]; /**< Reserved for future use - set to NULL */
} CUVIDSOURCEPARAMS;
/**********************************************/
//! \ingroup ENUMS
//! \enum CUvideosourceformat_flags
//! CUvideosourceformat_flags
//! Used in cuvidGetSourceVideoFormat API
/**********************************************/
typedef enum {
CUVID_FMT_EXTFORMATINFO = 0x100 /**< Return extended format structure (CUVIDEOFORMATEX) */
} CUvideosourceformat_flags;
#if !defined(__APPLE__)
/**************************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams)
//! Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks:
//! pfnVideoDataHandler() and pfnAudioDataHandler()
//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
/**************************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
/****************************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams)
//! Create video source object and initialize
/****************************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
/*********************************************************************/
//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
//! Destroy video source
/*********************************************************************/
typedef CUresult CUDAAPI tcuvidDestroyVideoSource(CUvideosource obj);
/******************************************************************************************/
//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
//! Set video source state
/******************************************************************************************/
typedef CUresult CUDAAPI tcuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
/******************************************************************************************/
//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
//! Get video source state
/******************************************************************************************/
typedef cudaVideoState CUDAAPI tcuvidGetVideoSourceState(CUvideosource obj);
/****************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
//! Gets details of video stream in pvidfmt
/****************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
/****************************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
//! Get audio source format
//! NVDECODE API is intended for HW accelarated video decoding so CUvideosource doesn't have audio demuxer for all suppported
//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
/****************************************************************************************************************/
typedef CUresult CUDAAPI tcuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
#endif
/**********************************************************************************/
//! \ingroup STRUCTS
//! \struct CUVIDPARSERDISPINFO
//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
/**********************************************************************************/
typedef struct _CUVIDPARSERDISPINFO
{
int picture_index; /**< OUT: Index of the current picture */
int progressive_frame; /**< OUT: 1 if progressive frame; 0 otherwise */
int top_field_first; /**< OUT: 1 if top field is displayed first; 0 otherwise */
int repeat_first_field; /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling,
-1=unpaired field) */
CUvideotimestamp timestamp; /**< OUT: Presentation time stamp */
} CUVIDPARSERDISPINFO;
/***********************************************************************************************************************/
//! Parser callbacks
//! The parser will call these synchronously from within cuvidParseVideoData(), whenever a picture is ready to
//! be decoded and/or displayed. First argument in functions is "void *pUserData" member of structure CUVIDSOURCEPARAMS
/***********************************************************************************************************************/
typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
/**************************************/
//! \ingroup STRUCTS
//! \struct CUVIDPARSERPARAMS
//! Used in cuvidCreateVideoParser API
/**************************************/
typedef struct _CUVIDPARSERPARAMS
{
cudaVideoCodec CodecType; /**< IN: cudaVideoCodec_XXX */
unsigned int ulMaxNumDecodeSurfaces; /**< IN: Max # of decode surfaces (parser will cycle through these) */
unsigned int ulClockRate; /**< IN: Timestamp units in Hz (0=default=10000000Hz) */
unsigned int ulErrorThreshold; /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always
IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
unsigned int ulMaxDisplayDelay; /**< IN: Max display queue delay (improves pipelining of decode with display)
0=no delay (recommended values: 2..4) */
unsigned int uReserved1[5]; /**< IN: Reserved for future use - set to 0 */
void *pUserData; /**< IN: User data for callbacks */
PFNVIDSEQUENCECALLBACK pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
PFNVIDDECODECALLBACK pfnDecodePicture; /**< IN: Called when a picture is ready to be decoded (decode order) */
PFNVIDDISPLAYCALLBACK pfnDisplayPicture; /**< IN: Called whenever a picture is ready to be displayed (display order) */
void *pvReserved2[7]; /**< Reserved for future use - set to NULL */
CUVIDEOFORMATEX *pExtVideoInfo; /**< IN: [Optional] sequence header data from system layer */
} CUVIDPARSERPARAMS;
/************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
//! Create video parser object and initialize
/************************************************************************************************/
typedef CUresult CUDAAPI tcuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
/************************************************************************************************/
//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
//! Parse the video data from source data packet in pPacket
//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and
//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
/************************************************************************************************/
typedef CUresult CUDAAPI tcuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
/*******************************************************************/
//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
/*******************************************************************/
typedef CUresult CUDAAPI tcuvidDestroyVideoParser(CUvideoparser obj);
extern tcuvidCreateVideoSource *cuvidCreateVideoSource;
extern tcuvidCreateVideoSourceW *cuvidCreateVideoSourceW;
extern tcuvidDestroyVideoSource *cuvidDestroyVideoSource;
extern tcuvidSetVideoSourceState *cuvidSetVideoSourceState;
extern tcuvidGetVideoSourceState *cuvidGetVideoSourceState;
extern tcuvidGetSourceVideoFormat *cuvidGetSourceVideoFormat;
extern tcuvidGetSourceAudioFormat *cuvidGetSourceAudioFormat;
extern tcuvidCreateVideoParser *cuvidCreateVideoParser;
extern tcuvidParseVideoData *cuvidParseVideoData;
extern tcuvidDestroyVideoParser *cuvidDestroyVideoParser;
/**********************************************************************************************/
#if defined(__cplusplus)
}
#endif /* __cplusplus */
#endif // __NVCUVID_H__
#include "utils.h"
using namespace cv::cuda;
const char *INPUT_BLOB_NAME = "Input";
static Logger gLogger;
// TODO: refactor once done
static bool globalRunInInt8 = false;
#define RETURN_AND_LOG(ret, severity, message) \
do \
{ \
std::string error_message = "sample_uff_ssd: " + std::string(message); \
gLogger.log(ILogger::Severity::k##severity, error_message.c_str()); \
return (ret); \
} while (0)
const int OUTPUT_CLS_SIZE = 91;
const int OUTPUT_BBOX_SIZE = OUTPUT_CLS_SIZE * 4;
const char *OUTPUT_BLOB_NAME0 = "NMS";
//INT8 Calibration, currently set to calibrate over 100 images
static constexpr int CAL_BATCH_SIZE = 50;
static constexpr int FIRST_CAL_BATCH = 0, NB_CAL_BATCHES = 10;
// Concat layers
// mbox_priorbox, mbox_loc, mbox_conf
const int concatAxis[2] = {1, 1};
const bool ignoreBatch[2] = {false, false};
DetectionOutputParameters detectionOutputParam{true, false, 0, OUTPUT_CLS_SIZE, 100, 100, 0.5, 0.6, CodeTypeSSD::TF_CENTER, {0, 2, 1}, true, true};
// Visualization
const float visualizeThreshold = 0.5;
void printOutput(int64_t eltCount, DataType dtype, void *buffer)
{
std::cout << eltCount << " eltCount" << std::endl;
assert(samplesCommon::getElementSize(dtype) == sizeof(float));
std::cout << "--- OUTPUT ---" << std::endl;
size_t memSize = eltCount * samplesCommon::getElementSize(dtype);
float *outputs = new float[eltCount];
CHECK_TRT(cudaMemcpyAsync(outputs, buffer, memSize, cudaMemcpyDeviceToHost));
int maxIdx = std::distance(outputs, std::max_element(outputs, outputs + eltCount));
for (int64_t eltIdx = 0; eltIdx < eltCount; ++eltIdx)
{
std::cout << eltIdx << " => " << outputs[eltIdx] << "\t : ";
if (eltIdx == maxIdx)
std::cout << "***";
std::cout << "\n";
}
std::cout << std::endl;
delete[] outputs;
}
std::string locateFile(const std::string &input)
{
std::vector<std::string> dirs{"data/ssd/",
"data/ssd/VOC2007/",
"data/ssd/VOC2007/PPMImages/",
"data/samples/ssd/",
"data/samples/ssd/VOC2007/",
"data/samples/ssd/VOC2007/PPMImages/"};
return locateFile(input, dirs);
}
void populateTFInputData(float *data)
{
auto graphFileName = locateFile("inp_bus.txt");
std::ifstream labelFile(graphFileName);
string line;
int id = 0;
while (getline(labelFile, line))
{
istringstream iss(line);
float num;
iss >> num;
data[id++] = num;
}
return;
}
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName)
{
std::ifstream labelFile(labelFileName);
string line;
int id = 0;
while (getline(labelFile, line))
{
CLASSES.push_back(line);
}
return;
}
std::vector<std::pair<int64_t, DataType>>
calculateBindingBufferSizes(const ICudaEngine &engine, int nbBindings, int batchSize)
{
std::vector<std::pair<int64_t, DataType>> sizes;
for (int i = 0; i < nbBindings; ++i)
{
Dims dims = engine.getBindingDimensions(i);
DataType dtype = engine.getBindingDataType(i);
int64_t eltCount = samplesCommon::volume(dims) * batchSize;
sizes.push_back(std::make_pair(eltCount, dtype));
}
return sizes;
}
ICudaEngine *loadModelAndCreateEngine(const char *uffFile, int maxBatchSize,
IUffParser *parser, IInt8Calibrator *calibrator, IHostMemory *&trtModelStream, bool isInt8)
{
// Create the builder
IBuilder *builder = createInferBuilder(gLogger);
// Parse the UFF model to populate the network, then set the outputs.
INetworkDefinition *network = builder->createNetwork();
std::cout << "Begin parsing model..." << std::endl;
if (!parser->parse(uffFile, *network, nvinfer1::DataType::kFLOAT))
RETURN_AND_LOG(nullptr, ERROR, "Fail to parse");
std::cout << "End parsing model..." << std::endl;
// Build the engine.
builder->setMaxBatchSize(maxBatchSize);
// The _GB literal operator is defined in common/common.h
builder->setMaxWorkspaceSize(1_GB); // We need about 1GB of scratch space for the plugin layer for batch size 5.
builder->setHalf2Mode(false);
if (isInt8)
{
builder->setInt8Mode(true);
builder->setInt8Calibrator(calibrator);
}
std::cout << "Begin building engine..." << std::endl;
ICudaEngine *engine = builder->buildCudaEngine(*network);
if (!engine)
RETURN_AND_LOG(nullptr, ERROR, "Unable to create engine");
std::cout << "End building engine..." << std::endl;
// We don't need the network any more, and we can destroy the parser.
network->destroy();
parser->destroy();
// Serialize the engine, then close everything down.
trtModelStream = engine->serialize();
builder->destroy();
shutdownProtobufLibrary();
return engine;
}
void doInference(IExecutionContext &context, float *inputData, float *detectionOut, int *keepCount, int batchSize)
{
const ICudaEngine &engine = context.getEngine();
// Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(),
// of these, but in this case we know that there is exactly 1 input and 2 output.
int nbBindings = engine.getNbBindings();
std::vector<void *> buffers(nbBindings);
std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize);
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings().
int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME),
outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0),
outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1);
for (int i = 0; i < nbBindings; ++i)
{
// inputData is already allocated on the device
if (i == inputIndex)
{
continue;
}
auto bufferSizesOutput = buffersSizes[i];
buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second));
}
cudaStream_t stream;
CHECK_TRT(cudaStreamCreate(&stream));
// make sure the data we are about to use is allocated on the GPU
cudaPointerAttributes attributes;
cudaError_t err = cudaPointerGetAttributes(&attributes, inputData);
#if CUDART_VERSION >= 10000
assert(err != cudaErrorInvalidValue && attributes.type == cudaMemoryTypeDevice);
#else
assert(err != cudaErrorInvalidValue && attributes.memoryType == cudaMemoryTypeDevice);
#endif
buffers[inputIndex] = inputData;
auto t_start = std::chrono::high_resolution_clock::now();
context.execute(batchSize, &buffers[0]);
auto t_end = std::chrono::high_resolution_clock::now();
float total = std::chrono::duration<float, std::milli>(t_end - t_start).count();
//std::cout << "Time taken for inference is " << total << " ms." << std::endl;
for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx)
{
if (engine.bindingIsInput(bindingIdx))
continue;
#ifdef SSD_INT8_DEBUG
auto bufferSizesOutput = buffersSizes[bindingIdx];
printOutput(bufferSizesOutput.first, bufferSizesOutput.second,
buffers[bindingIdx]);
#endif
}
CHECK_TRT(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK_TRT(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
// Release the stream and the buffers
cudaStreamDestroy(stream);
CHECK_TRT(cudaFree(buffers[inputIndex]));
CHECK_TRT(cudaFree(buffers[outputIndex0]));
CHECK_TRT(cudaFree(buffers[outputIndex1]));
}
class FlattenConcat : public IPluginV2
{
public:
FlattenConcat(int concatAxis, bool ignoreBatch)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
{
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
}
//clone constructor
FlattenConcat(int concatAxis, bool ignoreBatch, int numInputs, int outputConcatAxis, int* inputConcatAxis)
: mIgnoreBatch(ignoreBatch)
, mConcatAxisID(concatAxis)
, mOutputConcatAxis(outputConcatAxis)
, mNumInputs(numInputs)
{
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
for (int i = 0; i < mNumInputs; ++i)
mInputConcatAxis[i] = inputConcatAxis[i];
}
FlattenConcat(const void* data, size_t length)
{
const char *d = reinterpret_cast<const char*>(data), *a = d;
mIgnoreBatch = read<bool>(d);
mConcatAxisID = read<int>(d);
assert(mConcatAxisID == 1 || mConcatAxisID == 2 || mConcatAxisID == 3);
mOutputConcatAxis = read<int>(d);
mNumInputs = read<int>(d);
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
CHECK_TRT(cudaMallocHost((void**) &mCopySize, mNumInputs * sizeof(int)));
std::for_each(mInputConcatAxis, mInputConcatAxis + mNumInputs, [&](int& inp) { inp = read<int>(d); });
mCHW = read<nvinfer1::DimsCHW>(d);
std::for_each(mCopySize, mCopySize + mNumInputs, [&](size_t& inp) { inp = read<size_t>(d); });
assert(d == a + length);
}
~FlattenConcat()
{
if (mInputConcatAxis)
CHECK_TRT(cudaFreeHost(mInputConcatAxis));
if (mCopySize)
CHECK_TRT(cudaFreeHost(mCopySize));
}
int getNbOutputs() const override { return 1; }
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override
{
assert(nbInputDims >= 1);
assert(index == 0);
mNumInputs = nbInputDims;
CHECK_TRT(cudaMallocHost((void**) &mInputConcatAxis, mNumInputs * sizeof(int)));
mOutputConcatAxis = 0;
#ifdef SSD_INT8_DEBUG
std::cout << " Concat nbInputs " << nbInputDims << "\n";
std::cout << " Concat axis " << mConcatAxisID << "\n";
for (int i = 0; i < 6; ++i)
for (int j = 0; j < 3; ++j)
std::cout << " Concat InputDims[" << i << "]"
<< "d[" << j << " is " << inputs[i].d[j] << "\n";
#endif
for (int i = 0; i < nbInputDims; ++i)
{
int flattenInput = 0;
assert(inputs[i].nbDims == 3);
if (mConcatAxisID != 1)
assert(inputs[i].d[0] == inputs[0].d[0]);
if (mConcatAxisID != 2)
assert(inputs[i].d[1] == inputs[0].d[1]);
if (mConcatAxisID != 3)
assert(inputs[i].d[2] == inputs[0].d[2]);
flattenInput = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2];
mInputConcatAxis[i] = flattenInput;
mOutputConcatAxis += mInputConcatAxis[i];
}
return DimsCHW(mConcatAxisID == 1 ? mOutputConcatAxis : 1,
mConcatAxisID == 2 ? mOutputConcatAxis : 1,
mConcatAxisID == 3 ? mOutputConcatAxis : 1);
}
int initialize() override
{
CHECK_TRT(cublasCreate(&mCublas));
return 0;
}
void terminate() override
{
CHECK_TRT(cublasDestroy(mCublas));
}
size_t getWorkspaceSize(int) const override { return 0; }
int enqueue(int batchSize, const void* const* inputs, void** outputs, void*, cudaStream_t stream) override
{
int numConcats = 1;
assert(mConcatAxisID != 0);
numConcats = std::accumulate(mCHW.d, mCHW.d + mConcatAxisID - 1, 1, std::multiplies<int>());
if (!mIgnoreBatch)
numConcats *= batchSize;
float* output = reinterpret_cast<float*>(outputs[0]);
int offset = 0;
for (int i = 0; i < mNumInputs; ++i)
{
const float* input = reinterpret_cast<const float*>(inputs[i]);
float* inputTemp;
CHECK_TRT(cudaMalloc(&inputTemp, mCopySize[i] * batchSize));
CHECK_TRT(cudaMemcpyAsync(inputTemp, input, mCopySize[i] * batchSize, cudaMemcpyDeviceToDevice, stream));
for (int n = 0; n < numConcats; ++n)
{
CHECK_TRT(cublasScopy(mCublas, mInputConcatAxis[i],
inputTemp + n * mInputConcatAxis[i], 1,
output + (n * mOutputConcatAxis + offset), 1));
}
CHECK_TRT(cudaFree(inputTemp));
offset += mInputConcatAxis[i];
}
return 0;
}
size_t getSerializationSize() const override
{
return sizeof(bool) + sizeof(int) * (3 + mNumInputs) + sizeof(nvinfer1::Dims) + (sizeof(mCopySize) * mNumInputs);
}
void serialize(void* buffer) const override
{
char *d = reinterpret_cast<char*>(buffer), *a = d;
write(d, mIgnoreBatch);
write(d, mConcatAxisID);
write(d, mOutputConcatAxis);
write(d, mNumInputs);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mInputConcatAxis[i]);
}
write(d, mCHW);
for (int i = 0; i < mNumInputs; ++i)
{
write(d, mCopySize[i]);
}
assert(d == a + getSerializationSize());
}
void configureWithFormat(const Dims* inputs, int nbInputs, const Dims* outputDims, int nbOutputs, nvinfer1::DataType type, nvinfer1::PluginFormat format, int maxBatchSize) override
{
assert(nbOutputs == 1);
mCHW = inputs[0];
assert(inputs[0].nbDims == 3);
CHECK_TRT(cudaMallocHost((void**) &mCopySize, nbInputs * sizeof(int)));
for (int i = 0; i < nbInputs; ++i)
{
mCopySize[i] = inputs[i].d[0] * inputs[i].d[1] * inputs[i].d[2] * sizeof(float);
}
}
bool supportsFormat(DataType type, PluginFormat format) const override
{
return (type == DataType::kFLOAT && format == PluginFormat::kNCHW);
}
const char* getPluginType() const override { return "FlattenConcat_TRT"; }
const char* getPluginVersion() const override { return "1"; }
void destroy() override { delete this; }
IPluginV2* clone() const override
{
return new FlattenConcat(mConcatAxisID, mIgnoreBatch, mNumInputs, mOutputConcatAxis, mInputConcatAxis);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
template <typename T>
void write(char*& buffer, const T& val) const
{
*reinterpret_cast<T*>(buffer) = val;
buffer += sizeof(T);
}
template <typename T>
T read(const char*& buffer)
{
T val = *reinterpret_cast<const T*>(buffer);
buffer += sizeof(T);
return val;
}
size_t* mCopySize = nullptr;
bool mIgnoreBatch{false};
int mConcatAxisID{0}, mOutputConcatAxis{0}, mNumInputs{0};
int* mInputConcatAxis = nullptr;
nvinfer1::Dims mCHW;
cublasHandle_t mCublas;
std::string mNamespace;
};
namespace
{
const char *FLATTENCONCAT_PLUGIN_VERSION{"1"};
const char *FLATTENCONCAT_PLUGIN_NAME{"FlattenConcat_TRT"};
} // namespace
class FlattenConcatPluginCreator : public IPluginCreator
{
public:
FlattenConcatPluginCreator()
{
mPluginAttributes.emplace_back(PluginField("axis", nullptr, PluginFieldType::kINT32, 1));
mPluginAttributes.emplace_back(PluginField("ignoreBatch", nullptr, PluginFieldType::kINT32, 1));
mFC.nbFields = mPluginAttributes.size();
mFC.fields = mPluginAttributes.data();
}
~FlattenConcatPluginCreator() {}
const char* getPluginName() const override { return FLATTENCONCAT_PLUGIN_NAME; }
const char* getPluginVersion() const override { return FLATTENCONCAT_PLUGIN_VERSION; }
const PluginFieldCollection* getFieldNames() override { return &mFC; }
IPluginV2* createPlugin(const char* name, const PluginFieldCollection* fc) override
{
const PluginField* fields = fc->fields;
for (int i = 0; i < fc->nbFields; ++i)
{
const char* attrName = fields[i].name;
if (!strcmp(attrName, "axis"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mConcatAxisID = *(static_cast<const int*>(fields[i].data));
}
if (!strcmp(attrName, "ignoreBatch"))
{
assert(fields[i].type == PluginFieldType::kINT32);
mIgnoreBatch = *(static_cast<const bool*>(fields[i].data));
}
}
return new FlattenConcat(mConcatAxisID, mIgnoreBatch);
}
IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override
{
//This object will be deleted when the network is destroyed, which will
//call Concat::destroy()
return new FlattenConcat(serialData, serialLength);
}
void setPluginNamespace(const char* libNamespace) override { mNamespace = libNamespace; }
const char* getPluginNamespace() const override { return mNamespace.c_str(); }
private:
static PluginFieldCollection mFC;
bool mIgnoreBatch{false};
int mConcatAxisID;
static std::vector<PluginField> mPluginAttributes;
std::string mNamespace = "";
};
PluginFieldCollection FlattenConcatPluginCreator::mFC{};
std::vector<PluginField> FlattenConcatPluginCreator::mPluginAttributes;
REGISTER_TENSORRT_PLUGIN(FlattenConcatPluginCreator);
// 1. convert image to the right size
// 2. convert to float
// 3. normalize for inception
// 4. convert to flat vector, channels first
float * normalize_for_trt(const cv::cuda::GpuMat &img)
{
cv::Size size(INPUT_W, INPUT_H);
cv::cuda::GpuMat resizedMat;
cv::cuda::resize(img, resizedMat, size, 0, 0, CV_INTER_LINEAR);
cv::cuda::cvtColor(resizedMat, resizedMat, cv::COLOR_BGRA2RGB);
unsigned volChl = INPUT_H * INPUT_W;
float * data = (float *)samplesCommon::safeCudaMalloc(INPUT_C * volChl * sizeof(float));
// we treat the memory as if it's a one-channel, one row image
int rowSize = (int)resizedMat.step / (int)resizedMat.elemSize1();
// CUDA kernel to reshape the non-continuous GPU Mat structure and make it channel-first continuous
channelFirst(resizedMat.ptr<uint8_t>(), data, volChl, INPUT_C, INPUT_W * INPUT_C, rowSize);
return data;
}
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8)
{
initLibNvInferPlugins(&gLogger, "");
const int N = 10;
std::cout << graphFileName << std::endl;
auto parser = createUffParser();
BatchStream calibrationStream(CAL_BATCH_SIZE, NB_CAL_BATCHES);
parser->registerInput("Input", DimsCHW(INPUT_C, INPUT_H, INPUT_W), UffInputOrder::kNCHW);
parser->registerOutput("MarkOutput_0");
IHostMemory *trtModelStream{nullptr};
Int8EntropyCalibrator calibrator(calibrationStream, FIRST_CAL_BATCH, "CalibrationTableSSD");
ICudaEngine *tmpEngine = loadModelAndCreateEngine(graphFileName.c_str(), N, parser, &calibrator, trtModelStream, isInt8);
assert(tmpEngine != nullptr);
assert(trtModelStream != nullptr);
tmpEngine->destroy();
// Read a random sample image.
srand(unsigned(time(nullptr)));
// Deserialize the engine.
std::cout << "*** deserializing" << std::endl;
IRuntime *runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine *engine = runtime->deserializeCudaEngine(trtModelStream->data(), trtModelStream->size(), nullptr);
assert(engine != nullptr);
trtModelStream->destroy();
IExecutionContext *context = engine->createExecutionContext();
assert(context != nullptr);
return std::make_tuple(runtime, engine, context);
}
// mat representation of the image,
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat &img, IExecutionContext * context, vector<std::string>& CLASSES)
{
const int N = 1;
float * data = normalize_for_trt(img);
const std::string outFileRoot = "/home/borisk/images/";
// Host memory for outputs.
vector<float> detectionOut(N * detectionOutputParam.keepTopK * 7);
vector<int> keepCount(N);
// Run inference. This will also free the "data" pointer
doInference(*context, data, &detectionOut[0], &keepCount[0], N);
return std::make_tuple(detectionOut, keepCount);
}
\ No newline at end of file
#include "inference_base.h"
using tensorflow::Status;
using namespace std;
using namespace cv;
using namespace std::chrono;
int InferenceBase::ReadClassLabels()
{
Status readLabelsMapStatus = readLabelsMapFile(labelsFile, labelsMap);
if (!readLabelsMapStatus.ok())
{
LOG(ERROR) << "readLabelsMapFile(): ERROR" << readLabelsMapFile;
return -1;
}
else
LOG(INFO) << "readLabelsMapFile(): labels map loaded with " << labelsMap.size() << " label(s)" << endl;
return 0;
}
void InferenceBase::InitCuda()
{
void *hHandleDriver = nullptr;
CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
cuda_res = cuvidInit(0);
if (cuda_res != CUDA_SUCCESS)
{
throw exception();
}
std::cout << "CUDA init: SUCCESS" << endl;
cv::cuda::printCudaDeviceInfo(cv::cuda::getDevice());
isCudaInited = true;
}
int InferenceBase::Init(string videoStream)
{
if (!isCudaInited)
{
InitCuda();
}
if (ReadClassLabels() != 0)
{
LOG(ERROR) << "ReadClassLabels returned non-zero\n";
return -1;
}
LOG(INFO) << "CUDA INIT DONE\n";
/*
if (ReadGraph() != 0)
{
LOG(ERROR) << "Could not load inference graph";
return -1;
}
LOG(INFO) << "Inference graph loaded";
// create video stream
d_reader = GetVideoReader(videoStream);
if (d_reader == nullptr)
{
LOG(ERROR) << "Could not create video stream";
throw exception();
}
// save off frame dimensions
auto formatStruct = d_reader->format();
width = formatStruct.width;
height = formatStruct.height;
*/
isInitialized = true;
return 0;
}
void InferenceBase::RunInferenceOnStream()
{
if (!isInitialized)
{
LOG(ERROR) << "Video streaming not initialized";
return;
}
cuda::GpuMat d_frame;
int iFrame = 0, nFrames = 30;
double fps = 0., infer_tf_ms = 0.;
high_resolution_clock::time_point start = high_resolution_clock::now();
high_resolution_clock::time_point end;
double duration = 0.;
for (;;)
{
start = high_resolution_clock::now();
if (!d_reader->nextFrame(d_frame))
{
break;
}
if (doInference(d_frame) != 0)
{
LOG(ERROR) << "Inference failed";
return;
}
end = high_resolution_clock::now();
duration += (double) duration_cast<milliseconds>(end - start).count();
visualize(d_frame, fps);
if (++iFrame % nFrames == 0)
{
fps = 1. * nFrames / duration * 1000.;
duration = 0.;
}
if (iFrame % 100 == 0)
{
LOG(INFO) << "Speed: " << to_string(fps).substr(0, 5);
}
}
}
#pragma once
#include "utils.h"
using namespace std;
class InferenceBase
{
private:
bool isCudaInited;
cv::Ptr<cv::cudacodec::VideoReader> GetVideoReader(string video_file)
{return cv::cudacodec::createVideoReader(video_file);}
protected:
string labelsFile;
string graphFile;
map<int, string> labelsMap;
virtual int ReadClassLabels();
virtual int ReadGraph() = 0;
void InitCuda();
cv::Ptr<cv::cudacodec::VideoReader> d_reader;
double thresholdScore;
double thresholdIOU;
// frame width and height
int height;
int width;
int debug;
bool isInitialized;
public:
InferenceBase(const string &labelsFile, const string &graphFile, double threshScore, double threshIOU, int dbg)
: labelsFile(labelsFile)
, graphFile(graphFile)
, isCudaInited(false)
, thresholdScore(threshScore)
, thresholdIOU(threshIOU)
, isInitialized(false)
, labelsMap()
, width(1280)
, height(720)
, debug(dbg)
{}
virtual ~InferenceBase() {}
void RunInferenceOnStream();
virtual int doInference(cv::cuda::GpuMat&) = 0;
virtual void visualize(cv::cuda::GpuMat&, double) = 0;
virtual int Init(string video_stream);
map<int, string> get_labels_map() {return labelsMap;}
void set_debug(int dbg) {debug = dbg;}
};
#include "inference_tf.h"
using tensorflow::Status;
using tensorflow::Tensor;
using namespace cv;
using tensorflow::int32;
int InferenceTensorflow::ReadGraph()
{
LOG(INFO) << "graphFile:" << graphFile;
Status loadGraphStatus = loadGraph(graphFile, &session);
if (!loadGraphStatus.ok())
{
LOG(ERROR) << "loadGraph(): ERROR" << loadGraphStatus;
return -1;
}
else
LOG(INFO) << "loadGraph(): frozen graph loaded" << endl;
return 0;
}
// allocate input tensor
int InferenceTensorflow::Init(string videoStream)
{
if (InferenceBase::Init(videoStream) != 0)
{
LOG(INFO) << "Init(videostream) exit non-zero (aka huge fail)";
return -1;
}
LOG(INFO) << "Init(videostream): PASS\n";
LOG(INFO) << "The session must exist at this point, see loadGraph() in inference_base.cpp";
// configure callable options
opts.add_feed(inputLayer);
for (auto const &value : outputLayer)
{
opts.add_fetch(value);
}
const string gpu_device_name = GPUDeviceName(session.get());
opts.clear_fetch_devices();
opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok())
{
LOG(ERROR) << "Failed to make callable";
}
LOG(INFO) << "Shape of the GPU tensor: (1, " << height << ", " << width << ", 3)\n";
// allocate tensor on the GPU
tensorflow::TensorShape shape = tensorflow::TensorShape({1, height, width, 3});
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
new tensorflow::GPUMemAllocator(
tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
platform_gpu_id, false /*use_unified_memory*/, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
LOG(INFO) << "Is Cuda Tensor: " << IsCUDATensor(inputTensor);
return 0;
}
int InferenceTensorflow::doInference(cv::cuda::GpuMat &d_frame)
{
Status runStatus;
readTensorFromGpuMat(d_frame, inputTensor);
runStatus = session->RunCallable(feed_gpu_fetch_cpu, {inputTensor}, &outputs, nullptr);
if (!runStatus.ok())
{
LOG(ERROR) << "Running model failed: " << runStatus;
return -1;
}
return 0;
}
void InferenceTensorflow::visualize(cv::cuda::GpuMat &d_frame, double fps)
{
// Extract results from the outputs vector
tensorflow::TTypes<float>::Flat scores = outputs[1].flat<float>();
tensorflow::TTypes<float>::Flat classes = outputs[2].flat<float>();
tensorflow::TTypes<float>::Flat numDetections = outputs[3].flat<float>();
tensorflow::TTypes<float, 3>::Tensor boxes = outputs[0].flat_outer_dims<float, 3>();
vector<size_t> goodIdxs = filterBoxes(scores, boxes, thresholdIOU, thresholdScore);
if (debug & 0x1)
{
for (size_t i = 0; i < goodIdxs.size(); i++)
LOG(INFO) << "score:" << scores(goodIdxs.at(i)) << ",class:" << labelsMap[classes(goodIdxs.at(i))]
<< " (" << classes(goodIdxs.at(i)) << "), box:"
<< "," << boxes(0, goodIdxs.at(i), 0) << ","
<< boxes(0, goodIdxs.at(i), 1) << "," << boxes(0, goodIdxs.at(i), 2) << ","
<< boxes(0, goodIdxs.at(i), 3);
}
// Draw bboxes and captions
if (debug & 0x2)
{
Mat frame;
d_frame.download(frame);
drawBoundingBoxesOnImage(frame, scores, classes, boxes, labelsMap, goodIdxs);
auto color = Scalar(255, 0, 255);
drawFrameworkSignature(frame, fps, "Tensorflow", color);
}
}
#pragma once
#include "inference_base.h"
using namespace std;
using tensorflow::CallableOptions;
using tensorflow::Tensor;
using tensorflow::Session;
class InferenceTensorflow : public InferenceBase
{
private:
const string inputLayer = "image_tensor:0";
const vector<string> outputLayer = {"detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0"};
CallableOptions opts;
std::unique_ptr<tensorflow::Session> session;
Session::CallableHandle feed_gpu_fetch_cpu;
// Allocate input tensor on the gpu
Tensor inputTensor;
vector<Tensor> outputs;
protected:
int ReadGraph() override;
int doInference(cv::cuda::GpuMat& d_frame) override;
void visualize(cv::cuda::GpuMat &d_frame, double) override;
public:
InferenceTensorflow(const string &labelsFile, const string &graphFile, double threshScore = 0.5, double threshIOU = 0.8, int dbg = 0)
: InferenceBase(labelsFile, graphFile, threshScore, threshIOU, dbg)
, opts()
{ }
int Init(string videoStream) override;
virtual ~InferenceTensorflow() { session->ReleaseCallable(feed_gpu_fetch_cpu);}
};
\ No newline at end of file
#include "inference_base.h" #include "cublas_v2.h"
#include "inference_tf.h"
#include <cuda_profiler_api.h> // Required for CUDA check
#include "tensorflow/core/util/port.h"
// GPU allocator
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
#include "tensorflow/core/common_runtime/direct_session.h"
#include "tensorflow/cc/framework/scope.h"
#include "tensorflow/cc/ops/standard_ops.h"
// CUDA includes. Order matters
#include "dynlink_nvcuvid.h"
#include "cuda_runtime_api.h"
// CUDA kernel function (my)
#include "array.h"
using namespace std;
using tensorflow::CallableOptions; using tensorflow::CallableOptions;
using tensorflow::int32; using tensorflow::DeviceAttributes;
using tensorflow::Scope;
using tensorflow::Session;
using tensorflow::Status; using tensorflow::Status;
using tensorflow::string;
using tensorflow::Tensor; using tensorflow::Tensor;
using namespace std;
using namespace cv;
using namespace std::chrono;
int main(int argc, char *argv[]) Status loadGraph(unique_ptr<tensorflow::Session> *session){
{
if (!tensorflow::IsGoogleCudaEnabled()) tensorflow::GraphDef graph_def;
{
LOG(ERROR) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda"; using namespace tensorflow;
using namespace tensorflow::ops;
auto scope = Scope::NewRootScope();
// TF likes power of 2
tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1});
auto a = Placeholder(scope.WithOpName("array_tensor_in"), DT_UINT8, Placeholder::Shape(shape));
auto b = Identity(scope.WithOpName("array_tensor_out"), a);
TF_CHECK_OK(scope.ToGraphDef(&graph_def));
//tensorflow::WriteTextProto(Env::Default(), "mygraph.pbtxt", graph_def);
tensorflow::SessionOptions session_options;
session_options.config.mutable_gpu_options()->set_allow_growth(true);
//session_options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.1);
session->reset(tensorflow::NewSession(session_options));
Status session_create_status = (*session)->Create(graph_def);
if (!session_create_status.ok()){
LOG(ERROR) << "loadGraph(): ERROR" << session_create_status;
}
return Status::OK();
}
bool IsCUDATensor(const Tensor &t){
cudaPointerAttributes attributes;
cudaError_t err =
cudaPointerGetAttributes(&attributes, t.tensor_data().data());
if (err == cudaErrorInvalidValue)
return false;
CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
#if CUDART_VERSION >= 10000
return (attributes.type == cudaMemoryTypeDevice);
#else
return (attributes.memoryType == cudaMemoryTypeDevice);
#endif
}
string GPUDeviceName(Session* session) {
std::vector<DeviceAttributes> devices;
TF_CHECK_OK(session->ListDevices(&devices));
for (const DeviceAttributes& d : devices) {
LOG(INFO) << "Device: " << d.name();
if (d.device_type() == "GPU" || d.device_type() == "gpu") {
return d.name();
}
}
return "";
}
int main(int, char**) {
// check: TF built with CUDA support
if (!tensorflow::IsGoogleCudaEnabled()){
LOG(INFO) << "Tensorflow built without CUDA. Rebuild with -c opt --config=cuda";
return -1; return -1;
}else{
LOG(INFO) << "Tensorflow built with CUDA, keep running" << endl;
} }
const String keys = // check and init CUDA drivers and libs
"{d display |1 | view video while objects are detected}" void *hHandleDriver = nullptr;
//"{t tensorrt|false | use tensorrt}" CUresult cuda_res = cuInit(0, __CUDA_API_VERSION, hHandleDriver);
//"{i int8|false| use INT8 (requires callibration)}" if (cuda_res != CUDA_SUCCESS)
"{v video | | video for detection}" {
"{graph ||frozen graph location}" throw exception();
"{labels ||trained labels filelocation}"; }
cuda_res = cuvidInit(0);
// Set dirs variables if (cuda_res != CUDA_SUCCESS)
string ROOTDIR = ""; {
throw exception();
CommandLineParser parser(argc, argv, keys); }
int showWindow = parser.get<int>("d"); LOG(INFO) << "\033[1;32m" << "CUDA init: ok" << "\033[0m";
String video_file = parser.get<String>("v");
//bool is_tensor_rt = parser.get<bool>("t"); std::unique_ptr<tensorflow::Session> session;
//bool is_int8 = parser.get<bool>("i");
String LABELS = parser.get<String>("labels"); loadGraph(&session);
String GRAPH = parser.get<String>("graph");
const string inputLayer = "array_tensor_in:0";
unique_ptr<InferenceBase> infer((InferenceBase *) new InferenceTensorflow(LABELS, GRAPH)); const string outputLayer = "array_tensor_out:0";
infer->set_debug(showWindow); // do the opts
CallableOptions opts;
cout << "Init()\n"; Session::CallableHandle feed_gpu_fetch_cpu;
infer->Init(video_file); opts.add_feed(inputLayer);
opts.add_fetch(outputLayer);
// never reached?
cout << "EXIT 0\n"; const string gpu_device_name = GPUDeviceName(session.get());
//infer->RunInferenceOnStream();
opts.clear_fetch_devices();
opts.mutable_feed_devices()->insert({inputLayer, gpu_device_name});
auto runStatus = session->MakeCallable(opts, &feed_gpu_fetch_cpu);
if (!runStatus.ok())
{
LOG(ERROR) << "\033[1;31m" << "Failed to make callable" << "\033[0m";
}
// TF likes power of 2
tensorflow::TensorShape shape = tensorflow::TensorShape({1,32,32,1});
// allocate tensor on the GPU
tensorflow::PlatformGpuId platform_gpu_id(0);
tensorflow::GPUMemAllocator *sub_allocator =
new tensorflow::GPUMemAllocator(
tensorflow::GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(),
platform_gpu_id, false, {}, {});
tensorflow::GPUBFCAllocator *allocator =
new tensorflow::GPUBFCAllocator(sub_allocator, shape.num_elements() * sizeof(tensorflow::uint8), "GPU_0_bfc");
auto inputTensor = Tensor(allocator, tensorflow::DT_UINT8, shape);
LOG(INFO) << "\033[1;37m" << "Is CUDA Tensor? " << (IsCUDATensor(inputTensor)?"\033[1;32myes":"\033[1;31mno") << "\033[0m";
tensorflow::uint8 *p = inputTensor.flat<tensorflow::uint8>().data();
unsigned char* ptr;
//ptr = p;
// Testing array initialization
//myCreateCUDAArray(ptr);
//check tensor data here
session->ReleaseCallable(feed_gpu_fetch_cpu);
return 0; return 0;
} }
./build/tf_detector_example -d=$1 \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/home/boris/model/frozen_inference_graph.pb \
-labels=/home/boris/model/mscoco_label_map.pbtxt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
./build/tf_detector_example \
-d=$1 \
-i \
-t \
-v=/home/boris/Videos/ride_2.mp4 \
-graph=/usr/src/tensorrt/data/ssd/sample_ssd_relu6.uff \
-labels=/usr/src/tensorrt/data/ssd/ssd_coco_labels.txt
\ No newline at end of file
#include "utils.h"
using namespace std;
using namespace cv;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::string;
using tensorflow::int32;
using tensorflow::DeviceAttributes;
/** Read a model graph definition (xxx.pb) from disk, and creates a session object you can use to run it.
*/
Status loadGraph(const string &graph_file_name,
unique_ptr<tensorflow::Session> *session) {
tensorflow::GraphDef graph_def;
Status load_graph_status =
ReadBinaryProto(tensorflow::Env::Default(), graph_file_name, &graph_def);
if (!load_graph_status.ok()) {
return tensorflow::errors::NotFound("Failed to load compute graph at '",
graph_file_name, "'");
}
tensorflow::SessionOptions session_options;
session_options.config.mutable_gpu_options()->set_allow_growth(true);
session->reset(tensorflow::NewSession(session_options));
Status session_create_status = (*session)->Create(graph_def);
if (!session_create_status.ok()) {
return session_create_status;
}
return Status::OK();
}
/** Read a labels map file (xxx.pbtxt) from disk to translate class numbers into human-readable labels.
*/
Status readLabelsMapFile(const string &fileName, map<int, string> &labelsMap) {
// Read file into a string
ifstream t(fileName);
if (t.bad())
return tensorflow::errors::NotFound("Failed to load labels map at '", fileName, "'");
stringstream buffer;
buffer << t.rdbuf();
string fileString = buffer.str();
// Search entry patterns of type 'item { ... }' and parse each of them
smatch matcherEntry;
smatch matcherId;
smatch matcherName;
const regex reEntry("item \\{([\\S\\s]*?)\\}");
const regex reId("id: [0-9]+");
const regex reDisplayName("display_name: (\"|\').+(\"|\')");
const regex reName("name: (\"|\').+(\"|\')");
string entry;
const string namePrefix = "name: \"";
const string display_name = "display_name: \"";
const size_t idOffset = string("id: ").length();
size_t nameOffset = display_name.length();
// we first try to parse "display_name"
// and fall back if it does not exist
bool isParsingName = false;
auto stringBegin = sregex_iterator(fileString.begin(), fileString.end(), reEntry);
auto stringEnd = sregex_iterator();
int id;
string name;
for (sregex_iterator i = stringBegin; i != stringEnd; i++) {
matcherEntry = *i;
entry = matcherEntry.str();
regex_search(entry, matcherId, reId);
if (!matcherId.empty())
id = stoi(matcherId[0].str().substr(idOffset, matcherId[0].str().length() - idOffset));
else
continue;
if(!isParsingName)
{
regex_search(entry, matcherName, reDisplayName);
if(matcherName.empty())
{
isParsingName = true;
nameOffset = namePrefix.length();
}
}
if(isParsingName)
{
regex_search(entry, matcherName, reName);
}
if (!matcherName.empty())
name = matcherName[0].str().substr(nameOffset, matcherName[0].str().length() - nameOffset - 1);
else
continue;
labelsMap.insert(pair<int, string>(id, name));
}
return Status::OK();
}
/** Convert Mat image into tensor of shape (1, height, width, d) where last three dims are equal to the original dims.
*/
Status readTensorFromMat(const Mat &mat, Tensor &outTensor) {
// Trick from https://github.com/tensorflow/tensorflow/issues/8033
tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
Mat fakeMat(mat.rows, mat.cols, CV_8UC3, p);
cv::cvtColor(mat, fakeMat, COLOR_BGR2RGB);
return Status::OK();
}
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor) {
tensorflow::uint8 *p = outTensor.flat<tensorflow::uint8>().data();
cv::cuda::GpuMat fakeMat(g_mat.rows, g_mat.cols, CV_8UC3, p);
// comes in with 4 channels -> 3 channels
cv::cuda::cvtColor(g_mat, fakeMat, COLOR_BGRA2RGB);
return Status::OK();
}
/** Draw bounding box and add caption to the image.
* Boolean flag _scaled_ shows if the passed coordinates are in relative units (true by default in tensorflow detection)
*/
void drawBoundingBoxOnImage(Mat &image, double yMin, double xMin, double yMax, double xMax, double score, string label, bool scaled) {
cv::Point tl, br;
if (scaled) {
tl = cv::Point((int) (xMin * image.cols), (int) (yMin * image.rows));
br = cv::Point((int) (xMax * image.cols), (int) (yMax * image.rows));
} else {
tl = cv::Point((int) xMin, (int) yMin);
br = cv::Point((int) xMax, (int) yMax);
}
cv::rectangle(image, tl, br, cv::Scalar(0, 255, 255), 1);
// Ceiling the score down to 3 decimals (weird!)
float scoreRounded = floorf(score * 1000) / 1000;
string scoreString = to_string(scoreRounded).substr(0, 5);
string caption = label + " (" + scoreString + ")";
// Adding caption of type "LABEL (X.XXX)" to the top-left corner of the bounding box
int fontCoeff = 12;
cv::Point brRect = cv::Point(tl.x + caption.length() * fontCoeff / 1.6, tl.y + fontCoeff);
cv::rectangle(image, tl, brRect, cv::Scalar(0, 255, 255), -1);
cv::Point textCorner = cv::Point(tl.x, tl.y + fontCoeff * 0.9);
cv::putText(image, caption, textCorner, FONT_HERSHEY_SIMPLEX, 0.4, cv::Scalar(255, 0, 0));
}
/** Draw bounding boxes and add captions to the image.
* Box is drawn only if corresponding score is higher than the _threshold_.
*/
void drawFrameworkSignature(Mat& image, double fps, string signature, Scalar& color)
{
putText(image, "TensorFlow", Point(0, image.rows - 30), FONT_HERSHEY_SIMPLEX, 0.7, color, 2);
putText(image, to_string(fps).substr(0, 5), Point(0, image.rows - 5), FONT_HERSHEY_SIMPLEX, 0.7, Scalar(255, 255, 255), 2);
imshow("stream", image);
waitKey(1);
}
void drawBoundingBoxesOnImage(Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
map<int, string> &labelsMap,
vector<size_t> &idxs) {
for (int j = 0; j < idxs.size(); j++)
drawBoundingBoxOnImage(image,
boxes(0,idxs.at(j),0), boxes(0,idxs.at(j),1),
boxes(0,idxs.at(j),2), boxes(0,idxs.at(j),3),
scores(idxs.at(j)), labelsMap[classes(idxs.at(j))]);
}
/** Calculate intersection-over-union (IOU) for two given bbox Rects.
*/
double IOU(Rect2f box1, Rect2f box2) {
float xA = max(box1.tl().x, box2.tl().x);
float yA = max(box1.tl().y, box2.tl().y);
float xB = min(box1.br().x, box2.br().x);
float yB = min(box1.br().y, box2.br().y);
float intersectArea = abs((xB - xA) * (yB - yA));
float unionArea = abs(box1.area()) + abs(box2.area()) - intersectArea;
return 1. * intersectArea / unionArea;
}
/** Return idxs of good boxes (ones with highest confidence score (>= thresholdScore)
* and IOU <= thresholdIOU with others).
*/
vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore) {
vector<size_t> sortIdxs(scores.size());
iota(sortIdxs.begin(), sortIdxs.end(), 0);
// Create set of "bad" idxs
set<size_t> badIdxs = set<size_t>();
size_t i = 0;
while (i < sortIdxs.size()) {
if (scores(sortIdxs.at(i)) < thresholdScore)
badIdxs.insert(sortIdxs[i]);
if (badIdxs.find(sortIdxs.at(i)) != badIdxs.end()) {
i++;
continue;
}
Rect2f box1 = Rect2f(Point2f(boxes(0, sortIdxs.at(i), 1), boxes(0, sortIdxs.at(i), 0)),
Point2f(boxes(0, sortIdxs.at(i), 3), boxes(0, sortIdxs.at(i), 2)));
for (size_t j = i + 1; j < sortIdxs.size(); j++) {
if (scores(sortIdxs.at(j)) < thresholdScore) {
badIdxs.insert(sortIdxs[j]);
continue;
}
Rect2f box2 = Rect2f(Point2f(boxes(0, sortIdxs.at(j), 1), boxes(0, sortIdxs.at(j), 0)),
Point2f(boxes(0, sortIdxs.at(j), 3), boxes(0, sortIdxs.at(j), 2)));
if (IOU(box1, box2) > thresholdIOU)
badIdxs.insert(sortIdxs[j]);
}
i++;
}
// Prepare "good" idxs for return
vector<size_t> goodIdxs = vector<size_t>();
for (auto it = sortIdxs.begin(); it != sortIdxs.end(); it++)
if (badIdxs.find(sortIdxs.at(*it)) == badIdxs.end())
goodIdxs.push_back(*it);
return goodIdxs;
}
string type2str(int type) {
string r;
uchar depth = type & CV_MAT_DEPTH_MASK;
uchar chans = 1 + (type >> CV_CN_SHIFT);
switch ( depth ) {
case CV_8U: r = "8U"; break;
case CV_8S: r = "8S"; break;
case CV_16U: r = "16U"; break;
case CV_16S: r = "16S"; break;
case CV_32S: r = "32S"; break;
case CV_32F: r = "32F"; break;
case CV_64F: r = "64F"; break;
default: r = "User"; break;
}
r += "C";
r += (chans+'0');
return r;
}
bool IsCUDATensor(const Tensor &t)
{
cudaPointerAttributes attributes;
cudaError_t err =
cudaPointerGetAttributes(&attributes, t.tensor_data().data());
if (err == cudaErrorInvalidValue)
return false;
CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
#if CUDART_VERSION >= 10000
return (attributes.type == cudaMemoryTypeDevice);
#else
return (attributes.memoryType == cudaMemoryTypeDevice);
#endif
}
string GPUDeviceName(Session* session) {
std::vector<DeviceAttributes> devices;
TF_CHECK_OK(session->ListDevices(&devices));
for (const DeviceAttributes& d : devices) {
LOG(INFO) << "Device: " << d.name();
if (d.device_type() == "GPU" || d.device_type() == "gpu") {
return d.name();
}
}
return "";
}
\ No newline at end of file
#ifndef TF_DETECTOR_EXAMPLE_UTILS_H
#define TF_DETECTOR_EXAMPLE_UTILS_H
#endif //TF_DETECTOR_EXAMPLE_UTILS_H
#include <vector>
#include <string>
#include <fstream>
#include <iostream>
#include <map>
#include <unordered_map>
#include <math.h>
#include <regex>
#include <tuple>
#include <cassert>
#include <cublas_v2.h>
#include <cudnn.h>
#include <sstream>
#include <time.h>
#include "BatchStreamPPM.h"
#include "NvUffParser.h"
#include "common.h"
#include "NvInferPlugin.h"
// Required for CUDA check
#include "tensorflow/core/util/port.h"
// GPU allocator
#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
#include "tensorflow/core/common_runtime/gpu/gpu_init.h"
#include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
// Direct session
#include "tensorflow/core/common_runtime/direct_session.h"
#include <cv.hpp>
#include <opencv2/cudacodec.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
// CUDA includes. Order matters
#include <dynlink_nvcuvid.h>
#include "cuda_runtime_api.h"
using namespace std;
using tensorflow::Tensor;
using tensorflow::Status;
using tensorflow::Session;
using namespace nvinfer1;
using namespace nvuffparser;
string type2str(int type);
Status readLabelsMapFile(const string &fileName, std::map<int, string> &labelsMap);
Status loadGraph(const string &graph_file_name,
std::unique_ptr<tensorflow::Session> *session);
Status readTensorFromMat(const cv::Mat &mat, Tensor &outTensor);
Status readTensorFromGpuMat(const cv::cuda::GpuMat& g_mat, Tensor& outTensor);
void drawBoundingBoxOnImage(cv::Mat &image, double xMin, double yMin, double xMax, double yMax, double score, std::string label, bool scaled = true);
void drawBoundingBoxesOnImage(cv::Mat &image,
tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float>::Flat &classes,
tensorflow::TTypes<float,3>::Tensor &boxes,
std::map<int, string> &labelsMap,
std::vector<size_t> &idxs);
void drawFrameworkSignature(cv::Mat& image, double fps, string signature, cv::Scalar& color);
double IOU(cv::Rect box1, cv::Rect box2);
std::vector<size_t> filterBoxes(tensorflow::TTypes<float>::Flat &scores,
tensorflow::TTypes<float, 3>::Tensor &boxes,
double thresholdIOU, double thresholdScore);
bool IsCUDATensor(const Tensor &t);
string GPUDeviceName(Session* session);
std::tuple<vector<float>, vector<int>> doInferenceWithTrt(cv::cuda::GpuMat& img, IExecutionContext * context, vector<std::string>& CLASSES);
std::tuple<IRuntime*, ICudaEngine *, IExecutionContext*> CreateTrtEngineAndContext(std::string &graphFileName, bool isInt8);
extern DetectionOutputParameters detectionOutputParam;
void populateClassLabels(std::vector<std::string>& CLASSES, const std::string &labelFileName);
void channelFirst(unsigned char * source, float * dest, int channelSize, int channelsNum, int rowElements, int rowSize);
extern const int OUTPUT_CLS_SIZE;
extern const int OUTPUT_BBOX_SIZE;
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment