docker stuff

9bc751e7 · fierval · 4dfdcc86 · 9bc751e7 · 9bc751e7 · 9bc751e7
Commit 9bc751e7 authored Mar 10, 2019 by fierval
6 changed files
--- a/cpall.sh
+++ b/cpall.sh
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM fierval/tensorrt:19.02-py3
+# tensorflow libraries
+ADD tensorflow.tar /
+# nvcuvid
+ADD nvcuvid/* /usr/local/cuda/targets/x86_64-linux/include/
+# opencv
+RUN apt-get update
+RUN apt-get install -y git libgtk2.0-dev curl pkg-config libavcodec-dev \
+libavformat-dev libswscale-dev python-dev python-numpy libtbb2 libtbb-dev \
+libjpeg-dev libpng-dev libtiff-dev libjasper-dev libdc1394-22-dev unzip libcurl4-gnutls-dev zlib1g-dev
+RUN apt-get install -y wget
+RUN apt-get install -y vim
+## CMAKE
+ADD https://cmake.org/files/v3.13/cmake-3.13.0.tar.gz /
+RUN tar xzvf /cmake-3.13.0.tar.gz -C / \
+    && cd /cmake-3.13.0 \
+    && ./bootstrap \
+    && make -j15 \
+    && make install
+# Second: get and build OpenCV 4.0.1
+#
+ADD https://github.com/opencv/opencv_contrib/archive/4.0.1.zip  / 
+RUN unzip -o /4.0.1.zip
+#     && cd opencv-4.0.1/build \
+#     && cmake -DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-4.0.1/modules/ .. \
+#     && make -j8 \
+#     && make install \
+#     && cd ../.. \
+#     && rm 4.0.1.zip
+ADD https://github.com/opencv/opencv/archive/4.0.1.zip / 
+RUN unzip -o /4.0.1.zip 
+RUN cd /workspace/opencv-4.0.1 \
+    && mkdir build \
+    && cd build \
+    && cmake -DBUILD_TIFF=ON \
+       -DBUILD_opencv_java=OFF \
+       -DBUILD_SHARED_LIBS=OFF \
+       -DWITH_CUDA=ON \
+       -DBUILD_PERF_TESTS=OFF \
+       -DBUILD_TESTS=OFF \
+       # -DENABLE_FAST_MATH=1 \
+       # -DCUDA_FAST_MATH=1 \
+       -DWITH_CUBLAS=1 \
+       -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda \
+       -DOPENCV_EXTRA_MODULES_PATH=../../opencv_contrib-4.0.1/modules/ \
+       ##
+       ## Should compile for most card
+       ## 3.5 binary code for devices with compute capability 3.5 and 3.7,
+       ## 5.0 binary code for devices with compute capability 5.0 and 5.2,
+       ## 6.0 binary code for devices with compute capability 6.0 and 6.1,
+       -DCUDA_ARCH_BIN='7.0' \
+       -DCUDA_ARCH_PTX="" \
+       ##
+       ## AVX in dispatch because not all machines have it
+       -DCPU_DISPATCH=AVX,AVX2 \
+       -DENABLE_PRECOMPILED_HEADERS=OFF \
+       -DWITH_OPENGL=OFF \
+       -DWITH_OPENCL=OFF \
+       -DWITH_QT=OFF \
+       -DWITH_NVCUVID=ON \
+       -DWITH_IPP=ON \
+       -DWITH_TBB=ON \
+       -DFORCE_VTK=ON \
+       -DWITH_EIGEN=ON \
+       -DWITH_V4L=ON \
+       -DWITH_XINE=ON \
+       -DWITH_GDAL=ON \
+       -DWITH_1394=OFF \
+       -DWITH_FFMPEG=OFF \
+       -DBUILD_PROTOBUF=OFF \
+       -DBUILD_TESTS=OFF \
+       -DBUILD_PERF_TESTS=OFF \
+       -DBUILD_opencv_xfeatures2d=OFF \
+       -DCMAKE_BUILD_TYPE=RELEASE \
+       -DCMAKE_INSTALL_PREFIX=/usr/local \
+    .. \
+    && make -j15 \
+    && make install \
+    && rm /4.0.1.zip \
+    && rm /cmake-3.13.0.tar.gz
--- a/docker/nvcuvid/dynlink_cuda.h
+++ b/docker/nvcuvid/dynlink_cuda.h
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+#ifndef __cuda_h__
+#define __cuda_h__
+/**
+* CUDA API version support
+*/
+#include "dynlink_cuda_cuda.h"
+#endif //__cuda_h__
--- a/docker/nvcuvid/dynlink_cuda_cuda.h
+++ b/docker/nvcuvid/dynlink_cuda_cuda.h
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+#include <stdlib.h>
+#ifndef __CUDA_API_VERSION
+#define __CUDA_API_VERSION 4000
+#endif
+/**
+ * \defgroup CUDA_DRIVER CUDA Driver API
+ *
+ * This section describes the low-level CUDA driver application programming
+ * interface.
+ *
+ * @{
+ */
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 4000 /* 4.0 */
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * CUDA device pointer
+ */
+#if __CUDA_API_VERSION >= 3020
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined(__aarch64__)
+    typedef unsigned long long CUdeviceptr;
+#else
+    typedef unsigned int CUdeviceptr;
+#endif
+#endif /* __CUDA_API_VERSION >= 3020 */
+typedef int CUdevice;                                     /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                       /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                        /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                     /**< CUDA function */
+typedef struct CUarray_st *CUarray;                       /**< CUDA array */
+typedef struct CUtexref_st *CUtexref;                     /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                   /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                       /**< CUDA event */
+typedef struct CUstream_st *CUstream;                     /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */
+#if 0
+typedef struct CUuuid_st                                  /**< CUDA definition of UUID */
+{
+    char bytes[16];
+} CUuuid;
+#endif
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum
+{
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling \deprecated */
+    CU_CTX_MAP_HOST            = 0x08, /**< Support mapped pinned allocations */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+#if __CUDA_API_VERSION < 4000
+    CU_CTX_SCHED_MASK          = 0x03,
+    CU_CTX_FLAGS_MASK          = 0x1f
+#else
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_PRIMARY             = 0x20, /**< Initialize and return the primary context */
+    CU_CTX_FLAGS_MASK          = 0x3f
+#endif
+} CUctx_flags;
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum
+{
+    CU_EVENT_DEFAULT        = 0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 2  /**< Event will not record timing data */
+} CUevent_flags;
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum
+{
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20  /**< 32-bit floating point */
+} CUarray_format;
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum
+{
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum
+{
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum
+{
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,              /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                    /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                    /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                    /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                     /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                     /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                     /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,        /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,            /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,              /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                         /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                         /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,           /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,               /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                        /**< Peak clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                 /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                       /**< Device can possibly copy memory and execute a kernel concurrently */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,              /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,               /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                        /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,               /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                      /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,           /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,           /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,          /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,           /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,          /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,           /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,     /**< Maximum texture array width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,    /**< Maximum texture array height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Maximum slices in a texture array */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                 /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                       /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                        /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                     /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35                         /**< Device is using TCC driver model */
+#if __CUDA_API_VERSION >= 4000
+  , CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                 /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,           /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                     /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,    /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                /**< Device uses shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,   /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43   /**< Maximum layers in a 1D layered texture */
+#endif
+} CUdevice_attribute;
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st
+{
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop;
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum
+{
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum
+{
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02  /**< prefer larger L1 cache and smaller shared memory */
+} CUfunc_cache;
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum
+{
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03     /**< Array memory */
+#if __CUDA_API_VERSION >= 4000
+  , CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+#endif
+} CUmemorytype;
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum
+{
+    CU_COMPUTEMODE_DEFAULT           = 0,  /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_EXCLUSIVE         = 1, /**< Compute-exclusive-thread mode (Only one context used by a single thread can be present on this device at a time) */
+    CU_COMPUTEMODE_PROHIBITED        = 2  /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+#if __CUDA_API_VERSION >= 4000
+  , CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+#endif
+} CUcomputemode;
+/**
+ * Online compiler options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+    /**
+     * Returns a float value in the option of the wall clock time, in
+     * milliseconds, spent creating the cubin\n
+     * Option type: float
+     */
+    CU_JIT_WALL_TIME,
+    /**
+     * Pointer to a buffer in which to print any log messsages from PTXAS
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) \n
+     * Option type: char*
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+    /**
+     * Pointer to a buffer in which to print any log messages from PTXAS that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char*
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+    /**
+     * Target is chosen based on supplied ::CUjit_target_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target_enum
+     */
+    CU_JIT_TARGET,
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback_enum
+     */
+    CU_JIT_FALLBACK_STRATEGY
+} CUjit_option;
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_10 = 0,   /**< Compute device class 1.0 */
+    CU_TARGET_COMPUTE_11,       /**< Compute device class 1.1 */
+    CU_TARGET_COMPUTE_12,       /**< Compute device class 1.2 */
+    CU_TARGET_COMPUTE_13,       /**< Compute device class 1.3 */
+    CU_TARGET_COMPUTE_20,       /**< Compute device class 2.0 */
+    CU_TARGET_COMPUTE_21        /**< Compute device class 2.1 */
+} CUjit_target;
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx */
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code */
+} CUjit_fallback;
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum
+{
+    CU_GRAPHICS_REGISTER_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST  = 0x04
+} CUgraphicsRegisterFlags;
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum
+{
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum
+{
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum
+{
+    CU_LIMIT_STACK_SIZE        = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE  = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE  = 0x02  /**< GPU malloc heap size */
+} CUlimit;
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum
+{
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * can also mean that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+    /**
+     * This indicates profiling APIs are called while application is running
+     * in visual profiler mode.
+    */
+    CUDA_ERROR_PROFILER_DISABLED           = 5,
+    /**
+     * This indicates profiling has not been initialized for this context.
+     * Call cuProfilerInitialize() to resolve this.
+    */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+    /**
+     * This indicates profiler has already been started and probably
+     * cuProfilerStart() is incorrectly called.
+    */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+    /**
+     * This indicates profiler has already been stopped and probably
+     * cuProfilerStop() is incorrectly called.
+    */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, texture names, and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. The context cannot be used, so it must
+     * be destroyed (and a new one should be created). All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 700,
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The
+     * context cannot be used (and must be destroyed similar to
+     * ::CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from
+     * this context are invalid and must be reconstructed if the program is to
+     * continue using CUDA.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704,
+    /**
+     * This error indicates that a call to ::cuMemPeerRegister is trying to
+     * register memory from a context which has not had peer access
+     * enabled yet via ::cuCtxEnablePeerAccess(), or that
+     * ::cuCtxDisablePeerAccess() is trying to disable peer access
+     * which has not been enabled yet.
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED    = 705,
+    /**
+     * This error indicates that a call to ::cuMemPeerRegister is trying to
+     * register already-registered memory.
+     */
+    CUDA_ERROR_PEER_MEMORY_ALREADY_REGISTERED = 706,
+    /**
+     * This error indicates that a call to ::cuMemPeerUnregister is trying to
+     * unregister memory that has not been registered.
+     */
+    CUDA_ERROR_PEER_MEMORY_NOT_REGISTERED     = 707,
+    /**
+     * This error indicates that ::cuCtxCreate was called with the flag
+     * ::CU_CTX_PRIMARY on a device which already has initialized its
+     * primary context.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+#if __CUDA_API_VERSION >= 4000
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+/**
+ * If set, peer memory is mapped into CUDA address space and
+ * ::cuMemPeerGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemPeerRegister()
+ */
+#define CU_MEMPEERREGISTER_DEVICEMAP    0x02
+#endif
+#if __CUDA_API_VERSION >= 3020
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st
+{
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER;
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+#endif /* __CUDA_API_VERSION >= 3020 */
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)0x00)
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE    ((void*)0x02)
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+/**
+ * CUDA API made obselete at API version 3020
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #define CUdeviceptr                  CUdeviceptr_v1
+    #define CUDA_MEMCPY2D_st             CUDA_MEMCPY2D_v1_st
+    #define CUDA_MEMCPY2D                CUDA_MEMCPY2D_v1
+    #define CUDA_MEMCPY3D_st             CUDA_MEMCPY3D_v1_st
+    #define CUDA_MEMCPY3D                CUDA_MEMCPY3D_v1
+    #define CUDA_ARRAY_DESCRIPTOR_st     CUDA_ARRAY_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY_DESCRIPTOR        CUDA_ARRAY_DESCRIPTOR_v1
+    #define CUDA_ARRAY3D_DESCRIPTOR_st   CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    #define CUDA_ARRAY3D_DESCRIPTOR      CUDA_ARRAY3D_DESCRIPTOR_v1
+#endif /* CUDA_FORCE_LEGACY32_INTERNAL */
+#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020
+typedef unsigned int CUdeviceptr;
+typedef struct CUDA_MEMCPY2D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+    unsigned int Height;        /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D;
+typedef struct CUDA_MEMCPY3D_st
+{
+    unsigned int srcXInBytes;   /**< Source X in bytes */
+    unsigned int srcY;          /**< Source Y */
+    unsigned int srcZ;          /**< Source Z */
+    unsigned int srcLOD;        /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+    unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+    unsigned int dstXInBytes;   /**< Destination X in bytes */
+    unsigned int dstY;          /**< Destination Y */
+    unsigned int dstZ;          /**< Destination Z */
+    unsigned int dstLOD;        /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+    unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+    unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+    unsigned int Height;        /**< Height of 3D memory copy */
+    unsigned int Depth;         /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D;
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of array */
+    unsigned int Height;        /**< Height of array */
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR;
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    unsigned int Width;         /**< Width of 3D array */
+    unsigned int Height;        /**< Height of 3D array */
+    unsigned int Depth;         /**< Depth of 3D array */
+    CUarray_format Format;      /**< Array format */
+    unsigned int NumChannels;   /**< Channels per array element */
+    unsigned int Flags;         /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR;
+#endif /* (__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 */
+/*
+ * If set, the CUDA array contains an array of 2D slices
+ * and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies
+ * the number of slices, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags()
+ */
+#define CU_TRSF_SRGB  0x10
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+/** @} */ /* END CUDA_TYPES */
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    #define CUDAAPI __stdcall
+#else
+    #define CUDAAPI
+#endif
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+/*********************************
+ ** Initialization
+ *********************************/
+typedef CUresult  CUDAAPI tcuInit(unsigned int Flags);
+/*********************************
+ ** Driver Version Query
+ *********************************/
+typedef CUresult  CUDAAPI tcuDriverGetVersion(int *driverVersion);
+/************************************
+ **
+ **    Device management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuDeviceGet(CUdevice *device, int ordinal);
+typedef CUresult  CUDAAPI tcuDeviceGetCount(int *count);
+typedef CUresult  CUDAAPI tcuDeviceGetName(char *name, int len, CUdevice dev);
+typedef CUresult  CUDAAPI tcuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuDeviceTotalMem(size_t *bytes, CUdevice dev);
+#else
+    typedef CUresult  CUDAAPI tcuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+#endif
+typedef CUresult  CUDAAPI tcuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+typedef CUresult  CUDAAPI tcuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+/************************************
+ **
+ **    Context management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+typedef CUresult  CUDAAPI tcuCtxDestroy(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxAttach(CUcontext *pctx, unsigned int flags);
+typedef CUresult  CUDAAPI tcuCtxDetach(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxPushCurrent(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxPopCurrent(CUcontext *pctx);
+typedef CUresult  CUDAAPI tcuCtxSetCurrent(CUcontext ctx);
+typedef CUresult  CUDAAPI tcuCtxGetCurrent(CUcontext *pctx);
+typedef CUresult  CUDAAPI tcuCtxGetDevice(CUdevice *device);
+typedef CUresult  CUDAAPI tcuCtxSynchronize(void);
+/************************************
+ **
+ **    Module management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuModuleLoad(CUmodule *module, const char *fname);
+typedef CUresult  CUDAAPI tcuModuleLoadData(CUmodule *module, const void *image);
+typedef CUresult  CUDAAPI tcuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult  CUDAAPI tcuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+typedef CUresult  CUDAAPI tcuModuleUnload(CUmodule hmod);
+typedef CUresult  CUDAAPI tcuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+#else
+    typedef CUresult  CUDAAPI tcuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+#endif
+typedef CUresult  CUDAAPI tcuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult  CUDAAPI tcuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+/************************************
+ **
+ **    Memory management
+ **
+ ***********************************/
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuMemGetInfo(size_t *free, size_t *total);
+    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
+                                              size_t *pPitch,
+                                              size_t WidthInBytes,
+                                              size_t Height,
+                                              // size of biggest r/w to be performed by kernels on this memory
+                                              // 4, 8 or 16 bytes
+                                              unsigned int ElementSizeBytes
+                                             );
+#else
+    typedef CUresult CUDAAPI tcuMemGetInfo(unsigned int *free, unsigned int *total);
+    typedef CUresult CUDAAPI tcuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize);
+    typedef CUresult CUDAAPI tcuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr);
+    typedef CUresult CUDAAPI tcuMemAllocPitch(CUdeviceptr *dptr,
+                                              unsigned int *pPitch,
+                                              unsigned int WidthInBytes,
+                                              unsigned int Height,
+                                              // size of biggest r/w to be performed by kernels on this memory
+                                              // 4, 8 or 16 bytes
+                                              unsigned int ElementSizeBytes
+                                             );
+#endif
+typedef CUresult CUDAAPI tcuMemFree(CUdeviceptr dptr);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, size_t bytesize);
+#else
+    typedef CUresult CUDAAPI tcuMemAllocHost(void **pp, unsigned int bytesize);
+#endif
+typedef CUresult CUDAAPI tcuMemFreeHost(void *p);
+typedef CUresult CUDAAPI tcuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostGetFlags(unsigned int *pFlags, void *p);
+typedef CUresult CUDAAPI tcuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult CUDAAPI tcuMemHostUnregister(void *p);;
+typedef CUresult CUDAAPI tcuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+typedef CUresult CUDAAPI tcuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+/************************************
+ **
+ **    Synchronous Memcpy
+ **
+ ** Intra-device memcpy's done with these functions may execute in parallel with the CPU,
+ ** but if host memory is involved, they wait until the copy is done before returning.
+ **
+ ***********************************/
+// 1D functions
+#if __CUDA_API_VERSION >= 3020
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    // device <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    // array <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+#else
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount);
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount);
+    // device <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    // array <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+#endif
+// 2D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+typedef CUresult  CUDAAPI tcuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+// 3D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+/************************************
+ **
+ **    Asynchronous Memcpy
+ **
+ ** Any host memory involved must be DMA'able (e.g., allocated with cuMemAllocHost).
+ ** memcpy's done with these functions execute in parallel with the CPU and, if
+ ** the hardware is available, may execute in parallel with the GPU.
+ ** Asynchronous memcpy must be accompanied by appropriate stream synchronization.
+ **
+ ***********************************/
+// 1D functions
+#if __CUDA_API_VERSION >= 3020
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
+                                                 const void *srcHost, size_t ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
+                                                 CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
+                                                 CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                                 const void *srcHost, size_t ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset,
+                                                 size_t ByteCount, CUstream hStream);
+#else
+    // system <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoDAsync(CUdeviceptr dstDevice,
+                                                 const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyDtoHAsync(void *dstHost,
+                                                 CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+    // device <-> device memory
+    typedef CUresult  CUDAAPI tcuMemcpyDtoDAsync(CUdeviceptr dstDevice,
+                                                 CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream);
+    // system <-> array memory
+    typedef CUresult  CUDAAPI tcuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset,
+                                                 const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult  CUDAAPI tcuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset,
+                                                 unsigned int ByteCount, CUstream hStream);
+#endif
+// 2D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+// 3D memcpy
+typedef CUresult  CUDAAPI tcuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+/************************************
+ **
+ **    Memset
+ **
+ ***********************************/
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, size_t Width, size_t Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, size_t Width, size_t Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, size_t Width, size_t Height);
+#else
+    typedef CUresult  CUDAAPI tcuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult  CUDAAPI tcuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult  CUDAAPI tcuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult  CUDAAPI tcuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult  CUDAAPI tcuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+#endif
+/************************************
+ **
+ **    Function management
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+typedef CUresult CUDAAPI tcuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+typedef CUresult CUDAAPI tcuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult CUDAAPI tcuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult CUDAAPI tcuLaunchKernel(CUfunction f,
+                                         unsigned int gridDimX,  unsigned int gridDimY,  unsigned int gridDimZ,
+                                         unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
+                                         unsigned int sharedMemBytes,
+                                         CUstream hStream, void **kernelParams, void **extra);
+/************************************
+ **
+ **    Array management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+typedef CUresult  CUDAAPI tcuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+typedef CUresult  CUDAAPI tcuArrayDestroy(CUarray hArray);
+typedef CUresult  CUDAAPI tcuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+typedef CUresult  CUDAAPI tcuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+/************************************
+ **
+ **    Texture reference management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuTexRefCreate(CUtexref *pTexRef);
+typedef CUresult  CUDAAPI tcuTexRefDestroy(CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+#else
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes);
+    typedef CUresult  CUDAAPI tcuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch);
+#endif
+typedef CUresult  CUDAAPI tcuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult  CUDAAPI tcuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult  CUDAAPI tcuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult  CUDAAPI tcuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult  CUDAAPI tcuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult  CUDAAPI tcuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult  CUDAAPI tcuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+/************************************
+ **
+ **    Surface reference management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult  CUDAAPI tcuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+/************************************
+ **
+ **    Parameter management
+ **
+ ***********************************/
+typedef CUresult  CUDAAPI tcuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult  CUDAAPI tcuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult  CUDAAPI tcuParamSetf(CUfunction hfunc, int offset, float value);
+typedef CUresult  CUDAAPI tcuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult  CUDAAPI tcuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+/************************************
+ **
+ **    Launch functions
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuLaunch(CUfunction f);
+typedef CUresult CUDAAPI tcuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+typedef CUresult CUDAAPI tcuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+/************************************
+ **
+ **    Events
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuEventCreate(CUevent *phEvent, unsigned int Flags);
+typedef CUresult CUDAAPI tcuEventRecord(CUevent hEvent, CUstream hStream);
+typedef CUresult CUDAAPI tcuEventQuery(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventSynchronize(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventDestroy(CUevent hEvent);
+typedef CUresult CUDAAPI tcuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+/************************************
+ **
+ **    Streams
+ **
+ ***********************************/
+typedef CUresult CUDAAPI  tcuStreamCreate(CUstream *phStream, unsigned int Flags);
+typedef CUresult CUDAAPI  tcuStreamQuery(CUstream hStream);
+typedef CUresult CUDAAPI  tcuStreamSynchronize(CUstream hStream);
+typedef CUresult CUDAAPI  tcuStreamDestroy(CUstream hStream);
+/************************************
+ **
+ **    Graphics interop
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuGraphicsUnregisterResource(CUgraphicsResource resource);
+typedef CUresult CUDAAPI tcuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+#if __CUDA_API_VERSION >= 3020
+    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+#else
+    typedef CUresult CUDAAPI tcuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+#endif
+typedef CUresult CUDAAPI tcuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult CUDAAPI tcuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult CUDAAPI tcuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+/************************************
+ **
+ **    Export tables
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+/************************************
+ **
+ **    Limits
+ **
+ ***********************************/
+typedef CUresult CUDAAPI tcuCtxSetLimit(CUlimit limit, size_t value);
+typedef CUresult CUDAAPI tcuCtxGetLimit(size_t *pvalue, CUlimit limit);
+extern tcuDriverGetVersion             *cuDriverGetVersion;
+extern tcuDeviceGet                    *cuDeviceGet;
+extern tcuDeviceGetCount               *cuDeviceGetCount;
+extern tcuDeviceGetName                *cuDeviceGetName;
+extern tcuDeviceComputeCapability      *cuDeviceComputeCapability;
+extern tcuDeviceGetProperties          *cuDeviceGetProperties;
+extern tcuDeviceGetAttribute           *cuDeviceGetAttribute;
+extern tcuCtxDestroy                   *cuCtxDestroy;
+extern tcuCtxAttach                    *cuCtxAttach;
+extern tcuCtxDetach                    *cuCtxDetach;
+extern tcuCtxPushCurrent               *cuCtxPushCurrent;
+extern tcuCtxPopCurrent                *cuCtxPopCurrent;
+extern tcuCtxSetCurrent                *cuCtxSetCurrent;
+extern tcuCtxGetCurrent                *cuCtxGetCurrent;
+extern tcuCtxGetDevice                 *cuCtxGetDevice;
+extern tcuCtxSynchronize               *cuCtxSynchronize;
+extern tcuModuleLoad                   *cuModuleLoad;
+extern tcuModuleLoadData               *cuModuleLoadData;
+extern tcuModuleLoadDataEx             *cuModuleLoadDataEx;
+extern tcuModuleLoadFatBinary          *cuModuleLoadFatBinary;
+extern tcuModuleUnload                 *cuModuleUnload;
+extern tcuModuleGetFunction            *cuModuleGetFunction;
+extern tcuModuleGetTexRef              *cuModuleGetTexRef;
+extern tcuModuleGetSurfRef             *cuModuleGetSurfRef;
+extern tcuMemFreeHost                  *cuMemFreeHost;
+extern tcuMemHostAlloc                 *cuMemHostAlloc;
+extern tcuMemHostGetFlags              *cuMemHostGetFlags;
+extern tcuMemHostRegister              *cuMemHostRegister;
+extern tcuMemHostUnregister            *cuMemHostUnregister;
+extern tcuMemcpy                       *cuMemcpy;
+extern tcuMemcpyPeer                   *cuMemcpyPeer;
+extern tcuDeviceTotalMem               *cuDeviceTotalMem;
+extern tcuCtxCreate                    *cuCtxCreate;
+extern tcuModuleGetGlobal              *cuModuleGetGlobal;
+extern tcuMemGetInfo                   *cuMemGetInfo;
+extern tcuMemAlloc                     *cuMemAlloc;
+extern tcuMemAllocPitch                *cuMemAllocPitch;
+extern tcuMemFree                      *cuMemFree;
+extern tcuMemGetAddressRange           *cuMemGetAddressRange;
+extern tcuMemAllocHost                 *cuMemAllocHost;
+extern tcuMemHostGetDevicePointer      *cuMemHostGetDevicePointer;
+extern tcuFuncSetBlockShape            *cuFuncSetBlockShape;
+extern tcuFuncSetSharedSize            *cuFuncSetSharedSize;
+extern tcuFuncGetAttribute             *cuFuncGetAttribute;
+extern tcuFuncSetCacheConfig           *cuFuncSetCacheConfig;
+extern tcuLaunchKernel                 *cuLaunchKernel;
+extern tcuArrayDestroy                 *cuArrayDestroy;
+extern tcuTexRefCreate                 *cuTexRefCreate;
+extern tcuTexRefDestroy                *cuTexRefDestroy;
+extern tcuTexRefSetArray               *cuTexRefSetArray;
+extern tcuTexRefSetFormat              *cuTexRefSetFormat;
+extern tcuTexRefSetAddressMode         *cuTexRefSetAddressMode;
+extern tcuTexRefSetFilterMode          *cuTexRefSetFilterMode;
+extern tcuTexRefSetFlags               *cuTexRefSetFlags;
+extern tcuTexRefGetArray               *cuTexRefGetArray;
+extern tcuTexRefGetAddressMode         *cuTexRefGetAddressMode;
+extern tcuTexRefGetFilterMode          *cuTexRefGetFilterMode;
+extern tcuTexRefGetFormat              *cuTexRefGetFormat;
+extern tcuTexRefGetFlags               *cuTexRefGetFlags;
+extern tcuSurfRefSetArray              *cuSurfRefSetArray;
+extern tcuSurfRefGetArray              *cuSurfRefGetArray;
+extern tcuParamSetSize                 *cuParamSetSize;
+extern tcuParamSeti                    *cuParamSeti;
+extern tcuParamSetf                    *cuParamSetf;
+extern tcuParamSetv                    *cuParamSetv;
+extern tcuParamSetTexRef               *cuParamSetTexRef;
+extern tcuLaunch                       *cuLaunch;
+extern tcuLaunchGrid                   *cuLaunchGrid;
+extern tcuLaunchGridAsync              *cuLaunchGridAsync;
+extern tcuEventCreate                  *cuEventCreate;
+extern tcuEventRecord                  *cuEventRecord;
+extern tcuEventQuery                   *cuEventQuery;
+extern tcuEventSynchronize             *cuEventSynchronize;
+extern tcuEventDestroy                 *cuEventDestroy;
+extern tcuEventElapsedTime             *cuEventElapsedTime;
+extern tcuStreamCreate                 *cuStreamCreate;
+extern tcuStreamQuery                  *cuStreamQuery;
+extern tcuStreamSynchronize            *cuStreamSynchronize;
+extern tcuStreamDestroy                *cuStreamDestroy;
+extern tcuGraphicsUnregisterResource   *cuGraphicsUnregisterResource;
+extern tcuGraphicsSubResourceGetMappedArray  *cuGraphicsSubResourceGetMappedArray;
+extern tcuGraphicsResourceSetMapFlags  *cuGraphicsResourceSetMapFlags;
+extern tcuGraphicsMapResources         *cuGraphicsMapResources;
+extern tcuGraphicsUnmapResources       *cuGraphicsUnmapResources;
+extern tcuGetExportTable               *cuGetExportTable;
+extern tcuCtxSetLimit                  *cuCtxSetLimit;
+extern tcuCtxGetLimit                  *cuCtxGetLimit;
+// These functions could be using the CUDA 3.2 interface (_v2)
+extern tcuMemcpyHtoD                   *cuMemcpyHtoD;
+extern tcuMemcpyDtoH                   *cuMemcpyDtoH;
+extern tcuMemcpyDtoD                   *cuMemcpyDtoD;
+extern tcuMemcpyDtoA                   *cuMemcpyDtoA;
+extern tcuMemcpyAtoD                   *cuMemcpyAtoD;
+extern tcuMemcpyHtoA                   *cuMemcpyHtoA;
+extern tcuMemcpyAtoH                   *cuMemcpyAtoH;
+extern tcuMemcpyAtoA                   *cuMemcpyAtoA;
+extern tcuMemcpy2D                     *cuMemcpy2D;
+extern tcuMemcpy2DUnaligned            *cuMemcpy2DUnaligned;
+extern tcuMemcpy3D                     *cuMemcpy3D;
+extern tcuMemcpyHtoDAsync              *cuMemcpyHtoDAsync;
+extern tcuMemcpyDtoHAsync              *cuMemcpyDtoHAsync;
+extern tcuMemcpyDtoDAsync              *cuMemcpyDtoDAsync;
+extern tcuMemcpyHtoAAsync              *cuMemcpyHtoAAsync;
+extern tcuMemcpyAtoHAsync              *cuMemcpyAtoHAsync;
+extern tcuMemcpy2DAsync                *cuMemcpy2DAsync;
+extern tcuMemcpy3DAsync                *cuMemcpy3DAsync;
+extern tcuMemsetD8                     *cuMemsetD8;
+extern tcuMemsetD16                    *cuMemsetD16;
+extern tcuMemsetD32                    *cuMemsetD32;
+extern tcuMemsetD2D8                   *cuMemsetD2D8;
+extern tcuMemsetD2D16                  *cuMemsetD2D16;
+extern tcuMemsetD2D32                  *cuMemsetD2D32;
+extern tcuArrayCreate                  *cuArrayCreate;
+extern tcuArrayGetDescriptor           *cuArrayGetDescriptor;
+extern tcuArray3DCreate                *cuArray3DCreate;
+extern tcuArray3DGetDescriptor         *cuArray3DGetDescriptor;
+extern tcuTexRefSetAddress             *cuTexRefSetAddress;
+extern tcuTexRefSetAddress2D           *cuTexRefSetAddress2D;
+extern tcuTexRefGetAddress             *cuTexRefGetAddress;
+extern tcuGraphicsResourceGetMappedPointer   *cuGraphicsResourceGetMappedPointer;
+/************************************/
+CUresult CUDAAPI cuInit   (unsigned int, int cudaVersion, void *hHandleDriver);
+/************************************/
+#ifdef __cplusplus
+}
+#endif
+#endif //__cuda_cuda_h__
--- a/docker/nvcuvid/dynlink_cuviddec.h
+++ b/docker/nvcuvid/dynlink_cuviddec.h
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2017 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/*****************************************************************************************************/
+//! \file cuviddec.h
+//! NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2017
+//! This file contains constants, structure definitions and function prototypes used for decoding.
+/*****************************************************************************************************/
+#if !defined(__CUDA_VIDEO_H__)
+#define __CUDA_VIDEO_H__
+#ifndef __dynlink_cuda_h__
+#include "dynlink_cuda.h"
+#endif // __dynlink_cuda_h__
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+#if (CUDA_VERSION >= 3020) && (!defined(CUDA_FORCE_API_VERSION) || (CUDA_FORCE_API_VERSION >= 3020))
+#define __CUVID_DEVPTR64
+#endif
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+typedef void *CUvideodecoder;
+typedef struct _CUcontextlock_st *CUvideoctxlock;
+/*********************************************************************************/
+//! \enum cudaVideoCodec
+//! Video codec enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+/*********************************************************************************/
+typedef enum cudaVideoCodec_enum {
+    cudaVideoCodec_MPEG1=0,                                         /**<  MPEG1             */
+    cudaVideoCodec_MPEG2,                                           /**<  MPEG2             */
+    cudaVideoCodec_MPEG4,                                           /**<  MPEG4             */
+    cudaVideoCodec_VC1,                                             /**<  VC1               */
+    cudaVideoCodec_H264,                                            /**<  H264              */
+    cudaVideoCodec_JPEG,                                            /**<  JPEG              */
+    cudaVideoCodec_H264_SVC,                                        /**<  H264-SVC          */
+    cudaVideoCodec_H264_MVC,                                        /**<  H264-MVC          */
+    cudaVideoCodec_HEVC,                                            /**<  HEVC              */
+    cudaVideoCodec_VP8,                                             /**<  VP8               */
+    cudaVideoCodec_VP9,                                             /**<  VP9               */
+    cudaVideoCodec_NumCodecs,                                       /**<  Max codecs        */
+    // Uncompressed YUV
+    cudaVideoCodec_YUV420 = (('I'<<24)|('Y'<<16)|('U'<<8)|('V')),   /**< Y,U,V (4:2:0)      */
+    cudaVideoCodec_YV12   = (('Y'<<24)|('V'<<16)|('1'<<8)|('2')),   /**< Y,V,U (4:2:0)      */
+    cudaVideoCodec_NV12   = (('N'<<24)|('V'<<16)|('1'<<8)|('2')),   /**< Y,UV  (4:2:0)      */
+    cudaVideoCodec_YUYV   = (('Y'<<24)|('U'<<16)|('Y'<<8)|('V')),   /**< YUYV/YUY2 (4:2:2)  */
+    cudaVideoCodec_UYVY   = (('U'<<24)|('Y'<<16)|('V'<<8)|('Y'))    /**< UYVY (4:2:2)       */
+} cudaVideoCodec;
+/*********************************************************************************/
+//! \enum cudaVideoSurfaceFormat
+//! Video surface format enums used for output format of decoded output
+//! These enums are used in CUVIDDECODECREATEINFO structure
+/*********************************************************************************/
+typedef enum cudaVideoSurfaceFormat_enum {
+    cudaVideoSurfaceFormat_NV12=0,       /**< NV12 format          */
+    cudaVideoSurfaceFormat_P016=1        /**< 16 bit semiplaner format. Can be used for 10 bit(6LSB bits 0), 
+                                              12 bit (4LSB bits 0) */
+} cudaVideoSurfaceFormat;
+/******************************************************************************************************************/
+//! \enum cudaVideoDeinterlaceMode
+//! Deinterlacing mode enums
+//! These enums are used in CUVIDDECODECREATEINFO structure
+//! Use cudaVideoDeinterlaceMode_Weave for progressive content and for content that doesn't need deinterlacing
+//! cudaVideoDeinterlaceMode_Adaptive needs more video memory than other DImodes
+/******************************************************************************************************************/
+typedef enum cudaVideoDeinterlaceMode_enum {
+    cudaVideoDeinterlaceMode_Weave=0,   /**< Weave both fields (no deinterlacing) */
+    cudaVideoDeinterlaceMode_Bob,       /**< Drop one field                       */
+    cudaVideoDeinterlaceMode_Adaptive   /**< Adaptive deinterlacing               */
+} cudaVideoDeinterlaceMode;
+/**************************************************************************************************************/
+//! \enum cudaVideoChromaFormat
+//! Chroma format enums
+//! These enums are used in CUVIDDECODECREATEINFO and CUVIDDECODECAPS structures
+//! JPEG supports Monochrome, YUV 4:2:0, YUV 4:2:2 and YUV 4:4:4 chroma formats.
+//! H264, HEVC, VP9, VP8, VC1, MPEG1, MPEG2 and MPEG4 support YUV 4:2:0 chroma format only.
+/**************************************************************************************************************/
+typedef enum cudaVideoChromaFormat_enum {
+    cudaVideoChromaFormat_Monochrome=0,  /**< MonoChrome */
+    cudaVideoChromaFormat_420,           /**< YUV 4:2:0  */
+    cudaVideoChromaFormat_422,           /**< YUV 4:2:2  */
+    cudaVideoChromaFormat_444            /**< YUV 4:4:4  */
+} cudaVideoChromaFormat;
+/*************************************************************************************************************/
+//! \enum cudaVideoCreateFlags
+//! Decoder flag enums to select preferred decode path
+//! cudaVideoCreate_Default and cudaVideoCreate_PreferCUVID are most optimized, use these whenever possible
+/*************************************************************************************************************/
+typedef enum cudaVideoCreateFlags_enum {
+    cudaVideoCreate_Default     = 0x00,     /**< Default operation mode: use dedicated video engines                        */
+    cudaVideoCreate_PreferCUDA  = 0x01,     /**< Use CUDA-based decoder (requires valid vidLock object for multi-threading) */
+    cudaVideoCreate_PreferDXVA  = 0x02,     /**< Go through DXVA internally if possible (requires D3D9 interop)             */
+    cudaVideoCreate_PreferCUVID = 0x04      /**< Use dedicated video engines directly                                       */
+} cudaVideoCreateFlags;
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECAPS;
+//! This structure is used in cuvidGetDecoderCaps API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECAPS
+{
+    cudaVideoCodec          eCodecType;                 /**< IN: cudaVideoCodec_XXX                                 */
+    cudaVideoChromaFormat   eChromaFormat;              /**< IN: cudaVideoChromaFormat_XXX                          */
+    unsigned int            nBitDepthMinus8;            /**< IN: The Value "BitDepth minus 8"                       */
+    unsigned int            reserved1[3];               /**< Reserved for future use - set to zero                  */
+    unsigned char           bIsSupported;               /**< OUT: 1 if codec supported, 0 if not supported          */
+    unsigned char           reserved2[3];               /**< Reserved for future use - set to zero                  */
+    unsigned int            nMaxWidth;                  /**< OUT: Max supported coded width in pixels               */
+    unsigned int            nMaxHeight;                 /**< OUT: Max supported coded height in pixels              */
+    unsigned int            nMaxMBCount;                /**< OUT: Max supported macroblock count
+                                                                  CodedWidth*CodedHeight/256 must be <= nMaxMBCount */
+    unsigned short          nMinWidth;                  /**< OUT: Min supported coded width in pixels               */
+    unsigned short          nMinHeight;                 /**< OUT: Min supported coded height in pixels              */
+    unsigned int            reserved3[11];              /**< Reserved for future use - set to zero                  */
+} CUVIDDECODECAPS;
+/**************************************************************************************************************/
+//! \struct CUVIDDECODECREATEINFO
+//! This structure is used in cuvidCreateDecoder API
+/**************************************************************************************************************/
+typedef struct _CUVIDDECODECREATEINFO
+{
+    unsigned long ulWidth;              /**< IN: Coded sequence width in pixels                                             */
+    unsigned long ulHeight;             /**< IN: Coded sequence height in pixels                                            */
+    unsigned long ulNumDecodeSurfaces;  /**< IN: Maximum number of internal decode surfaces                                 */
+    cudaVideoCodec CodecType;           /**< IN: cudaVideoCodec_XXX                                                         */
+    cudaVideoChromaFormat ChromaFormat; /**< IN: cudaVideoChromaFormat_XXX                                                  */
+    unsigned long ulCreationFlags;      /**< IN: Decoder creation flags (cudaVideoCreateFlags_XXX)                          */
+    unsigned long bitDepthMinus8;       /**< IN: The value "BitDepth minus 8"                                               */
+    unsigned long ulIntraDecodeOnly;    /**< IN: Set 1 only if video has all intra frames (default value is 0). This will
+                                             optimize video memory for Intra frames only decoding. The support is limited
+                                             to specific codecs(H264 rightnow), the flag will be ignored for codecs which
+                                             are not supported. However decoding might fail if the flag is enabled in case 
+                                             of supported codecs for regular bit streams having P and/or B frames.          */
+    unsigned long Reserved1[3];         /**< Reserved for future use - set to zero                                          */
+    /**
+    * IN: area of the frame that should be displayed
+    */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } display_area;
+    cudaVideoSurfaceFormat OutputFormat;       /**< IN: cudaVideoSurfaceFormat_XXX                                     */
+    cudaVideoDeinterlaceMode DeinterlaceMode;  /**< IN: cudaVideoDeinterlaceMode_XXX                                   */
+    unsigned long ulTargetWidth;               /**< IN: Post-processed output width (Should be aligned to 2)           */
+    unsigned long ulTargetHeight;              /**< IN: Post-processed output height (Should be aligbed to 2)          */
+    unsigned long ulNumOutputSurfaces;         /**< IN: Maximum number of output surfaces simultaneously mapped        */
+    CUvideoctxlock vidLock;                    /**< IN: If non-NULL, context lock used for synchronizing ownership of 
+                                                    the cuda context. Needed for cudaVideoCreate_PreferCUDA decode     */
+    /**
+    * IN: target rectangle in the output frame (for aspect ratio conversion)
+    * if a null rectangle is specified, {0,0,ulTargetWidth,ulTargetHeight} will be used
+    */
+    struct {
+        short left;
+        short top;
+        short right;
+        short bottom;
+    } target_rect;
+    unsigned long Reserved2[5];                /**< Reserved for future use - set to zero */
+} CUVIDDECODECREATEINFO;
+/*********************************************************/
+//! \struct CUVIDH264DPBENTRY
+//! H.264 DPB entry
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264DPBENTRY
+{
+    int PicIdx;                 /**< picture index of reference frame                                        */
+    int FrameIdx;               /**< frame_num(short-term) or LongTermFrameIdx(long-term)                    */
+    int is_long_term;           /**< 0=short term reference, 1=long term reference                           */
+    int not_existing;           /**< non-existing reference frame (corresponding PicIdx should be set to -1) */
+    int used_for_reference;     /**< 0=unused, 1=top_field, 2=bottom_field, 3=both_fields                    */
+    int FieldOrderCnt[2];       /**< field order count of top and bottom fields                              */
+} CUVIDH264DPBENTRY;
+/************************************************************/
+//! \struct CUVIDH264MVCEXT
+//! H.264 MVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/************************************************************/
+typedef struct _CUVIDH264MVCEXT
+{
+    int num_views_minus1;                  /**< Max number of coded views minus 1 in video : Range - 0 to 1023              */
+    int view_id;                           /**< view identifier                                                             */
+    unsigned char inter_view_flag;         /**< 1 if used for inter-view prediction, 0 if not                               */
+    unsigned char num_inter_view_refs_l0;  /**< number of inter-view ref pics in RefPicList0                                */
+    unsigned char num_inter_view_refs_l1;  /**< number of inter-view ref pics in RefPicList1                                */
+    unsigned char MVCReserved8Bits;        /**< Reserved bits                                                               */
+    int InterViewRefsL0[16];               /**< view id of the i-th view component for inter-view prediction in RefPicList0 */
+    int InterViewRefsL1[16];               /**< view id of the i-th view component for inter-view prediction in RefPicList1 */
+} CUVIDH264MVCEXT;
+/*********************************************************/
+//! \struct CUVIDH264SVCEXT
+//! H.264 SVC picture parameters ext
+//! This structure is used in CUVIDH264PICPARAMS structure
+/*********************************************************/
+typedef struct _CUVIDH264SVCEXT
+{
+    unsigned char profile_idc;
+    unsigned char level_idc;
+    unsigned char DQId;
+    unsigned char DQIdMax;
+    unsigned char disable_inter_layer_deblocking_filter_idc;
+    unsigned char ref_layer_chroma_phase_y_plus1;
+    signed char   inter_layer_slice_alpha_c0_offset_div2;
+    signed char   inter_layer_slice_beta_offset_div2;
+    unsigned short DPBEntryValidFlag;
+    unsigned char inter_layer_deblocking_filter_control_present_flag;
+    unsigned char extended_spatial_scalability_idc;
+    unsigned char adaptive_tcoeff_level_prediction_flag;
+    unsigned char slice_header_restriction_flag;
+    unsigned char chroma_phase_x_plus1_flag;
+    unsigned char chroma_phase_y_plus1;
+    unsigned char tcoeff_level_prediction_flag;
+    unsigned char constrained_intra_resampling_flag;
+    unsigned char ref_layer_chroma_phase_x_plus1_flag;
+    unsigned char store_ref_base_pic_flag;
+    unsigned char Reserved8BitsA;
+    unsigned char Reserved8BitsB;
+    short scaled_ref_layer_left_offset;
+    short scaled_ref_layer_top_offset;
+    short scaled_ref_layer_right_offset;
+    short scaled_ref_layer_bottom_offset;
+    unsigned short Reserved16Bits;
+    struct _CUVIDPICPARAMS *pNextLayer; /**< Points to the picparams for the next layer to be decoded. 
+                                             Linked list ends at the target layer. */
+    int bRefBaseLayer;                  /**< whether to store ref base pic */
+} CUVIDH264SVCEXT;
+/******************************************************/
+//! \struct CUVIDH264PICPARAMS
+//! H.264 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/******************************************************/
+typedef struct _CUVIDH264PICPARAMS
+{
+    // SPS
+    int log2_max_frame_num_minus4;
+    int pic_order_cnt_type;
+    int log2_max_pic_order_cnt_lsb_minus4;
+    int delta_pic_order_always_zero_flag;
+    int frame_mbs_only_flag;
+    int direct_8x8_inference_flag;
+    int num_ref_frames;             // NOTE: shall meet level 4.1 restrictions
+    unsigned char residual_colour_transform_flag;
+    unsigned char bit_depth_luma_minus8;    // Must be 0 (only 8-bit supported)
+    unsigned char bit_depth_chroma_minus8;  // Must be 0 (only 8-bit supported)
+    unsigned char qpprime_y_zero_transform_bypass_flag;
+    // PPS
+    int entropy_coding_mode_flag;
+    int pic_order_present_flag;
+    int num_ref_idx_l0_active_minus1;
+    int num_ref_idx_l1_active_minus1;
+    int weighted_pred_flag;
+    int weighted_bipred_idc;
+    int pic_init_qp_minus26;
+    int deblocking_filter_control_present_flag;
+    int redundant_pic_cnt_present_flag;
+    int transform_8x8_mode_flag;
+    int MbaffFrameFlag;
+    int constrained_intra_pred_flag;
+    int chroma_qp_index_offset;
+    int second_chroma_qp_index_offset;
+    int ref_pic_flag;
+    int frame_num;
+    int CurrFieldOrderCnt[2];
+    // DPB
+    CUVIDH264DPBENTRY dpb[16];          // List of reference frames within the DPB
+    // Quantization Matrices (raster-order)
+    unsigned char WeightScale4x4[6][16];
+    unsigned char WeightScale8x8[2][64];
+    // FMO/ASO
+    unsigned char fmo_aso_enable;
+    unsigned char num_slice_groups_minus1;
+    unsigned char slice_group_map_type;
+    signed char pic_init_qs_minus26;
+    unsigned int slice_group_change_rate_minus1;
+    union
+    {
+        unsigned long long slice_group_map_addr;
+        const unsigned char *pMb2SliceGroupMap;
+    } fmo;
+    unsigned int  Reserved[12];
+    // SVC/MVC
+    union
+    {
+        CUVIDH264MVCEXT mvcext;
+        CUVIDH264SVCEXT svcext;
+    };
+} CUVIDH264PICPARAMS;
+/********************************************************/
+//! \struct CUVIDMPEG2PICPARAMS
+//! MPEG-2 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDMPEG2PICPARAMS
+{
+    int ForwardRefIdx;          // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx;         // Picture index of backward reference (B-frames)
+    int picture_coding_type;
+    int full_pel_forward_vector;
+    int full_pel_backward_vector;
+    int f_code[2][2];
+    int intra_dc_precision;
+    int frame_pred_frame_dct;
+    int concealment_motion_vectors;
+    int q_scale_type;
+    int intra_vlc_format;
+    int alternate_scan;
+    int top_field_first;
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+} CUVIDMPEG2PICPARAMS;
+// MPEG-4 has VOP types instead of Picture types
+#define I_VOP 0
+#define P_VOP 1
+#define B_VOP 2
+#define S_VOP 3
+/*******************************************************/
+//! \struct CUVIDMPEG4PICPARAMS
+//! MPEG-4 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDMPEG4PICPARAMS
+{
+    int ForwardRefIdx;          // Picture index of forward reference (P/B-frames)
+    int BackwardRefIdx;         // Picture index of backward reference (B-frames)
+    // VOL
+    int video_object_layer_width;
+    int video_object_layer_height;
+    int vop_time_increment_bitcount;
+    int top_field_first;
+    int resync_marker_disable;
+    int quant_type;
+    int quarter_sample;
+    int short_video_header;
+    int divx_flags;
+    // VOP
+    int vop_coding_type;
+    int vop_coded;
+    int vop_rounding_type;
+    int alternate_vertical_scan_flag;
+    int interlaced;
+    int vop_fcode_forward;
+    int vop_fcode_backward;
+    int trd[2];
+    int trb[2];
+    // Quantization matrices (raster order)
+    unsigned char QuantMatrixIntra[64];
+    unsigned char QuantMatrixInter[64];
+    int gmc_enabled;
+} CUVIDMPEG4PICPARAMS;
+/********************************************************/
+//! \struct CUVIDVC1PICPARAMS
+//! VC1 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/********************************************************/
+typedef struct _CUVIDVC1PICPARAMS
+{
+    int ForwardRefIdx;      /**< Picture index of forward reference (P/B-frames) */
+    int BackwardRefIdx;     /**< Picture index of backward reference (B-frames)  */
+    int FrameWidth;         /**< Actual frame width                              */
+    int FrameHeight;        /**< Actual frame height                             */
+    // PICTURE
+    int intra_pic_flag;     /**< Set to 1 for I,BI frames */
+    int ref_pic_flag;       /**< Set to 1 for I,P frames  */
+    int progressive_fcm;    /**< Progressive frame        */
+    // SEQUENCE
+    int profile;
+    int postprocflag;
+    int pulldown;
+    int interlace;
+    int tfcntrflag;
+    int finterpflag;
+    int psf;
+    int multires;
+    int syncmarker;
+    int rangered;
+    int maxbframes;
+    // ENTRYPOINT
+    int panscan_flag;
+    int refdist_flag;
+    int extended_mv;
+    int dquant;
+    int vstransform;
+    int loopfilter;
+    int fastuvmc;
+    int overlap;
+    int quantizer;
+    int extended_dmv;
+    int range_mapy_flag;
+    int range_mapy;
+    int range_mapuv_flag;
+    int range_mapuv;
+    int rangeredfrm;    // range reduction state
+} CUVIDVC1PICPARAMS;
+/***********************************************************/
+//! \struct CUVIDJPEGPICPARAMS
+//! JPEG picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDJPEGPICPARAMS
+{
+    int Reserved;
+} CUVIDJPEGPICPARAMS;
+/*******************************************************/
+//! \struct CUVIDHEVCPICPARAMS
+//! HEVC picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/*******************************************************/
+typedef struct _CUVIDHEVCPICPARAMS
+{
+    // sps
+    int pic_width_in_luma_samples;
+    int pic_height_in_luma_samples;
+    unsigned char log2_min_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_luma_coding_block_size;
+    unsigned char log2_min_transform_block_size_minus2;
+    unsigned char log2_diff_max_min_transform_block_size;
+    unsigned char pcm_enabled_flag;
+    unsigned char log2_min_pcm_luma_coding_block_size_minus3;
+    unsigned char log2_diff_max_min_pcm_luma_coding_block_size;
+    unsigned char pcm_sample_bit_depth_luma_minus1;
+    unsigned char pcm_sample_bit_depth_chroma_minus1;
+    unsigned char pcm_loop_filter_disabled_flag;
+    unsigned char strong_intra_smoothing_enabled_flag;
+    unsigned char max_transform_hierarchy_depth_intra;
+    unsigned char max_transform_hierarchy_depth_inter;
+    unsigned char amp_enabled_flag;
+    unsigned char separate_colour_plane_flag;
+    unsigned char log2_max_pic_order_cnt_lsb_minus4;
+    unsigned char num_short_term_ref_pic_sets;
+    unsigned char long_term_ref_pics_present_flag;
+    unsigned char num_long_term_ref_pics_sps;
+    unsigned char sps_temporal_mvp_enabled_flag;
+    unsigned char sample_adaptive_offset_enabled_flag;
+    unsigned char scaling_list_enable_flag;
+    unsigned char IrapPicFlag;
+    unsigned char IdrPicFlag;
+    unsigned char bit_depth_luma_minus8;
+    unsigned char bit_depth_chroma_minus8;
+    unsigned char reserved1[14];
+    // pps
+    unsigned char dependent_slice_segments_enabled_flag;
+    unsigned char slice_segment_header_extension_present_flag;
+    unsigned char sign_data_hiding_enabled_flag;
+    unsigned char cu_qp_delta_enabled_flag;
+    unsigned char diff_cu_qp_delta_depth;
+    signed char init_qp_minus26;
+    signed char pps_cb_qp_offset;
+    signed char pps_cr_qp_offset;
+    unsigned char constrained_intra_pred_flag;
+    unsigned char weighted_pred_flag;
+    unsigned char weighted_bipred_flag;
+    unsigned char transform_skip_enabled_flag;
+    unsigned char transquant_bypass_enabled_flag;
+    unsigned char entropy_coding_sync_enabled_flag;
+    unsigned char log2_parallel_merge_level_minus2;
+    unsigned char num_extra_slice_header_bits;
+    unsigned char loop_filter_across_tiles_enabled_flag;
+    unsigned char loop_filter_across_slices_enabled_flag;
+    unsigned char output_flag_present_flag;
+    unsigned char num_ref_idx_l0_default_active_minus1;
+    unsigned char num_ref_idx_l1_default_active_minus1;
+    unsigned char lists_modification_present_flag;
+    unsigned char cabac_init_present_flag;
+    unsigned char pps_slice_chroma_qp_offsets_present_flag;
+    unsigned char deblocking_filter_override_enabled_flag;
+    unsigned char pps_deblocking_filter_disabled_flag;
+    signed char   pps_beta_offset_div2;
+    signed char   pps_tc_offset_div2;
+    unsigned char tiles_enabled_flag;
+    unsigned char uniform_spacing_flag;
+    unsigned char num_tile_columns_minus1;
+    unsigned char num_tile_rows_minus1;
+    unsigned short column_width_minus1[21];
+    unsigned short row_height_minus1[21];
+    unsigned int   reserved3[15];
+    // RefPicSets
+    int NumBitsForShortTermRPSInSlice;
+    int NumDeltaPocsOfRefRpsIdx;
+    int NumPocTotalCurr;
+    int NumPocStCurrBefore;
+    int NumPocStCurrAfter;
+    int NumPocLtCurr;
+    int CurrPicOrderCntVal;
+    int RefPicIdx[16];                      // [refpic] Indices of valid reference pictures (-1 if unused for reference)
+    int PicOrderCntVal[16];                 // [refpic]
+    unsigned char IsLongTerm[16];           // [refpic] 0=not a long-term reference, 1=long-term reference
+    unsigned char RefPicSetStCurrBefore[8]; // [0..NumPocStCurrBefore-1] -> refpic (0..15)
+    unsigned char RefPicSetStCurrAfter[8];  // [0..NumPocStCurrAfter-1] -> refpic (0..15)
+    unsigned char RefPicSetLtCurr[8];       // [0..NumPocLtCurr-1] -> refpic (0..15)
+    unsigned char RefPicSetInterLayer0[8];
+    unsigned char RefPicSetInterLayer1[8];
+    unsigned int  reserved4[12];
+    // scaling lists (diag order)
+    unsigned char ScalingList4x4[6][16];       // [matrixId][i]
+    unsigned char ScalingList8x8[6][64];       // [matrixId][i]
+    unsigned char ScalingList16x16[6][64];     // [matrixId][i]
+    unsigned char ScalingList32x32[2][64];     // [matrixId][i]
+    unsigned char ScalingListDCCoeff16x16[6];  // [matrixId]
+    unsigned char ScalingListDCCoeff32x32[2];  // [matrixId]
+} CUVIDHEVCPICPARAMS;
+/***********************************************************/
+//! \struct CUVIDVP8PICPARAMS
+//! VP8 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP8PICPARAMS
+{
+    int width;
+    int height;
+    unsigned int first_partition_size;
+    //Frame Indexes
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    union {
+        struct {
+            unsigned char frame_type : 1;    /**< 0 = KEYFRAME, 1 = INTERFRAME  */
+            unsigned char version : 3;
+            unsigned char show_frame : 1;
+            unsigned char update_mb_segmentation_data : 1;    /**< Must be 0 if segmentation is not enabled */
+            unsigned char Reserved2Bits : 2;
+        };
+        unsigned char wFrameTagFlags;
+    };
+    unsigned char Reserved1[4];
+    unsigned int  Reserved2[3];
+} CUVIDVP8PICPARAMS;
+/***********************************************************/
+//! \struct CUVIDVP9PICPARAMS
+//! VP9 picture parameters
+//! This structure is used in CUVIDPICPARAMS structure
+/***********************************************************/
+typedef struct _CUVIDVP9PICPARAMS
+{
+    unsigned int width;
+    unsigned int height;
+    //Frame Indices
+    unsigned char LastRefIdx;
+    unsigned char GoldenRefIdx;
+    unsigned char AltRefIdx;
+    unsigned char colorSpace;
+    unsigned short profile : 3;
+    unsigned short frameContextIdx : 2;
+    unsigned short frameType : 1;
+    unsigned short showFrame : 1;
+    unsigned short errorResilient : 1;
+    unsigned short frameParallelDecoding : 1;
+    unsigned short subSamplingX : 1;
+    unsigned short subSamplingY : 1;
+    unsigned short intraOnly : 1;
+    unsigned short allow_high_precision_mv : 1;
+    unsigned short refreshEntropyProbs : 1;
+    unsigned short reserved2Bits : 2;
+    unsigned short reserved16Bits;
+    unsigned char  refFrameSignBias[4];
+    unsigned char bitDepthMinus8Luma;
+    unsigned char bitDepthMinus8Chroma;
+    unsigned char loopFilterLevel;
+    unsigned char loopFilterSharpness;
+    unsigned char modeRefLfEnabled;
+    unsigned char log2_tile_columns;
+    unsigned char log2_tile_rows;
+    unsigned char segmentEnabled : 1;
+    unsigned char segmentMapUpdate : 1;
+    unsigned char segmentMapTemporalUpdate : 1;
+    unsigned char segmentFeatureMode : 1;
+    unsigned char reserved4Bits : 4;
+    unsigned char segmentFeatureEnable[8][4];
+    short         segmentFeatureData[8][4];
+    unsigned char mb_segment_tree_probs[7];
+    unsigned char segment_pred_probs[3];
+    unsigned char reservedSegment16Bits[2];
+    int qpYAc;
+    int qpYDc;
+    int qpChDc;
+    int qpChAc;
+    unsigned int activeRefIdx[3];
+    unsigned int resetFrameContext;
+    unsigned int mcomp_filter_type;
+    unsigned int mbRefLfDelta[4];
+    unsigned int mbModeLfDelta[2];
+    unsigned int frameTagSize;
+    unsigned int offsetToDctParts;
+    unsigned int reserved128Bits[4];
+} CUVIDVP9PICPARAMS;
+/******************************************************************************************/
+//! \struct CUVIDPICPARAMS
+//! Picture parameters for decoding
+//! This structure is used in cuvidDecodePicture API
+//! IN  for cuvidDecodePicture
+/******************************************************************************************/
+typedef struct _CUVIDPICPARAMS
+{
+    int PicWidthInMbs;                     /**< IN: Coded frame size in macroblocks                           */
+    int FrameHeightInMbs;                  /**< IN: Coded frame height in macroblocks                         */
+    int CurrPicIdx;                        /**< IN: Output index of the current picture                       */
+    int field_pic_flag;                    /**< IN: 0=frame picture, 1=field picture                          */
+    int bottom_field_flag;                 /**< IN: 0=top field, 1=bottom field (ignored if field_pic_flag=0) */
+    int second_field;                      /**< IN: Second field of a complementary field pair                */
+    // Bitstream data
+    unsigned int nBitstreamDataLen;        /**< IN: Number of bytes in bitstream data buffer                  */
+    const unsigned char *pBitstreamData;   /**< IN: Ptr to bitstream data for this picture (slice-layer)      */
+    unsigned int nNumSlices;               /**< IN: Number of slices in this picture                          */
+    const unsigned int *pSliceDataOffsets; /**< IN: nNumSlices entries, contains offset of each slice within 
+                                                        the bitstream data buffer                             */
+    int ref_pic_flag;                      /**< IN: This picture is a reference picture                       */
+    int intra_pic_flag;                    /**< IN: This picture is entirely intra coded                      */
+    unsigned int Reserved[30];             /**< Reserved for future use                                       */
+    // IN: Codec-specific data
+    union {
+        CUVIDMPEG2PICPARAMS mpeg2;         /**< Also used for MPEG-1 */
+        CUVIDH264PICPARAMS  h264;
+        CUVIDVC1PICPARAMS   vc1;
+        CUVIDMPEG4PICPARAMS mpeg4;
+        CUVIDJPEGPICPARAMS  jpeg;
+        CUVIDHEVCPICPARAMS  hevc;
+        CUVIDVP8PICPARAMS   vp8;
+        CUVIDVP9PICPARAMS   vp9;
+        unsigned int CodecReserved[1024];
+    } CodecSpecific;
+} CUVIDPICPARAMS;
+/******************************************************/
+//! \struct CUVIDPROCPARAMS
+//! Picture parameters for postprocessing
+//! This structure is used in cuvidMapVideoFrame API
+/******************************************************/
+typedef struct _CUVIDPROCPARAMS
+{
+    int progressive_frame;              /**< IN: Input is progressive (deinterlace_mode will be ignored)                */
+    int second_field;                   /**< IN: Output the second field (ignored if deinterlace mode is Weave)         */
+    int top_field_first;                /**< IN: Input frame is top field first (1st field is top, 2nd field is bottom) */
+    int unpaired_field;                 /**< IN: Input only contains one field (2nd field is invalid)                   */
+    // The fields below are used for raw YUV input
+    unsigned int reserved_flags;        /**< Reserved for future use (set to zero)                                      */
+    unsigned int reserved_zero;         /**< Reserved (set to zero)                                                     */
+    unsigned long long raw_input_dptr;  /**< IN: Input CUdeviceptr for raw YUV extensions                               */
+    unsigned int raw_input_pitch;       /**< IN: pitch in bytes of raw YUV input (should be aligned appropriately)      */
+    unsigned int raw_input_format;      /**< IN: Input YUV format (cudaVideoCodec_enum)                                 */
+    unsigned long long raw_output_dptr; /**< IN: Output CUdeviceptr for raw YUV extensions                              */
+    unsigned int raw_output_pitch;      /**< IN: pitch in bytes of raw YUV output (should be aligned appropriately)     */
+    unsigned int Reserved1;             /**< Reserved for future use (set to zero)                                      */
+    CUstream output_stream;             /**< IN: stream object used by cuvidMapVideoFrame                               */
+    unsigned int Reserved[46];          /**< Reserved for future use (set to zero)                                      */
+    void *Reserved2[2];                 /**< Reserved for future use (set to zero)                                      */
+} CUVIDPROCPARAMS;
+/***********************************************************************************************************/
+//! VIDEO_DECODER
+//!
+//! In order to minimize decode latencies, there should be always at least 2 pictures in the decode
+//! queue at any time, in order to make sure that all decode engines are always busy.
+//!
+//! Overall data flow:
+//!  - cuvidGetDecoderCaps(...)
+//!  - cuvidCreateDecoder(...)
+//!  - For each picture:
+//!    + cuvidDecodePicture(N)
+//!    + cuvidMapVideoFrame(N-4)
+//!    + do some processing in cuda
+//!    + cuvidUnmapVideoFrame(N-4)
+//!    + cuvidDecodePicture(N+1)
+//!    + cuvidMapVideoFrame(N-3)
+//!    + ...
+//!  - cuvidDestroyDecoder(...)
+//!
+//! NOTE:
+//! - When the cuda context is created from a D3D device, the D3D device must also be created
+//!   with the D3DCREATE_MULTITHREADED flag.
+//! - There is a limit to how many pictures can be mapped simultaneously (ulNumOutputSurfaces)
+//! - cuvidDecodePicture may block the calling thread if there are too many pictures pending
+//!   in the decode queue
+/***********************************************************************************************************/
+/**********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetDecoderCaps(CUVIDDECODECAPS *pdc)
+//! Queries decode capabilities of NVDEC-HW based on CodecType, ChromaFormat and BitDepthMinus8 parameters.
+//! 1. Application fills IN parameters CodecType, ChromaFormat and BitDepthMinus8 of CUVIDDECODECAPS structure
+//! 2. On calling cuvidGetDecoderCaps, driver fills OUT parameters if the IN parameters are supported
+//!    If IN parameters passed to the driver are not supported by NVDEC-HW, then all OUT params are set to 0.
+//! E.g. on Geforce GTX 960:
+//!   App fills - eCodecType = cudaVideoCodec_H264; eChromaFormat = cudaVideoChromaFormat_420; nBitDepthMinus8 = 0;
+//!   Given IN parameters are supported, hence driver fills: bIsSupported = 1; nMinWidth   = 48; nMinHeight  = 16; 
+//!   nMaxWidth = 4096; nMaxHeight = 4096; nMaxMBCount = 65536;
+//! CodedWidth*CodedHeight/256 must be less than or equal to nMaxMBCount
+/**********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidGetDecoderCaps(CUVIDDECODECAPS *pdc);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci)
+//! Create the decoder object based on pdci. A handle to the created decoder is returned
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCreateDecoder(CUvideodecoder *phDecoder, CUVIDDECODECREATEINFO *pdci);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyDecoder(CUvideodecoder hDecoder)
+//! Destroy the decoder object.
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidDestroyDecoder(CUvideodecoder hDecoder);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams)
+//! Decode a single picture (field or frame)
+//! Kicks off HW decoding 
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidDecodePicture(CUvideodecoder hDecoder, CUVIDPICPARAMS *pPicParams);
+#if !defined(__CUVID_DEVPTR64) || defined(__CUVID_INTERNAL)
+/************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx, unsigned int *pDevPtr,
+//!                                         unsigned int  *pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and associated
+//! pitch of the video frame
+/************************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidMapVideoFrame(CUvideodecoder hDecoder, int nPicIdx,
+                                           unsigned int *pDevPtr, unsigned int *pPitch,
+                                           CUVIDPROCPARAMS *pVPP);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr)
+//! Unmap a previously mapped video frame
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidUnmapVideoFrame(CUvideodecoder hDecoder, unsigned int DevPtr);
+#endif
+#if defined(_WIN64) || defined(__LP64__) || defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+/************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+//!                                           unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+//! Post-process and map video frame corresponding to nPicIdx for use in cuda. Returns cuda device pointer and associated
+//! pitch of the video frame
+/************************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidMapVideoFrame64(CUvideodecoder hDecoder, int nPicIdx, unsigned long long *pDevPtr,
+                                             unsigned int *pPitch, CUVIDPROCPARAMS *pVPP);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+//! Unmap a previously mapped video frame
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidUnmapVideoFrame64(CUvideodecoder hDecoder, unsigned long long DevPtr);
+#if defined(__CUVID_DEVPTR64) && !defined(__CUVID_INTERNAL)
+#define tcuvidMapVideoFrame      tcuvidMapVideoFrame64
+#define tcuvidUnmapVideoFrame    tcuvidUnmapVideoFrame64
+#endif
+#endif
+/********************************************************************************************************************/
+//!
+//! Context-locking: to facilitate multi-threaded implementations, the following 4 functions
+//! provide a simple mutex-style host synchronization. If a non-NULL context is specified
+//! in CUVIDDECODECREATEINFO, the codec library will acquire the mutex associated with the given
+//! context before making any cuda calls.
+//! A multi-threaded application could create a lock associated with a context handle so that
+//! multiple threads can safely share the same cuda context:
+//!  - use cuCtxPopCurrent immediately after context creation in order to create a 'floating' context
+//!    that can be passed to cuvidCtxLockCreate.
+//!  - When using a floating context, all cuda calls should only be made within a cuvidCtxLock/cuvidCtxUnlock section.
+//!
+//! NOTE: This is a safer alternative to cuCtxPushCurrent and cuCtxPopCurrent, and is not related to video
+//! decoder in any way (implemented as a critical section associated with cuCtx{Push|Pop}Current calls).
+/********************************************************************************************************************/
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx)
+//! This API is used to create CtxLock object
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCtxLockCreate(CUvideoctxlock *pLock, CUcontext ctx);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLockDestroy(CUvideoctxlock lck)
+//! This API is used to free CtxLock object
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCtxLockDestroy(CUvideoctxlock lck);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to acquire ctxlock
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCtxLock(CUvideoctxlock lck, unsigned int reserved_flags);
+/********************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags)
+//! This API is used to release ctxlock
+/********************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCtxUnlock(CUvideoctxlock lck, unsigned int reserved_flags);
+/**********************************************************************************************/
+extern tcuvidGetDecoderCaps       *cuvidGetDecoderCaps;
+extern tcuvidCreateDecoder        *cuvidCreateDecoder;
+extern tcuvidDestroyDecoder       *cuvidDestroyDecoder;
+extern tcuvidDecodePicture        *cuvidDecodePicture;
+extern tcuvidMapVideoFrame        *cuvidMapVideoFrame;
+extern tcuvidUnmapVideoFrame      *cuvidUnmapVideoFrame;
+#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64)
+extern tcuvidMapVideoFrame64      *cuvidMapVideoFrame64;
+extern tcuvidUnmapVideoFrame64    *cuvidUnmapVideoFrame64;
+#endif
+// extern tcuvidGetVideoFrameSurface *cuvidGetVideoFrameSurface;
+extern tcuvidCtxLockCreate        *cuvidCtxLockCreate;
+extern tcuvidCtxLockDestroy       *cuvidCtxLockDestroy;
+extern tcuvidCtxLock              *cuvidCtxLock;
+extern tcuvidCtxUnlock            *cuvidCtxUnlock;
+#if defined(__cplusplus)
+}
+// Auto-lock helper for C++ applications
+class CCtxAutoLock
+{
+private:
+    CUvideoctxlock m_ctx;
+public:
+    CCtxAutoLock(CUvideoctxlock ctx);
+    ~CCtxAutoLock();
+};
+#endif /* __cplusplus */
+#endif // __CUDA_VIDEO_H__
--- a/docker/nvcuvid/dynlink_nvcuvid.h
+++ b/docker/nvcuvid/dynlink_nvcuvid.h
+/*
+ * This copyright notice applies to this header file only:
+ *
+ * Copyright (c) 2010-2017 NVIDIA Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the software, and to permit persons to whom the
+ * software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+/********************************************************************************************************************/
+//! \file nvcuvid.h
+//!   NVDECODE API provides video decoding interface to NVIDIA GPU devices.
+//! \date 2015-2017
+//!  This file contains the interface constants, structure definitions and function prototypes.
+/********************************************************************************************************************/
+#if !defined(__NVCUVID_H__)
+#define __NVCUVID_H__
+#include "dynlink_cuviddec.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/*********************************
+** Initialization
+*********************************/
+CUresult  CUDAAPI cuvidInit(unsigned int Flags);
+/***********************************************/
+//!
+//! High-level helper APIs for video sources
+//!
+/***********************************************/
+typedef void *CUvideosource;
+typedef void *CUvideoparser;
+typedef long long CUvideotimestamp;
+/************************************************************************/
+//! \enum cudaVideoState
+//! Video source state enums
+//! Used in cuvidSetVideoSourceState and cuvidGetVideoSourceState APIs
+/************************************************************************/
+typedef enum {
+    cudaVideoState_Error   = -1,    /**< Error state (invalid source)                  */
+    cudaVideoState_Stopped = 0,     /**< Source is stopped (or reached end-of-stream)  */
+    cudaVideoState_Started = 1      /**< Source is running and delivering data         */
+} cudaVideoState;
+/************************************************************************/
+//! \enum cudaAudioCodec
+//! Audio compression enums
+//! Used in CUAUDIOFORMAT structure
+/************************************************************************/
+typedef enum {
+    cudaAudioCodec_MPEG1=0,         /**< MPEG-1 Audio               */
+    cudaAudioCodec_MPEG2,           /**< MPEG-2 Audio               */
+    cudaAudioCodec_MP3,             /**< MPEG-1 Layer III Audio     */
+    cudaAudioCodec_AC3,             /**< Dolby Digital (AC3) Audio  */
+    cudaAudioCodec_LPCM,            /**< PCM Audio                  */
+    cudaAudioCodec_AAC,             /**< AAC Audio                  */
+} cudaAudioCodec;
+/************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMAT
+//! Video format
+//! Used in cuvidGetSourceVideoFormat API
+/************************************************************************************************/
+typedef struct
+{
+    cudaVideoCodec codec;                   /**< OUT: Compression format          */
+   /**
+    * OUT: frame rate = numerator / denominator (for example: 30000/1001)
+    */
+    struct {
+        /**< OUT: frame rate numerator   (0 = unspecified or variable frame rate) */
+        unsigned int numerator;
+        /**< OUT: frame rate denominator (0 = unspecified or variable frame rate) */
+        unsigned int denominator;
+    } frame_rate;
+    unsigned char progressive_sequence;     /**< OUT: 0=interlaced, 1=progressive                                      */
+    unsigned char bit_depth_luma_minus8;    /**< OUT: high bit depth luma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth   */
+    unsigned char bit_depth_chroma_minus8;  /**< OUT: high bit depth chroma. E.g, 2 for 10-bitdepth, 4 for 12-bitdepth */
+    unsigned char reserved1;                /**< Reserved for future use                                               */
+    unsigned int coded_width;               /**< OUT: coded frame width in pixels                                      */
+    unsigned int coded_height;              /**< OUT: coded frame height in pixels                                     */
+   /**
+    * area of the frame that should be displayed
+    * typical example:
+    * coded_width = 1920, coded_height = 1088
+    * display_area = { 0,0,1920,1080 }
+    */
+    struct {
+        int left;                           /**< OUT: left position of display rect    */
+        int top;                            /**< OUT: top position of display rect     */
+        int right;                          /**< OUT: right position of display rect   */
+        int bottom;                         /**< OUT: bottom position of display rect  */
+    } display_area;
+    cudaVideoChromaFormat chroma_format;    /**< OUT:  Chroma format                   */
+    unsigned int bitrate;                   /**< OUT: video bitrate (bps, 0=unknown)   */
+   /**
+    * OUT: Display Aspect Ratio = x:y (4:3, 16:9, etc)
+    */
+    struct {
+        int x;
+        int y;
+    } display_aspect_ratio;
+    /**
+    * Video Signal Description
+    * Refer section E.2.1 (VUI parameters semantics) of H264 spec file
+    */
+    struct {
+        unsigned char video_format          : 3; /**< OUT: 0-Component, 1-PAL, 2-NTSC, 3-SECAM, 4-MAC, 5-Unspecified     */
+        unsigned char video_full_range_flag : 1; /**< OUT: indicates the black level and luma and chroma range           */
+        unsigned char reserved_zero_bits    : 4; /**< Reserved bits                                                      */
+        unsigned char color_primaries;           /**< OUT: chromaticity coordinates of source primaries                  */
+        unsigned char transfer_characteristics;  /**< OUT: opto-electronic transfer characteristic of the source picture */
+        unsigned char matrix_coefficients;       /**< OUT: used in deriving luma and chroma signals from RGB primaries   */
+    } video_signal_description;
+    unsigned int seqhdr_data_length;             /**< OUT: Additional bytes following (CUVIDEOFORMATEX)                  */
+} CUVIDEOFORMAT;
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDEOFORMATEX
+//! Video format including raw sequence header information
+//! Used in cuvidGetSourceVideoFormat API
+/****************************************************************/
+typedef struct
+{
+    CUVIDEOFORMAT format;                 /**< OUT: CUVIDEOFORMAT structure */
+    unsigned char raw_seqhdr_data[1024];  /**< OUT: Sequence header data    */
+} CUVIDEOFORMATEX;
+/****************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUAUDIOFORMAT
+//! Audio formats
+//! Used in cuvidGetSourceAudioFormat API
+/****************************************************************/
+typedef struct
+{
+    cudaAudioCodec codec;       /**< OUT: Compression format                                              */
+    unsigned int channels;      /**< OUT: number of audio channels                                        */
+    unsigned int samplespersec; /**< OUT: sampling frequency                                              */
+    unsigned int bitrate;       /**< OUT: For uncompressed, can also be used to determine bits per sample */
+    unsigned int reserved1;     /**< Reserved for future use                                              */
+    unsigned int reserved2;     /**< Reserved for future use                                              */
+} CUAUDIOFORMAT;
+/***************************************************************/
+//! \enum CUvideopacketflags
+//! Data packet flags
+//! Used in CUVIDSOURCEDATAPACKET structure
+/***************************************************************/
+typedef enum {
+    CUVID_PKT_ENDOFSTREAM   = 0x01,   /**< Set when this is the last packet for this stream  */
+    CUVID_PKT_TIMESTAMP     = 0x02,   /**< Timestamp is valid                                */
+    CUVID_PKT_DISCONTINUITY = 0x04,   /**< Set when a discontinuity has to be signalled      */
+    CUVID_PKT_ENDOFPICTURE  = 0x08,   /**< Set when the packet contains exactly one frame    */
+} CUvideopacketflags;
+/*****************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEDATAPACKET
+//! Data Packet
+//! Used in cuvidParseVideoData API
+//! IN for cuvidParseVideoData
+/*****************************************************************************/
+typedef struct _CUVIDSOURCEDATAPACKET
+{
+    unsigned long flags;            /**< IN: Combination of CUVID_PKT_XXX flags                              */
+    unsigned long payload_size;     /**< IN: number of bytes in the payload (may be zero if EOS flag is set) */
+    const unsigned char *payload;   /**< IN: Pointer to packet payload data (may be NULL if EOS flag is set) */
+    CUvideotimestamp timestamp;     /**< IN: Presentation time stamp (10MHz clock), only valid if
+                                             CUVID_PKT_TIMESTAMP flag is set                                 */
+} CUVIDSOURCEDATAPACKET;
+// Callback for packet delivery
+typedef int (CUDAAPI *PFNVIDSOURCECALLBACK)(void *, CUVIDSOURCEDATAPACKET *);
+/**************************************************************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDSOURCEPARAMS
+//! Describes parameters needed in cuvidCreateVideoSource API
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+typedef struct _CUVIDSOURCEPARAMS
+{
+    unsigned int ulClockRate;                   /**< IN: Time stamp units in Hz (0=default=10000000Hz)      */
+    unsigned int uReserved1[7];                 /**< Reserved for future use - set to zero                  */
+    void *pUserData;                            /**< IN: User private data passed in to the data handlers   */
+    PFNVIDSOURCECALLBACK pfnVideoDataHandler;   /**< IN: Called to deliver video packets                    */
+    PFNVIDSOURCECALLBACK pfnAudioDataHandler;   /**< IN: Called to deliver audio packets.                   */
+    void *pvReserved2[8];                       /**< Reserved for future use - set to NULL                  */
+} CUVIDSOURCEPARAMS;
+/**********************************************/
+//! \ingroup ENUMS
+//! \enum CUvideosourceformat_flags
+//! CUvideosourceformat_flags
+//! Used in cuvidGetSourceVideoFormat API
+/**********************************************/
+typedef enum {
+    CUVID_FMT_EXTFORMATINFO = 0x100             /**< Return extended format structure (CUVIDEOFORMATEX) */
+} CUvideosourceformat_flags;
+#if !defined(__APPLE__)
+/**************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create CUvideosource object. CUvideosource spawns demultiplexer thread that provides two callbacks: 
+//! pfnVideoDataHandler() and pfnAudioDataHandler()
+//! NVDECODE API is intended for HW accelerated video decoding so CUvideosource doesn't have audio demuxer for all supported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/**************************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCreateVideoSource(CUvideosource *pObj, const char *pszFileName, CUVIDSOURCEPARAMS *pParams);
+/****************************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams)
+//! Create video source object and initialize
+/****************************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCreateVideoSourceW(CUvideosource *pObj, const wchar_t *pwszFileName, CUVIDSOURCEPARAMS *pParams);
+/*********************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyVideoSource(CUvideosource obj)
+//! Destroy video source
+/*********************************************************************/
+typedef CUresult CUDAAPI tcuvidDestroyVideoSource(CUvideosource obj);
+/******************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state)
+//! Set video source state
+/******************************************************************************************/
+typedef CUresult CUDAAPI tcuvidSetVideoSourceState(CUvideosource obj, cudaVideoState state);
+/******************************************************************************************/
+//! \fn cudaVideoState CUDAAPI cuvidGetVideoSourceState(CUvideosource obj)
+//! Get video source state
+/******************************************************************************************/
+typedef cudaVideoState CUDAAPI tcuvidGetVideoSourceState(CUvideosource obj);
+/****************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags)
+//! Gets details of video stream in pvidfmt
+/****************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidGetSourceVideoFormat(CUvideosource obj, CUVIDEOFORMAT *pvidfmt, unsigned int flags);
+/****************************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags)
+//! Get audio source format
+//! NVDECODE API is intended for HW accelarated video decoding so CUvideosource doesn't have audio demuxer for all suppported 
+//! containers. It's recommended to clients to use their own or third party demuxer if audio support is needed.
+/****************************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidGetSourceAudioFormat(CUvideosource obj, CUAUDIOFORMAT *paudfmt, unsigned int flags);
+#endif
+/**********************************************************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERDISPINFO
+//! Used in cuvidParseVideoData API with PFNVIDDISPLAYCALLBACK pfnDisplayPicture
+/**********************************************************************************/
+typedef struct _CUVIDPARSERDISPINFO
+{
+    int picture_index;          /**< OUT: Index of the current picture                                                         */
+    int progressive_frame;      /**< OUT: 1 if progressive frame; 0 otherwise                                                  */
+    int top_field_first;        /**< OUT: 1 if top field is displayed first; 0 otherwise                                       */
+    int repeat_first_field;     /**< OUT: Number of additional fields (1=ivtc, 2=frame doubling, 4=frame tripling, 
+                                     -1=unpaired field)                                                                        */
+    CUvideotimestamp timestamp; /**< OUT: Presentation time stamp                                                              */
+} CUVIDPARSERDISPINFO;
+/***********************************************************************************************************************/
+//! Parser callbacks
+//! The parser will call these synchronously from within cuvidParseVideoData(), whenever a picture is ready to
+//! be decoded and/or displayed. First argument in functions is "void *pUserData" member of structure CUVIDSOURCEPARAMS
+/***********************************************************************************************************************/
+typedef int (CUDAAPI *PFNVIDSEQUENCECALLBACK)(void *, CUVIDEOFORMAT *);
+typedef int (CUDAAPI *PFNVIDDECODECALLBACK)(void *, CUVIDPICPARAMS *);
+typedef int (CUDAAPI *PFNVIDDISPLAYCALLBACK)(void *, CUVIDPARSERDISPINFO *);
+/**************************************/
+//! \ingroup STRUCTS
+//! \struct CUVIDPARSERPARAMS
+//! Used in cuvidCreateVideoParser API
+/**************************************/
+typedef struct _CUVIDPARSERPARAMS
+{
+    cudaVideoCodec CodecType;                   /**< IN: cudaVideoCodec_XXX                                                  */
+    unsigned int ulMaxNumDecodeSurfaces;        /**< IN: Max # of decode surfaces (parser will cycle through these)          */
+    unsigned int ulClockRate;                   /**< IN: Timestamp units in Hz (0=default=10000000Hz)                        */
+    unsigned int ulErrorThreshold;              /**< IN: % Error threshold (0-100) for calling pfnDecodePicture (100=always 
+                                                     IN: call pfnDecodePicture even if picture bitstream is fully corrupted) */
+    unsigned int ulMaxDisplayDelay;             /**< IN: Max display queue delay (improves pipelining of decode with display)
+                                                         0=no delay (recommended values: 2..4)                               */
+    unsigned int uReserved1[5];                 /**< IN: Reserved for future use - set to 0                                  */
+    void *pUserData;                            /**< IN: User data for callbacks                                             */
+    PFNVIDSEQUENCECALLBACK pfnSequenceCallback; /**< IN: Called before decoding frames and/or whenever there is a fmt change */
+    PFNVIDDECODECALLBACK pfnDecodePicture;      /**< IN: Called when a picture is ready to be decoded (decode order)         */
+    PFNVIDDISPLAYCALLBACK pfnDisplayPicture;    /**< IN: Called whenever a picture is ready to be displayed (display order)  */
+    void *pvReserved2[7];                       /**< Reserved for future use - set to NULL                                   */
+    CUVIDEOFORMATEX *pExtVideoInfo;             /**< IN: [Optional] sequence header data from system layer                   */
+} CUVIDPARSERPARAMS;
+/************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams)
+//! Create video parser object and initialize
+/************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidCreateVideoParser(CUvideoparser *pObj, CUVIDPARSERPARAMS *pParams);
+/************************************************************************************************/
+//! \fn CUresult CUDAAPI cuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket)
+//! Parse the video data from source data packet in pPacket 
+//! Extracts parameter sets like SPS, PPS, bitstream etc. from pPacket and 
+//! calls back pfnDecodePicture with CUVIDPICPARAMS data for kicking of HW decoding
+/************************************************************************************************/
+typedef CUresult CUDAAPI tcuvidParseVideoData(CUvideoparser obj, CUVIDSOURCEDATAPACKET *pPacket);
+/*******************************************************************/
+//! \fn CUresult CUDAAPI cuvidDestroyVideoParser(CUvideoparser obj)
+/*******************************************************************/
+typedef CUresult CUDAAPI tcuvidDestroyVideoParser(CUvideoparser obj);
+extern tcuvidCreateVideoSource               *cuvidCreateVideoSource;
+extern tcuvidCreateVideoSourceW              *cuvidCreateVideoSourceW;
+extern tcuvidDestroyVideoSource              *cuvidDestroyVideoSource;
+extern tcuvidSetVideoSourceState             *cuvidSetVideoSourceState;
+extern tcuvidGetVideoSourceState             *cuvidGetVideoSourceState;
+extern tcuvidGetSourceVideoFormat            *cuvidGetSourceVideoFormat;
+extern tcuvidGetSourceAudioFormat            *cuvidGetSourceAudioFormat;
+extern tcuvidCreateVideoParser               *cuvidCreateVideoParser;
+extern tcuvidParseVideoData                  *cuvidParseVideoData;
+extern tcuvidDestroyVideoParser              *cuvidDestroyVideoParser;
+/**********************************************************************************************/
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif // __NVCUVID_H__