TensorRT系列传送门(不定期更新): 深度框架|TensorRT



  记录NvInferRuntime.h头文件中的几个重要的接口类以及类中各接口函数的作用
头文件中都有英文注释,这里只是简单中文翻译一下,如有误,欢迎讨论。业余时间,看多少记录多少,纯属笔记。
NvInferRuntime.h是运行TensorRT的最重要的接口头文件之一,NV都说了
This is the top-level API file for TensorRT extended runtime library

介绍几个重要的类
1、ICudaEngine
2、IExecutionContext
3、IPluginFactory
4、IRuntime


一、ICudaEngine类

用来构建网络上执行推理的引擎,官方提示,不要从这个类中继承,否则可能破坏前向传播的兼容性。
介绍几个常用的类函数,为了排版,删除原有的英文注释,笔者看不懂的部分,直接贴上英文注释。

class ICudaEngine
{
public:

    // 获取绑定索引的个数,比如input绑定了一个,output绑定了一个,那索引总数就为2
    virtual int getNbBindings() const noexcept = 0;
    // 根据节点名,获取绑定的索引
    virtual int getBindingIndex(const char* name) const noexcept = 0;
    //! 根据索引,获取绑定的节点名
    virtual const char* getBindingName(int bindingIndex) const noexcept = 0;
    //! 确定是否是输入节点,
    virtual bool bindingIsInput(int bindingIndex) const noexcept = 0;
    //! 获取节点对应的维度
    virtual Dims getBindingDimensions(int bindingIndex) const noexcept = 0;
    //! 获取索引对于的数据类型
    virtual DataType getBindingDataType(int bindingIndex) const noexcept = 0;
    //! 获得最大batchSize的大小
    virtual int getMaxBatchSize() const noexcept = 0;
    //! \return The number of layers in the network.
    //! 
    virtual int getNbLayers() const noexcept = 0;

    //! 获取工作空间的大小, 通常小于设置的值
    TRT_DEPRECATED
    virtual std::size_t getWorkspaceSize() const noexcept = 0;
    //! 序列化模型,序列化之后可以保存到本地
    virtual IHostMemory* serialize() const noexcept = 0;
    //! 创建执行文件,创建好后,可以推断模型,见IExecutionContext
    virtual IExecutionContext* createExecutionContext() noexcept = 0;
    //! 销毁对象,释放空间
    virtual void destroy() noexcept = 0;
    //! 获取索引对应的tensor在gpu上还是cpu上
    virtual TensorLocation getLocation(int bindingIndex) const noexcept = 0;

protected:
    virtual ~ICudaEngine() {}

public:
    //! \see getDeviceMemorySize() IExecutionContext::setDeviceMemory()
    //! 默认情况下,创建IExecutionContext时,会分配持久设备内存来保留激活数据
    // 如果不想这样,可以通过这个函数创建,并通过IExecutionContext::setDeviceMemory()设置空间
    virtual IExecutionContext* createExecutionContextWithoutDeviceMemory() noexcept = 0;
    //! 获取设备内存大小
    virtual size_t getDeviceMemorySize() const noexcept = 0;
    //! 返回这个engine能否被修改
    virtual bool isRefittable() const noexcept = 0;
    //! 返回元素每个组成部分的字节数
    virtual int getBindingBytesPerComponent(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Return the number of components included in one element.
    //!
    //! The number of elements in the vectors is returned if getBindingVectorizedDim() != -1.
    //!
    //! \param bindingIndex The binding Index.
    //!
    //! \see ICudaEngine::getBindingVectorizedDim()
    //!
    virtual int getBindingComponentsPerElement(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Return the binding format.
    //!
    //! \param bindingIndex The binding Index.
    //! 返回绑定数据格式,如NCHW
    virtual TensorFormat getBindingFormat(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Return the human readable description of the tensor format.
    //!
    //! The description includes the order, vectorization, data type, strides,
    //! and etc. Examples are shown as follows:
    //!   Example 1: kCHW + FP32
    //!     "Row major linear FP32 format"
    //!   Example 2: kCHW2 + FP16
    //!     "Two wide channel vectorized row major FP16 format"
    //!   Example 3: kHWC8 + FP16 + Line Stride = 32
    //!     "Channel major FP16 format where C % 8 == 0 and H Stride % 32 == 0"
    //!
    //! \param bindingIndex The binding Index.
    //! 返回绑定数据格式,如NCHW, 和以上不同的是,这边返回的是字符串,前面返回的是枚举
    // 如NCHW fp32返回的是  Row major linear FP32 format (kLINEAR)
    virtual const char* getBindingFormatDesc(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Return the dimension index that the buffer is vectorized.
    //!
    //! Specifically -1 is returned if scalars per vector is 1.
    //!
    //! \param bindingIndex The binding Index.
    //! 返回向量化索引内存
    virtual int getBindingVectorizedDim(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Returns the name of the network associated with the engine.
    //!
    //! The name is set during network creation and is retrieved after
    //! building or deserialization.
    //!
    //! \see INetworkDefinition::setName(), INetworkDefinition::getName()
    //!
    //! \return A zero delimited C-style string representing the name of the network.
    //!返回与引擎关联的网络的名称,名称是在网络创建期间设置的,并在之后创建建立或反序列化。
    virtual const char* getName() const noexcept = 0;

    //!
    //! \brief Get the number of optimization profiles defined for this engine.
    //!
    //! \return Number of optimization profiles. It is always at least 1.
    //!
    //! \see IExecutionContext::setOptimizationProfile()
    virtual int getNbOptimizationProfiles() const noexcept = 0;

    //!
    //! \brief Get the minimum / optimum / maximum dimensions for a particular binding under an optimization profile.
    //!
    //! \param bindingIndex The binding index (must be between 0 and getNbBindings() - 1)
    //!
    //! \param profileIndex The profile index (must be between 0 and getNbOptimizationProfiles()-1)
    //!
    //! \param select Whether to query the minimum, optimum, or maximum dimensions for this binding.
    //!
    //! \return The minimum / optimum / maximum dimensions for this binding in this profile.
    virtual Dims getProfileDimensions(int bindingIndex, int profileIndex, OptProfileSelector select) const noexcept = 0;

    //!
    //! \brief Get minimum / optimum / maximum values for an input shape binding under an optimization profile.
    //!
    //! \param profileIndex The profile index (must be between 0 and getNbOptimizationProfiles()-1)
    //!
    //! \param inputIndex The input index (must be between 0 and getNbBindings() - 1)
    //!
    //! \param select Whether to query the minimum, optimum, or maximum shape values for this binding.
    //!
    //! \return If the binding is an input shape binding, return a pointer to an array that has
    //!         the same number of elements as the corresponding tensor, i.e. 1 if dims.nbDims == 0, or dims.d[0]
    //!         if dims.nbDims == 1, where dims = getBindingDimensions(inputIndex). The array contains
    //!         the elementwise minimum / optimum / maximum values for this shape binding under the profile.
    //!         If either of the indices is out of range, or if the binding is not an input shape binding, return
    //!         nullptr.
    virtual const int32_t* getProfileShapeValues(int profileIndex, int inputIndex, OptProfileSelector select) const
        noexcept
        = 0;

    //!
    //! \brief True if tensor is required as input for shape calculations or output from them.
    //!
    //! TensorRT evaluates a network in two phases:
    //!
    //! 1. Compute shape information required to determine memory allocation requirements
    //!    and validate that runtime sizes make sense.
    //!
    //! 2. Process tensors on the device.
    //!
    //! Some tensors are required in phase 1.  These tensors are called "shape tensors", and always
    //! have type Int32 and no more than one dimension.  These tensors are not always shapes
    //! themselves, but might be used to calculate tensor shapes for phase 2.
    //!
    //! isShapeBinding(i) returns true if the tensor is a required input or an output computed in phase 1.
    //! isExecutionBinding(i) returns true if the tensor is a required input or an output computed in phase 2.
    //!
    //! For example, if a network uses an input tensor with binding i as an addend
    //! to an IElementWiseLayer that computes the "reshape dimensions" for IShuffleLayer,
    //! then isShapeBinding(i) == true.
    //!
    //! It's possible to have a tensor be required by both phases.  For instance, a tensor
    //! can be used for the "reshape dimensions" and as the indices for an IGatherLayer
    //! collecting floating-point data.
    //!
    //! It's also possible to have a tensor be required by neither phase, but nonetheless
    //! shows up in the engine's inputs.  For example, if an input tensor is used only
    //! as an input to IShapeLayer, only its shape matters and its values are irrelevant.
    //!
    //! \see isExecutionBinding()
    //!
    virtual bool isShapeBinding(int bindingIndex) const noexcept = 0;

    //!
    //! \brief True if pointer to tensor data is required for execution phase, false if nullptr can be supplied.
    //!
    //! For example, if a network uses an input tensor with binding i ONLY as the "reshape dimensions"
    //! input of IShuffleLayer, then isExecutionBinding(i) is false, and a nullptr can be
    //! supplied for it when calling IExecutionContext::execute or IExecutionContext::enqueue.
    //!
    //! \see isShapeBinding()
    //!
    virtual bool isExecutionBinding(int bindingIndex) const noexcept = 0;

    //!
    //! \brief determine that execution capability this engine has.
    //!
    //! If the engine has EngineCapability::kDEFAULT, then all engine functionality is valid..
    //! If the engine has EngineCapability::kSAFE_GPU, then only the functionality in safe::ICudaEngine is valid.
    //! If the engine has EngineCapability::kSAFE_DLA, then only serialize, destroy, and const-accessor functions are valid.
    //!
    //! \return The EngineCapability flag that the engine was built for.
    //!
    virtual EngineCapability getEngineCapability() const noexcept = 0;

    //! \brief Set the ErrorRecorder for this interface
    //!
    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
    //! a recorder has been registered.
    //!
    //! \param recorder The error recorder to register with this interface.
    //
    //! \see getErrorRecorder
    //!
    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;

    //!
    //! \brief get the ErrorRecorder assigned to this interface.
    //!
    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
    //! so a nullptr will be returned if setErrorRecorder has not been called.
    //!
    //! \return A pointer to the IErrorRecorder object that has been registered.
    //!
    //! \see setErrorRecorder
    //!
    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;

    //!
    //! \brief Query whether the engine was built with an implicit batch dimension.
    //!
    //! \return True if tensors have implicit batch dimension, false otherwise.
    //!
    //! This is an engine-wide property.  Either all tensors in the engine
    //! have an implicit batch dimension or none of them do.
    //!
    //! hasImplicitBatchDimension() is true if and only if the INetworkDefinition
    //! from which this engine was built was created with createNetwork() or
    //! createNetworkV2() without NetworkDefinitionCreationFlag::kEXPLICIT_BATCH flag.
    //!
    //! \see createNetworkV2
    //!
    virtual bool hasImplicitBatchDimension() const TRTNOEXCEPT = 0;
};

二、IExecutionContext接口类

执行推断的功能,建立engine后,再通过IExecutionContext进行推断 一个ICudaEngine实例可能存在多个执行context,从而允许将同一引擎用于同时执行多个批处理。如果引擎支持动态形状,则并发使用的每个执行上下文必须使用单独的优化配置文件。
为防止解释有误,保留部分英文注释

class IExecutionContext
{
public:
    //! \return True if execution succeeded.
    //! 同步执行推断
    virtual bool execute(int batchSize, void** bindings) noexcept = 0;
    //! \return True if the kernels were enqueued successfully.
    //! 异步推断
    virtual bool enqueue(int batchSize, void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) noexcept = 0;
    //! 设置同步调试标志,如果将此标志设置为true,则引擎将在execute()期间记录每个内核的成功执行。 使用enqueue()时无效。
    virtual void setDebugSync(bool sync) noexcept = 0;
    //! \see setDebugSync()
    //! 获取是否设置同步标志
    virtual bool getDebugSync() const noexcept = 0;
    //! 设置分析接口,利用IProfiler回调函数。比如用来查看网络每层的消耗的时间
    virtual void setProfiler(IProfiler*) noexcept = 0;
    //! \see IProfiler setProfiler()
    //! 获取这个IProfiler 类
    virtual IProfiler* getProfiler() const noexcept = 0;
    //! 获取关联的Engine
    virtual const ICudaEngine& getEngine() const noexcept = 0;
    //! 释放
    virtual void destroy() noexcept = 0;

protected:
    virtual ~IExecutionContext() noexcept {}

public:
    //!
    //! \brief Set the name of the execution context.
    //!
    //! This method copies the name string.
    //!
    //! \see getName()
    //!
    virtual void setName(const char* name) noexcept = 0;

    //!
    //! \brief Return the name of the execution context.
    //!
    //! \see setName()
    //! 获取名字
    virtual const char* getName() const noexcept = 0;

    //!
    //! \brief set the device memory for use by this execution context.
    //!
    //! The memory must be aligned with cuda memory alignment property (using cudaGetDeviceProperties()), and its size must be at least that
    //! returned by getDeviceMemorySize(). If using enqueue() to run the network, The memory is in
    //! use from the invocation of enqueue() until network execution is complete. If using execute(),
    //! it is in use until execute() returns. Releasing or otherwise using the memory for other
    //! purposes during this time will result in undefined behavior.
    //!
    //! \see ICudaEngine::getDeviceMemorySize() ICudaEngine::createExecutionContextWithoutDeviceMemory()
    //! 设置设备内存供此执行上下文使用。
    // 内存必须与cuda内存对齐属性对齐(使用cudaGetDeviceProperties()),并且其大小必须至少为
    //! 由getDeviceMemorySize()返回。 如果使用enqueue()运行网络,则从调用enqueue()到网络执行完成之前一直在使用内存。 
    //! 如果使用execute(),它将一直使用,直到execute()返回。 在此期间释放或以其他方式使用内存将导致不确定的行为。
    virtual void setDeviceMemory(void* memory) noexcept = 0;

    //!
    //! \brief Return the strides of the buffer for the given binding.
    //!
    //! Note that strides can be different for different execution contexts
    //! with dynamic shapes.
    //!
    //! \param bindingIndex The binding index.
    //! 返回给定绑定的内存的stride。
    virtual Dims getStrides(int bindingIndex) const noexcept = 0;

public:
    //!
    //! \brief Select an optimization profile for the current context.
    //!
    //! \param profileIndex Index of the profile. It must lie between 0 and
    //!        getEngine().getNbOptimizationProfiles() - 1
    // !
    // ! The selected profile will be used in subsequent calls to execute() or enqueue().
    // !
    // ! If the associated CUDA engine has dynamic inputs, this method must be called at least once
    // ! with a unique profileIndex before calling execute or enqueue (i.e. the profile index
    // ! may not be in use by another execution context that has not been destroyed yet).
    // ! For the first execution context that is created for an engine, setOptimizationProfile(0)
    // ! is called implicitly.
    // !
    // ! If the associated CUDA engine does not have inputs with dynamic shapes, this method need not be
    // ! called, in which case the default profile index of 0 will be used (this is particularly
    // ! the case for all safe engines).
    // !
    // ! setOptimizationProfile() must be called before calling setBindingDimensions() and
    // ! setInputShapeBinding() for all dynamic input tensors or input shape tensors, which in
    // ! turn must be called before either execute() or enqueue().
    // !
    // ! \return true if the call succeeded, else false (e.g. input out of range)
    // !
    // ! \see ICudaEngine::getNbOptimizationProfiles()
    // 为context选择一个优化的配置文件,输入的是配置文件的索引
    virtual bool setOptimizationProfile(int profileIndex) noexcept = 0;

    //!
    //! \brief Get the index of the currently selected optimization profile.
    //!
    //! If the profile index has not been set yet (implicitly to 0 for the first execution context
    //! to be created, or explicitly for all subsequent contexts), an invalid value of -1 will be returned
    //! and all calls to enqueue() or execute() will fail until a valid profile index has been set.
    //! 获取当前所选优化配置文件的索引。
    virtual int getOptimizationProfile() const noexcept = 0;

    //!
    //! \brief Set the dynamic dimensions of a binding
    //!
    //! Requires the engine to be built without an implicit batch dimension.
    //! The binding must be an input tensor, and all dimensions must be compatible with
    //! the network definition (i.e. only the wildcard dimension -1 can be replaced with a
    //! new dimension > 0). Furthermore, the dimensions must be in the valid range for the
    //! currently selected optimization profile, and the corresponding engine must not be
    //! safety-certified.
    //! This method will fail unless a valid optimization profile is defined for the current
    //! execution context (getOptimizationProfile() must not be -1).
    //!
    //! For all dynamic non-output bindings (which have at least one wildcard dimension of -1),
    //! this method needs to be called before either enqueue() or execute() may be called.
    //! This can be checked using the method allInputDimensionsSpecified().
    //!
    //! \return false if an error occurs (e.g. index out of range), else true
    // 设置绑定的动态尺寸
    // 要求构建的引擎没有隐式批处理维度 绑定必须是输入张量,并且所有维都必须与网络定义兼容(即只有通配符维-1可以替换为新维> 0)。
    // 此外,尺寸必须在当前选择的优化配置文件的有效范围内,并且相应的引擎不得经过安全认证。
    // 除非为当前执行上下文定义了有效的优化配置文件,否则此方法将失败 是-1)。
    // 对于所有动态非输出绑定(其至少一个通配符维为-1),需要在调用enqueue()或execute()之前调用此方法。 
    // 可以使用allInputDimensionsSpecified()方法进行检查。
    virtual bool setBindingDimensions(int bindingIndex, Dims dimensions) noexcept = 0;

    //!
    //! \brief Get the dynamic dimensions of a binding
    //!
    //! If the engine was built with an implicit batch dimension, same as ICudaEngine::getBindingDimensions.
    //!
    //! If setBindingDimensions() has been called on this binding (or if there are no
    //! dynamic dimensions), all dimensions will be positive. Otherwise, it is necessary to
    //! call setBindingDimensions() before enqueue() or execute() may be called.
    //!
    //! If the bindingIndex is out of range, an invalid Dims with nbDims == -1 is returned.
    //! The same invalid Dims will be returned if the engine was not built with an implicit
    //! batch dimension and if the execution context is not currently associated with a valid
    //! optimization profile (i.e. if getOptimizationProfile() returns -1).
    //!
    //! If ICudaEngine::bindingIsInput(bindingIndex) is false, then both
    //! allInputDimensionsSpecified() and allInputShapesSpecified() must be true
    //! before calling this method.
    //!
    //! \return Currently selected binding dimensions
    //!
    virtual Dims getBindingDimensions(int bindingIndex) const noexcept = 0;

    //!
    //! \brief Set values of input tensor required by shape calculations.
    //!
    //! \param bindingIndex index of an input tensor for which
    //!        ICudaEngine::isShapeBinding(bindingIndex) and ICudaEngine::bindingIsInput(bindingIndex)
    //!        are both true.
    //!
    //! \param data pointer to values of the input tensor.  The number of values should be
    //!         the product of the dimensions returned by getBindingDimensions(bindingIndex).
    //!
    //! If ICudaEngine::isShapeBinding(bindingIndex) and ICudaEngine::bindingIsInput(bindingIndex)
    //! are both true, this method must be called before enqueue() or execute() may be called.
    //! This method will fail unless a valid optimization profile is defined for the current
    //! execution context (getOptimizationProfile() must not be -1).
    //!
    virtual bool setInputShapeBinding(int bindingIndex, const int32_t* data) noexcept = 0;

    //!
    //! \brief Get values of an input tensor required for shape calculations or an output tensor produced by shape calculations.
    //!
    //! \param bindingIndex index of an input or output tensor for which
    //!        ICudaEngine::isShapeBinding(bindingIndex) is true.
    //!
    //! \param data pointer to where values will be written.  The number of values written is
    //!        the product of the dimensions returned by getBindingDimensions(bindingIndex).
    //!
    //! If ICudaEngine::bindingIsInput(bindingIndex) is false, then both
    //! allInputDimensionsSpecified() and allInputShapesSpecified() must be true
    //! before calling this method. The method will also fail if no valid optimization profile
    //! has been set for the current execution context, i.e. if getOptimizationProfile() returns -1.
    //!
    //! \see isShapeBinding(bindingIndex)
    //!
    virtual bool getShapeBinding(int bindingIndex, int32_t* data) const noexcept = 0;

    //!
    //! \brief Whether all dynamic dimensions of input tensors have been specified
    //!
    //! \return True if all dynamic dimensions of input tensors have been specified
    //!         by calling setBindingDimensions().
    //!
    //! Trivially true if network has no dynamically shaped input tensors.
    //!
    //! \see setBindingDimensions(bindingIndex,dimensions)
    //!
    virtual bool allInputDimensionsSpecified() const noexcept = 0;

    //!
    //! \brief Whether all input shape bindings have been specified
    //!
    //! \return True if all input shape bindings have been specified by setInputShapeBinding().
    //!
    //! Trivially true if network has no input shape bindings.
    //!
    //! \see isShapeBinding(bindingIndex)
    //!
    virtual bool allInputShapesSpecified() const noexcept = 0;

    //!
    //! \brief Set the ErrorRecorder for this interface
    //!
    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
    //! a recorder has been registered.
    //!
    //! \param recorder The error recorder to register with this interface.
    //
    //! \see getErrorRecorder
    //!
    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;

    //!
    //! \brief get the ErrorRecorder assigned to this interface.
    //!
    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
    //! so a nullptr will be returned if setErrorRecorder has not been called.
    //!
    //! \return A pointer to the IErrorRecorder object that has been registered.
    //!
    //! \see setErrorRecorder
    //!
    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;

    //!
    //! \brief Synchronously execute inference a network.
    //!
    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be
    //! queried using ICudaEngine::getBindingIndex().
    //! This method only works for execution contexts built with full dimension networks.
    //! \param bindings An array of pointers to input and output buffers for the network.
    //!
    //! \return True if execution succeeded.
    //!
    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
    //! 同步推断网络V2版本
    virtual bool executeV2(void** bindings) noexcept = 0;

    //!
    //! \brief Asynchronously execute inference.
    //!
    //! This method requires an array of input and output buffers. The mapping from tensor names to indices can be
    //! queried using ICudaEngine::getBindingIndex().
    //! This method only works for execution contexts built with full dimension networks.
    //! \param bindings An array of pointers to input and output buffers for the network.
    //! \param stream A cuda stream on which the inference kernels will be enqueued
    //! \param inputConsumed An optional event which will be signaled when the input buffers can be refilled with new
    //! data
    //!
    //! \return True if the kernels were enqueued successfully.
    //!
    //! \see ICudaEngine::getBindingIndex() ICudaEngine::getMaxBatchSize()
    //! 异步推断网络V2版本
    virtual bool enqueueV2(void** bindings, cudaStream_t stream, cudaEvent_t* inputConsumed) noexcept = 0;
};

三、IRuntime类

允许一个序列化功能不安全的engine能够被反序列化
常用的就是反序列化一个Engine出来,见最后一节例子
关于dlaCore,没有弄懂,推荐一篇博客
在TensorRT推断期间在DLA上运行

//! 允许一个序列化功能不安全的engine能够被反序列化
class IRuntime
{
public:
    //!
    //! \brief Deserialize an engine from a stream.
    //! 
    //! \param blob The memory that holds the serialized engine.
    //! \param size The size of the memory.
    //! \param pluginFactory The plugin factory, if any plugins are used by the network, otherwise nullptr.
    //!
    //! \return The engine, or nullptr if it could not be deserialized.
    //! 从流上反序列化engine
    // 第三个参数,如果增加了额外的操作,就需要使用这个,否则不需要,
    virtual nvinfer1::ICudaEngine* deserializeCudaEngine(const void* blob, std::size_t size, IPluginFactory* pluginFactory) noexcept = 0;

    //!
    //! \brief Set the DLA core that the deserialized engine must execute on.
    //! \param dlaCore The DLA core to execute the engine on (0 to N-1, where N is the maximum number of DLA's present on the device). Default value is 0.
    //! \see getDLACore()
    //! 设置反序列化引擎必须在其上执行的DLA核心。
    virtual void setDLACore(int dlaCore) noexcept = 0;

    //!
    //! \brief Get the DLA core that the engine executes on.
    //! \return If setDLACore is called, returns DLA core from 0 to N-1, else returns 0.
    //!
    virtual int getDLACore() const noexcept = 0;

    //!
    //! \brief Returns number of DLA hardware cores accessible.
    //!
    virtual int getNbDLACores() const noexcept = 0;

    //!
    //! \brief Destroy this object.
    //!
    virtual void destroy() noexcept = 0;

protected:
    virtual ~IRuntime() {}

public:
    //!
    //! \brief Set the GPU allocator.
    //! \param allocator Set the GPU allocator to be used by the runtime. All GPU memory acquired will use this allocator. If NULL is passed, the default allocator will be used.
    //!
    //! Default: uses cudaMalloc/cudaFree.
    //!
    //! If nullptr is passed, the default allocator will be used.
    //!
    virtual void setGpuAllocator(IGpuAllocator* allocator) noexcept = 0;

    //!
    //! \brief Set the ErrorRecorder for this interface
    //!
    //! Assigns the ErrorRecorder to this interface. The ErrorRecorder will track all errors during execution.
    //! This function will call incRefCount of the registered ErrorRecorder at least once. Setting
    //! recorder to nullptr unregisters the recorder with the interface, resulting in a call to decRefCount if
    //! a recorder has been registered.
    //!
    //! \param recorder The error recorder to register with this interface.
    //
    //! \see getErrorRecorder
    //!
    virtual void setErrorRecorder(IErrorRecorder* recorder) noexcept = 0;

    //!
    //! \brief get the ErrorRecorder assigned to this interface.
    //!
    //! Retrieves the assigned error recorder object for the given class. A default error recorder does not exist,
    //! so a nullptr will be returned if setErrorRecorder has not been called.
    //!
    //! \return A pointer to the IErrorRecorder object that has been registered.
    //!
    //! \see setErrorRecorder
    //!
    virtual IErrorRecorder* getErrorRecorder() const noexcept = 0;

    //!
    //! \brief Deserialize an engine from a stream when plugin factory is not used.
    //!
    //! \param blob The memory that holds the serialized engine.
    //! \param size The size of the memory.
    //!
    //! \return The engine, or nullptr if it could not be deserialized.
    //!
    nvinfer1::ICudaEngine* deserializeCudaEngine(const void* blob, std::size_t size) noexcept
    {
        return deserializeCudaEngine(blob, size, nullptr);
    }
};

四、其他类和函数

IRefitter 更新Engine上的weights
IProfiler 以回调函数的方式传入到context中,主要用用是用来分析每一层消耗的时间
使用见TensorRT(3):FP16与分析网络每层的消耗时间
createInferRuntime函数,创建一个IRuntime类,用来记录日志等
如:

// gLogger是一个日志类,必须要有,但又不是那么重要,可以自己继承
    IRuntime* runtime = createInferRuntime(gLogger);

五、代码举例:

加载本地fp32的序列化后的trt模型,了解engine、context等接口的作用

/*====================================================================
文件 : sampleCaffeClassf.cc
功能 : TensorRT学习系列4、ICudaEngine接口类
====================================================================*/
#include "NvCaffeParser.h"
#include "NvInfer.h"
#include "NvInferPlugin.h"
#include "NvInferRuntimeCommon.h"
#include "logger.h"
#include "cuda_runtime_api.h"
#include "common.h"

#include <cstdlib>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <opencv2/opencv.hpp>

using namespace nvinfer1;
using namespace plugin;
using namespace nvcaffeparser1;

const int MODEL_HEIGHT = 256;
const int MODEL_WIDTH = 256;
const int MODEL_CHANNEL = 3;
const int MODEL_OUTPUT_SIZE = 5; // 5分类

/**********************************
 * @brief 先resize、再减均值、除方差
 *
 * @param src 
 * @param dst 
 * @return 
 *********************************/
void preData(cv::Mat &matSrc, cv::Mat &matDst)
{   
    cv::resize(matSrc, matSrc, cv::Size(MODEL_WIDTH, MODEL_HEIGHT));
    cv::Mat matMean(MODEL_HEIGHT, MODEL_WIDTH, CV_32FC3, \
                        cv::Scalar(103.53f, 116.28f, 123.675f)); // 均值
    cv::Mat matStd(256, 256, CV_32FC3, \
                        cv::Scalar(1.0f, 1.0f, 1.0f)); // 方差
    cv::Mat matF32Img;
    matSrc.convertTo(matF32Img, CV_32FC3);
    matDst = (matF32Img - matMean) / matStd;
}

int main()
{
    std::string strTrtSavedPath = "./savedTrt.trt";
    // gLogger
    // gLogger是一个日志类,必须要有,但又不是那么重要,可以自己继承
    IRuntime* runtime = createInferRuntime(gLogger);
    std::ifstream fin(strTrtSavedPath);

    // 1、将文件中的内容读取至cached_engine字符串
    std::string modelData = "";
    while (fin.peek() != EOF){ // 使用fin.peek()防止文件读取时无限循环
        std::stringstream buffer;
        buffer << fin.rdbuf();
        modelData.append(buffer.str());
    }
    fin.close();

    // 2、 将序列化得到的结果进行反序列化,以执行后续的inference
    ICudaEngine* engine = runtime->deserializeCudaEngine(modelData.data(), modelData.size(), nullptr);
    // inference推断过程

    IExecutionContext *context = engine->createExecutionContext();   // inference推断过程

    int nInputIdx = engine->getBindingIndex("data");     // 获取输入节点索引
    int nOutputIndex = engine->getBindingIndex("prob");  // 获取输出节点索引
    int nNumIndex =  engine->getNbBindings();            // 获取总索引个数
    char achInputTensorName[128];                        
    strcpy(achInputTensorName,  engine->getBindingName(nInputIdx));//获取对应索引的节点名
    bool bIsInputTensor = engine->bindingIsInput(nInputIdx); // 是否是输入节点
    Dims tDimInput = engine->getBindingDimensions(nInputIdx); // 获取输入检点的维度
    DataType emInputDataType = engine->getBindingDataType(nInputIdx);// 获取输入数据的类型
    int nMaxBatchSize = engine->getMaxBatchSize();    // 获取最大BatchSize
    int nNumLayers = engine->getNbLayers(); // 获取网络层的个数
    int nWorkSpaceSize = engine->getWorkspaceSize(); // 获取工作空间的大小, 通常小于设置的值
    TensorLocation emLocattion= engine->getLocation(nInputIdx); // 获取索引对应的tensor在gpu上还是cpu上
    int nBindingSize = engine->getBindingBytesPerComponent(0); //返回元素每个组成部分的字节数
    TensorFormat emTensorFormat = engine->getBindingFormat(nInputIdx); // 返回数据格式
    
    std::cout << " 输入节点索引 nInputIdx = " << nInputIdx << std::endl;
    std::cout << " 输入节点索引 nOutputIdx = " << nOutputIndex << std::endl;
    std::cout << " 总索引个数 nNumIndex = " << engine->getNbBindings() << std::endl;
    std::cout << " input 节点名 = " <<  achInputTensorName << std::endl;
    for(int i=0; i<tDimInput.nbDims; ++i)
    {
        std::cout << " 输入维度 dim[" << i << "] = " << tDimInput.d[i] << std::endl;
    }
    //申请GPU显存
    std::cout << " 输入数据类型为 " << int(emInputDataType) << std::endl;
    std::cout << " 工作空间大小为 " << nWorkSpaceSize << std::endl;
    std::cout << " 输入数据在 " << int(emLocattion) << std::endl;
    std::cout << " 每个元素空间字节大小:" << nBindingSize << std::endl;
    std::cout << " 输入数据格式: " << int(emTensorFormat) << std::endl;
    std::cout << " 输入数据格式: " << engine->getBindingFormatDesc(nInputIdx) << std::endl;
        // inference推断过程
    std::cout << " 网络layer的个数:" << nNumLayers << std::endl;

      void* buffers[2] = {NULL, NULL};
    int nBatchSize = 1;
    int nOutputSize = MODEL_OUTPUT_SIZE;
    CHECK(cudaMalloc(&buffers[nInputIdx], nBatchSize * MODEL_CHANNEL * MODEL_HEIGHT * MODEL_WIDTH * sizeof(float)));
    CHECK(cudaMalloc(&buffers[nOutputIndex], nBatchSize * nOutputSize * sizeof(float)));

    // 创建cuda流
    cudaStream_t stream;
    CHECK(cudaStreamCreate(&stream));
    cudaEvent_t start, end; //calculate run time
    CHECK(cudaEventCreate(&start));
    CHECK(cudaEventCreate(&end));

    cv::Mat matBgrImg = cv::imread("./data/fram_25.jpg");
    cv::Mat matNormImage;
    preData(matBgrImg, matNormImage); // 减均值除方差


    std::vector<std::vector<cv::Mat>> nChannels;
    std::vector<cv::Mat> rgbChannels(3);
    cv::split(matNormImage, rgbChannels);
    nChannels.push_back(rgbChannels); //  NHWC  转NCHW 

    void *data = malloc(nBatchSize * MODEL_CHANNEL * MODEL_HEIGHT * MODEL_WIDTH *sizeof(float));;
    if (NULL == data)
    {
        printf("malloc error!\n");
        return 0;
    }
    for (int c = 0; c < 3; ++c) 
    {
        cv::Mat cur_imag_plane = nChannels[0][c];
        memcpy(data + c * MODEL_HEIGHT * MODEL_WIDTH * sizeof(float), cur_imag_plane.ptr<unsigned char>(0), 256 *256 * sizeof(float));
    }

    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[nInputIdx], data, \
        nBatchSize * MODEL_CHANNEL * MODEL_WIDTH * MODEL_HEIGHT * sizeof(float), cudaMemcpyHostToDevice, stream));

    bool bIsSucess = context->execute(nBatchSize, buffers); // 同步执行
    if ( !bIsSucess)
    {
        std::cerr << " 推断执行失败 " << std::endl;
        return -1;
    }

    context->setName("Vgg16");
    std::cout <<"context 名为 " << context->getName() << std::endl;

    tDimInput = context->getStrides(nInputIdx);
    for(int i=0; i<tDimInput.nbDims; ++i)
    {
        std::cout << " 输入维度 dim[" << i << "] = " << tDimInput.d[i] << std::endl;
    }
    
    printf("\nend ... TensorRt \n");
    return 0;
    
}
Logo

开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!

更多推荐