TensorRT10系列的api使用以及部署案例
和前面的部署思路一致,我们从onnx开始构建模型推理引擎,然后加载推理引擎,推理模型。构建模型和推理代码的过程都需要logger,这个logger用来记录模型构建和推理过程中出现的异常或错误。此时推理出来的结果数据被存储在绑定的输出地址上,直接读取绑定的输出地址即可以进行后处理的操作。在10系列的trt构建的过程中,加载onnx模型开始构建推理引擎的时候,基于nvonnxparser这个动态库实现
tensorRT更新到10系列后,一些接口发生变化。本文简单介绍使用tensorRT部署模型的接口介绍以及部署例子的分享。和前面的部署思路一致,我们从onnx开始构建模型推理引擎,然后加载推理引擎,推理模型。
1. 准备模型的logger
构建模型和推理代码的过程都需要logger,这个logger用来记录模型构建和推理过程中出现的异常或错误。需要手动实现这个logger类,logger类需要继承自tensorRT的iLogger类。
我的实现的方法如下:
其中trtlogger.h的代码如下:
#ifndef __LOGGER_HPP__
#define __LOGGER_HPP__
#include <NvInfer.h>
#include <string>
#include <stdarg.h>
#include <memory>
#define LOGF(...) trtlogger::Logger::__log_info(logger::Level::FATAL, __VA_ARGS__)
#define LOGE(...) trtlogger::Logger::__log_info(logger::Level::ERROR, __VA_ARGS__)
#define LOGW(...) trtlogger::Logger::__log_info(logger::Level::WARN, __VA_ARGS__)
#define LOG(...) trtlogger::Logger::__log_info(logger::Level::INFO, __VA_ARGS__)
#define LOGV(...) trtlogger::Logger::__log_info(logger::Level::VERB, __VA_ARGS__)
#define LOGD(...) trtlogger::Logger::__log_info(logger::Level::DEBUG, __VA_ARGS__)
#define DGREEN "\033[1;36m"
#define BLUE "\033[1;34m"
#define PURPLE "\033[1;35m"
#define GREEN "\033[1;32m"
#define YELLOW "\033[1;33m"
#define RED "\033[1;31m"
#define CLEAR "\033[0m"
namespace trtlogger{
enum class Level : int32_t{
FATAL = 0,
ERROR = 1,
WARN = 2,
INFO = 3,
VERB = 4,
DEBUG = 5
};
class Logger : public nvinfer1::ILogger{
public:
Logger();
Logger(Level level);
virtual void log(Severity severity, const char* msg) noexcept override;
static void __log_info(Level level, const char* format, ...);
Severity get_severity(Level level);
Level get_level(Severity severity);
private:
static Level m_level;
Severity m_severity;
};
std::shared_ptr<Logger> create_logger(Level level);
} // namespace logger
#endif //__LOGGER_HPP__
其中的trtlogger.cpp代码如下:
#include "TrtLogger.hpp"
#include <NvInfer.h>
#include <cstdlib>
using namespace std;
namespace trtlogger {
Level Logger::m_level = Level::INFO;
Logger::Logger(Level level) {
m_level = level;
m_severity = get_severity(level);
}
Logger::Severity Logger::get_severity(Level level) {
switch (level) {
case Level::FATAL: return Severity::kINTERNAL_ERROR;
case Level::ERROR: return Severity::kERROR;
case Level::WARN: return Severity::kWARNING;
case Level::INFO: return Severity::kINFO;
case Level::VERB: return Severity::kVERBOSE;
default: return Severity::kVERBOSE;
}
}
Level Logger::get_level(Severity severity) {
switch (severity) {
case Severity::kINTERNAL_ERROR: return Level::FATAL;
case Severity::kERROR: return Level::ERROR;
case Severity::kWARNING: return Level::WARN;
case Severity::kINFO: return Level::INFO;
case Severity::kVERBOSE: return Level::VERB;
default: return Level::FATAL;
}
}
void Logger::log (Severity severity, const char* msg) noexcept{
/*
有的时候TensorRT给出的log会比较多并且比较细,所以我们选择将TensorRT的打印log的级别稍微约束一下
- TensorRT的log级别如果是FATAL, ERROR, WARNING, 按照正常方式打印
- TensorRT的log级别如果是INFO或者是VERBOSE的时候,只有当logger的level在大于VERBOSE的时候再打出
*/
if (severity <= get_severity(Level::WARN)
|| m_level >= Level::DEBUG)
__log_info(get_level(severity), "%s", msg);
}
void Logger::__log_info(Level level, const char* format, ...) {
char msg[1000];
va_list args;
va_start(args, format);
int n = 0;
switch (level) {
case Level::DEBUG: n += snprintf(msg + n, sizeof(msg) - n, DGREEN "[debug]" CLEAR); break;
case Level::VERB: n += snprintf(msg + n, sizeof(msg) - n, PURPLE "[verb]" CLEAR); break;
case Level::INFO: n += snprintf(msg + n, sizeof(msg) - n, YELLOW "[info]" CLEAR); break;
case Level::WARN: n += snprintf(msg + n, sizeof(msg) - n, BLUE "[warn]" CLEAR); break;
case Level::ERROR: n += snprintf(msg + n, sizeof(msg) - n, RED "[error]" CLEAR); break;
default: n += snprintf(msg + n, sizeof(msg) - n, RED "[fatal]" CLEAR); break;
}
n += vsnprintf(msg + n, sizeof(msg) - n, format, args);
va_end(args);
if (level <= m_level)
fprintf(stdout, "%s\n", msg);
if (level <= Level::ERROR) {
fflush(stdout);
exit(0);
}
}
shared_ptr<Logger> create_logger(Level level) {
return make_shared<Logger>(level);
}
} // namespace logger
1. 构建模型推理引擎
加载onnx模型,构建推理引擎。
这部分的代码和tensorRT8版本的变化不大,总的分为构建需要如下几个
1. 声明构建模型的四件套:(logger)、builder、network、config、parser
2. 配置模型优化参数:dynamic shape、Calibration、DLA
3. 申请workspace
4. 序列化保存推理引擎文件
需要引用前面的logger方法,使用tensorRT的api构建模型推理引擎的额完整代码如下:
bool genEngine(std::string onnx_file_path, std::string save_engine_path, trtlogger::Logger level, int maxbatch){
auto logger = std::make_shared<trtlogger::Logger>(level);
// 创建builder
auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(*logger));
if(!builder){
std::cout<<" (T_T)~~~, Failed to create builder."<<std::endl;
return false;
}
auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(0U));
if(!network){
std::cout<<" (T_T)~~~, Failed to create network."<<std::endl;
return false;
}
// 创建 config
auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());
if(!config){
std::cout<<" (T_T)~~~, Failed to create config."<<std::endl;
return false;
}
// 创建parser 从onnx自动构建模型,否则需要自己构建每个算子
auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, *logger));
if(!parser){
std::cout<<" (T_T)~~~, Failed to create parser."<<std::endl;
return false;
}
// 读取onnx模型文件开始构建模型
auto parsed = parser->parseFromFile(onnx_file_path.c_str(), 1);
if(!parsed){
std::cout<<" (T_T)~~~ ,Failed to parse onnx file."<<std::endl;
return false;
}
{
auto input = network->getInput(0);
auto input_dims = input->getDimensions();
auto profile = builder->createOptimizationProfile();
// 配置最小、最优、最大范围
input_dims.d[0] = 1;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMIN, input_dims);
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kOPT, input_dims);
input_dims.d[0] = maxbatch;
profile->setDimensions(input->getName(), nvinfer1::OptProfileSelector::kMAX, input_dims);
config->addOptimizationProfile(profile);
// 判断是否使用半精度优化模型
// if(FP16)
config->setFlag(nvinfer1::BuilderFlag::kFP16);
config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
// 设置默认设备类型为 DLA
config->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
// 获取 DLA 核心支持情况
int numDLACores = builder->getNbDLACores();
if (numDLACores > 0) {
std::cout << "DLA is available. Number of DLA cores: " << numDLACores << std::endl;
// 设置 DLA 核心
int coreToUse = 0; // 选择第一个 DLA 核心(可以根据实际需求修改)
config->setDLACore(coreToUse);
std::cout << "Using DLA core: " << coreToUse << std::endl;
} else {
std::cerr << "DLA not available on this platform, falling back to GPU." << std::endl;
// 如果 DLA 不可用,则设置 GPU 回退
config->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
config->setDefaultDeviceType(nvinfer1::DeviceType::kGPU);
}
};
config->setMemoryPoolLimit(nvinfer1::MemoryPoolType::kWORKSPACE, 1 << 28); /*在新的版本中被使用*/
// 创建序列化引擎文件
auto plan = std::unique_ptr<nvinfer1::IHostMemory>(builder->buildSerializedNetwork(*network, *config));
if(!plan){
std::cout<<" (T_T)~~~, Failed to SerializedNetwork."<<std::endl;
return false;
}
//! 检查输入部分是否符合要求
auto numInput = network->getNbInputs();
std::cout<<"模型的输入个数是:"<<numInput<<std::endl;
for(auto i = 0; i<numInput; ++i){
std::cout<<" 模型的第"<<i<<"个输入:";
auto mInputDims = network->getInput(i)->getDimensions();
std::cout<<" ✨~ model input dims: "<<mInputDims.nbDims <<std::endl;
for(size_t ii=0; ii<mInputDims.nbDims; ++ii){
std::cout<<" ✨^_^ model input dim"<<ii<<": "<<mInputDims.d[ii] <<std::endl;
}
}
auto numOutput = network->getNbOutputs();
std::cout<<"模型的输出个数是:"<<numOutput<<std::endl;
for(auto i=0; i<numOutput; ++i){
std::cout<<" 模型的第"<<i<<"个输出:";
auto mOutputDims = network->getOutput(i)->getDimensions();
std::cout<<" ✨~ model output dims: "<<mOutputDims.nbDims <<std::endl;
for(size_t jj=0; jj<mOutputDims.nbDims; ++jj){
std::cout<<" ✨^_^ model output dim"<<jj<<": "<<mOutputDims.d[jj] <<std::endl;
}
}
// 序列化保存推理引擎文件文件
std::ofstream engine_file(save_engine_path, std::ios::binary);
if(!engine_file.good()){
std::cout<<" (T_T)~~~, Failed to open engine file"<<std::endl;
return false;
}
engine_file.write((char *)plan->data(), plan->size());
engine_file.close();
std::cout << " ~~Congratulations! 🎉🎉🎉~ Engine build success!!! ✨✨✨~~ " << std::endl;
return true;
}
在10系列的trt构建的过程中,加载onnx模型开始构建推理引擎的时候,基于nvonnxparser这个动态库实现的。
3. input_dir--engine--outpur_dir对接
把准备好的数据传给engine,engine推理这个数据,然后输出推理的结果。B=engine(A),其中A是模型可以接收的数据格式,B是engine推理的结果。此时,需要把engine和A、B对接好接口。在当前的版本中,使用context->setTensorAddress(name, buffers.getDeviceBuffer(name))接口完成对接。
bool TrtModel::trtIOMemory() {
m_inputDims = m_context->getTensorShape("images");
m_outputDims = m_context->getTensorShape("output0");
// for(auto i=0; i<IOName.size(); ++i){
// m_IODims.push_back(m_context->getTensorShape(IOName[i].c_str()));
// }
// int memory_size = 1;
// for(auto i=0; i<m_IODims.size(); ++i){
// for(auto j=0; m_IODims[i].nbDims; ++j){
// memory_size *= m_IODims[i].d[j];
// std::cout<<"展示出来输入输出的内存大小"<<memory_size<<std::endl;
// }
// }
this->kInputH = m_inputDims.d[2];
this->kInputW = m_inputDims.d[3];
m_inputSize = m_inputDims.d[0] * m_inputDims.d[1] * m_inputDims.d[2] * m_inputDims.d[3] * sizeof(float);
m_outputSize = m_outputDims.d[0] * m_outputDims.d[1] * m_outputDims.d[2] * sizeof(float);
checkRuntime(cudaMalloc(&buffers[0], m_inputSize));
checkRuntime(cudaMalloc(&buffers[1], m_outputSize));
checkRuntime(cudaMallocHost(&m_inputMemory, m_inputSize));
checkRuntime(cudaMallocHost(&m_outputMemory[0], m_outputSize)); // cpu output
checkRuntime(cudaMalloc(&m_outputMemory[1], m_outputSize));
checkRuntime(cudaMalloc(&m_outputMemory[2], (1 + kMaxNumOutputBbox * kNumBoxElement) * sizeof(float)));
checkRuntime(cudaMallocHost(&m_outputMemory[3], (1 + kMaxNumOutputBbox * kNumBoxElement) * sizeof(float)));
m_context->setTensorAddress("images", buffers[0]);
m_context->setTensorAddress("output0", buffers[1]);
// for(auto i=0; i<IOName.size();++i){
// m_context->setTensorAddress(IOName[i].c_str(), buffers[i]);
// }
// std::vector<std::string>().swap(IOName);
checkRuntime(cudaStreamCreate(&m_stream));
return true;
}
这里主要给模型的输入输出申请内存,然后绑定对接输入输出
4. 推理模型
推理:声明推理模型的三件套:(load_engine)、(logger)、runtime、deserialize、ExecutionContext
bool TrtModel::Runtime(std::string engine_file_path, trtlogger::Logger level, int maxBatch){
auto logger = std::make_shared<trtlogger::Logger>(level);
std::ifstream engineFile(engine_file_path, std::ios::binary);
long int fsize = 0;
engineFile.seekg(0, engineFile.end);
fsize = engineFile.tellg();
engineFile.seekg(0, engineFile.beg);
std::vector<char> engineString(fsize);
engineFile.read(engineString.data(), fsize);
if (engineString.size() == 0) { std::cout << "Failed getting serialized engine!" << std::endl; return false; }
// 创建推理引擎
m_runtime.reset(nvinfer1::createInferRuntime(*logger));
if(!m_runtime){
std::cout<<" (T_T)~~~, Failed to create runtime."<<std::endl;
return false;
}
// 反序列化推理引擎
m_engine.reset(m_runtime->deserializeCudaEngine(engineString.data(), fsize));
if(!m_engine){
std::cout<<" (T_T)~~~, Failed to deserialize."<<std::endl;
return false;
}
// 获取优化后的模型的输入维度和输出维度
// int nbBindings = m_engine->getNbBindings(); // trt8.5 以前版本
int nbBindings = m_engine->getNbIOTensors(); // trt8.5 以后版本
auto num_tensors = m_engine->getNbIOTensors();
for(auto i=0; i<num_tensors; ++i){
std::string name = std::string(m_engine->getIOTensorName(i)); // 获取张量名称
auto shape = m_engine->getTensorShape(name.c_str());
IOName.push_back(name);
auto dtype = m_engine->getTensorDataType(name.c_str()); // 获取张量数据类型
bool input = (m_engine->getTensorIOMode(name.c_str()) == nvinfer1::TensorIOMode::kINPUT); // 判断张量是否为输入
if(input){
std::cout<<"input of model:"<<std::endl;
for(auto i=0; i<shape.nbDims; ++i){
std::cout<< i <<" dims: "<<shape.d[i]<<std::endl;
}
}else{
std::cout<<"output of model:"<<std::endl;
for(auto i=0; i<shape.nbDims; ++i){
std::cout<< i <<" dims: "<<shape.d[i]<<std::endl;
}
}
}
// 推理执行上下文
m_context.reset(m_engine->createExecutionContext());
if(!m_context){
std::cout<<" (T_T)~~~, Failed to create ExecutionContext."<<std::endl;
return false;
}
auto input_dims = m_context->getTensorShape("images");
input_dims.d[0] = maxBatch;
m_context->setInputShape("images", input_dims);
std::cout << " ~~Congratulations! 🎉🎉🎉~ create execution context success!!! ✨✨✨~~ " << std::endl;
return true;
}
这部分代码主要用来加载推理引擎,然后创建createInferRuntime,deserializeCudaEngine,createExecutionContext,此时完善推理后,
bool status = this->m_context->enqueueV3(m_stream);
进行enqueueV3即可以得到模型推理的结果。此时推理出来的结果数据被存储在绑定的输出地址上,直接读取绑定的输出地址即可以进行后处理的操作。
总体来说和之前的部署思路完全一致,api变动是推理引擎的输入输出的绑定接口和推理模型的接口。
更多推荐


所有评论(0)