ONNX Runtime #
Runtime 概述 #
ONNX Runtime 是微软开发的高性能推理引擎,专为 ONNX 模型设计,支持 CPU、GPU、TPU 等多种硬件后端。
text
┌─────────────────────────────────────────────────────────────┐
│ ONNX Runtime 架构 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 应用层 │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Python / C++ / C# / Java API │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ ONNX Runtime Core │ │
│ │ ┌─────────┐ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ │
│ │ │图优化 │ │算子内核 │ │内存管理 │ │执行调度 │ │ │
│ │ └─────────┘ └─────────┘ └─────────┘ └─────────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │ │
│ ▼ │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Execution Providers │ │
│ │ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ │ │
│ │ │CPU │ │CUDA │ │TensorRT│ │OpenVINO│ │CoreML│ │... │ │ │
│ │ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ └─────┘ │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────┘
基本使用 #
创建推理会话 #
python
import onnxruntime as ort
session = ort.InferenceSession("model.onnx")
print(f"输入:")
for inp in session.get_inputs():
print(f" {inp.name}: {inp.shape} ({inp.type})")
print(f"输出:")
for out in session.get_outputs():
print(f" {out.name}: {out.shape} ({out.type})")
执行推理 #
python
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx")
input_name = session.get_inputs()[0].name
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
outputs = session.run(None, {input_name: input_data})
print(f"输出形状: {outputs[0].shape}")
output_names = [out.name for out in session.get_outputs()]
outputs = session.run(output_names, {input_name: input_data})
获取输出 #
python
import onnxruntime as ort
session = ort.InferenceSession("model.onnx")
input_name = session.get_inputs()[0].name
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
result = session.run(None, {input_name: input_data})
print(f"输出列表: {[r.shape for r in result]}")
output_dict = session.run(None, {input_name: input_data})
output_name = session.get_outputs()[0].name
print(f"输出 {output_name}: {output_dict[0].shape}")
执行提供者 #
查看可用提供者 #
python
import onnxruntime as ort
print(f"可用提供者: {ort.get_available_providers()}")
print(f"设备提供者: {ort.get_device()}")
session = ort.InferenceSession("model.onnx")
print(f"当前提供者: {session.get_providers()}")
CPU 提供者 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = 4
sess_options.inter_op_num_threads = 4
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session = ort.InferenceSession(
"model.onnx",
sess_options,
providers=["CPUExecutionProvider"]
)
CUDA GPU 提供者 #
python
import onnxruntime as ort
providers = [
('CUDAExecutionProvider', {
'device_id': 0,
'arena_extend_strategy': 'kNextPowerOfTwo',
'gpu_mem_limit': 2 * 1024 * 1024 * 1024,
'cudnn_conv_algo_search': 'EXHAUSTIVE',
'do_copy_in_default_stream': True,
}),
'CPUExecutionProvider'
]
session = ort.InferenceSession(
"model.onnx",
providers=providers
)
print(f"使用提供者: {session.get_providers()}")
TensorRT 提供者 #
python
import onnxruntime as ort
providers = [
('TensorrtExecutionProvider', {
'device_id': 0,
'trt_max_workspace_size': 1 << 30,
'trt_fp16_enable': True,
'trt_engine_cache_enable': True,
'trt_engine_cache_path': './trt_cache',
}),
'CUDAExecutionProvider',
'CPUExecutionProvider'
]
session = ort.InferenceSession(
"model.onnx",
providers=providers
)
OpenVINO 提供者 #
python
import onnxruntime as ort
providers = [
('OpenVINOExecutionProvider', {
'device_type': 'CPU',
'precision': 'FP32',
'enable_opencl_throttling': True,
})
]
session = ort.InferenceSession(
"model.onnx",
providers=providers
)
SessionOptions 配置 #
基本配置 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = 4
sess_options.inter_op_num_threads = 1
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
sess_options.enable_mem_pattern = True
sess_options.enable_cpu_mem_arena = True
session = ort.InferenceSession("model.onnx", sess_options)
图优化级别 #
text
┌─────────────────────────────────────────────────────────────┐
│ 图优化级别 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ORT_DISABLE_ALL: │
│ - 禁用所有优化 │
│ - 用于调试 │
│ │
│ ORT_ENABLE_BASIC: │
│ - 基础优化(默认) │
│ - 冗余节点消除、常量折叠 │
│ │
│ ORT_ENABLE_EXTENDED: │
│ - 扩展优化 │
│ - 算子融合 │
│ │
│ ORT_ENABLE_ALL: │
│ - 启用所有优化 │
│ - 包括布局优化等 │
│ │
└─────────────────────────────────────────────────────────────┘
执行模式 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.intra_op_num_threads = 4
日志配置 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.log_severity_level = 0
sess_options.log_verbosity_level = 0
session = ort.InferenceSession("model.onnx", sess_options)
性能优化 #
IO Binding #
python
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx", providers=['CUDAExecutionProvider'])
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
io_binding = session.io_binding()
io_binding.bind_cpu_input(input_name, input_data)
io_binding.bind_output(output_name, "cuda", 0, np.float32, [1, 1000])
session.run_with_iobinding(io_binding)
output = io_binding.get_outputs()[0].numpy()
print(f"输出形状: {output.shape}")
预分配输出 #
python
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx")
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
output_shape = session.get_outputs()[0].shape
output_buffer = np.zeros(output_shape, dtype=np.float32)
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
session.run_with_iobinding(
session.io_binding()
.bind_cpu_input(input_name, input_data)
.bind_output(output_name, output_buffer)
)
批量推理 #
python
import onnxruntime as ort
import numpy as np
session = ort.InferenceSession("model.onnx")
input_name = session.get_inputs()[0].name
batch_size = 32
input_data = np.random.randn(batch_size, 3, 224, 224).astype(np.float32)
outputs = session.run(None, {input_name: input_data})
print(f"批量输出形状: {outputs[0].shape}")
异步推理 #
python
import onnxruntime as ort
import numpy as np
import asyncio
async def run_inference_async(session, input_data):
loop = asyncio.get_event_loop()
input_name = session.get_inputs()[0].name
result = await loop.run_in_executor(
None,
lambda: session.run(None, {input_name: input_data})
)
return result
async def main():
session = ort.InferenceSession("model.onnx")
tasks = []
for _ in range(10):
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
tasks.append(run_inference_async(session, input_data))
results = await asyncio.gather(*tasks)
print(f"完成 {len(results)} 次推理")
asyncio.run(main())
模型信息 #
获取模型元数据 #
python
import onnxruntime as ort
session = ort.InferenceSession("model.onnx")
print("模型元数据:")
metadata = session.get_modelmeta()
print(f" 生产者: {metadata.producer_name}")
print(f" 版本: {metadata.producer_version}")
print(f" 域: {metadata.domain}")
print(f" 描述: {metadata.description}")
print(f" 图描述: {metadata.graph_description}")
print("\n自定义元数据:")
for key, value in metadata.custom_metadata_map.items():
print(f" {key}: {value}")
获取输入输出信息 #
python
import onnxruntime as ort
session = ort.InferenceSession("model.onnx")
print("输入:")
for inp in session.get_inputs():
print(f" 名称: {inp.name}")
print(f" 类型: {inp.type}")
print(f" 形状: {inp.shape}")
print()
print("输出:")
for out in session.get_outputs():
print(f" 名称: {out.name}")
print(f" 类型: {out.type}")
print(f" 形状: {out.shape}")
print()
内存配置 #
内存模式 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.enable_mem_pattern = True
sess_options.enable_cpu_mem_arena = True
sess_options.memory_pattern_enable = True
sess_options.arena_extend_strategy = ort.ArenaExtendStrategy.kNextPowerOfTwo
GPU 内存配置 #
python
import onnxruntime as ort
providers = [
('CUDAExecutionProvider', {
'device_id': 0,
'arena_extend_strategy': 'kSameAsRequested',
'gpu_mem_limit': 4 * 1024 * 1024 * 1024,
'cudnn_conv_algo_search': 'DEFAULT',
'do_copy_in_default_stream': True,
})
]
session = ort.InferenceSession("model.onnx", providers=providers)
性能分析 #
Profiling #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.enable_profiling = True
sess_options.profile_file_prefix = "onnx_profile"
session = ort.InferenceSession("model.onnx", sess_options)
for _ in range(10):
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
session.run(None, {"input": input_data})
profile_file = session.end_profiling()
print(f"Profile 文件: {profile_file}")
性能计数器 #
python
import onnxruntime as ort
sess_options = ort.SessionOptions()
sess_options.enable_profiling = True
session = ort.InferenceSession("model.onnx", sess_options)
for _ in range(10):
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
session.run(None, {"input": input_data})
import json
profile_file = session.end_profiling()
with open(profile_file, 'r') as f:
profile_data = json.load(f)
for event in profile_data:
print(f"{event['name']}: {event['dur']} us")
错误处理 #
异常捕获 #
python
import onnxruntime as ort
import numpy as np
try:
session = ort.InferenceSession("model.onnx")
input_data = np.random.randn(1, 3, 224, 224).astype(np.float32)
outputs = session.run(None, {"input": input_data})
except ort.RuntimeException as e:
print(f"运行时错误: {e}")
except ValueError as e:
print(f"值错误: {e}")
except Exception as e:
print(f"其他错误: {e}")
输入验证 #
python
import onnxruntime as ort
import numpy as np
def validate_and_run(session, input_data):
input_info = session.get_inputs()[0]
expected_shape = input_info.shape
expected_type = input_info.type
if input_data.dtype != np.float32 and expected_type == 'tensor(float)':
input_data = input_data.astype(np.float32)
for i, (actual, expected) in enumerate(zip(input_data.shape, expected_shape)):
if expected != -1 and expected is not None and actual != expected:
raise ValueError(f"维度 {i} 不匹配: 期望 {expected}, 实际 {actual}")
return session.run(None, {input_info.name: input_data})
下一步 #
现在你已经了解了 ONNX Runtime,接下来学习 模型优化,深入了解 ONNX 模型优化技术!
最后更新:2026-04-04