性能优化 #
优化概览 #
Whisper 的性能优化可以从多个维度进行,包括硬件加速、算法优化、模型压缩等。
text
┌─────────────────────────────────────────────────────────────┐
│ 性能优化策略 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 硬件层面: │
│ ├── GPU 加速 │
│ ├── 多 GPU 并行 │
│ └── 内存优化 │
│ │
│ 算法层面: │
│ ├── 批处理 │
│ ├── 量化压缩 │
│ └── 缓存优化 │
│ │
│ 实现层面: │
│ ├── faster-whisper │
│ ├── whisper.cpp │
│ └── whisper-jax │
│ │
└─────────────────────────────────────────────────────────────┘
GPU 加速 #
基本配置 #
python
import whisper
import torch
print(f"CUDA 可用: {torch.cuda.is_available()}")
print(f"CUDA 版本: {torch.version.cuda}")
print(f"GPU 数量: {torch.cuda.device_count()}")
if torch.cuda.is_available():
print(f"GPU 名称: {torch.cuda.get_device_name(0)}")
print(f"GPU 显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=device)
FP16 vs FP32 #
python
import whisper
import time
model_fp16 = whisper.load_model("base", device="cuda")
model_fp32 = whisper.load_model("base", device="cuda")
start = time.time()
result_fp16 = model_fp16.transcribe("audio.mp3", fp16=True)
time_fp16 = time.time() - start
start = time.time()
result_fp32 = model_fp32.transcribe("audio.mp3", fp16=False)
time_fp32 = time.time() - start
print(f"FP16 时间: {time_fp16:.2f}s")
print(f"FP32 时间: {time_fp32:.2f}s")
print(f"加速比: {time_fp32/time_fp16:.2f}x")
内存管理 #
python
import whisper
import torch
import gc
def transcribe_with_memory_management(audio_path, model_size="base"):
torch.cuda.empty_cache()
gc.collect()
model = whisper.load_model(model_size, device="cuda")
result = model.transcribe(audio_path)
del model
torch.cuda.empty_cache()
gc.collect()
return result
result = transcribe_with_memory_management("audio.mp3", "medium")
print(result["text"])
批处理优化 #
音频批处理 #
python
import whisper
import numpy as np
model = whisper.load_model("base")
def batch_transcribe(audio_paths, batch_size=4):
results = []
for i in range(0, len(audio_paths), batch_size):
batch = audio_paths[i:i+batch_size]
mels = []
for path in batch:
audio = whisper.load_audio(path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
mels.append(mel)
mels = np.stack(mels)
for j, path in enumerate(batch):
result = model.transcribe(path)
results.append({"file": path, "text": result["text"]})
return results
audio_files = ["audio1.mp3", "audio2.mp3", "audio3.mp3", "audio4.mp3"]
results = batch_transcribe(audio_files)
for r in results:
print(f"{r['file']}: {r['text'][:50]}...")
使用 insanely-fast-whisper #
bash
pip install insanely-fast-whisper
python
from insanely_fast_whisper import Whisper
whisper = Whisper(model_name="large-v3", device="cuda")
result = whisper.transcribe("audio.mp3")
print(result["text"])
faster-whisper #
安装 #
bash
pip install faster-whisper
基本使用 #
python
from faster_whisper import WhisperModel
model = WhisperModel("base", device="cuda", compute_type="float16")
segments, info = model.transcribe("audio.mp3", language="zh")
print(f"检测语言: {info.language} (概率: {info.language_probability:.2f})")
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
性能对比 #
python
import whisper
from faster_whisper import WhisperModel
import time
audio_path = "audio.mp3"
start = time.time()
model_original = whisper.load_model("base")
result_original = model_original.transcribe(audio_path)
time_original = time.time() - start
start = time.time()
model_faster = WhisperModel("base", device="cuda", compute_type="float16")
segments, info = model_faster.transcribe(audio_path)
time_faster = time.time() - start
print(f"原始 Whisper: {time_original:.2f}s")
print(f"faster-whisper: {time_faster:.2f}s")
print(f"加速比: {time_original/time_faster:.2f}x")
faster-whisper 配置 #
python
from faster_whisper import WhisperModel
model = WhisperModel(
"large-v3",
device="cuda",
device_index=0,
compute_type="float16",
cpu_threads=4,
num_workers=1
)
segments, info = model.transcribe(
"audio.mp3",
language="zh",
task="transcribe",
beam_size=5,
best_of=5,
temperature=0.0,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500)
)
for segment in segments:
print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")
whisper.cpp #
安装 #
bash
git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
make
下载模型 #
bash
bash ./models/download-ggml-model.sh base
命令行使用 #
bash
./main -f audio.mp3 -m models/ggml-base.bin -l zh
Python 绑定 #
bash
pip install whisper-cpp
python
from whisper_cpp import Whisper
model = Whisper("models/ggml-base.bin")
result = model.transcribe("audio.mp3")
print(result)
量化压缩 #
INT8 量化 #
python
import whisper
import torch
model = whisper.load_model("base", device="cpu")
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
result = quantized_model.transcribe("audio.mp3")
print(result["text"])
模型大小对比 #
python
import whisper
import os
models = ["tiny", "base", "small", "medium", "large"]
for model_name in models:
model = whisper.load_model(model_name)
param_count = sum(p.numel() for p in model.parameters()) / 1e6
size_mb = param_count * 4 / 1024
print(f"{model_name}: {param_count:.1f}M 参数, ~{size_mb:.0f}MB")
缓存优化 #
模型缓存 #
python
import whisper
import os
MODEL_CACHE = {}
def get_cached_model(model_size="base"):
if model_size not in MODEL_CACHE:
MODEL_CACHE[model_size] = whisper.load_model(model_size)
return MODEL_CACHE[model_size]
model = get_cached_model("base")
result = model.transcribe("audio.mp3")
print(result["text"])
频谱图缓存 #
python
import whisper
import hashlib
import pickle
import os
CACHE_DIR = "./spectrogram_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
def get_cached_mel_spectrogram(audio_path):
with open(audio_path, "rb") as f:
file_hash = hashlib.md5(f.read()).hexdigest()
cache_path = os.path.join(CACHE_DIR, f"{file_hash}.pkl")
if os.path.exists(cache_path):
with open(cache_path, "rb") as f:
return pickle.load(f)
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
with open(cache_path, "wb") as f:
pickle.dump(mel, f)
return mel
mel = get_cached_mel_spectrogram("audio.mp3")
并行处理 #
多进程处理 #
python
import whisper
from multiprocessing import Pool
import os
def transcribe_file(args):
audio_path, model_size = args
model = whisper.load_model(model_size)
result = model.transcribe(audio_path)
return {"file": audio_path, "text": result["text"]}
def parallel_transcribe(audio_files, model_size="base", num_workers=4):
args = [(f, model_size) for f in audio_files]
with Pool(num_workers) as pool:
results = pool.map(transcribe_file, args)
return results
audio_files = [f for f in os.listdir(".") if f.endswith(".mp3")]
results = parallel_transcribe(audio_files, "base", 4)
for r in results:
print(f"{r['file']}: {r['text'][:50]}...")
多 GPU 并行 #
python
import whisper
import torch
from concurrent.futures import ThreadPoolExecutor
def transcribe_on_gpu(audio_path, model_size, gpu_id):
device = f"cuda:{gpu_id}"
model = whisper.load_model(model_size, device=device)
result = model.transcribe(audio_path)
return result["text"]
def multi_gpu_transcribe(audio_files, model_size="base"):
num_gpus = torch.cuda.device_count()
with ThreadPoolExecutor(max_workers=num_gpus) as executor:
futures = []
for i, audio_path in enumerate(audio_files):
gpu_id = i % num_gpus
future = executor.submit(
transcribe_on_gpu,
audio_path,
model_size,
gpu_id
)
futures.append(future)
results = [f.result() for f in futures]
return results
audio_files = ["audio1.mp3", "audio2.mp3", "audio3.mp3", "audio4.mp3"]
results = multi_gpu_transcribe(audio_files)
性能基准测试 #
完整基准测试 #
python
import whisper
import time
import torch
def benchmark_whisper(audio_path, models=["tiny", "base", "small", "medium"]):
results = []
for model_name in models:
for device in ["cuda", "cpu"]:
if device == "cuda" and not torch.cuda.is_available():
continue
model = whisper.load_model(model_name, device=device)
torch.cuda.empty_cache() if device == "cuda" else None
start_time = time.time()
result = model.transcribe(audio_path)
elapsed_time = time.time() - start_time
results.append({
"model": model_name,
"device": device,
"time": elapsed_time,
"text_length": len(result["text"]),
"segments": len(result["segments"])
})
del model
torch.cuda.empty_cache() if device == "cuda" else None
return results
results = benchmark_whisper("test.mp3")
print(f"{'模型':<10} {'设备':<6} {'时间':<10} {'分段数':<8}")
print("-" * 40)
for r in results:
print(f"{r['model']:<10} {r['device']:<6} {r['time']:.2f}s{'':<4} {r['segments']:<8}")
实时因子 (RTF) #
python
import whisper
def calculate_rtf(audio_path, model_size="base"):
model = whisper.load_model(model_size)
audio = whisper.load_audio(audio_path)
audio_duration = len(audio) / 16000
import time
start_time = time.time()
result = model.transcribe(audio_path)
processing_time = time.time() - start_time
rtf = processing_time / audio_duration
print(f"音频时长: {audio_duration:.2f}s")
print(f"处理时间: {processing_time:.2f}s")
print(f"实时因子: {rtf:.2f}x")
print(f"{'可以实时处理' if rtf < 1 else '无法实时处理'}")
return rtf
rtf = calculate_rtf("audio.mp3", "base")
优化建议总结 #
text
场景优化建议:
实时转录:
├── 使用 tiny 或 base 模型
├── 使用 GPU 加速
├── 考虑 faster-whisper
└── 使用 FP16 精度
批量处理:
├── 使用 faster-whisper
├── 启用批处理
├── 多进程/多 GPU 并行
└── 缓存模型实例
低资源环境:
├── 使用 tiny 模型
├── 使用 whisper.cpp
├── CPU + INT8 量化
└── 减少束搜索大小
高精度需求:
├── 使用 large-v3 模型
├── 增加束搜索大小
├── 使用初始提示
└── 考虑 faster-whisper + large
下一步 #
掌握了性能优化后,继续学习 高级用法 了解更多高级功能!
最后更新:2026-04-05