性能优化 #

优化概览 #

Whisper 的性能优化可以从多个维度进行，包括硬件加速、算法优化、模型压缩等。

text

┌─────────────────────────────────────────────────────────────┐
│                    性能优化策略                              │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  硬件层面:                                                   │
│  ├── GPU 加速                                               │
│  ├── 多 GPU 并行                                            │
│  └── 内存优化                                               │
│                                                             │
│  算法层面:                                                   │
│  ├── 批处理                                                 │
│  ├── 量化压缩                                               │
│  └── 缓存优化                                               │
│                                                             │
│  实现层面:                                                   │
│  ├── faster-whisper                                         │
│  ├── whisper.cpp                                            │
│  └── whisper-jax                                            │
│                                                             │
└─────────────────────────────────────────────────────────────┘

GPU 加速 #

基本配置 #

python

import whisper
import torch

print(f"CUDA 可用: {torch.cuda.is_available()}")
print(f"CUDA 版本: {torch.version.cuda}")
print(f"GPU 数量: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    print(f"GPU 名称: {torch.cuda.get_device_name(0)}")
    print(f"GPU 显存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model("base", device=device)

FP16 vs FP32 #

python

import whisper
import time

model_fp16 = whisper.load_model("base", device="cuda")
model_fp32 = whisper.load_model("base", device="cuda")

start = time.time()
result_fp16 = model_fp16.transcribe("audio.mp3", fp16=True)
time_fp16 = time.time() - start

start = time.time()
result_fp32 = model_fp32.transcribe("audio.mp3", fp16=False)
time_fp32 = time.time() - start

print(f"FP16 时间: {time_fp16:.2f}s")
print(f"FP32 时间: {time_fp32:.2f}s")
print(f"加速比: {time_fp32/time_fp16:.2f}x")

内存管理 #

python

import whisper
import torch
import gc

def transcribe_with_memory_management(audio_path, model_size="base"):
    torch.cuda.empty_cache()
    gc.collect()
    
    model = whisper.load_model(model_size, device="cuda")
    
    result = model.transcribe(audio_path)
    
    del model
    torch.cuda.empty_cache()
    gc.collect()
    
    return result

result = transcribe_with_memory_management("audio.mp3", "medium")
print(result["text"])

批处理优化 #

音频批处理 #

python

import whisper
import numpy as np

model = whisper.load_model("base")

def batch_transcribe(audio_paths, batch_size=4):
    results = []
    
    for i in range(0, len(audio_paths), batch_size):
        batch = audio_paths[i:i+batch_size]
        
        mels = []
        for path in batch:
            audio = whisper.load_audio(path)
            audio = whisper.pad_or_trim(audio)
            mel = whisper.log_mel_spectrogram(audio)
            mels.append(mel)
        
        mels = np.stack(mels)
        
        for j, path in enumerate(batch):
            result = model.transcribe(path)
            results.append({"file": path, "text": result["text"]})
    
    return results

audio_files = ["audio1.mp3", "audio2.mp3", "audio3.mp3", "audio4.mp3"]
results = batch_transcribe(audio_files)

for r in results:
    print(f"{r['file']}: {r['text'][:50]}...")

使用 insanely-fast-whisper #

bash

pip install insanely-fast-whisper

python

from insanely_fast_whisper import Whisper

whisper = Whisper(model_name="large-v3", device="cuda")

result = whisper.transcribe("audio.mp3")
print(result["text"])

faster-whisper #

安装 #

bash

pip install faster-whisper

基本使用 #

python

from faster_whisper import WhisperModel

model = WhisperModel("base", device="cuda", compute_type="float16")

segments, info = model.transcribe("audio.mp3", language="zh")

print(f"检测语言: {info.language} (概率: {info.language_probability:.2f})")

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

性能对比 #

python

import whisper
from faster_whisper import WhisperModel
import time

audio_path = "audio.mp3"

start = time.time()
model_original = whisper.load_model("base")
result_original = model_original.transcribe(audio_path)
time_original = time.time() - start

start = time.time()
model_faster = WhisperModel("base", device="cuda", compute_type="float16")
segments, info = model_faster.transcribe(audio_path)
time_faster = time.time() - start

print(f"原始 Whisper: {time_original:.2f}s")
print(f"faster-whisper: {time_faster:.2f}s")
print(f"加速比: {time_original/time_faster:.2f}x")

faster-whisper 配置 #

python

from faster_whisper import WhisperModel

model = WhisperModel(
    "large-v3",
    device="cuda",
    device_index=0,
    compute_type="float16",
    cpu_threads=4,
    num_workers=1
)

segments, info = model.transcribe(
    "audio.mp3",
    language="zh",
    task="transcribe",
    beam_size=5,
    best_of=5,
    temperature=0.0,
    vad_filter=True,
    vad_parameters=dict(min_silence_duration_ms=500)
)

for segment in segments:
    print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

whisper.cpp #

安装 #

bash

git clone https://github.com/ggerganov/whisper.cpp
cd whisper.cpp
make

下载模型 #

bash

bash ./models/download-ggml-model.sh base

命令行使用 #

bash

./main -f audio.mp3 -m models/ggml-base.bin -l zh

Python 绑定 #

bash

pip install whisper-cpp

python

from whisper_cpp import Whisper

model = Whisper("models/ggml-base.bin")
result = model.transcribe("audio.mp3")
print(result)

量化压缩 #

INT8 量化 #

python

import whisper
import torch

model = whisper.load_model("base", device="cpu")

quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

result = quantized_model.transcribe("audio.mp3")
print(result["text"])

模型大小对比 #

python

import whisper
import os

models = ["tiny", "base", "small", "medium", "large"]

for model_name in models:
    model = whisper.load_model(model_name)
    
    param_count = sum(p.numel() for p in model.parameters()) / 1e6
    size_mb = param_count * 4 / 1024
    
    print(f"{model_name}: {param_count:.1f}M 参数, ~{size_mb:.0f}MB")

缓存优化 #

模型缓存 #

python

import whisper
import os

MODEL_CACHE = {}

def get_cached_model(model_size="base"):
    if model_size not in MODEL_CACHE:
        MODEL_CACHE[model_size] = whisper.load_model(model_size)
    return MODEL_CACHE[model_size]

model = get_cached_model("base")
result = model.transcribe("audio.mp3")
print(result["text"])

频谱图缓存 #

python

import whisper
import hashlib
import pickle
import os

CACHE_DIR = "./spectrogram_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def get_cached_mel_spectrogram(audio_path):
    with open(audio_path, "rb") as f:
        file_hash = hashlib.md5(f.read()).hexdigest()
    
    cache_path = os.path.join(CACHE_DIR, f"{file_hash}.pkl")
    
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            return pickle.load(f)
    
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio)
    
    with open(cache_path, "wb") as f:
        pickle.dump(mel, f)
    
    return mel

mel = get_cached_mel_spectrogram("audio.mp3")

并行处理 #

多进程处理 #

python

import whisper
from multiprocessing import Pool
import os

def transcribe_file(args):
    audio_path, model_size = args
    model = whisper.load_model(model_size)
    result = model.transcribe(audio_path)
    return {"file": audio_path, "text": result["text"]}

def parallel_transcribe(audio_files, model_size="base", num_workers=4):
    args = [(f, model_size) for f in audio_files]
    
    with Pool(num_workers) as pool:
        results = pool.map(transcribe_file, args)
    
    return results

audio_files = [f for f in os.listdir(".") if f.endswith(".mp3")]
results = parallel_transcribe(audio_files, "base", 4)

for r in results:
    print(f"{r['file']}: {r['text'][:50]}...")

多 GPU 并行 #

python

import whisper
import torch
from concurrent.futures import ThreadPoolExecutor

def transcribe_on_gpu(audio_path, model_size, gpu_id):
    device = f"cuda:{gpu_id}"
    model = whisper.load_model(model_size, device=device)
    result = model.transcribe(audio_path)
    return result["text"]

def multi_gpu_transcribe(audio_files, model_size="base"):
    num_gpus = torch.cuda.device_count()
    
    with ThreadPoolExecutor(max_workers=num_gpus) as executor:
        futures = []
        for i, audio_path in enumerate(audio_files):
            gpu_id = i % num_gpus
            future = executor.submit(
                transcribe_on_gpu,
                audio_path,
                model_size,
                gpu_id
            )
            futures.append(future)
        
        results = [f.result() for f in futures]
    
    return results

audio_files = ["audio1.mp3", "audio2.mp3", "audio3.mp3", "audio4.mp3"]
results = multi_gpu_transcribe(audio_files)

性能基准测试 #

完整基准测试 #

python

import whisper
import time
import torch

def benchmark_whisper(audio_path, models=["tiny", "base", "small", "medium"]):
    results = []
    
    for model_name in models:
        for device in ["cuda", "cpu"]:
            if device == "cuda" and not torch.cuda.is_available():
                continue
            
            model = whisper.load_model(model_name, device=device)
            
            torch.cuda.empty_cache() if device == "cuda" else None
            
            start_time = time.time()
            result = model.transcribe(audio_path)
            elapsed_time = time.time() - start_time
            
            results.append({
                "model": model_name,
                "device": device,
                "time": elapsed_time,
                "text_length": len(result["text"]),
                "segments": len(result["segments"])
            })
            
            del model
            torch.cuda.empty_cache() if device == "cuda" else None
    
    return results

results = benchmark_whisper("test.mp3")

print(f"{'模型':<10} {'设备':<6} {'时间':<10} {'分段数':<8}")
print("-" * 40)
for r in results:
    print(f"{r['model']:<10} {r['device']:<6} {r['time']:.2f}s{'':<4} {r['segments']:<8}")

实时因子 (RTF) #

python

import whisper

def calculate_rtf(audio_path, model_size="base"):
    model = whisper.load_model(model_size)
    
    audio = whisper.load_audio(audio_path)
    audio_duration = len(audio) / 16000
    
    import time
    start_time = time.time()
    result = model.transcribe(audio_path)
    processing_time = time.time() - start_time
    
    rtf = processing_time / audio_duration
    
    print(f"音频时长: {audio_duration:.2f}s")
    print(f"处理时间: {processing_time:.2f}s")
    print(f"实时因子: {rtf:.2f}x")
    print(f"{'可以实时处理' if rtf < 1 else '无法实时处理'}")
    
    return rtf

rtf = calculate_rtf("audio.mp3", "base")

优化建议总结 #

text

场景优化建议:

实时转录:
├── 使用 tiny 或 base 模型
├── 使用 GPU 加速
├── 考虑 faster-whisper
└── 使用 FP16 精度

批量处理:
├── 使用 faster-whisper
├── 启用批处理
├── 多进程/多 GPU 并行
└── 缓存模型实例

低资源环境:
├── 使用 tiny 模型
├── 使用 whisper.cpp
├── CPU + INT8 量化
└── 减少束搜索大小

高精度需求:
├── 使用 large-v3 模型
├── 增加束搜索大小
├── 使用初始提示
└── 考虑 faster-whisper + large

下一步 #

掌握了性能优化后，继续学习高级用法了解更多高级功能！