语音转录 #
转录基础 #
什么是语音转录? #
语音转录是将音频中的语音内容转换为文本的过程。Whisper 的转录功能支持 99 种语言,能够自动处理口音、背景噪音和技术术语。
text
┌─────────────────────────────────────────────────────────────┐
│ 转录流程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ 音频输入 → 预处理 → 特征提取 → 模型推理 → 文本输出 │
│ │
│ 输入: audio.mp3 │
│ 输出: "你好,欢迎使用 Whisper 语音识别系统" │
│ │
└─────────────────────────────────────────────────────────────┘
基本用法 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
print(result["text"])
输出格式 #
JSON 格式 #
python
import whisper
import json
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
print(json.dumps(result, indent=2, ensure_ascii=False))
输出结构:
json
{
"text": "完整的转录文本",
"segments": [
{
"id": 0,
"seek": 0,
"start": 0.0,
"end": 3.5,
"text": "第一段文本",
"tokens": [50364, 50364, ...],
"temperature": 0.0,
"avg_logprob": -0.35,
"compression_ratio": 1.2,
"no_speech_prob": 0.05
}
],
"language": "zh"
}
字段说明 #
| 字段 | 说明 |
|---|---|
| text | 完整的转录文本 |
| segments | 分段信息数组 |
| language | 检测到的语言 |
| id | 分段序号 |
| start | 开始时间(秒) |
| end | 结束时间(秒) |
| tokens | 词元 ID 数组 |
| temperature | 使用的温度值 |
| avg_logprob | 平均对数概率 |
| compression_ratio | 压缩比 |
| no_speech_prob | 无语音概率 |
SRT 字幕格式 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
def format_timestamp(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
def write_srt(segments, output_path):
with open(output_path, "w", encoding="utf-8") as f:
for i, segment in enumerate(segments, 1):
start = format_timestamp(segment["start"])
end = format_timestamp(segment["end"])
text = segment["text"].strip()
f.write(f"{i}\n")
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
write_srt(result["segments"], "output.srt")
输出示例:
text
1
00:00:00,000 --> 00:00:03,500
第一段文本内容
2
00:00:03,500 --> 00:00:07,200
第二段文本内容
VTT 字幕格式 #
python
def write_vtt(segments, output_path):
with open(output_path, "w", encoding="utf-8") as f:
f.write("WEBVTT\n\n")
for segment in segments:
start = format_timestamp_vtt(segment["start"])
end = format_timestamp_vtt(segment["end"])
text = segment["text"].strip()
f.write(f"{start} --> {end}\n")
f.write(f"{text}\n\n")
def format_timestamp_vtt(seconds):
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millis = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millis:03d}"
时间戳对齐 #
获取精确时间戳 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3", word_timestamps=True)
for segment in result["segments"]:
print(f"[{segment['start']:.2f} - {segment['end']:.2f}] {segment['text']}")
if "words" in segment:
for word in segment["words"]:
print(f" [{word['start']:.2f} - {word['end']:.2f}] {word['word']}")
词级时间戳 #
python
result = model.transcribe("audio.mp3", word_timestamps=True)
words_with_timestamps = []
for segment in result["segments"]:
if "words" in segment:
for word in segment["words"]:
words_with_timestamps.append({
"word": word["word"],
"start": word["start"],
"end": word["end"],
"probability": word.get("probability", 1.0)
})
for w in words_with_timestamps:
print(f"{w['word']}: {w['start']:.2f}s - {w['end']:.2f}s")
多语言转录 #
自动语言检测 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
print(f"检测到的语言: {result['language']}")
print(f"转录文本: {result['text']}")
指定语言 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3", language="zh")
print(result["text"])
支持的语言 #
python
import whisper
print("支持的语言代码:")
for lang_code, lang_name in whisper.tokenizer.LANGUAGES.items():
print(f" {lang_code}: {lang_name}")
常用语言代码:
| 代码 | 语言 | 代码 | 语言 |
|---|---|---|---|
| zh | 中文 | en | 英语 |
| ja | 日语 | ko | 韩语 |
| fr | 法语 | de | 德语 |
| es | 西班牙语 | ru | 俄语 |
| ar | 阿拉伯语 | pt | 葡萄牙语 |
| it | 意大利语 | hi | 印地语 |
转录选项 #
温度参数 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe(
"audio.mp3",
temperature=0.0
)
result_fallback = model.transcribe(
"audio.mp3",
temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)
)
束搜索 #
python
result = model.transcribe(
"audio.mp3",
beam_size=5,
best_of=5
)
初始提示 #
python
result = model.transcribe(
"audio.mp3",
initial_prompt="这是一段关于机器学习的技术演讲。"
)
条件文本 #
python
result = model.transcribe(
"audio.mp3",
condition_on_previous_text=True
)
处理长音频 #
自动分段 #
Whisper 自动将长音频分成 30 秒的片段处理:
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("long_audio.mp3")
print(f"总分段数: {len(result['segments'])}")
for segment in result["segments"]:
print(f"[{segment['start']:.1f}s - {segment['end']:.1f}s] {segment['text']}")
手动分段处理 #
python
import whisper
import numpy as np
model = whisper.load_model("base")
audio = whisper.load_audio("long_audio.mp3")
chunk_duration = 30 * 16000
chunks = [audio[i:i+chunk_duration] for i in range(0, len(audio), chunk_duration)]
full_text = []
for i, chunk in enumerate(chunks):
chunk = whisper.pad_or_trim(chunk)
mel = whisper.log_mel_spectrogram(chunk).to(model.device)
options = whisper.DecodingOptions(language="zh")
result = whisper.decode(model, mel, options)
full_text.append(result.text)
print(f"处理片段 {i+1}/{len(chunks)}")
print("\n完整文本:")
print(" ".join(full_text))
提高转录质量 #
音频预处理 #
python
import whisper
import numpy as np
from scipy import signal
def preprocess_audio(audio_path):
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
audio = audio / np.max(np.abs(audio))
return audio
model = whisper.load_model("base")
audio = preprocess_audio("audio.mp3")
mel = whisper.log_mel_spectrogram(audio).to(model.device)
options = whisper.DecodingOptions(language="zh")
result = whisper.decode(model, mel, options)
print(result.text)
使用更大的模型 #
python
import whisper
models_comparison = {
"tiny": whisper.load_model("tiny"),
"base": whisper.load_model("base"),
"small": whisper.load_model("small"),
"medium": whisper.load_model("medium"),
}
for name, model in models_comparison.items():
result = model.transcribe("audio.mp3", language="zh")
print(f"{name}: {result['text'][:100]}...")
使用初始提示提高准确性 #
python
import whisper
model = whisper.load_model("base")
prompts = [
"这是一段关于人工智能的技术演讲。",
"演讲者正在讨论机器学习和深度学习。",
"内容涉及神经网络和自然语言处理。"
]
for prompt in prompts:
result = model.transcribe(
"audio.mp3",
initial_prompt=prompt,
language="zh"
)
print(f"提示: {prompt}")
print(f"结果: {result['text'][:100]}...\n")
转录质量评估 #
计算WER(词错误率) #
python
def calculate_wer(reference, hypothesis):
ref_words = reference.split()
hyp_words = hypothesis.split()
d = [[0] * (len(hyp_words) + 1) for _ in range(len(ref_words) + 1)]
for i in range(len(ref_words) + 1):
d[i][0] = i
for j in range(len(hyp_words) + 1):
d[0][j] = j
for i in range(1, len(ref_words) + 1):
for j in range(1, len(hyp_words) + 1):
if ref_words[i-1] == hyp_words[j-1]:
d[i][j] = d[i-1][j-1]
else:
d[i][j] = min(
d[i-1][j] + 1,
d[i][j-1] + 1,
d[i-1][j-1] + 1
)
wer = d[len(ref_words)][len(hyp_words)] / len(ref_words)
return wer
reference = "这是正确的转录文本"
hypothesis = "这是转录的文本"
wer = calculate_wer(reference, hypothesis)
print(f"词错误率: {wer:.2%}")
置信度分析 #
python
import whisper
model = whisper.load_model("base")
result = model.transcribe("audio.mp3")
for segment in result["segments"]:
avg_logprob = segment["avg_logprob"]
no_speech_prob = segment["no_speech_prob"]
confidence = 1 - no_speech_prob
print(f"文本: {segment['text']}")
print(f"平均对数概率: {avg_logprob:.2f}")
print(f"无语音概率: {no_speech_prob:.2f}")
print(f"置信度: {confidence:.2%}\n")
实用工具函数 #
提取关键词时间 #
python
def find_keyword_timestamps(result, keyword):
matches = []
for segment in result["segments"]:
if keyword.lower() in segment["text"].lower():
matches.append({
"start": segment["start"],
"end": segment["end"],
"text": segment["text"]
})
return matches
result = model.transcribe("audio.mp3")
keyword = "人工智能"
timestamps = find_keyword_timestamps(result, keyword)
for match in timestamps:
print(f"[{match['start']:.1f}s - {match['end']:.1f}s] {match['text']}")
生成摘要时间轴 #
python
def generate_timeline(result, interval=60):
timeline = []
current_time = 0
for segment in result["segments"]:
if segment["start"] >= current_time + interval:
timeline.append({
"time": current_time,
"text": segment["text"][:100] + "..."
})
current_time = segment["start"]
return timeline
result = model.transcribe("long_audio.mp3")
timeline = generate_timeline(result, interval=60)
for item in timeline:
minutes = item["time"] // 60
print(f"[{minutes:.0f}分钟] {item['text']}")
下一步 #
掌握了语音转录功能后,继续学习 语音翻译 了解如何将音频翻译成英文!
最后更新:2026-04-05