语音合成 #
合成流程概述 #
text
┌─────────────────────────────────────────────────────────────┐
│ 语音合成流程 │
├─────────────────────────────────────────────────────────────┤
│ │
│ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │
│ │ 文本输入 │ → │ 文本预处理 │ → │ 声学模型 │ → │ 声码器 │ │
│ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │
│ │ │ │ │ │
│ ↓ ↓ ↓ ↓ │
│ 原始文本 音素转换 梅尔频谱 音频波形 │
│ │
└─────────────────────────────────────────────────────────────┘
基础合成 #
Python API 基础用法 #
python
from TTS.api import TTS
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/en/ljspeech/vits").to(device)
# 基础合成
tts.tts_to_file(text="Hello, world!", file_path="output.wav")
# 获取音频数据
wav = tts.tts(text="Hello, world!")
print(f"音频形状: {wav.shape}")
print(f"音频长度: {len(wav) / 22050:.2f} 秒")
CLI 基础用法 #
bash
# 基础合成
tts --text "Hello, world!" --out_path output.wav
# 指定模型
tts --text "Hello, world!" \
--model_name tts_models/en/ljspeech/vits \
--out_path output.wav
文本预处理 #
文本规范化 #
python
from TTS.tts.utils.text import cleaners
# 英文文本规范化
text = "Hello, I'm 25 years old."
cleaned = cleaners.english_cleaners(text)
print(cleaned) # "hello i am twenty five years old"
# 自定义预处理
def custom_preprocess(text):
# 转小写
text = text.lower()
# 移除多余空格
text = " ".join(text.split())
# 替换特殊字符
text = text.replace("&", "and")
return text
text = "Hello & Welcome!"
processed = custom_preprocess(text)
print(processed) # "hello and welcome!"
音素转换 #
python
from TTS.tts.utils.text.phonemizers import ESpeak
# 创建音素转换器
phonemizer = ESpeak(language="en-us")
# 转换文本到音素
text = "Hello world"
phonemes = phonemizer.phonemize(text)
print(phonemes) # "həloʊ wɜːld"
# 中文音素转换
phonemizer_zh = ESpeak(language="cmn")
text_zh = "你好世界"
phonemes_zh = phonemizer_zh.phonemize(text_zh)
print(phonemes_zh)
处理特殊文本 #
python
import re
def process_special_text(text):
# 处理数字
text = re.sub(r'\b(\d+)\b', lambda m: num_to_words(int(m.group())), text)
# 处理缩写
abbreviations = {
"Mr.": "Mister",
"Mrs.": "Misses",
"Dr.": "Doctor",
"etc.": "etcetera",
}
for abbr, full in abbreviations.items():
text = text.replace(abbr, full)
# 处理 URL
text = re.sub(r'https?://\S+', 'URL', text)
# 处理邮箱
text = re.sub(r'\S+@\S+', 'email', text)
return text
def num_to_words(n):
ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
teens = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen']
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty']
if n < 10:
return ones[n]
elif n < 20:
return teens[n - 10]
elif n < 60:
return tens[n // 10] + ('' if n % 10 == 0 else '-' + ones[n % 10])
return str(n)
# 使用
text = "Dr. Smith is 25 years old. Email: test@example.com"
processed = process_special_text(text)
print(processed)
多说话人合成 #
查看可用说话人 #
python
from TTS.api import TTS
tts = TTS("tts_models/en/vctk/vits")
# 列出所有说话人
print("可用说话人:")
for i, speaker in enumerate(tts.speakers):
print(f" {i}: {speaker}")
选择说话人 #
python
from TTS.api import TTS
tts = TTS("tts_models/en/vctk/vits")
# 方式 1:使用索引
tts.tts_to_file(
text="Hello, I am speaker one",
speaker=0,
file_path="speaker_0.wav"
)
# 方式 2:使用名称
tts.tts_to_file(
text="Hello, I am speaker two",
speaker="p225",
file_path="speaker_p225.wav"
)
# 批量生成所有说话人
for speaker in tts.speakers[:5]:
tts.tts_to_file(
text=f"Hello, this is {speaker}",
speaker=speaker,
file_path=f"output_{speaker}.wav"
)
说话人嵌入 #
python
from TTS.api import TTS
import numpy as np
tts = TTS("tts_models/en/vctk/vits")
# 获取说话人嵌入
speaker_embedding = tts.synthesizer.tts_model.speaker_manager.get_embedding("p225")
# 使用嵌入合成
tts.tts_to_file(
text="Custom speaker embedding",
speaker_embedding=speaker_embedding,
file_path="custom_speaker.wav"
)
# 混合说话人
embedding1 = tts.synthesizer.tts_model.speaker_manager.get_embedding("p225")
embedding2 = tts.synthesizer.tts_model.speaker_manager.get_embedding("p226")
mixed_embedding = (embedding1 + embedding2) / 2
tts.tts_to_file(
text="Mixed speaker voice",
speaker_embedding=mixed_embedding,
file_path="mixed_speaker.wav"
)
多语言合成 #
使用 XTTS 多语言模型 #
python
from TTS.api import TTS
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
# 查看支持的语言
print("支持的语言:")
for lang in tts.languages[:20]:
print(f" {lang}")
# 英文合成
tts.tts_to_file(
text="Hello, this is English.",
language="en",
file_path="english.wav"
)
# 中文合成
tts.tts_to_file(
text="你好,这是中文语音合成。",
language="zh-cn",
file_path="chinese.wav"
)
# 日文合成
tts.tts_to_file(
text="こんにちは、これは日本語の音声合成です。",
language="ja",
file_path="japanese.wav"
)
# 法语合成
tts.tts_to_file(
text="Bonjour, c'est la synthèse vocale française.",
language="fr",
file_path="french.wav"
)
语言检测与自动选择 #
python
from TTS.api import TTS
import langdetect
def detect_and_synthesize(text, tts, output_path):
try:
lang = langdetect.detect(text)
lang_map = {
"en": "en",
"zh": "zh-cn",
"ja": "ja",
"ko": "ko",
"fr": "fr",
"de": "de",
"es": "es",
}
tts_lang = lang_map.get(lang, "en")
tts.tts_to_file(
text=text,
language=tts_lang,
file_path=output_path
)
print(f"检测语言: {lang}, 使用: {tts_lang}")
except:
tts.tts_to_file(text=text, language="en", file_path=output_path)
# 使用
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
detect_and_synthesize("Hello world", tts, "auto_en.wav")
detect_and_synthesize("你好世界", tts, "auto_zh.wav")
参数调优 #
语速控制 #
python
from TTS.api import TTS
tts = TTS("tts_models/en/ljspeech/glow-tts")
# Glow-TTS 支持语速控制
tts.tts_to_file(
text="This is normal speed.",
file_path="normal.wav"
)
# 注意:语速控制需要模型支持
# FastSpeech2 和 Glow-TTS 支持语速控制
情感控制 #
python
from TTS.api import TTS
# 使用支持情感的模型
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC_GST")
# GST (Global Style Token) 可以控制情感
# 具体参数取决于模型配置
音量调整 #
python
import numpy as np
import soundfile as sf
from TTS.api import TTS
tts = TTS("tts_models/en/ljspeech/vits")
# 获取原始音频
wav = tts.tts(text="Hello world")
# 调整音量
def adjust_volume(audio, factor):
return np.clip(audio * factor, -1.0, 1.0)
# 增大音量
louder = adjust_volume(wav, 1.5)
# 减小音量
quieter = adjust_volume(wav, 0.5)
# 保存
sf.write("louder.wav", louder, 22050)
sf.write("quieter.wav", quieter, 22050)
高级合成技术 #
流式合成 #
python
from TTS.api import TTS
tts = TTS("tts_models/en/ljspeech/vits")
# 流式生成(适用于长文本)
for chunk in tts.tts_stream(text="This is a long text that will be streamed."):
# chunk 是音频片段
pass
长文本处理 #
python
from TTS.api import TTS
import numpy as np
import soundfile as sf
def synthesize_long_text(tts, text, max_length=200):
sentences = text.split(". ")
audio_chunks = []
for sentence in sentences:
if sentence.strip():
wav = tts.tts(text=sentence.strip() + ".")
audio_chunks.append(wav)
# 合并音频
silence = np.zeros(int(22050 * 0.5)) # 0.5秒静音
full_audio = []
for chunk in audio_chunks:
full_audio.extend(chunk)
full_audio.extend(silence)
return np.array(full_audio)
# 使用
tts = TTS("tts_models/en/ljspeech/vits")
long_text = """
This is the first sentence. This is the second sentence.
And this is the third sentence. Finally, this is the last one.
"""
audio = synthesize_long_text(tts, long_text)
sf.write("long_text.wav", audio, 22050)
批量并行处理 #
python
from TTS.api import TTS
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
def process_text(args):
text, output_path, tts = args
tts.tts_to_file(text=text, file_path=output_path)
return output_path
def batch_synthesize(texts, output_dir, max_workers=4):
tts = TTS("tts_models/en/ljspeech/vits")
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
args_list = [
(text, str(output_dir / f"audio_{i}.wav"), tts)
for i, text in enumerate(texts)
]
with ThreadPoolExecutor(max_workers=max_workers) as executor:
results = list(executor.map(process_text, args_list))
return results
# 使用
texts = [
"First sentence.",
"Second sentence.",
"Third sentence.",
]
results = batch_synthesize(texts, "output")
音频后处理 #
添加背景音乐 #
python
import numpy as np
import soundfile as sf
from TTS.api import TTS
def add_background_music(speech_path, music_path, output_path, music_volume=0.3):
speech, sr1 = sf.read(speech_path)
music, sr2 = sf.read(music_path)
# 重采样背景音乐
if sr1 != sr2:
import librosa
music = librosa.resample(music, sr2, sr1)
# 调整音乐长度
if len(music) < len(speech):
music = np.tile(music, int(np.ceil(len(speech) / len(music))))
music = music[:len(speech)]
# 混合
mixed = speech + music * music_volume
mixed = np.clip(mixed, -1.0, 1.0)
sf.write(output_path, mixed, sr1)
# 使用
tts = TTS("tts_models/en/ljspeech/vits")
tts.tts_to_file(text="Welcome to our podcast!", file_path="speech.wav")
add_background_music("speech.wav", "background.mp3", "mixed.wav")
音频标准化 #
python
import numpy as np
import soundfile as sf
def normalize_audio(audio_path, output_path, target_db=-20):
audio, sr = sf.read(audio_path)
# 计算 RMS
rms = np.sqrt(np.mean(audio ** 2))
current_db = 20 * np.log10(rms)
# 计算增益
gain = 10 ** ((target_db - current_db) / 20)
# 应用增益
normalized = audio * gain
normalized = np.clip(normalized, -1.0, 1.0)
sf.write(output_path, normalized, sr)
# 使用
normalize_audio("output.wav", "normalized.wav")
实用示例 #
示例 1:文本朗读器 #
python
from TTS.api import TTS
from pathlib import Path
class TextReader:
def __init__(self, model_name="tts_models/en/ljspeech/vits"):
self.tts = TTS(model_name)
def read_file(self, file_path, output_dir="output"):
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True)
with open(file_path, "r", encoding="utf-8") as f:
lines = f.readlines()
for i, line in enumerate(lines):
text = line.strip()
if text:
output_path = output_dir / f"line_{i:04d}.wav"
self.tts.tts_to_file(text=text, file_path=str(output_path))
print(f"已处理: {output_path}")
# 使用
reader = TextReader()
reader.read_file("document.txt")
示例 2:多语言新闻播报 #
python
from TTS.api import TTS
import torch
class NewsBroadcaster:
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
def broadcast(self, news_items):
for i, item in enumerate(news_items):
text = item["text"]
lang = item["language"]
output = f"news_{i}_{lang}.wav"
self.tts.tts_to_file(
text=text,
language=lang,
file_path=output
)
print(f"已生成: {output}")
# 使用
broadcaster = NewsBroadcaster()
news = [
{"text": "Good morning, here is today's news.", "language": "en"},
{"text": "早上好,这是今天的新闻。", "language": "zh-cn"},
{"text": "おはようございます、今日のニュースです。", "language": "ja"},
]
broadcaster.broadcast(news)
下一步 #
掌握了语音合成后,继续学习 声音克隆,了解如何使用参考音频克隆任意声音!
最后更新:2026-04-05