语音合成 #

合成流程概述 #

text

┌─────────────────────────────────────────────────────────────┐
│                     语音合成流程                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌──────────┐   ┌──────────┐   ┌──────────┐   ┌──────────┐ │
│  │ 文本输入  │ → │ 文本预处理 │ → │ 声学模型  │ → │ 声码器    │ │
│  └──────────┘   └──────────┘   └──────────┘   └──────────┘ │
│       │              │              │              │       │
│       ↓              ↓              ↓              ↓       │
│  原始文本       音素转换        梅尔频谱       音频波形     │
│                                                             │
└─────────────────────────────────────────────────────────────┘

基础合成 #

Python API 基础用法 #

python

from TTS.api import TTS
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/en/ljspeech/vits").to(device)

# 基础合成
tts.tts_to_file(text="Hello, world!", file_path="output.wav")

# 获取音频数据
wav = tts.tts(text="Hello, world!")
print(f"音频形状: {wav.shape}")
print(f"音频长度: {len(wav) / 22050:.2f} 秒")

CLI 基础用法 #

bash

# 基础合成
tts --text "Hello, world!" --out_path output.wav

# 指定模型
tts --text "Hello, world!" \
    --model_name tts_models/en/ljspeech/vits \
    --out_path output.wav

文本预处理 #

文本规范化 #

python

from TTS.tts.utils.text import cleaners

# 英文文本规范化
text = "Hello, I'm 25 years old."
cleaned = cleaners.english_cleaners(text)
print(cleaned)  # "hello i am twenty five years old"

# 自定义预处理
def custom_preprocess(text):
    # 转小写
    text = text.lower()
    # 移除多余空格
    text = " ".join(text.split())
    # 替换特殊字符
    text = text.replace("&", "and")
    return text

text = "Hello & Welcome!"
processed = custom_preprocess(text)
print(processed)  # "hello and welcome!"

音素转换 #

python

from TTS.tts.utils.text.phonemizers import ESpeak

# 创建音素转换器
phonemizer = ESpeak(language="en-us")

# 转换文本到音素
text = "Hello world"
phonemes = phonemizer.phonemize(text)
print(phonemes)  # "həloʊ wɜːld"

# 中文音素转换
phonemizer_zh = ESpeak(language="cmn")
text_zh = "你好世界"
phonemes_zh = phonemizer_zh.phonemize(text_zh)
print(phonemes_zh)

处理特殊文本 #

python

import re

def process_special_text(text):
    # 处理数字
    text = re.sub(r'\b(\d+)\b', lambda m: num_to_words(int(m.group())), text)
    
    # 处理缩写
    abbreviations = {
        "Mr.": "Mister",
        "Mrs.": "Misses",
        "Dr.": "Doctor",
        "etc.": "etcetera",
    }
    for abbr, full in abbreviations.items():
        text = text.replace(abbr, full)
    
    # 处理 URL
    text = re.sub(r'https?://\S+', 'URL', text)
    
    # 处理邮箱
    text = re.sub(r'\S+@\S+', 'email', text)
    
    return text

def num_to_words(n):
    ones = ['', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
    teens = ['ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 
             'sixteen', 'seventeen', 'eighteen', 'nineteen']
    tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty']
    
    if n < 10:
        return ones[n]
    elif n < 20:
        return teens[n - 10]
    elif n < 60:
        return tens[n // 10] + ('' if n % 10 == 0 else '-' + ones[n % 10])
    return str(n)

# 使用
text = "Dr. Smith is 25 years old. Email: test@example.com"
processed = process_special_text(text)
print(processed)

多说话人合成 #

查看可用说话人 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/vctk/vits")

# 列出所有说话人
print("可用说话人:")
for i, speaker in enumerate(tts.speakers):
    print(f"  {i}: {speaker}")

选择说话人 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/vctk/vits")

# 方式 1：使用索引
tts.tts_to_file(
    text="Hello, I am speaker one",
    speaker=0,
    file_path="speaker_0.wav"
)

# 方式 2：使用名称
tts.tts_to_file(
    text="Hello, I am speaker two",
    speaker="p225",
    file_path="speaker_p225.wav"
)

# 批量生成所有说话人
for speaker in tts.speakers[:5]:
    tts.tts_to_file(
        text=f"Hello, this is {speaker}",
        speaker=speaker,
        file_path=f"output_{speaker}.wav"
    )

说话人嵌入 #

python

from TTS.api import TTS
import numpy as np

tts = TTS("tts_models/en/vctk/vits")

# 获取说话人嵌入
speaker_embedding = tts.synthesizer.tts_model.speaker_manager.get_embedding("p225")

# 使用嵌入合成
tts.tts_to_file(
    text="Custom speaker embedding",
    speaker_embedding=speaker_embedding,
    file_path="custom_speaker.wav"
)

# 混合说话人
embedding1 = tts.synthesizer.tts_model.speaker_manager.get_embedding("p225")
embedding2 = tts.synthesizer.tts_model.speaker_manager.get_embedding("p226")
mixed_embedding = (embedding1 + embedding2) / 2

tts.tts_to_file(
    text="Mixed speaker voice",
    speaker_embedding=mixed_embedding,
    file_path="mixed_speaker.wav"
)

多语言合成 #

使用 XTTS 多语言模型 #

python

from TTS.api import TTS
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# 查看支持的语言
print("支持的语言:")
for lang in tts.languages[:20]:
    print(f"  {lang}")

# 英文合成
tts.tts_to_file(
    text="Hello, this is English.",
    language="en",
    file_path="english.wav"
)

# 中文合成
tts.tts_to_file(
    text="你好，这是中文语音合成。",
    language="zh-cn",
    file_path="chinese.wav"
)

# 日文合成
tts.tts_to_file(
    text="こんにちは、これは日本語の音声合成です。",
    language="ja",
    file_path="japanese.wav"
)

# 法语合成
tts.tts_to_file(
    text="Bonjour, c'est la synthèse vocale française.",
    language="fr",
    file_path="french.wav"
)

语言检测与自动选择 #

python

from TTS.api import TTS
import langdetect

def detect_and_synthesize(text, tts, output_path):
    try:
        lang = langdetect.detect(text)
        lang_map = {
            "en": "en",
            "zh": "zh-cn",
            "ja": "ja",
            "ko": "ko",
            "fr": "fr",
            "de": "de",
            "es": "es",
        }
        tts_lang = lang_map.get(lang, "en")
        
        tts.tts_to_file(
            text=text,
            language=tts_lang,
            file_path=output_path
        )
        print(f"检测语言: {lang}, 使用: {tts_lang}")
    except:
        tts.tts_to_file(text=text, language="en", file_path=output_path)

# 使用
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
detect_and_synthesize("Hello world", tts, "auto_en.wav")
detect_and_synthesize("你好世界", tts, "auto_zh.wav")

参数调优 #

语速控制 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/ljspeech/glow-tts")

# Glow-TTS 支持语速控制
tts.tts_to_file(
    text="This is normal speed.",
    file_path="normal.wav"
)

# 注意：语速控制需要模型支持
# FastSpeech2 和 Glow-TTS 支持语速控制

情感控制 #

python

from TTS.api import TTS

# 使用支持情感的模型
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC_GST")

# GST (Global Style Token) 可以控制情感
# 具体参数取决于模型配置

音量调整 #

python

import numpy as np
import soundfile as sf
from TTS.api import TTS

tts = TTS("tts_models/en/ljspeech/vits")

# 获取原始音频
wav = tts.tts(text="Hello world")

# 调整音量
def adjust_volume(audio, factor):
    return np.clip(audio * factor, -1.0, 1.0)

# 增大音量
louder = adjust_volume(wav, 1.5)

# 减小音量
quieter = adjust_volume(wav, 0.5)

# 保存
sf.write("louder.wav", louder, 22050)
sf.write("quieter.wav", quieter, 22050)

高级合成技术 #

流式合成 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/ljspeech/vits")

# 流式生成（适用于长文本）
for chunk in tts.tts_stream(text="This is a long text that will be streamed."):
    # chunk 是音频片段
    pass

长文本处理 #

python

from TTS.api import TTS
import numpy as np
import soundfile as sf

def synthesize_long_text(tts, text, max_length=200):
    sentences = text.split(". ")
    audio_chunks = []
    
    for sentence in sentences:
        if sentence.strip():
            wav = tts.tts(text=sentence.strip() + ".")
            audio_chunks.append(wav)
    
    # 合并音频
    silence = np.zeros(int(22050 * 0.5))  # 0.5秒静音
    full_audio = []
    for chunk in audio_chunks:
        full_audio.extend(chunk)
        full_audio.extend(silence)
    
    return np.array(full_audio)

# 使用
tts = TTS("tts_models/en/ljspeech/vits")
long_text = """
This is the first sentence. This is the second sentence. 
And this is the third sentence. Finally, this is the last one.
"""

audio = synthesize_long_text(tts, long_text)
sf.write("long_text.wav", audio, 22050)

批量并行处理 #

python

from TTS.api import TTS
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

def process_text(args):
    text, output_path, tts = args
    tts.tts_to_file(text=text, file_path=output_path)
    return output_path

def batch_synthesize(texts, output_dir, max_workers=4):
    tts = TTS("tts_models/en/ljspeech/vits")
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    args_list = [
        (text, str(output_dir / f"audio_{i}.wav"), tts)
        for i, text in enumerate(texts)
    ]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(process_text, args_list))
    
    return results

# 使用
texts = [
    "First sentence.",
    "Second sentence.",
    "Third sentence.",
]
results = batch_synthesize(texts, "output")

音频后处理 #

添加背景音乐 #

python

import numpy as np
import soundfile as sf
from TTS.api import TTS

def add_background_music(speech_path, music_path, output_path, music_volume=0.3):
    speech, sr1 = sf.read(speech_path)
    music, sr2 = sf.read(music_path)
    
    # 重采样背景音乐
    if sr1 != sr2:
        import librosa
        music = librosa.resample(music, sr2, sr1)
    
    # 调整音乐长度
    if len(music) < len(speech):
        music = np.tile(music, int(np.ceil(len(speech) / len(music))))
    music = music[:len(speech)]
    
    # 混合
    mixed = speech + music * music_volume
    mixed = np.clip(mixed, -1.0, 1.0)
    
    sf.write(output_path, mixed, sr1)

# 使用
tts = TTS("tts_models/en/ljspeech/vits")
tts.tts_to_file(text="Welcome to our podcast!", file_path="speech.wav")
add_background_music("speech.wav", "background.mp3", "mixed.wav")

音频标准化 #

python

import numpy as np
import soundfile as sf

def normalize_audio(audio_path, output_path, target_db=-20):
    audio, sr = sf.read(audio_path)
    
    # 计算 RMS
    rms = np.sqrt(np.mean(audio ** 2))
    current_db = 20 * np.log10(rms)
    
    # 计算增益
    gain = 10 ** ((target_db - current_db) / 20)
    
    # 应用增益
    normalized = audio * gain
    normalized = np.clip(normalized, -1.0, 1.0)
    
    sf.write(output_path, normalized, sr)

# 使用
normalize_audio("output.wav", "normalized.wav")

实用示例 #

示例 1：文本朗读器 #

python

from TTS.api import TTS
from pathlib import Path

class TextReader:
    def __init__(self, model_name="tts_models/en/ljspeech/vits"):
        self.tts = TTS(model_name)
    
    def read_file(self, file_path, output_dir="output"):
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        with open(file_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        
        for i, line in enumerate(lines):
            text = line.strip()
            if text:
                output_path = output_dir / f"line_{i:04d}.wav"
                self.tts.tts_to_file(text=text, file_path=str(output_path))
                print(f"已处理: {output_path}")

# 使用
reader = TextReader()
reader.read_file("document.txt")

示例 2：多语言新闻播报 #

python

from TTS.api import TTS
import torch

class NewsBroadcaster:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
    
    def broadcast(self, news_items):
        for i, item in enumerate(news_items):
            text = item["text"]
            lang = item["language"]
            output = f"news_{i}_{lang}.wav"
            
            self.tts.tts_to_file(
                text=text,
                language=lang,
                file_path=output
            )
            print(f"已生成: {output}")

# 使用
broadcaster = NewsBroadcaster()
news = [
    {"text": "Good morning, here is today's news.", "language": "en"},
    {"text": "早上好，这是今天的新闻。", "language": "zh-cn"},
    {"text": "おはようございます、今日のニュースです。", "language": "ja"},
]
broadcaster.broadcast(news)

下一步 #

掌握了语音合成后，继续学习声音克隆，了解如何使用参考音频克隆任意声音！