快速开始 #

第一个语音合成 #

使用命令行（CLI） #

最简单的方式是使用命令行工具：

bash

# 基础用法
tts --text "Hello, this is my first speech synthesis." --out_path output.wav

# 指定模型
tts --text "Hello world" \
    --model_name tts_models/en/ljspeech/vits \
    --out_path output.wav

# 列出所有可用模型
tts --list_models

使用 Python API #

python

import torch
from TTS.api import TTS

# 自动选择设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 加载模型
tts = TTS("tts_models/en/ljspeech/vits").to(device)

# 合成语音
tts.tts_to_file(text="Hello, this is a test.", file_path="output.wav")

CLI 命令详解 #

基本命令格式 #

bash

tts --text <文本> --model_name <模型名> --out_path <输出路径>

常用参数 #

text

┌─────────────────────────────────────────────────────────────┐
│                     CLI 参数说明                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  --text              要合成的文本                            │
│  --model_name        模型名称（格式：tts_models/语言/数据集/模型）│
│  --out_path          输出文件路径                            │
│  --speaker_idx       说话人索引（多说话人模型）               │
│  --speaker_wav       参考音频（声音克隆）                     │
│  --language_idx      语言索引（多语言模型）                   │
│  --list_models       列出所有可用模型                        │
│  --model_info        显示模型详细信息                        │
│  --gpus              指定 GPU 设备                          │
│                                                             │
└─────────────────────────────────────────────────────────────┘

CLI 示例 #

bash

# 使用默认模型
tts --text "Hello world" --out_path hello.wav

# 使用 VITS 模型
tts --text "Hello world" \
    --model_name tts_models/en/ljspeech/vits \
    --out_path hello_vits.wav

# 使用 Tacotron2 模型
tts --text "Hello world" \
    --model_name tts_models/en/ljspeech/tacotron2-DDC \
    --out_path hello_tacotron.wav

# 多说话人模型
tts --text "Hello world" \
    --model_name tts_models/en/vctk/vits \
    --speaker_idx 0 \
    --out_path speaker_0.wav

# 声音克隆
tts --text "This is cloned voice" \
    --model_name tts_models/multilingual/multi-dataset/xtts_v2 \
    --speaker_wav reference.wav \
    --language_idx en \
    --out_path cloned.wav

Python API 详解 #

基础用法 #

python

from TTS.api import TTS
import torch

# 初始化
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/en/ljspeech/vits").to(device)

# 方式 1：保存到文件
tts.tts_to_file(text="Hello world", file_path="output.wav")

# 方式 2：获取音频数据
wav = tts.tts(text="Hello world")

# 方式 3：流式生成
for chunk in tts.tts_stream(text="Hello world"):
    pass

多说话人模型 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/vctk/vits")

# 列出所有说话人
print(tts.speakers)

# 选择特定说话人
tts.tts_to_file(
    text="Hello, I am speaker one",
    speaker=tts.speakers[0],
    file_path="speaker_0.wav"
)

tts.tts_to_file(
    text="Hello, I am speaker two",
    speaker=tts.speakers[5],
    file_path="speaker_5.wav"
)

多语言模型 #

python

from TTS.api import TTS

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

# 列出支持的语言
print(tts.languages)

# 中文合成
tts.tts_to_file(
    text="你好，这是中文语音合成测试。",
    language="zh-cn",
    file_path="chinese.wav"
)

# 英文合成
tts.tts_to_file(
    text="Hello, this is English speech synthesis.",
    language="en",
    file_path="english.wav"
)

# 日文合成
tts.tts_to_file(
    text="こんにちは、これは日本語の音声合成テストです。",
    language="ja",
    file_path="japanese.wav"
)

声音克隆 #

python

from TTS.api import TTS
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# 使用参考音频克隆声音
tts.tts_to_file(
    text="这是克隆的声音，听起来和原声音很像。",
    speaker_wav="reference_audio.wav",
    language="zh-cn",
    file_path="cloned_voice.wav"
)

模型选择指南 #

按语言选择 #

python

# 英语模型
english_models = [
    "tts_models/en/ljspeech/vits",           # 高质量，推荐
    "tts_models/en/ljspeech/tacotron2-DDC",  # 经典模型
    "tts_models/en/vctk/vits",               # 多说话人
    "tts_models/en/jenny/jenny",             # 女声
]

# 中文模型
chinese_models = [
    "tts_models/zh-CN/baker/tacotron2-DDC_GST",
    "tts_models/multilingual/multi-dataset/xtts_v2",  # 多语言
]

# 多语言模型
multilingual_models = [
    "tts_models/multilingual/multi-dataset/xtts_v2",
    "tts_models/multilingual/multi-dataset/your_tts",
]

按用途选择 #

text

┌─────────────────────────────────────────────────────────────┐
│                     模型选择建议                             │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  快速原型开发：                                              │
│  └── tts_models/en/ljspeech/vits                           │
│                                                             │
│  高质量合成：                                                │
│  └── tts_models/en/ljspeech/vits                           │
│                                                             │
│  多说话人：                                                  │
│  └── tts_models/en/vctk/vits                               │
│                                                             │
│  声音克隆：                                                  │
│  └── tts_models/multilingual/multi-dataset/xtts_v2         │
│                                                             │
│  多语言支持：                                                │
│  └── tts_models/multilingual/multi-dataset/xtts_v2         │
│                                                             │
│  实时应用：                                                  │
│  └── tts_models/en/ljspeech/fast_speech                    │
│                                                             │
└─────────────────────────────────────────────────────────────┘

音频输出配置 #

采样率设置 #

python

from TTS.api import TTS

tts = TTS("tts_models/en/ljspeech/vits")

# 默认采样率
print(f"默认采样率: {tts.synthesizer.output_sample_rate}")

# 自定义采样率（需要重采样）
import librosa
import soundfile as sf

wav = tts.tts(text="Hello world")
wav_resampled = librosa.resample(
    wav, 
    orig_sr=tts.synthesizer.output_sample_rate,
    target_sr=44100
)
sf.write("output_44k.wav", wav_resampled, 44100)

音频格式 #

python

import soundfile as sf
from TTS.api import TTS

tts = TTS("tts_models/en/ljspeech/vits")
wav = tts.tts(text="Hello world")

# WAV 格式（默认）
sf.write("output.wav", wav, 22050)

# FLAC 格式（无损压缩）
sf.write("output.flac", wav, 22050)

# OGG 格式（有损压缩）
sf.write("output.ogg", wav, 22050)

批量处理 #

批量文本合成 #

python

from TTS.api import TTS
from pathlib import Path

tts = TTS("tts_models/en/ljspeech/vits")

texts = [
    "First sentence to synthesize.",
    "Second sentence to synthesize.",
    "Third sentence to synthesize.",
]

output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

for i, text in enumerate(texts):
    output_path = output_dir / f"audio_{i}.wav"
    tts.tts_to_file(text=text, file_path=str(output_path))
    print(f"已生成: {output_path}")

从文件读取 #

python

from TTS.api import TTS
from pathlib import Path

tts = TTS("tts_models/en/ljspeech/vits")

# 从文本文件读取
with open("input.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

output_dir = Path("output")
output_dir.mkdir(exist_ok=True)

for i, line in enumerate(lines):
    text = line.strip()
    if text:
        output_path = output_dir / f"line_{i}.wav"
        tts.tts_to_file(text=text, file_path=str(output_path))

性能优化 #

GPU 加速 #

python

import torch
from TTS.api import TTS

# 使用 GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
tts = TTS("tts_models/en/ljspeech/vits").to(device)

# 性能对比
import time

text = "This is a performance test sentence."

# GPU 推理
start = time.time()
tts.tts_to_file(text=text, file_path="gpu.wav")
gpu_time = time.time() - start
print(f"GPU 时间: {gpu_time:.2f}s")

# CPU 推理
tts_cpu = TTS("tts_models/en/ljspeech/vits").to("cpu")
start = time.time()
tts_cpu.tts_to_file(text=text, file_path="cpu.wav")
cpu_time = time.time() - start
print(f"CPU 时间: {cpu_time:.2f}s")
print(f"加速比: {cpu_time/gpu_time:.2f}x")

批量推理 #

python

from TTS.api import TTS
import torch

tts = TTS("tts_models/en/ljspeech/vits").to("cuda")

# 批量处理优化
texts = ["Sentence one.", "Sentence two.", "Sentence three."]

# 方式 1：顺序处理
for i, text in enumerate(texts):
    tts.tts_to_file(text=text, file_path=f"output_{i}.wav")

# 方式 2：预加载模型
# 模型只加载一次，后续调用更快

实用示例 #

示例 1：简单文本朗读器 #

python

from TTS.api import TTS
import sys

def text_to_speech(text, output_path="output.wav"):
    tts = TTS("tts_models/en/ljspeech/vits")
    tts.tts_to_file(text=text, file_path=output_path)
    print(f"音频已保存到: {output_path}")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        text = " ".join(sys.argv[1:])
    else:
        text = input("请输入要合成的文本: ")
    
    text_to_speech(text)

示例 2：多语言朗读器 #

python

from TTS.api import TTS
import torch

class MultilingualTTS:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
    
    def synthesize(self, text, language="en", output_path="output.wav"):
        self.tts.tts_to_file(
            text=text,
            language=language,
            file_path=output_path
        )
        return output_path

# 使用
tts = MultilingualTTS()
tts.synthesize("Hello world", "en", "english.wav")
tts.synthesize("你好世界", "zh-cn", "chinese.wav")
tts.synthesize("こんにちは世界", "ja", "japanese.wav")

示例 3：声音克隆工具 #

python

from TTS.api import TTS
import torch
from pathlib import Path

class VoiceCloner:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
    
    def clone(self, reference_audio, text, output_path, language="en"):
        self.tts.tts_to_file(
            text=text,
            speaker_wav=reference_audio,
            language=language,
            file_path=output_path
        )
        print(f"克隆语音已保存: {output_path}")

# 使用
cloner = VoiceCloner()
cloner.clone(
    reference_audio="my_voice.wav",
    text="This is my cloned voice speaking.",
    output_path="cloned.wav",
    language="en"
)

下一步 #

现在你已经掌握了 Coqui TTS 的基本用法，接下来学习预训练模型，深入了解各种模型的特点和使用方法！