实战案例 #

案例一：有声读物制作系统 #

项目概述 #

创建一个将电子书转换为有声读物的完整系统。

text

┌─────────────────────────────────────────────────────────────┐
│                   有声读物制作流程                           │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  ┌──────────┐   ┌──────────┐   ┌──────────┐   ┌──────────┐ │
│  │ 电子书    │ → │ 文本提取  │ → │ 语音合成  │ → │ 音频合并  │ │
│  │ 输入      │   │ 分段处理  │   │ 批量生成  │   │ 输出文件  │ │
│  └──────────┘   └──────────┘   └──────────┘   └──────────┘ │
│                                                             │
└─────────────────────────────────────────────────────────────┘

完整实现 #

python

from TTS.api import TTS
import torch
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import soundfile as sf
import numpy as np
from tqdm import tqdm
import re

class AudiobookGenerator:
    def __init__(self, model_name="tts_models/en/ljspeech/vits"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS(model_name).to(self.device)
        self.sample_rate = 22050
    
    def extract_text_from_epub(self, epub_path):
        """从 EPUB 提取文本"""
        book = epub.read_epub(epub_path)
        chapters = []
        
        for item in book.get_items():
            if item.get_type() == ebooklib.ITEM_DOCUMENT:
                soup = BeautifulSoup(item.get_content(), 'html.parser')
                text = soup.get_text()
                text = self._clean_text(text)
                if text.strip():
                    chapters.append(text)
        
        return chapters
    
    def extract_text_from_txt(self, txt_path):
        """从 TXT 提取文本"""
        with open(txt_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        # 按段落分割
        paragraphs = text.split('\n\n')
        return [self._clean_text(p) for p in paragraphs if p.strip()]
    
    def _clean_text(self, text):
        """清理文本"""
        # 移除多余空白
        text = re.sub(r'\s+', ' ', text)
        # 移除特殊字符
        text = re.sub(r'[^\w\s.,!?;:\'"-]', '', text)
        return text.strip()
    
    def split_into_sentences(self, text, max_length=200):
        """将文本分割成句子"""
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # 合并短句
        result = []
        current = ""
        
        for sentence in sentences:
            if len(current) + len(sentence) < max_length:
                current += " " + sentence if current else sentence
            else:
                if current:
                    result.append(current.strip())
                current = sentence
        
        if current:
            result.append(current.strip())
        
        return result
    
    def synthesize_chapter(self, text, output_path):
        """合成一个章节"""
        sentences = self.split_into_sentences(text)
        audio_chunks = []
        
        for sentence in tqdm(sentences, desc="合成中"):
            if sentence.strip():
                wav = self.tts.tts(text=sentence)
                audio_chunks.append(wav)
                # 添加短暂停顿
                audio_chunks.append(np.zeros(int(self.sample_rate * 0.3)))
        
        # 合并音频
        if audio_chunks:
            full_audio = np.concatenate(audio_chunks)
            sf.write(output_path, full_audio, self.sample_rate)
            return output_path
        
        return None
    
    def generate_audiobook(self, input_path, output_dir="audiobook"):
        """生成有声读物"""
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        # 提取文本
        input_path = Path(input_path)
        if input_path.suffix == '.epub':
            chapters = self.extract_text_from_epub(str(input_path))
        else:
            chapters = self.extract_text_from_txt(str(input_path))
        
        print(f"共 {len(chapters)} 个章节")
        
        # 合成每个章节
        for i, chapter in enumerate(chapters):
            output_path = output_dir / f"chapter_{i:03d}.wav"
            print(f"\n处理章节 {i+1}/{len(chapters)}")
            self.synthesize_chapter(chapter, str(output_path))
        
        # 合并所有章节
        self._merge_chapters(output_dir)
        
        print(f"\n有声读物已生成: {output_dir}")
    
    def _merge_chapters(self, output_dir):
        """合并所有章节"""
        chapter_files = sorted(Path(output_dir).glob("chapter_*.wav"))
        
        if not chapter_files:
            return
        
        audio_chunks = []
        for chapter_file in chapter_files:
            audio, sr = sf.read(str(chapter_file))
            audio_chunks.append(audio)
            # 添加章节间停顿
            audio_chunks.append(np.zeros(int(self.sample_rate * 2)))
        
        full_audio = np.concatenate(audio_chunks)
        output_path = output_dir / "complete_audiobook.wav"
        sf.write(str(output_path), full_audio, self.sample_rate)
        
        print(f"完整有声读物: {output_path}")

# 使用
generator = AudiobookGenerator()
generator.generate_audiobook("book.epub", "my_audiobook")

案例二：语音助手 #

项目概述 #

创建一个支持语音交互的智能助手。

python

import torch
from TTS.api import TTS
import speech_recognition as sr
import soundfile as sf
import numpy as np
import tempfile
import os

class VoiceAssistant:
    def __init__(self, tts_model="tts_models/en/ljspeech/vits"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS(tts_model).to(self.device)
        self.recognizer = sr.Recognizer()
        self.sample_rate = 22050
    
    def listen(self):
        """监听语音输入"""
        with sr.Microphone() as source:
            print("正在监听...")
            self.recognizer.adjust_for_ambient_noise(source)
            audio = self.recognizer.listen(source)
        
        try:
            text = self.recognizer.recognize_google(audio)
            print(f"识别结果: {text}")
            return text
        except sr.UnknownValueError:
            print("无法识别")
            return None
        except sr.RequestError as e:
            print(f"服务错误: {e}")
            return None
    
    def speak(self, text):
        """语音输出"""
        print(f"助手: {text}")
        
        wav = self.tts.tts(text=text)
        
        # 播放音频
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
            sf.write(f.name, wav, self.sample_rate)
            os.system(f"afplay {f.name}" if os.name != "nt" else f"start {f.name}")
            os.unlink(f.name)
    
    def process_command(self, text):
        """处理命令"""
        text = text.lower()
        
        if "hello" in text or "hi" in text:
            return "Hello! How can I help you today?"
        
        elif "time" in text:
            from datetime import datetime
            now = datetime.now().strftime("%H:%M")
            return f"The current time is {now}"
        
        elif "date" in text:
            from datetime import datetime
            today = datetime.now().strftime("%B %d, %Y")
            return f"Today is {today}"
        
        elif "weather" in text:
            return "I'm sorry, I don't have access to weather data right now."
        
        elif "bye" in text or "goodbye" in text:
            return "Goodbye! Have a nice day!"
        
        else:
            return "I'm not sure how to help with that. Can you try again?"
    
    def run(self):
        """运行助手"""
        self.speak("Hello! I'm your voice assistant. How can I help you?")
        
        while True:
            text = self.listen()
            
            if text:
                if "bye" in text.lower() or "goodbye" in text.lower():
                    self.speak("Goodbye!")
                    break
                
                response = self.process_command(text)
                self.speak(response)

# 使用
assistant = VoiceAssistant()
assistant.run()

案例三：多语言播报系统 #

项目概述 #

创建一个支持多语言的新闻播报系统。

python

import torch
from TTS.api import TTS
from pathlib import Path
import soundfile as sf
import numpy as np
from datetime import datetime
import json

class MultilingualBroadcaster:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(self.device)
        self.sample_rate = 22050
    
    def broadcast_news(self, news_items, output_dir="broadcast"):
        """
        news_items: list of dict
        [
            {"text": "...", "language": "en", "title": "..."},
            ...
        ]
        """
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        for i, item in enumerate(news_items):
            text = item.get("text", "")
            language = item.get("language", "en")
            title = item.get("title", f"News {i+1}")
            
            print(f"播报: {title} ({language})")
            
            # 合成语音
            wav = self.tts.tts(text=text, language=language)
            
            # 保存
            filename = f"{timestamp}_{i:02d}_{language}.wav"
            output_path = output_dir / filename
            sf.write(str(output_path), wav, self.sample_rate)
            
            print(f"已保存: {output_path}")
        
        # 生成播放列表
        self._create_playlist(news_items, output_dir, timestamp)
    
    def _create_playlist(self, news_items, output_dir, timestamp):
        """生成播放列表"""
        playlist = []
        
        for i, item in enumerate(news_items):
            filename = f"{timestamp}_{i:02d}_{item.get('language', 'en')}.wav"
            playlist.append({
                "index": i,
                "title": item.get("title", f"News {i+1}"),
                "language": item.get("language", "en"),
                "file": filename
            })
        
        playlist_path = output_dir / f"playlist_{timestamp}.json"
        with open(playlist_path, "w", encoding="utf-8") as f:
            json.dump(playlist, f, ensure_ascii=False, indent=2)
        
        print(f"播放列表: {playlist_path}")
    
    def create_daily_broadcast(self, news_by_language):
        """创建每日播报"""
        intro_templates = {
            "en": "Good morning. Here is today's news.",
            "zh-cn": "早上好，这是今天的新闻。",
            "ja": "おはようございます。今日のニュースです。",
            "fr": "Bonjour. Voici les nouvelles d'aujourd'hui.",
            "de": "Guten Morgen. Hier sind die heutigen Nachrichten.",
            "es": "Buenos días. Aquí están las noticias de hoy."
        }
        
        all_news = []
        
        for language, news_list in news_by_language.items():
            # 添加开场白
            intro = intro_templates.get(language, intro_templates["en"])
            all_news.append({
                "text": intro,
                "language": language,
                "title": f"Intro ({language})"
            })
            
            # 添加新闻
            for j, news in enumerate(news_list):
                all_news.append({
                    "text": news,
                    "language": language,
                    "title": f"News {j+1} ({language})"
                })
        
        self.broadcast_news(all_news, "daily_broadcast")

# 使用
broadcaster = MultilingualBroadcaster()

news = {
    "en": [
        "The stock market reached a new high today.",
        "Scientists discovered a new species in the Amazon."
    ],
    "zh-cn": [
        "今天股市创下新高。",
        "科学家在亚马逊发现了新物种。"
    ],
    "ja": [
        "今日、株式市場は新高値を記録しました。",
        "科学者がアマゾンで新種を発見しました。"
    ]
}

broadcaster.create_daily_broadcast(news)

案例四：语音通知系统 #

项目概述 #

创建一个自动语音通知系统，用于发送提醒和通知。

python

import torch
from TTS.api import TTS
import soundfile as sf
from datetime import datetime, timedelta
import schedule
import time
from pathlib import Path

class VoiceNotificationSystem:
    def __init__(self, model_name="tts_models/en/ljspeech/vits"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS(model_name).to(self.device)
        self.notifications = []
        self.audio_dir = Path("notification_audio")
        self.audio_dir.mkdir(exist_ok=True)
    
    def add_notification(self, message, time_str, repeat=False):
        """添加通知"""
        notification = {
            "message": message,
            "time": time_str,
            "repeat": repeat
        }
        self.notifications.append(notification)
        
        # 预生成音频
        self._generate_audio(message)
        
        print(f"通知已添加: {message} at {time_str}")
    
    def _generate_audio(self, message):
        """生成音频文件"""
        filename = f"{hash(message)}.wav"
        output_path = self.audio_dir / filename
        
        if not output_path.exists():
            wav = self.tts.tts(text=message)
            sf.write(str(output_path), wav, 22050)
        
        return str(output_path)
    
    def _play_notification(self, message):
        """播放通知"""
        import os
        audio_path = self._generate_audio(message)
        
        print(f"\n[{datetime.now().strftime('%H:%M:%S')}] 通知: {message}")
        
        if os.name == "nt":
            os.system(f'start "" "{audio_path}"')
        else:
            os.system(f"afplay {audio_path}")
    
    def start(self):
        """启动通知系统"""
        for notif in self.notifications:
            if notif["repeat"]:
                schedule.every().day.at(notif["time"]).do(
                    self._play_notification, notif["message"]
                )
            else:
                schedule.every().day.at(notif["time"]).do(
                    self._play_notification, notif["message"]
                )
        
        print("通知系统已启动...")
        
        while True:
            schedule.run_pending()
            time.sleep(1)

# 使用
system = VoiceNotificationSystem()

# 添加通知
system.add_notification("Good morning! Time to start your day.", "08:00", repeat=True)
system.add_notification("It's time for lunch break.", "12:00", repeat=True)
system.add_notification("Don't forget your meeting in 15 minutes.", "14:45", repeat=True)
system.add_notification("Time to wrap up for the day.", "17:30", repeat=True)

# 启动系统
# system.start()

案例五：批量音频生成工具 #

项目概述 #

创建一个批量处理文本文件并生成音频的工具。

python

import torch
from TTS.api import TTS
from pathlib import Path
import soundfile as sf
import concurrent.futures
from tqdm import tqdm
import json

class BatchAudioGenerator:
    def __init__(self, model_name="tts_models/en/ljspeech/vits", max_workers=4):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tts = TTS(model_name).to(self.device)
        self.max_workers = max_workers
    
    def process_file(self, input_file, output_dir="output"):
        """处理单个文件"""
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        results = []
        
        for i, line in enumerate(tqdm(lines, desc="处理中")):
            text = line.strip()
            if text:
                output_path = output_dir / f"audio_{i:04d}.wav"
                wav = self.tts.tts(text=text)
                sf.write(str(output_path), wav, 22050)
                results.append({
                    "index": i,
                    "text": text,
                    "file": str(output_path)
                })
        
        # 保存索引
        index_path = output_dir / "index.json"
        with open(index_path, "w", encoding="utf-8") as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"处理完成: {len(results)} 个音频文件")
        return results
    
    def process_directory(self, input_dir, output_dir="output"):
        """处理目录中的所有文件"""
        input_dir = Path(input_dir)
        output_dir = Path(output_dir)
        
        txt_files = list(input_dir.glob("*.txt"))
        print(f"找到 {len(txt_files)} 个文件")
        
        all_results = {}
        
        for txt_file in txt_files:
            file_output_dir = output_dir / txt_file.stem
            results = self.process_file(str(txt_file), str(file_output_dir))
            all_results[txt_file.name] = results
        
        return all_results
    
    def generate_manifest(self, output_dir):
        """生成清单文件"""
        output_dir = Path(output_dir)
        manifest = []
        
        for audio_file in output_dir.glob("*.wav"):
            info = sf.info(str(audio_file))
            manifest.append({
                "audio_filepath": str(audio_file),
                "duration": info.duration,
                "text": ""  # 需要从索引文件读取
            })
        
        manifest_path = output_dir / "manifest.json"
        with open(manifest_path, "w") as f:
            for item in manifest:
                f.write(json.dumps(item) + "\n")
        
        return manifest_path

# 使用
generator = BatchAudioGenerator()

# 处理单个文件
generator.process_file("input.txt", "audio_output")

# 处理目录
# generator.process_directory("input_texts", "batch_output")

下一步 #

完成实战案例学习后，继续学习生产部署，了解如何将 TTS 系统部署到生产环境！