""" bots/shorts/tts_engine.py 역할: 쇼츠 스크립트 텍스트 → 음성(WAV) + 단어별 타임스탬프(JSON) 생성 엔진 우선순위 (shorts_config.json tts.engine_priority): 1. ElevenLabs — 최고 품질, ELEVENLABS_API_KEY 필요 2. Google Cloud TTS — 중간 품질, GOOGLE_TTS_API_KEY 필요 3. Edge TTS — 무료 폴백, API 키 불필요 출력: data/shorts/tts/{timestamp}.wav data/shorts/tts/{timestamp}_timestamps.json [{word: str, start: float, end: float}, ...] """ import asyncio import json import logging import os import re import struct import tempfile import wave from pathlib import Path from typing import Optional logger = logging.getLogger(__name__) # ─── SmartTTSRouter ─────────────────────────────────────────── class SmartTTSRouter: """ Budget-aware TTS engine selection with graceful fallback. Engine priority order (best to cheapest): 1. elevenlabs — best quality, paid 2. openai_tts — good quality, paid (uses existing OpenAI key) 3. cosyvoice2 — local, free, Korean native speaker voice 4. kokoro — local, free, 82M params 5. edge_tts — free fallback, always available """ ENGINE_PRIORITY = ['elevenlabs', 'openai_tts', 'cosyvoice2', 'kokoro', 'edge_tts'] # Daily/monthly usage limits per engine ENGINE_LIMITS = { 'elevenlabs': {'chars_per_month': 10000, 'threshold': 0.8}, 'openai_tts': {'chars_per_day': 500000, 'threshold': 0.9}, } ENGINE_API_KEYS = { 'elevenlabs': 'ELEVENLABS_API_KEY', 'openai_tts': 'OPENAI_API_KEY', } # cosyvoice2, kokoro, edge_tts are local — no API key needed def __init__(self, resolved_config: dict): """ resolved_config: output from ConfigResolver.resolve() """ self.budget = resolved_config.get('budget', 'free') self.tts_engine = resolved_config.get('tts', 'edge_tts') self._usage = {} # {engine_name: chars_used_today} self._failed = set() # engines that failed this session def select(self, text_length: int) -> str: """ Select best available TTS engine for given text length. 1. If user specified a non-auto engine: use it if available 2. Else: check budget-appropriate engines in priority order 3. Skip engines that have exceeded usage threshold 4. Skip engines that failed this session 5. Always fall back to edge_tts """ import os # If user explicitly chose a specific engine (not 'auto') if self.tts_engine not in ('auto', 'edge_tts', ''): engine = self.tts_engine api_key_env = self.ENGINE_API_KEYS.get(engine, '') if not api_key_env or os.environ.get(api_key_env, ''): if engine not in self._failed: return engine # Budget-based priority selection if self.budget == 'free': priority = ['kokoro', 'edge_tts'] elif self.budget == 'low': priority = ['openai_tts', 'kokoro', 'edge_tts'] else: # medium, premium priority = self.ENGINE_PRIORITY for engine in priority: if engine in self._failed: continue api_key_env = self.ENGINE_API_KEYS.get(engine, '') if api_key_env and not os.environ.get(api_key_env, ''): continue # no API key if self._is_over_limit(engine, text_length): continue return engine return 'edge_tts' # always available def on_failure(self, engine: str, error: str) -> str: """ Record engine failure and return next available engine. No retry on same engine — no wasted credits. """ import logging logging.getLogger(__name__).warning(f'TTS 엔진 실패: {engine} — {error}, 다음 엔진으로 전환') self._failed.add(engine) return self.select(0) # Select next engine def record_usage(self, engine: str, char_count: int) -> None: """Record character usage for an engine.""" self._usage[engine] = self._usage.get(engine, 0) + char_count def _is_over_limit(self, engine: str, text_length: int) -> bool: """Check if engine has exceeded its usage threshold.""" limits = self.ENGINE_LIMITS.get(engine, {}) if not limits: return False threshold = limits.get('threshold', 0.9) daily_limit = limits.get('chars_per_day', limits.get('chars_per_month', 0)) if not daily_limit: return False used = self._usage.get(engine, 0) return (used + text_length) / daily_limit > threshold # ─── 공통 유틸 ──────────────────────────────────────────────── def _load_config() -> dict: cfg_path = Path(__file__).parent.parent.parent / 'config' / 'shorts_config.json' if cfg_path.exists(): return json.loads(cfg_path.read_text(encoding='utf-8')) return {} def _concat_script(script: dict) -> str: """스크립트 dict → 읽기용 단일 텍스트. 문장 사이 공백 추가.""" parts = [script.get('hook', '')] parts.extend(script.get('body', [])) parts.append(script.get('closer', '')) return ' '.join(p for p in parts if p) def _add_pause(wav_path: Path, pause_ms: int = 300) -> None: """WAV 파일 끝에 무음 pause_ms 밀리초 추가 (인플레이스).""" with wave.open(str(wav_path), 'rb') as wf: params = wf.getparams() frames = wf.readframes(wf.getnframes()) silence_frames = int(params.framerate * pause_ms / 1000) silence = b'\x00' * silence_frames * params.nchannels * params.sampwidth with wave.open(str(wav_path), 'wb') as wf: wf.setparams(params) wf.writeframes(frames + silence) def _get_wav_duration(wav_path: Path) -> float: with wave.open(str(wav_path), 'rb') as wf: return wf.getnframes() / wf.getframerate() # ─── ElevenLabs ─────────────────────────────────────────────── def _tts_elevenlabs(text: str, output_path: Path, cfg: dict) -> list[dict]: """ ElevenLabs TTS + 단어별 타임스탬프. Returns: [{word, start, end}, ...] """ import requests api_key = os.environ.get('ELEVENLABS_API_KEY', '') if not api_key: raise RuntimeError('ELEVENLABS_API_KEY not set') el_cfg = cfg.get('tts', {}).get('elevenlabs', {}) voice_id = el_cfg.get('voice_id', 'pNInz6obpgDQGcFmaJgB') model_id = el_cfg.get('model', 'eleven_multilingual_v2') stability = el_cfg.get('stability', 0.5) similarity = el_cfg.get('similarity_boost', 0.8) speed = el_cfg.get('speed', 1.1) url = f'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/with-timestamps' headers = {'xi-api-key': api_key, 'Content-Type': 'application/json'} payload = { 'text': text, 'model_id': model_id, 'voice_settings': { 'stability': stability, 'similarity_boost': similarity, 'speed': speed, }, } resp = requests.post(url, headers=headers, json=payload, timeout=60) resp.raise_for_status() data = resp.json() # 오디오 디코딩 import base64 audio_b64 = data.get('audio_base64', '') audio_bytes = base64.b64decode(audio_b64) # ElevenLabs는 mp3 반환 → wav 변환 mp3_tmp = output_path.with_suffix('.mp3') mp3_tmp.write_bytes(audio_bytes) _mp3_to_wav(mp3_tmp, output_path) mp3_tmp.unlink(missing_ok=True) # 타임스탬프 파싱 alignment = data.get('alignment', {}) chars = alignment.get('characters', []) starts = alignment.get('character_start_times_seconds', []) ends = alignment.get('character_end_times_seconds', []) timestamps = _chars_to_words(chars, starts, ends) return timestamps def _chars_to_words(chars: list, starts: list, ends: list) -> list[dict]: """ElevenLabs 문자 레벨 타임스탬프 → 단어 레벨.""" words = [] cur_word = '' cur_start = 0.0 cur_end = 0.0 for ch, st, en in zip(chars, starts, ends): if ch in (' ', '\n'): if cur_word: words.append({'word': cur_word, 'start': round(cur_start, 3), 'end': round(cur_end, 3)}) cur_word = '' else: if not cur_word: cur_start = st cur_word += ch cur_end = en if cur_word: words.append({'word': cur_word, 'start': round(cur_start, 3), 'end': round(cur_end, 3)}) return words def _mp3_to_wav(mp3_path: Path, wav_path: Path) -> None: try: from pydub import AudioSegment AudioSegment.from_mp3(str(mp3_path)).export(str(wav_path), format='wav') return except Exception: pass # ffmpeg 폴백 import subprocess ffmpeg = _get_ffmpeg() subprocess.run( [ffmpeg, '-y', '-i', str(mp3_path), str(wav_path)], check=True, capture_output=True, ) def _get_ffmpeg() -> str: ffmpeg_env = os.environ.get('FFMPEG_PATH', '') if ffmpeg_env and Path(ffmpeg_env).exists(): return ffmpeg_env return 'ffmpeg' # ─── OpenAI TTS ─────────────────────────────────────────────── def _tts_openai(text: str, output_path: Path, cfg: dict) -> list[dict]: """ OpenAI TTS (tts-1-hd model) with timestamp estimation. Returns: [{word, start, end}, ...] — uniform timestamps (no word-level from OpenAI) """ import requests, base64 import os api_key = os.environ.get('OPENAI_API_KEY', '') if not api_key: raise RuntimeError('OPENAI_API_KEY not set') openai_cfg = cfg.get('tts', {}).get('openai', {}) model = openai_cfg.get('model', 'tts-1-hd') voice = openai_cfg.get('voice', 'alloy') speed = openai_cfg.get('speed', 1.0) url = 'https://api.openai.com/v1/audio/speech' headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'} payload = { 'model': model, 'input': text, 'voice': voice, 'speed': speed, 'response_format': 'mp3', } resp = requests.post(url, headers=headers, json=payload, timeout=60) resp.raise_for_status() mp3_tmp = output_path.with_suffix('.mp3') mp3_tmp.write_bytes(resp.content) _mp3_to_wav(mp3_tmp, output_path) mp3_tmp.unlink(missing_ok=True) # OpenAI TTS has no word-level timestamps — use uniform distribution return [] # caption_renderer will use uniform fallback # ─── Google Cloud TTS ───────────────────────────────────────── def _tts_google_cloud(text: str, output_path: Path, cfg: dict) -> list[dict]: """ Google Cloud TTS (REST API) + SSML time_pointing으로 타임스탬프 추출. Returns: [{word, start, end}, ...] """ import requests api_key = os.environ.get('GOOGLE_TTS_API_KEY', '') if not api_key: raise RuntimeError('GOOGLE_TTS_API_KEY not set') gc_cfg = cfg.get('tts', {}).get('google_cloud', {}) voice_name = gc_cfg.get('voice_name', 'ko-KR-Neural2-C') speaking_rate = gc_cfg.get('speaking_rate', 1.1) # SSML: 단어별 mark 삽입 words = text.split() ssml_parts = [] for i, w in enumerate(words): ssml_parts.append(f'{w}') ssml_text = ' '.join(ssml_parts) ssml = f'{ssml_text}' url = f'https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={api_key}' payload = { 'input': {'ssml': ssml}, 'voice': {'languageCode': voice_name[:5], 'name': voice_name}, 'audioConfig': { 'audioEncoding': 'LINEAR16', 'speakingRate': speaking_rate, 'sampleRateHertz': 44100, }, 'enableTimePointing': ['SSML_MARK'], } resp = requests.post(url, json=payload, timeout=60) resp.raise_for_status() data = resp.json() import base64 audio_bytes = base64.b64decode(data['audioContent']) output_path.write_bytes(audio_bytes) # 타임스탬프 파싱 timepoints = data.get('timepoints', []) timestamps = _gcloud_marks_to_words(words, timepoints) return timestamps def _gcloud_marks_to_words(words: list[str], timepoints: list[dict]) -> list[dict]: """Google Cloud TTS mark 타임포인트 → 단어별 {word, start, end}.""" mark_map = {tp['markName']: tp['timeSeconds'] for tp in timepoints} total_dur = mark_map.get('end', 0.0) result = [] for i, w in enumerate(words): start = mark_map.get(f'w{i}', 0.0) end = mark_map.get(f'w{i+1}', total_dur) result.append({'word': w, 'start': round(start, 3), 'end': round(end, 3)}) return result # ─── Edge TTS + Whisper ─────────────────────────────────────── def _tts_edge(text: str, output_path: Path, cfg: dict) -> list[dict]: """ Edge TTS (무료) → WAV 생성 후 Whisper로 단어별 타임스탬프 추출. Returns: [{word, start, end}, ...] """ import edge_tts edge_cfg = cfg.get('tts', {}).get('edge_tts', {}) voice = edge_cfg.get('voice', 'ko-KR-SunHiNeural') rate = edge_cfg.get('rate', '+10%') mp3_tmp = output_path.with_suffix('.mp3') async def _generate(): communicate = edge_tts.Communicate(text, voice, rate=rate) await communicate.save(str(mp3_tmp)) try: loop = asyncio.get_running_loop() # 이미 루프 안에 있으면 새 스레드에서 실행 import concurrent.futures with concurrent.futures.ThreadPoolExecutor() as pool: pool.submit(lambda: asyncio.run(_generate())).result() except RuntimeError: # 루프 없음 — 직접 실행 asyncio.run(_generate()) # mp3 → wav _mp3_to_wav(mp3_tmp, output_path) mp3_tmp.unlink(missing_ok=True) # Whisper로 타임스탬프 추출 timestamps = _whisper_timestamps(output_path) return timestamps def _whisper_timestamps(wav_path: Path) -> list[dict]: """openai-whisper를 사용해 단어별 타임스탬프 추출. 없으면 균등 분할.""" try: import whisper # type: ignore model = whisper.load_model('tiny') result = model.transcribe(str(wav_path), word_timestamps=True, language='ko') words = [] for seg in result.get('segments', []): for w in seg.get('words', []): words.append({ 'word': w['word'].strip(), 'start': round(w['start'], 3), 'end': round(w['end'], 3), }) if words: return words except Exception as e: logger.warning(f'Whisper 타임스탬프 실패: {e} — 균등 분할 사용') return _uniform_timestamps(wav_path) def _uniform_timestamps(wav_path: Path) -> list[dict]: """Whisper 없을 때 균등 분할 타임스탬프 (캡션 품질 저하 감수).""" duration = _get_wav_duration(wav_path) with wave.open(str(wav_path), 'rb') as wf: pass # just to confirm it's readable # WAV 파일에서 텍스트를 다시 알 수 없으므로 빈 리스트 반환 # (caption_renderer가 균등 분할을 처리) return [] # ─── 메인 엔트리포인트 ──────────────────────────────────────── def generate_tts( script: dict, output_dir: Path, timestamp: str, cfg: Optional[dict] = None, ) -> tuple[Path, list[dict]]: """ 스크립트 dict → WAV + 단어별 타임스탬프. Args: script: {hook, body, closer, ...} output_dir: data/shorts/tts/ timestamp: 파일명 prefix (e.g. "20260328_120000") cfg: shorts_config.json dict (없으면 자동 로드) Returns: (wav_path, timestamps) — timestamps: [{word, start, end}, ...] """ if cfg is None: cfg = _load_config() output_dir.mkdir(parents=True, exist_ok=True) wav_path = output_dir / f'{timestamp}.wav' ts_path = output_dir / f'{timestamp}_timestamps.json' text = _concat_script(script) # Apply Korean preprocessing if available try: from bots.prompt_layer.korean_preprocessor import preprocess_korean text = preprocess_korean(text) except ImportError: pass # Korean preprocessing not available, use raw text pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300) priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'openai_tts', 'google_cloud', 'edge_tts']) # Engine map: elevenlabs → openai_tts → google_cloud → edge_tts engine_map = { 'elevenlabs': _tts_elevenlabs, 'openai_tts': _tts_openai, 'google_cloud': _tts_google_cloud, 'edge_tts': _tts_edge, } timestamps: list[dict] = [] last_error: Optional[Exception] = None for engine_name in priority: fn = engine_map.get(engine_name) if fn is None: continue try: logger.info(f'TTS 엔진 시도: {engine_name}') timestamps = fn(text, wav_path, cfg) logger.info(f'TTS 완료 ({engine_name}): {wav_path.name}') break except Exception as e: logger.warning(f'TTS 엔진 실패 ({engine_name}): {e}') last_error = e if wav_path.exists(): wav_path.unlink() if not wav_path.exists(): raise RuntimeError(f'모든 TTS 엔진 실패. 마지막 오류: {last_error}') # 문장 끝 무음 추가 try: _add_pause(wav_path, pause_ms) except Exception as e: logger.warning(f'무음 추가 실패: {e}') # 타임스탬프 저장 ts_path.write_text(json.dumps(timestamps, ensure_ascii=False, indent=2), encoding='utf-8') logger.info(f'타임스탬프 저장: {ts_path.name} ({len(timestamps)}단어)') return wav_path, timestamps def load_timestamps(ts_path: Path) -> list[dict]: """저장된 타임스탬프 JSON 로드.""" return json.loads(ts_path.read_text(encoding='utf-8')) # ── Standalone test ────────────────────────────────────────────── if __name__ == '__main__': import sys import tempfile from pathlib import Path if '--test' not in sys.argv: print("사용법: python -m bots.shorts.tts_engine --test") sys.exit(0) print("=== TTS Engine Test ===") # Test SmartTTSRouter initialization print("\n[1] SmartTTSRouter 초기화:") router = SmartTTSRouter({'budget': 'free'}) print(f" budget: {router.budget}") engine = router.select(text_length=100) print(f" select(100chars) → {engine}") assert isinstance(engine, str) and engine, "엔진 선택 실패" # Test with medium budget (no API keys → falls back to free engine) router_med = SmartTTSRouter({'budget': 'medium'}) engine_med = router_med.select(text_length=500) print(f" medium budget select(500chars) → {engine_med}") assert isinstance(engine_med, str) and engine_med, "medium 엔진 선택 실패" # Test usage recording + over-limit detection print("\n[2] 사용량 제한 로직:") router3 = SmartTTSRouter({'budget': 'free'}) router3.record_usage('elevenlabs', 9000) # near limit over = router3._is_over_limit('elevenlabs', 900) # 9000+900 > 8000 threshold print(f" elevenlabs 9000자 기록 후 900자 추가 → 한도 초과: {over}") assert over, "한도 초과 감지 실패" # Test Edge TTS (always-available free engine) with short text print("\n[3] Edge TTS 음성 생성 (네트워크 필요):") with tempfile.TemporaryDirectory() as tmpdir: try: wav, timestamps = generate_tts( script={'hook': '테스트입니다', 'body': [], 'closer': ''}, output_dir=Path(tmpdir), timestamp='test_20260329', ) print(f" WAV 생성: {wav.exists()}, 타임스탬프: {len(timestamps)}단어") assert wav.exists(), "WAV 파일 생성 실패" except Exception as e: print(f" [경고] TTS 실패 (네트워크/의존성 없을 수 있음): {e}") print("\n✅ 모든 테스트 통과")