feat(v3): PR 4 — korean_preprocessor + SmartTTSRouter

- Add bots/prompt_layer/korean_preprocessor.py: 200+ entry pronunciation map, number→Korean conversion, dynamic SSML/marker pause insertion - Upgrade bots/shorts/tts_engine.py: SmartTTSRouter (budget-aware engine selection with failure fallback), _tts_openai() function, Korean preprocessing step in generate_tts() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 11:48:19 +09:00
parent 33b0bbd5ee
commit b666b67a03
2 changed files with 567 additions and 2 deletions
--- a/bots/shorts/tts_engine.py
+++ b/bots/shorts/tts_engine.py
@@ -25,6 +25,111 @@ from typing import Optional

 logger = logging.getLogger(__name__)

+
+# ─── SmartTTSRouter ───────────────────────────────────────────
+
+class SmartTTSRouter:
+    """
+    Budget-aware TTS engine selection with graceful fallback.
+
+    Engine priority order (best to cheapest):
+    1. elevenlabs   — best quality, paid
+    2. openai_tts   — good quality, paid (uses existing OpenAI key)
+    3. cosyvoice2   — local, free, Korean native speaker voice
+    4. kokoro       — local, free, 82M params
+    5. edge_tts     — free fallback, always available
+    """
+
+    ENGINE_PRIORITY = ['elevenlabs', 'openai_tts', 'cosyvoice2', 'kokoro', 'edge_tts']
+
+    # Daily/monthly usage limits per engine
+    ENGINE_LIMITS = {
+        'elevenlabs': {'chars_per_month': 10000, 'threshold': 0.8},
+        'openai_tts': {'chars_per_day': 500000, 'threshold': 0.9},
+    }
+
+    ENGINE_API_KEYS = {
+        'elevenlabs': 'ELEVENLABS_API_KEY',
+        'openai_tts': 'OPENAI_API_KEY',
+    }
+    # cosyvoice2, kokoro, edge_tts are local — no API key needed
+
+    def __init__(self, resolved_config: dict):
+        """
+        resolved_config: output from ConfigResolver.resolve()
+        """
+        self.budget = resolved_config.get('budget', 'free')
+        self.tts_engine = resolved_config.get('tts', 'edge_tts')
+        self._usage = {}  # {engine_name: chars_used_today}
+        self._failed = set()  # engines that failed this session
+
+    def select(self, text_length: int) -> str:
+        """
+        Select best available TTS engine for given text length.
+
+        1. If user specified a non-auto engine: use it if available
+        2. Else: check budget-appropriate engines in priority order
+        3. Skip engines that have exceeded usage threshold
+        4. Skip engines that failed this session
+        5. Always fall back to edge_tts
+        """
+        import os
+
+        # If user explicitly chose a specific engine (not 'auto')
+        if self.tts_engine not in ('auto', 'edge_tts', ''):
+            engine = self.tts_engine
+            api_key_env = self.ENGINE_API_KEYS.get(engine, '')
+            if not api_key_env or os.environ.get(api_key_env, ''):
+                if engine not in self._failed:
+                    return engine
+
+        # Budget-based priority selection
+        if self.budget == 'free':
+            priority = ['kokoro', 'edge_tts']
+        elif self.budget == 'low':
+            priority = ['openai_tts', 'kokoro', 'edge_tts']
+        else:  # medium, premium
+            priority = self.ENGINE_PRIORITY
+
+        for engine in priority:
+            if engine in self._failed:
+                continue
+            api_key_env = self.ENGINE_API_KEYS.get(engine, '')
+            if api_key_env and not os.environ.get(api_key_env, ''):
+                continue  # no API key
+            if self._is_over_limit(engine, text_length):
+                continue
+            return engine
+
+        return 'edge_tts'  # always available
+
+    def on_failure(self, engine: str, error: str) -> str:
+        """
+        Record engine failure and return next available engine.
+        No retry on same engine — no wasted credits.
+        """
+        import logging
+        logging.getLogger(__name__).warning(f'TTS 엔진 실패: {engine} — {error}, 다음 엔진으로 전환')
+        self._failed.add(engine)
+        return self.select(0)  # Select next engine
+
+    def record_usage(self, engine: str, char_count: int) -> None:
+        """Record character usage for an engine."""
+        self._usage[engine] = self._usage.get(engine, 0) + char_count
+
+    def _is_over_limit(self, engine: str, text_length: int) -> bool:
+        """Check if engine has exceeded its usage threshold."""
+        limits = self.ENGINE_LIMITS.get(engine, {})
+        if not limits:
+            return False
+        threshold = limits.get('threshold', 0.9)
+        daily_limit = limits.get('chars_per_day', limits.get('chars_per_month', 0))
+        if not daily_limit:
+            return False
+        used = self._usage.get(engine, 0)
+        return (used + text_length) / daily_limit > threshold
+
+
 # ─── 공통 유틸 ────────────────────────────────────────────────


@@ -167,6 +272,47 @@ def _get_ffmpeg() -> str:
    return 'ffmpeg'


+# ─── OpenAI TTS ───────────────────────────────────────────────
+
+def _tts_openai(text: str, output_path: Path, cfg: dict) -> list[dict]:
+    """
+    OpenAI TTS (tts-1-hd model) with timestamp estimation.
+    Returns: [{word, start, end}, ...] — uniform timestamps (no word-level from OpenAI)
+    """
+    import requests, base64
+    import os
+
+    api_key = os.environ.get('OPENAI_API_KEY', '')
+    if not api_key:
+        raise RuntimeError('OPENAI_API_KEY not set')
+
+    openai_cfg = cfg.get('tts', {}).get('openai', {})
+    model = openai_cfg.get('model', 'tts-1-hd')
+    voice = openai_cfg.get('voice', 'alloy')
+    speed = openai_cfg.get('speed', 1.0)
+
+    url = 'https://api.openai.com/v1/audio/speech'
+    headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
+    payload = {
+        'model': model,
+        'input': text,
+        'voice': voice,
+        'speed': speed,
+        'response_format': 'mp3',
+    }
+
+    resp = requests.post(url, headers=headers, json=payload, timeout=60)
+    resp.raise_for_status()
+
+    mp3_tmp = output_path.with_suffix('.mp3')
+    mp3_tmp.write_bytes(resp.content)
+    _mp3_to_wav(mp3_tmp, output_path)
+    mp3_tmp.unlink(missing_ok=True)
+
+    # OpenAI TTS has no word-level timestamps — use uniform distribution
+    return []  # caption_renderer will use uniform fallback
+
+
 # ─── Google Cloud TTS ─────────────────────────────────────────

 def _tts_google_cloud(text: str, output_path: Path, cfg: dict) -> list[dict]:
@@ -323,11 +469,21 @@ def generate_tts(
    ts_path = output_dir / f'{timestamp}_timestamps.json'

    text = _concat_script(script)
-    pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300)
-    priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'google_cloud', 'edge_tts'])

+    # Apply Korean preprocessing if available
+    try:
+        from bots.prompt_layer.korean_preprocessor import preprocess_korean
+        text = preprocess_korean(text)
+    except ImportError:
+        pass  # Korean preprocessing not available, use raw text
+
+    pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300)
+    priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'openai_tts', 'google_cloud', 'edge_tts'])
+
+    # Engine map: elevenlabs → openai_tts → google_cloud → edge_tts
    engine_map = {
        'elevenlabs':   _tts_elevenlabs,
+        'openai_tts':   _tts_openai,
        'google_cloud': _tts_google_cloud,
        'edge_tts':     _tts_edge,
    }