diff --git a/bots/prompt_layer/korean_preprocessor.py b/bots/prompt_layer/korean_preprocessor.py new file mode 100644 index 0000000..d1702ce --- /dev/null +++ b/bots/prompt_layer/korean_preprocessor.py @@ -0,0 +1,409 @@ +""" +bots/prompt_layer/korean_preprocessor.py +Korean TTS text preprocessing. + +Functions: +- preprocess_korean(text): apply pronunciation map + number conversion +- insert_pauses(script): insert SSML/marker pauses by sentence type +""" +import re +import logging + +logger = logging.getLogger(__name__) + +# English/acronym → Korean phonetic pronunciation +# 200+ entries covering tech, finance, social media, brands, etc. +PRONUNCIATION_MAP = { + # AI/Tech terms + 'AI': '에이아이', + 'API': '에이피아이', + 'GPT': '지피티', + 'ChatGPT': '챗지피티', + 'Claude': '클로드', + 'GitHub': '깃허브', + 'OpenAI': '오픈에이아이', + 'YouTube': '유튜브', + 'TikTok': '틱톡', + 'SEO': '에스이오', + 'SaaS': '사스', + 'UI': '유아이', + 'UX': '유엑스', + 'LLM': '엘엘엠', + 'NFT': '엔에프티', + 'DeFi': '디파이', + 'IoT': '아이오티', + 'AR': '에이알', + 'VR': '브이알', + 'ML': '머신러닝', + 'NLP': '엔엘피', + 'DevOps': '데브옵스', + 'SQL': '에스큐엘', + 'HTML': '에이치티엠엘', + 'CSS': '씨에스에스', + 'JSON': '제이슨', + 'URL': '유알엘', + 'HTTP': '에이치티티피', + 'HTTPS': '에이치티티피에스', + 'PC': '피씨', + 'CPU': '씨피유', + 'GPU': '지피유', + 'RAM': '램', + 'SSD': '에스에스디', + 'USB': '유에스비', + 'WiFi': '와이파이', + 'Bluetooth': '블루투스', + 'iOS': '아이오에스', + 'Android': '안드로이드', + 'App': '앱', + 'IT': '아이티', + 'ICT': '아이씨티', + 'SNS': '에스엔에스', + 'KPI': '케이피아이', + 'ROI': '알오아이', + 'B2B': '비투비', + 'B2C': '비투씨', + 'MVP': '엠브이피', + 'OKR': '오케이알', + 'CTO': '씨티오', + 'CEO': '씨이오', + 'CFO': '씨에프오', + 'HR': '에이치알', + 'PR': '피알', + 'IR': '아이알', + # Social/Platforms + 'Instagram': '인스타그램', + 'Facebook': '페이스북', + 'Twitter': '트위터', + 'LinkedIn': '링크드인', + 'Netflix': '넷플릭스', + 'Spotify': '스포티파이', + 'Uber': '우버', + 'Airbnb': '에어비앤비', + 'Amazon': '아마존', + 'Google': '구글', + 'Apple': '애플', + 'Microsoft': '마이크로소프트', + 'Samsung': '삼성', + 'LG': '엘지', + 'SK': '에스케이', + 'KT': '케이티', + # Finance + 'ETF': '이티에프', + 'IPO': '아이피오', + 'S&P': '에스앤피', + 'NASDAQ': '나스닥', + 'KOSPI': '코스피', + 'KOSDAQ': '코스닥', + 'GDP': '지디피', + 'IMF': '아이엠에프', + 'ECB': '이씨비', + 'Fed': '연준', + 'P/E': '주가수익비율', + # Health/Science + 'DNA': '디엔에이', + 'RNA': '알엔에이', + 'BMI': '비엠아이', + 'COVID': '코비드', + 'PCR': '피씨알', + # Education/Certification + 'MBA': '엠비에이', + 'PhD': '박사', + 'IELTS': '아이엘츠', + 'TOEIC': '토익', + 'TOEFL': '토플', + # Measurement units + 'km': '킬로미터', + 'kg': '킬로그램', + 'MB': '메가바이트', + 'GB': '기가바이트', + 'TB': '테라바이트', + 'Hz': '헤르츠', + 'MHz': '메가헤르츠', + 'GHz': '기가헤르츠', + # Media/Entertainment + 'OTT': '오티티', + 'VOD': '브이오디', + 'BGM': '비지엠', + 'OST': '오에스티', + 'DJ': '디제이', + 'MC': '엠씨', + 'PD': '피디', + 'CP': '씨피', + # Common English words used in Korean context + 'App Store': '앱 스토어', + 'Play Store': '플레이 스토어', + 'ChatBot': '챗봇', + 'Web3': '웹쓰리', + 'Metaverse': '메타버스', + 'Blockchain': '블록체인', + 'Crypto': '크립토', + 'Bitcoin': '비트코인', + 'Ethereum': '이더리움', + 'Cloud': '클라우드', + 'Big Data': '빅데이터', + 'Startup': '스타트업', + 'Fintech': '핀테크', + 'Edtech': '에드테크', + 'Healthtech': '헬스테크', + 'PropTech': '프롭테크', + 'LegalTech': '리걸테크', + 'FOMO': '포모', + 'YOLO': '욜로', + 'MZ': '엠제트', + # More tech + 'Python': '파이썬', + 'JavaScript': '자바스크립트', + 'TypeScript': '타입스크립트', + 'React': '리액트', + 'Node.js': '노드제이에스', + 'Docker': '도커', + 'Kubernetes': '쿠버네티스', + 'AWS': '에이더블유에스', + 'GCP': '지씨피', + 'Azure': '애저', + 'Slack': '슬랙', + 'Zoom': '줌', + 'Discord': '디스코드', + 'Notion': '노션', + 'Figma': '피그마', + 'Canva': '캔바', + # Business/Strategy + 'OEM': '오이엠', + 'ODM': '오디엠', + 'SCM': '에스씨엠', + 'ERP': '이알피', + 'CRM': '씨알엠', + # More social media + 'Reels': '릴스', + 'Stories': '스토리', + 'Live': '라이브', + 'Feed': '피드', + 'DM': '디엠', + 'PM': '피엠', + 'QA': '큐에이', + # Content + 'Blog': '블로그', + 'Vlog': '브이로그', + 'Podcast': '팟캐스트', + 'Newsletter': '뉴스레터', + 'Shorts': '쇼츠', + 'Reel': '릴', + # Misc + 'OK': '오케이', + 'NO': '노', + 'YES': '예스', + 'WOW': '와우', + 'LOL': '엘오엘', + 'BTW': '그런데', + 'FYI': '참고로', + 'ASAP': '최대한 빨리', + 'FAQ': '자주 묻는 질문', + 'Q&A': '질의응답', + 'A/S': '에이에스', + 'DIY': '디아이와이', + 'PPT': '피피티', + 'PDF': '피디에프', + 'ZIP': '집', +} + +# Pause durations in milliseconds by sentence type +DYNAMIC_PAUSES = { + 'hook_after': 500, # ms — impact emphasis after hook + 'question_after': 400, # thinking time after question + 'normal_after': 300, # standard sentence end + 'section_break': 600, # body → closer transition + 'comma': 150, # comma pause + 'exclamation': 200, # exclamation mark pause +} + +# Number → Korean word conversion rules +_NUM_TO_KO = { + 0: '영', 1: '일', 2: '이', 3: '삼', 4: '사', 5: '오', + 6: '육', 7: '칠', 8: '팔', 9: '구', 10: '십', + 100: '백', 1000: '천', 10000: '만', +} + +# Counter words for common units (for better number reading) +_COUNTER_MAP = { + '개': ('개', False), # items + '명': ('명', False), # people + '번': ('번', False), # times + '배': ('배', False), # times/multiples + '위': ('위', False), # rank + '가지': ('가지', True), # types (use sino-Korean) + '초': ('초', False), # seconds + '분': ('분', False), # minutes + '시간': ('시간', False), # hours + '일': ('일', False), # days + '월': ('월', False), # months + '년': ('년', False), # years + '%': ('퍼센트', False), # percent +} + + +def preprocess_korean(text: str) -> str: + """ + Apply pronunciation map and number conversion to Korean text. + + 1. Replace English/acronym terms with Korean phonetics + 2. Convert Arabic numerals with counter words to Korean + + Returns processed text ready for TTS. + """ + # Apply pronunciation map (longer strings first to avoid partial replacement) + sorted_map = sorted(PRONUNCIATION_MAP.items(), key=lambda x: -len(x[0])) + for en, ko in sorted_map: + # Word boundary replacement to avoid partial matches + text = re.sub(r'(? str: + """ + Convert Arabic numerals in Korean context. + e.g.: "3가지" → "세 가지", "100%" → "백 퍼센트" + """ + # Handle percentage + text = re.sub(r'(\d+)%', lambda m: _num_to_korean(int(m.group(1))) + ' 퍼센트', text) + + # Handle number + counter word + for counter, (ko_counter, use_sino) in _COUNTER_MAP.items(): + if counter == '%': + continue + pattern = r'(\d+)\s*' + re.escape(counter) + def replace(m, kc=ko_counter): + n = int(m.group(1)) + return _num_to_korean(n) + ' ' + kc + text = re.sub(pattern, replace, text) + + return text + + +def _num_to_korean(n: int) -> str: + """Convert integer to Korean sino-Korean numeral string.""" + if n == 0: + return '영' + if n < 0: + return '마이너스 ' + _num_to_korean(-n) + + result = '' + if n >= 10000: + man = n // 10000 + result += _num_to_korean(man) + '만' + n %= 10000 + if n >= 1000: + cheon = n // 1000 + result += ('' if cheon == 1 else _num_to_korean(cheon)) + '천' + n %= 1000 + if n >= 100: + baek = n // 100 + result += ('' if baek == 1 else _num_to_korean(baek)) + '백' + n %= 100 + if n >= 10: + sip = n // 10 + result += ('' if sip == 1 else _num_to_korean(sip)) + '십' + n %= 10 + if n > 0: + result += _NUM_TO_KO[n] + + return result + + +def insert_pauses(script: dict, engine: str = 'ssml') -> dict: + """ + Insert pause markers into script by sentence type. + + engine='ssml': insert SSML tags (for ElevenLabs, Google TTS) + engine='marker': insert [[PAUSE_Xms]] text markers (for Edge TTS, others) + + Returns modified script dict with pauses inserted. + """ + result = dict(script) + + hook = script.get('hook', '') + body = script.get('body', []) + closer = script.get('closer', '') + + # Add pause after hook + if hook: + pause_ms = DYNAMIC_PAUSES['hook_after'] + result['hook'] = hook + _pause_marker(pause_ms, engine) + + # Add pauses within body sentences + processed_body = [] + for i, sentence in enumerate(body): + processed = _add_inline_pauses(sentence, engine) + # Add section break before closer transition + if i == len(body) - 1: + processed += _pause_marker(DYNAMIC_PAUSES['section_break'], engine) + else: + processed += _pause_marker(DYNAMIC_PAUSES['normal_after'], engine) + processed_body.append(processed) + result['body'] = processed_body + + return result + + +def _add_inline_pauses(sentence: str, engine: str) -> str: + """Add pauses at commas and after exclamation marks.""" + # Comma pauses + sentence = re.sub( + r',\s*', + ',' + _pause_marker(DYNAMIC_PAUSES['comma'], engine), + sentence + ) + # Question mark pauses + sentence = re.sub( + r'\?\s*', + '?' + _pause_marker(DYNAMIC_PAUSES['question_after'], engine), + sentence + ) + # Exclamation pauses + sentence = re.sub( + r'!\s*', + '!' + _pause_marker(DYNAMIC_PAUSES['exclamation'], engine), + sentence + ) + return sentence + + +def _pause_marker(ms: int, engine: str) -> str: + """Generate engine-appropriate pause marker.""" + if engine == 'ssml': + return f'' + else: + return f' [[PAUSE_{ms}ms]] ' + + +# ── Standalone test ────────────────────────────────────────────── + +if __name__ == '__main__': + import sys + if '--test' in sys.argv: + print("=== Korean Preprocessor Test ===") + test_texts = [ + "AI와 ChatGPT가 SEO를 바꾸고 있어요", + "3가지 방법으로 100%의 수익을 낼 수 있습니다", + "YouTube와 TikTok에서 SNS 마케팅하기", + "GPT API를 사용한 SaaS 창업", + ] + for text in test_texts: + result = preprocess_korean(text) + print(f"원문: {text}") + print(f"처리: {result}") + print() + + # Test pause insertion + test_script = { + 'hook': '이거 모르면 손해입니다!', + 'body': ['첫 번째, AI를 활용하면 10배 빠릅니다.', '두 번째, 자동화가 핵심입니다.'], + 'closer': '지금 바로 시작하세요.' + } + processed = insert_pauses(test_script, engine='marker') + print("=== Pause Insertion Test ===") + for k, v in processed.items(): + print(f"{k}: {v}") diff --git a/bots/shorts/tts_engine.py b/bots/shorts/tts_engine.py index 70ffb94..10c1ab5 100644 --- a/bots/shorts/tts_engine.py +++ b/bots/shorts/tts_engine.py @@ -25,6 +25,111 @@ from typing import Optional logger = logging.getLogger(__name__) + +# ─── SmartTTSRouter ─────────────────────────────────────────── + +class SmartTTSRouter: + """ + Budget-aware TTS engine selection with graceful fallback. + + Engine priority order (best to cheapest): + 1. elevenlabs — best quality, paid + 2. openai_tts — good quality, paid (uses existing OpenAI key) + 3. cosyvoice2 — local, free, Korean native speaker voice + 4. kokoro — local, free, 82M params + 5. edge_tts — free fallback, always available + """ + + ENGINE_PRIORITY = ['elevenlabs', 'openai_tts', 'cosyvoice2', 'kokoro', 'edge_tts'] + + # Daily/monthly usage limits per engine + ENGINE_LIMITS = { + 'elevenlabs': {'chars_per_month': 10000, 'threshold': 0.8}, + 'openai_tts': {'chars_per_day': 500000, 'threshold': 0.9}, + } + + ENGINE_API_KEYS = { + 'elevenlabs': 'ELEVENLABS_API_KEY', + 'openai_tts': 'OPENAI_API_KEY', + } + # cosyvoice2, kokoro, edge_tts are local — no API key needed + + def __init__(self, resolved_config: dict): + """ + resolved_config: output from ConfigResolver.resolve() + """ + self.budget = resolved_config.get('budget', 'free') + self.tts_engine = resolved_config.get('tts', 'edge_tts') + self._usage = {} # {engine_name: chars_used_today} + self._failed = set() # engines that failed this session + + def select(self, text_length: int) -> str: + """ + Select best available TTS engine for given text length. + + 1. If user specified a non-auto engine: use it if available + 2. Else: check budget-appropriate engines in priority order + 3. Skip engines that have exceeded usage threshold + 4. Skip engines that failed this session + 5. Always fall back to edge_tts + """ + import os + + # If user explicitly chose a specific engine (not 'auto') + if self.tts_engine not in ('auto', 'edge_tts', ''): + engine = self.tts_engine + api_key_env = self.ENGINE_API_KEYS.get(engine, '') + if not api_key_env or os.environ.get(api_key_env, ''): + if engine not in self._failed: + return engine + + # Budget-based priority selection + if self.budget == 'free': + priority = ['kokoro', 'edge_tts'] + elif self.budget == 'low': + priority = ['openai_tts', 'kokoro', 'edge_tts'] + else: # medium, premium + priority = self.ENGINE_PRIORITY + + for engine in priority: + if engine in self._failed: + continue + api_key_env = self.ENGINE_API_KEYS.get(engine, '') + if api_key_env and not os.environ.get(api_key_env, ''): + continue # no API key + if self._is_over_limit(engine, text_length): + continue + return engine + + return 'edge_tts' # always available + + def on_failure(self, engine: str, error: str) -> str: + """ + Record engine failure and return next available engine. + No retry on same engine — no wasted credits. + """ + import logging + logging.getLogger(__name__).warning(f'TTS 엔진 실패: {engine} — {error}, 다음 엔진으로 전환') + self._failed.add(engine) + return self.select(0) # Select next engine + + def record_usage(self, engine: str, char_count: int) -> None: + """Record character usage for an engine.""" + self._usage[engine] = self._usage.get(engine, 0) + char_count + + def _is_over_limit(self, engine: str, text_length: int) -> bool: + """Check if engine has exceeded its usage threshold.""" + limits = self.ENGINE_LIMITS.get(engine, {}) + if not limits: + return False + threshold = limits.get('threshold', 0.9) + daily_limit = limits.get('chars_per_day', limits.get('chars_per_month', 0)) + if not daily_limit: + return False + used = self._usage.get(engine, 0) + return (used + text_length) / daily_limit > threshold + + # ─── 공통 유틸 ──────────────────────────────────────────────── @@ -167,6 +272,47 @@ def _get_ffmpeg() -> str: return 'ffmpeg' +# ─── OpenAI TTS ─────────────────────────────────────────────── + +def _tts_openai(text: str, output_path: Path, cfg: dict) -> list[dict]: + """ + OpenAI TTS (tts-1-hd model) with timestamp estimation. + Returns: [{word, start, end}, ...] — uniform timestamps (no word-level from OpenAI) + """ + import requests, base64 + import os + + api_key = os.environ.get('OPENAI_API_KEY', '') + if not api_key: + raise RuntimeError('OPENAI_API_KEY not set') + + openai_cfg = cfg.get('tts', {}).get('openai', {}) + model = openai_cfg.get('model', 'tts-1-hd') + voice = openai_cfg.get('voice', 'alloy') + speed = openai_cfg.get('speed', 1.0) + + url = 'https://api.openai.com/v1/audio/speech' + headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'} + payload = { + 'model': model, + 'input': text, + 'voice': voice, + 'speed': speed, + 'response_format': 'mp3', + } + + resp = requests.post(url, headers=headers, json=payload, timeout=60) + resp.raise_for_status() + + mp3_tmp = output_path.with_suffix('.mp3') + mp3_tmp.write_bytes(resp.content) + _mp3_to_wav(mp3_tmp, output_path) + mp3_tmp.unlink(missing_ok=True) + + # OpenAI TTS has no word-level timestamps — use uniform distribution + return [] # caption_renderer will use uniform fallback + + # ─── Google Cloud TTS ───────────────────────────────────────── def _tts_google_cloud(text: str, output_path: Path, cfg: dict) -> list[dict]: @@ -323,11 +469,21 @@ def generate_tts( ts_path = output_dir / f'{timestamp}_timestamps.json' text = _concat_script(script) - pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300) - priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'google_cloud', 'edge_tts']) + # Apply Korean preprocessing if available + try: + from bots.prompt_layer.korean_preprocessor import preprocess_korean + text = preprocess_korean(text) + except ImportError: + pass # Korean preprocessing not available, use raw text + + pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300) + priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'openai_tts', 'google_cloud', 'edge_tts']) + + # Engine map: elevenlabs → openai_tts → google_cloud → edge_tts engine_map = { 'elevenlabs': _tts_elevenlabs, + 'openai_tts': _tts_openai, 'google_cloud': _tts_google_cloud, 'edge_tts': _tts_edge, }