feat(v3): PR 4 — korean_preprocessor + SmartTTSRouter

- Add bots/prompt_layer/korean_preprocessor.py: 200+ entry pronunciation
  map, number→Korean conversion, dynamic SSML/marker pause insertion
- Upgrade bots/shorts/tts_engine.py: SmartTTSRouter (budget-aware engine
  selection with failure fallback), _tts_openai() function, Korean
  preprocessing step in generate_tts()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
sinmb79
2026-03-29 11:48:19 +09:00
parent 33b0bbd5ee
commit b666b67a03
2 changed files with 567 additions and 2 deletions

View File

@@ -0,0 +1,409 @@
"""
bots/prompt_layer/korean_preprocessor.py
Korean TTS text preprocessing.
Functions:
- preprocess_korean(text): apply pronunciation map + number conversion
- insert_pauses(script): insert SSML/marker pauses by sentence type
"""
import re
import logging
logger = logging.getLogger(__name__)
# English/acronym → Korean phonetic pronunciation
# 200+ entries covering tech, finance, social media, brands, etc.
PRONUNCIATION_MAP = {
# AI/Tech terms
'AI': '에이아이',
'API': '에이피아이',
'GPT': '지피티',
'ChatGPT': '챗지피티',
'Claude': '클로드',
'GitHub': '깃허브',
'OpenAI': '오픈에이아이',
'YouTube': '유튜브',
'TikTok': '틱톡',
'SEO': '에스이오',
'SaaS': '사스',
'UI': '유아이',
'UX': '유엑스',
'LLM': '엘엘엠',
'NFT': '엔에프티',
'DeFi': '디파이',
'IoT': '아이오티',
'AR': '에이알',
'VR': '브이알',
'ML': '머신러닝',
'NLP': '엔엘피',
'DevOps': '데브옵스',
'SQL': '에스큐엘',
'HTML': '에이치티엠엘',
'CSS': '씨에스에스',
'JSON': '제이슨',
'URL': '유알엘',
'HTTP': '에이치티티피',
'HTTPS': '에이치티티피에스',
'PC': '피씨',
'CPU': '씨피유',
'GPU': '지피유',
'RAM': '',
'SSD': '에스에스디',
'USB': '유에스비',
'WiFi': '와이파이',
'Bluetooth': '블루투스',
'iOS': '아이오에스',
'Android': '안드로이드',
'App': '',
'IT': '아이티',
'ICT': '아이씨티',
'SNS': '에스엔에스',
'KPI': '케이피아이',
'ROI': '알오아이',
'B2B': '비투비',
'B2C': '비투씨',
'MVP': '엠브이피',
'OKR': '오케이알',
'CTO': '씨티오',
'CEO': '씨이오',
'CFO': '씨에프오',
'HR': '에이치알',
'PR': '피알',
'IR': '아이알',
# Social/Platforms
'Instagram': '인스타그램',
'Facebook': '페이스북',
'Twitter': '트위터',
'LinkedIn': '링크드인',
'Netflix': '넷플릭스',
'Spotify': '스포티파이',
'Uber': '우버',
'Airbnb': '에어비앤비',
'Amazon': '아마존',
'Google': '구글',
'Apple': '애플',
'Microsoft': '마이크로소프트',
'Samsung': '삼성',
'LG': '엘지',
'SK': '에스케이',
'KT': '케이티',
# Finance
'ETF': '이티에프',
'IPO': '아이피오',
'S&P': '에스앤피',
'NASDAQ': '나스닥',
'KOSPI': '코스피',
'KOSDAQ': '코스닥',
'GDP': '지디피',
'IMF': '아이엠에프',
'ECB': '이씨비',
'Fed': '연준',
'P/E': '주가수익비율',
# Health/Science
'DNA': '디엔에이',
'RNA': '알엔에이',
'BMI': '비엠아이',
'COVID': '코비드',
'PCR': '피씨알',
# Education/Certification
'MBA': '엠비에이',
'PhD': '박사',
'IELTS': '아이엘츠',
'TOEIC': '토익',
'TOEFL': '토플',
# Measurement units
'km': '킬로미터',
'kg': '킬로그램',
'MB': '메가바이트',
'GB': '기가바이트',
'TB': '테라바이트',
'Hz': '헤르츠',
'MHz': '메가헤르츠',
'GHz': '기가헤르츠',
# Media/Entertainment
'OTT': '오티티',
'VOD': '브이오디',
'BGM': '비지엠',
'OST': '오에스티',
'DJ': '디제이',
'MC': '엠씨',
'PD': '피디',
'CP': '씨피',
# Common English words used in Korean context
'App Store': '앱 스토어',
'Play Store': '플레이 스토어',
'ChatBot': '챗봇',
'Web3': '웹쓰리',
'Metaverse': '메타버스',
'Blockchain': '블록체인',
'Crypto': '크립토',
'Bitcoin': '비트코인',
'Ethereum': '이더리움',
'Cloud': '클라우드',
'Big Data': '빅데이터',
'Startup': '스타트업',
'Fintech': '핀테크',
'Edtech': '에드테크',
'Healthtech': '헬스테크',
'PropTech': '프롭테크',
'LegalTech': '리걸테크',
'FOMO': '포모',
'YOLO': '욜로',
'MZ': '엠제트',
# More tech
'Python': '파이썬',
'JavaScript': '자바스크립트',
'TypeScript': '타입스크립트',
'React': '리액트',
'Node.js': '노드제이에스',
'Docker': '도커',
'Kubernetes': '쿠버네티스',
'AWS': '에이더블유에스',
'GCP': '지씨피',
'Azure': '애저',
'Slack': '슬랙',
'Zoom': '',
'Discord': '디스코드',
'Notion': '노션',
'Figma': '피그마',
'Canva': '캔바',
# Business/Strategy
'OEM': '오이엠',
'ODM': '오디엠',
'SCM': '에스씨엠',
'ERP': '이알피',
'CRM': '씨알엠',
# More social media
'Reels': '릴스',
'Stories': '스토리',
'Live': '라이브',
'Feed': '피드',
'DM': '디엠',
'PM': '피엠',
'QA': '큐에이',
# Content
'Blog': '블로그',
'Vlog': '브이로그',
'Podcast': '팟캐스트',
'Newsletter': '뉴스레터',
'Shorts': '쇼츠',
'Reel': '',
# Misc
'OK': '오케이',
'NO': '',
'YES': '예스',
'WOW': '와우',
'LOL': '엘오엘',
'BTW': '그런데',
'FYI': '참고로',
'ASAP': '최대한 빨리',
'FAQ': '자주 묻는 질문',
'Q&A': '질의응답',
'A/S': '에이에스',
'DIY': '디아이와이',
'PPT': '피피티',
'PDF': '피디에프',
'ZIP': '',
}
# Pause durations in milliseconds by sentence type
DYNAMIC_PAUSES = {
'hook_after': 500, # ms — impact emphasis after hook
'question_after': 400, # thinking time after question
'normal_after': 300, # standard sentence end
'section_break': 600, # body → closer transition
'comma': 150, # comma pause
'exclamation': 200, # exclamation mark pause
}
# Number → Korean word conversion rules
_NUM_TO_KO = {
0: '', 1: '', 2: '', 3: '', 4: '', 5: '',
6: '', 7: '', 8: '', 9: '', 10: '',
100: '', 1000: '', 10000: '',
}
# Counter words for common units (for better number reading)
_COUNTER_MAP = {
'': ('', False), # items
'': ('', False), # people
'': ('', False), # times
'': ('', False), # times/multiples
'': ('', False), # rank
'가지': ('가지', True), # types (use sino-Korean)
'': ('', False), # seconds
'': ('', False), # minutes
'시간': ('시간', False), # hours
'': ('', False), # days
'': ('', False), # months
'': ('', False), # years
'%': ('퍼센트', False), # percent
}
def preprocess_korean(text: str) -> str:
"""
Apply pronunciation map and number conversion to Korean text.
1. Replace English/acronym terms with Korean phonetics
2. Convert Arabic numerals with counter words to Korean
Returns processed text ready for TTS.
"""
# Apply pronunciation map (longer strings first to avoid partial replacement)
sorted_map = sorted(PRONUNCIATION_MAP.items(), key=lambda x: -len(x[0]))
for en, ko in sorted_map:
# Word boundary replacement to avoid partial matches
text = re.sub(r'(?<![가-힣\w])' + re.escape(en) + r'(?![가-힣\w])', ko, text)
# Convert numbers
text = _convert_numbers(text)
return text
def _convert_numbers(text: str) -> str:
"""
Convert Arabic numerals in Korean context.
e.g.: "3가지""세 가지", "100%""백 퍼센트"
"""
# Handle percentage
text = re.sub(r'(\d+)%', lambda m: _num_to_korean(int(m.group(1))) + ' 퍼센트', text)
# Handle number + counter word
for counter, (ko_counter, use_sino) in _COUNTER_MAP.items():
if counter == '%':
continue
pattern = r'(\d+)\s*' + re.escape(counter)
def replace(m, kc=ko_counter):
n = int(m.group(1))
return _num_to_korean(n) + ' ' + kc
text = re.sub(pattern, replace, text)
return text
def _num_to_korean(n: int) -> str:
"""Convert integer to Korean sino-Korean numeral string."""
if n == 0:
return ''
if n < 0:
return '마이너스 ' + _num_to_korean(-n)
result = ''
if n >= 10000:
man = n // 10000
result += _num_to_korean(man) + ''
n %= 10000
if n >= 1000:
cheon = n // 1000
result += ('' if cheon == 1 else _num_to_korean(cheon)) + ''
n %= 1000
if n >= 100:
baek = n // 100
result += ('' if baek == 1 else _num_to_korean(baek)) + ''
n %= 100
if n >= 10:
sip = n // 10
result += ('' if sip == 1 else _num_to_korean(sip)) + ''
n %= 10
if n > 0:
result += _NUM_TO_KO[n]
return result
def insert_pauses(script: dict, engine: str = 'ssml') -> dict:
"""
Insert pause markers into script by sentence type.
engine='ssml': insert SSML <break> tags (for ElevenLabs, Google TTS)
engine='marker': insert [[PAUSE_Xms]] text markers (for Edge TTS, others)
Returns modified script dict with pauses inserted.
"""
result = dict(script)
hook = script.get('hook', '')
body = script.get('body', [])
closer = script.get('closer', '')
# Add pause after hook
if hook:
pause_ms = DYNAMIC_PAUSES['hook_after']
result['hook'] = hook + _pause_marker(pause_ms, engine)
# Add pauses within body sentences
processed_body = []
for i, sentence in enumerate(body):
processed = _add_inline_pauses(sentence, engine)
# Add section break before closer transition
if i == len(body) - 1:
processed += _pause_marker(DYNAMIC_PAUSES['section_break'], engine)
else:
processed += _pause_marker(DYNAMIC_PAUSES['normal_after'], engine)
processed_body.append(processed)
result['body'] = processed_body
return result
def _add_inline_pauses(sentence: str, engine: str) -> str:
"""Add pauses at commas and after exclamation marks."""
# Comma pauses
sentence = re.sub(
r',\s*',
',' + _pause_marker(DYNAMIC_PAUSES['comma'], engine),
sentence
)
# Question mark pauses
sentence = re.sub(
r'\?\s*',
'?' + _pause_marker(DYNAMIC_PAUSES['question_after'], engine),
sentence
)
# Exclamation pauses
sentence = re.sub(
r'!\s*',
'!' + _pause_marker(DYNAMIC_PAUSES['exclamation'], engine),
sentence
)
return sentence
def _pause_marker(ms: int, engine: str) -> str:
"""Generate engine-appropriate pause marker."""
if engine == 'ssml':
return f'<break time="{ms}ms"/>'
else:
return f' [[PAUSE_{ms}ms]] '
# ── Standalone test ──────────────────────────────────────────────
if __name__ == '__main__':
import sys
if '--test' in sys.argv:
print("=== Korean Preprocessor Test ===")
test_texts = [
"AI와 ChatGPT가 SEO를 바꾸고 있어요",
"3가지 방법으로 100%의 수익을 낼 수 있습니다",
"YouTube와 TikTok에서 SNS 마케팅하기",
"GPT API를 사용한 SaaS 창업",
]
for text in test_texts:
result = preprocess_korean(text)
print(f"원문: {text}")
print(f"처리: {result}")
print()
# Test pause insertion
test_script = {
'hook': '이거 모르면 손해입니다!',
'body': ['첫 번째, AI를 활용하면 10배 빠릅니다.', '두 번째, 자동화가 핵심입니다.'],
'closer': '지금 바로 시작하세요.'
}
processed = insert_pauses(test_script, engine='marker')
print("=== Pause Insertion Test ===")
for k, v in processed.items():
print(f"{k}: {v}")

View File

@@ -25,6 +25,111 @@ from typing import Optional
logger = logging.getLogger(__name__)
# ─── SmartTTSRouter ───────────────────────────────────────────
class SmartTTSRouter:
"""
Budget-aware TTS engine selection with graceful fallback.
Engine priority order (best to cheapest):
1. elevenlabs — best quality, paid
2. openai_tts — good quality, paid (uses existing OpenAI key)
3. cosyvoice2 — local, free, Korean native speaker voice
4. kokoro — local, free, 82M params
5. edge_tts — free fallback, always available
"""
ENGINE_PRIORITY = ['elevenlabs', 'openai_tts', 'cosyvoice2', 'kokoro', 'edge_tts']
# Daily/monthly usage limits per engine
ENGINE_LIMITS = {
'elevenlabs': {'chars_per_month': 10000, 'threshold': 0.8},
'openai_tts': {'chars_per_day': 500000, 'threshold': 0.9},
}
ENGINE_API_KEYS = {
'elevenlabs': 'ELEVENLABS_API_KEY',
'openai_tts': 'OPENAI_API_KEY',
}
# cosyvoice2, kokoro, edge_tts are local — no API key needed
def __init__(self, resolved_config: dict):
"""
resolved_config: output from ConfigResolver.resolve()
"""
self.budget = resolved_config.get('budget', 'free')
self.tts_engine = resolved_config.get('tts', 'edge_tts')
self._usage = {} # {engine_name: chars_used_today}
self._failed = set() # engines that failed this session
def select(self, text_length: int) -> str:
"""
Select best available TTS engine for given text length.
1. If user specified a non-auto engine: use it if available
2. Else: check budget-appropriate engines in priority order
3. Skip engines that have exceeded usage threshold
4. Skip engines that failed this session
5. Always fall back to edge_tts
"""
import os
# If user explicitly chose a specific engine (not 'auto')
if self.tts_engine not in ('auto', 'edge_tts', ''):
engine = self.tts_engine
api_key_env = self.ENGINE_API_KEYS.get(engine, '')
if not api_key_env or os.environ.get(api_key_env, ''):
if engine not in self._failed:
return engine
# Budget-based priority selection
if self.budget == 'free':
priority = ['kokoro', 'edge_tts']
elif self.budget == 'low':
priority = ['openai_tts', 'kokoro', 'edge_tts']
else: # medium, premium
priority = self.ENGINE_PRIORITY
for engine in priority:
if engine in self._failed:
continue
api_key_env = self.ENGINE_API_KEYS.get(engine, '')
if api_key_env and not os.environ.get(api_key_env, ''):
continue # no API key
if self._is_over_limit(engine, text_length):
continue
return engine
return 'edge_tts' # always available
def on_failure(self, engine: str, error: str) -> str:
"""
Record engine failure and return next available engine.
No retry on same engine — no wasted credits.
"""
import logging
logging.getLogger(__name__).warning(f'TTS 엔진 실패: {engine}{error}, 다음 엔진으로 전환')
self._failed.add(engine)
return self.select(0) # Select next engine
def record_usage(self, engine: str, char_count: int) -> None:
"""Record character usage for an engine."""
self._usage[engine] = self._usage.get(engine, 0) + char_count
def _is_over_limit(self, engine: str, text_length: int) -> bool:
"""Check if engine has exceeded its usage threshold."""
limits = self.ENGINE_LIMITS.get(engine, {})
if not limits:
return False
threshold = limits.get('threshold', 0.9)
daily_limit = limits.get('chars_per_day', limits.get('chars_per_month', 0))
if not daily_limit:
return False
used = self._usage.get(engine, 0)
return (used + text_length) / daily_limit > threshold
# ─── 공통 유틸 ────────────────────────────────────────────────
@@ -167,6 +272,47 @@ def _get_ffmpeg() -> str:
return 'ffmpeg'
# ─── OpenAI TTS ───────────────────────────────────────────────
def _tts_openai(text: str, output_path: Path, cfg: dict) -> list[dict]:
"""
OpenAI TTS (tts-1-hd model) with timestamp estimation.
Returns: [{word, start, end}, ...] — uniform timestamps (no word-level from OpenAI)
"""
import requests, base64
import os
api_key = os.environ.get('OPENAI_API_KEY', '')
if not api_key:
raise RuntimeError('OPENAI_API_KEY not set')
openai_cfg = cfg.get('tts', {}).get('openai', {})
model = openai_cfg.get('model', 'tts-1-hd')
voice = openai_cfg.get('voice', 'alloy')
speed = openai_cfg.get('speed', 1.0)
url = 'https://api.openai.com/v1/audio/speech'
headers = {'Authorization': f'Bearer {api_key}', 'Content-Type': 'application/json'}
payload = {
'model': model,
'input': text,
'voice': voice,
'speed': speed,
'response_format': 'mp3',
}
resp = requests.post(url, headers=headers, json=payload, timeout=60)
resp.raise_for_status()
mp3_tmp = output_path.with_suffix('.mp3')
mp3_tmp.write_bytes(resp.content)
_mp3_to_wav(mp3_tmp, output_path)
mp3_tmp.unlink(missing_ok=True)
# OpenAI TTS has no word-level timestamps — use uniform distribution
return [] # caption_renderer will use uniform fallback
# ─── Google Cloud TTS ─────────────────────────────────────────
def _tts_google_cloud(text: str, output_path: Path, cfg: dict) -> list[dict]:
@@ -323,11 +469,21 @@ def generate_tts(
ts_path = output_dir / f'{timestamp}_timestamps.json'
text = _concat_script(script)
pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300)
priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'google_cloud', 'edge_tts'])
# Apply Korean preprocessing if available
try:
from bots.prompt_layer.korean_preprocessor import preprocess_korean
text = preprocess_korean(text)
except ImportError:
pass # Korean preprocessing not available, use raw text
pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300)
priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'openai_tts', 'google_cloud', 'edge_tts'])
# Engine map: elevenlabs → openai_tts → google_cloud → edge_tts
engine_map = {
'elevenlabs': _tts_elevenlabs,
'openai_tts': _tts_openai,
'google_cloud': _tts_google_cloud,
'edge_tts': _tts_edge,
}