- korean_preprocessor: 발음 사전 176 → 206개 (200+ 달성) - video_engine: SoraEngine 완전 제거 (2026-03-24 서비스 종료) - smart_video_router: veo3/seedance2 빈 문자열 반환 → ffmpeg_slides 폴백 - cli/init: gemini_web 서비스 설정 질문 추가 (user_profile 일치) - caption_renderer, tts_engine, video_assembler: --test 스탠드얼론 블록 추가 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
443 lines
13 KiB
Python
443 lines
13 KiB
Python
"""
|
|
bots/prompt_layer/korean_preprocessor.py
|
|
Korean TTS text preprocessing.
|
|
|
|
Functions:
|
|
- preprocess_korean(text): apply pronunciation map + number conversion
|
|
- insert_pauses(script): insert SSML/marker pauses by sentence type
|
|
"""
|
|
import re
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# English/acronym → Korean phonetic pronunciation
|
|
# 200+ entries covering tech, finance, social media, brands, etc.
|
|
PRONUNCIATION_MAP = {
|
|
# AI/Tech terms
|
|
'AI': '에이아이',
|
|
'API': '에이피아이',
|
|
'GPT': '지피티',
|
|
'ChatGPT': '챗지피티',
|
|
'Claude': '클로드',
|
|
'GitHub': '깃허브',
|
|
'OpenAI': '오픈에이아이',
|
|
'YouTube': '유튜브',
|
|
'TikTok': '틱톡',
|
|
'SEO': '에스이오',
|
|
'SaaS': '사스',
|
|
'UI': '유아이',
|
|
'UX': '유엑스',
|
|
'LLM': '엘엘엠',
|
|
'NFT': '엔에프티',
|
|
'DeFi': '디파이',
|
|
'IoT': '아이오티',
|
|
'AR': '에이알',
|
|
'VR': '브이알',
|
|
'ML': '머신러닝',
|
|
'NLP': '엔엘피',
|
|
'DevOps': '데브옵스',
|
|
'SQL': '에스큐엘',
|
|
'HTML': '에이치티엠엘',
|
|
'CSS': '씨에스에스',
|
|
'JSON': '제이슨',
|
|
'URL': '유알엘',
|
|
'HTTP': '에이치티티피',
|
|
'HTTPS': '에이치티티피에스',
|
|
'PC': '피씨',
|
|
'CPU': '씨피유',
|
|
'GPU': '지피유',
|
|
'RAM': '램',
|
|
'SSD': '에스에스디',
|
|
'USB': '유에스비',
|
|
'WiFi': '와이파이',
|
|
'Bluetooth': '블루투스',
|
|
'iOS': '아이오에스',
|
|
'Android': '안드로이드',
|
|
'App': '앱',
|
|
'IT': '아이티',
|
|
'ICT': '아이씨티',
|
|
'SNS': '에스엔에스',
|
|
'KPI': '케이피아이',
|
|
'ROI': '알오아이',
|
|
'B2B': '비투비',
|
|
'B2C': '비투씨',
|
|
'MVP': '엠브이피',
|
|
'OKR': '오케이알',
|
|
'CTO': '씨티오',
|
|
'CEO': '씨이오',
|
|
'CFO': '씨에프오',
|
|
'HR': '에이치알',
|
|
'PR': '피알',
|
|
'IR': '아이알',
|
|
# Social/Platforms
|
|
'Instagram': '인스타그램',
|
|
'Facebook': '페이스북',
|
|
'Twitter': '트위터',
|
|
'LinkedIn': '링크드인',
|
|
'Netflix': '넷플릭스',
|
|
'Spotify': '스포티파이',
|
|
'Uber': '우버',
|
|
'Airbnb': '에어비앤비',
|
|
'Amazon': '아마존',
|
|
'Google': '구글',
|
|
'Apple': '애플',
|
|
'Microsoft': '마이크로소프트',
|
|
'Samsung': '삼성',
|
|
'LG': '엘지',
|
|
'SK': '에스케이',
|
|
'KT': '케이티',
|
|
# Finance
|
|
'ETF': '이티에프',
|
|
'IPO': '아이피오',
|
|
'S&P': '에스앤피',
|
|
'NASDAQ': '나스닥',
|
|
'KOSPI': '코스피',
|
|
'KOSDAQ': '코스닥',
|
|
'GDP': '지디피',
|
|
'IMF': '아이엠에프',
|
|
'ECB': '이씨비',
|
|
'Fed': '연준',
|
|
'P/E': '주가수익비율',
|
|
# Health/Science
|
|
'DNA': '디엔에이',
|
|
'RNA': '알엔에이',
|
|
'BMI': '비엠아이',
|
|
'COVID': '코비드',
|
|
'PCR': '피씨알',
|
|
# Education/Certification
|
|
'MBA': '엠비에이',
|
|
'PhD': '박사',
|
|
'IELTS': '아이엘츠',
|
|
'TOEIC': '토익',
|
|
'TOEFL': '토플',
|
|
# Measurement units
|
|
'km': '킬로미터',
|
|
'kg': '킬로그램',
|
|
'MB': '메가바이트',
|
|
'GB': '기가바이트',
|
|
'TB': '테라바이트',
|
|
'Hz': '헤르츠',
|
|
'MHz': '메가헤르츠',
|
|
'GHz': '기가헤르츠',
|
|
# Media/Entertainment
|
|
'OTT': '오티티',
|
|
'VOD': '브이오디',
|
|
'BGM': '비지엠',
|
|
'OST': '오에스티',
|
|
'DJ': '디제이',
|
|
'MC': '엠씨',
|
|
'PD': '피디',
|
|
'CP': '씨피',
|
|
# Common English words used in Korean context
|
|
'App Store': '앱 스토어',
|
|
'Play Store': '플레이 스토어',
|
|
'ChatBot': '챗봇',
|
|
'Web3': '웹쓰리',
|
|
'Metaverse': '메타버스',
|
|
'Blockchain': '블록체인',
|
|
'Crypto': '크립토',
|
|
'Bitcoin': '비트코인',
|
|
'Ethereum': '이더리움',
|
|
'Cloud': '클라우드',
|
|
'Big Data': '빅데이터',
|
|
'Startup': '스타트업',
|
|
'Fintech': '핀테크',
|
|
'Edtech': '에드테크',
|
|
'Healthtech': '헬스테크',
|
|
'PropTech': '프롭테크',
|
|
'LegalTech': '리걸테크',
|
|
'FOMO': '포모',
|
|
'YOLO': '욜로',
|
|
'MZ': '엠제트',
|
|
# More tech
|
|
'Python': '파이썬',
|
|
'JavaScript': '자바스크립트',
|
|
'TypeScript': '타입스크립트',
|
|
'React': '리액트',
|
|
'Node.js': '노드제이에스',
|
|
'Docker': '도커',
|
|
'Kubernetes': '쿠버네티스',
|
|
'AWS': '에이더블유에스',
|
|
'GCP': '지씨피',
|
|
'Azure': '애저',
|
|
'Slack': '슬랙',
|
|
'Zoom': '줌',
|
|
'Discord': '디스코드',
|
|
'Notion': '노션',
|
|
'Figma': '피그마',
|
|
'Canva': '캔바',
|
|
# Business/Strategy
|
|
'OEM': '오이엠',
|
|
'ODM': '오디엠',
|
|
'SCM': '에스씨엠',
|
|
'ERP': '이알피',
|
|
'CRM': '씨알엠',
|
|
# More social media
|
|
'Reels': '릴스',
|
|
'Stories': '스토리',
|
|
'Live': '라이브',
|
|
'Feed': '피드',
|
|
'DM': '디엠',
|
|
'PM': '피엠',
|
|
'QA': '큐에이',
|
|
# Content
|
|
'Blog': '블로그',
|
|
'Vlog': '브이로그',
|
|
'Podcast': '팟캐스트',
|
|
'Newsletter': '뉴스레터',
|
|
'Shorts': '쇼츠',
|
|
'Reel': '릴',
|
|
# Misc
|
|
'OK': '오케이',
|
|
'NO': '노',
|
|
'YES': '예스',
|
|
'WOW': '와우',
|
|
'LOL': '엘오엘',
|
|
'BTW': '그런데',
|
|
'FYI': '참고로',
|
|
'ASAP': '최대한 빨리',
|
|
'FAQ': '자주 묻는 질문',
|
|
'Q&A': '질의응답',
|
|
'A/S': '에이에스',
|
|
'DIY': '디아이와이',
|
|
'PPT': '피피티',
|
|
'PDF': '피디에프',
|
|
'ZIP': '집',
|
|
# AI/LLM extended
|
|
'Gemini': '제미나이',
|
|
'Grok': '그록',
|
|
'Copilot': '코파일럿',
|
|
'Perplexity': '퍼플렉시티',
|
|
'Midjourney': '미드저니',
|
|
'Stable Diffusion': '스테이블 디퓨전',
|
|
'DALL-E': '달리',
|
|
'Sora': '소라',
|
|
'Kling': '클링',
|
|
'Runway': '런웨이',
|
|
# Dev tools / infra
|
|
'Git': '깃',
|
|
'Linux': '리눅스',
|
|
'Ubuntu': '우분투',
|
|
'Windows': '윈도우',
|
|
'macOS': '맥오에스',
|
|
'Terminal': '터미널',
|
|
'CI/CD': '씨아이씨디',
|
|
'API Gateway': '에이피아이 게이트웨이',
|
|
# Finance extended
|
|
'PER': '주가수익비율',
|
|
'PBR': '주가순자산비율',
|
|
'EPS': '주당순이익',
|
|
'ROE': '자기자본이익률',
|
|
'CAGR': '연평균성장률',
|
|
# E-commerce / marketing
|
|
'CPC': '클릭당비용',
|
|
'CPM': '천회노출당비용',
|
|
'CTA': '씨티에이',
|
|
'CTR': '클릭률',
|
|
'ROAS': '광고수익률',
|
|
'LTV': '고객생애가치',
|
|
}
|
|
|
|
# Pause durations in milliseconds by sentence type
|
|
DYNAMIC_PAUSES = {
|
|
'hook_after': 500, # ms — impact emphasis after hook
|
|
'question_after': 400, # thinking time after question
|
|
'normal_after': 300, # standard sentence end
|
|
'section_break': 600, # body → closer transition
|
|
'comma': 150, # comma pause
|
|
'exclamation': 200, # exclamation mark pause
|
|
}
|
|
|
|
# Number → Korean word conversion rules
|
|
_NUM_TO_KO = {
|
|
0: '영', 1: '일', 2: '이', 3: '삼', 4: '사', 5: '오',
|
|
6: '육', 7: '칠', 8: '팔', 9: '구', 10: '십',
|
|
100: '백', 1000: '천', 10000: '만',
|
|
}
|
|
|
|
# Counter words for common units (for better number reading)
|
|
_COUNTER_MAP = {
|
|
'개': ('개', False), # items
|
|
'명': ('명', False), # people
|
|
'번': ('번', False), # times
|
|
'배': ('배', False), # times/multiples
|
|
'위': ('위', False), # rank
|
|
'가지': ('가지', True), # types (use sino-Korean)
|
|
'초': ('초', False), # seconds
|
|
'분': ('분', False), # minutes
|
|
'시간': ('시간', False), # hours
|
|
'일': ('일', False), # days
|
|
'월': ('월', False), # months
|
|
'년': ('년', False), # years
|
|
'%': ('퍼센트', False), # percent
|
|
}
|
|
|
|
|
|
def preprocess_korean(text: str) -> str:
|
|
"""
|
|
Apply pronunciation map and number conversion to Korean text.
|
|
|
|
1. Replace English/acronym terms with Korean phonetics
|
|
2. Convert Arabic numerals with counter words to Korean
|
|
|
|
Returns processed text ready for TTS.
|
|
"""
|
|
# Apply pronunciation map (longer strings first to avoid partial replacement)
|
|
sorted_map = sorted(PRONUNCIATION_MAP.items(), key=lambda x: -len(x[0]))
|
|
for en, ko in sorted_map:
|
|
# Word boundary replacement to avoid partial matches
|
|
text = re.sub(r'(?<![가-힣\w])' + re.escape(en) + r'(?![가-힣\w])', ko, text)
|
|
|
|
# Convert numbers
|
|
text = _convert_numbers(text)
|
|
|
|
return text
|
|
|
|
|
|
def _convert_numbers(text: str) -> str:
|
|
"""
|
|
Convert Arabic numerals in Korean context.
|
|
e.g.: "3가지" → "세 가지", "100%" → "백 퍼센트"
|
|
"""
|
|
# Handle percentage
|
|
text = re.sub(r'(\d+)%', lambda m: _num_to_korean(int(m.group(1))) + ' 퍼센트', text)
|
|
|
|
# Handle number + counter word
|
|
for counter, (ko_counter, use_sino) in _COUNTER_MAP.items():
|
|
if counter == '%':
|
|
continue
|
|
pattern = r'(\d+)\s*' + re.escape(counter)
|
|
def replace(m, kc=ko_counter):
|
|
n = int(m.group(1))
|
|
return _num_to_korean(n) + ' ' + kc
|
|
text = re.sub(pattern, replace, text)
|
|
|
|
return text
|
|
|
|
|
|
def _num_to_korean(n: int) -> str:
|
|
"""Convert integer to Korean sino-Korean numeral string."""
|
|
if n == 0:
|
|
return '영'
|
|
if n < 0:
|
|
return '마이너스 ' + _num_to_korean(-n)
|
|
|
|
result = ''
|
|
if n >= 10000:
|
|
man = n // 10000
|
|
result += _num_to_korean(man) + '만'
|
|
n %= 10000
|
|
if n >= 1000:
|
|
cheon = n // 1000
|
|
result += ('' if cheon == 1 else _num_to_korean(cheon)) + '천'
|
|
n %= 1000
|
|
if n >= 100:
|
|
baek = n // 100
|
|
result += ('' if baek == 1 else _num_to_korean(baek)) + '백'
|
|
n %= 100
|
|
if n >= 10:
|
|
sip = n // 10
|
|
result += ('' if sip == 1 else _num_to_korean(sip)) + '십'
|
|
n %= 10
|
|
if n > 0:
|
|
result += _NUM_TO_KO[n]
|
|
|
|
return result
|
|
|
|
|
|
def insert_pauses(script: dict, engine: str = 'ssml') -> dict:
|
|
"""
|
|
Insert pause markers into script by sentence type.
|
|
|
|
engine='ssml': insert SSML <break> tags (for ElevenLabs, Google TTS)
|
|
engine='marker': insert [[PAUSE_Xms]] text markers (for Edge TTS, others)
|
|
|
|
Returns modified script dict with pauses inserted.
|
|
"""
|
|
result = dict(script)
|
|
|
|
hook = script.get('hook', '')
|
|
body = script.get('body', [])
|
|
closer = script.get('closer', '')
|
|
|
|
# Add pause after hook
|
|
if hook:
|
|
pause_ms = DYNAMIC_PAUSES['hook_after']
|
|
result['hook'] = hook + _pause_marker(pause_ms, engine)
|
|
|
|
# Add pauses within body sentences
|
|
processed_body = []
|
|
for i, sentence in enumerate(body):
|
|
processed = _add_inline_pauses(sentence, engine)
|
|
# Add section break before closer transition
|
|
if i == len(body) - 1:
|
|
processed += _pause_marker(DYNAMIC_PAUSES['section_break'], engine)
|
|
else:
|
|
processed += _pause_marker(DYNAMIC_PAUSES['normal_after'], engine)
|
|
processed_body.append(processed)
|
|
result['body'] = processed_body
|
|
|
|
return result
|
|
|
|
|
|
def _add_inline_pauses(sentence: str, engine: str) -> str:
|
|
"""Add pauses at commas and after exclamation marks."""
|
|
# Comma pauses
|
|
sentence = re.sub(
|
|
r',\s*',
|
|
',' + _pause_marker(DYNAMIC_PAUSES['comma'], engine),
|
|
sentence
|
|
)
|
|
# Question mark pauses
|
|
sentence = re.sub(
|
|
r'\?\s*',
|
|
'?' + _pause_marker(DYNAMIC_PAUSES['question_after'], engine),
|
|
sentence
|
|
)
|
|
# Exclamation pauses
|
|
sentence = re.sub(
|
|
r'!\s*',
|
|
'!' + _pause_marker(DYNAMIC_PAUSES['exclamation'], engine),
|
|
sentence
|
|
)
|
|
return sentence
|
|
|
|
|
|
def _pause_marker(ms: int, engine: str) -> str:
|
|
"""Generate engine-appropriate pause marker."""
|
|
if engine == 'ssml':
|
|
return f'<break time="{ms}ms"/>'
|
|
else:
|
|
return f' [[PAUSE_{ms}ms]] '
|
|
|
|
|
|
# ── Standalone test ──────────────────────────────────────────────
|
|
|
|
if __name__ == '__main__':
|
|
import sys
|
|
if '--test' in sys.argv:
|
|
print("=== Korean Preprocessor Test ===")
|
|
test_texts = [
|
|
"AI와 ChatGPT가 SEO를 바꾸고 있어요",
|
|
"3가지 방법으로 100%의 수익을 낼 수 있습니다",
|
|
"YouTube와 TikTok에서 SNS 마케팅하기",
|
|
"GPT API를 사용한 SaaS 창업",
|
|
]
|
|
for text in test_texts:
|
|
result = preprocess_korean(text)
|
|
print(f"원문: {text}")
|
|
print(f"처리: {result}")
|
|
print()
|
|
|
|
# Test pause insertion
|
|
test_script = {
|
|
'hook': '이거 모르면 손해입니다!',
|
|
'body': ['첫 번째, AI를 활용하면 10배 빠릅니다.', '두 번째, 자동화가 핵심입니다.'],
|
|
'closer': '지금 바로 시작하세요.'
|
|
}
|
|
processed = insert_pauses(test_script, engine='marker')
|
|
print("=== Pause Insertion Test ===")
|
|
for k, v in processed.items():
|
|
print(f"{k}: {v}")
|