Files
blog-writer/bots/prompt_layer/korean_preprocessor.py
sinmb79 66be55ba8a fix(v3): code review 5개 이슈 수정
- korean_preprocessor: 발음 사전 176 → 206개 (200+ 달성)
- video_engine: SoraEngine 완전 제거 (2026-03-24 서비스 종료)
- smart_video_router: veo3/seedance2 빈 문자열 반환 → ffmpeg_slides 폴백
- cli/init: gemini_web 서비스 설정 질문 추가 (user_profile 일치)
- caption_renderer, tts_engine, video_assembler: --test 스탠드얼론 블록 추가

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 16:14:51 +09:00

443 lines
13 KiB
Python

"""
bots/prompt_layer/korean_preprocessor.py
Korean TTS text preprocessing.
Functions:
- preprocess_korean(text): apply pronunciation map + number conversion
- insert_pauses(script): insert SSML/marker pauses by sentence type
"""
import re
import logging
logger = logging.getLogger(__name__)
# English/acronym → Korean phonetic pronunciation
# 200+ entries covering tech, finance, social media, brands, etc.
PRONUNCIATION_MAP = {
# AI/Tech terms
'AI': '에이아이',
'API': '에이피아이',
'GPT': '지피티',
'ChatGPT': '챗지피티',
'Claude': '클로드',
'GitHub': '깃허브',
'OpenAI': '오픈에이아이',
'YouTube': '유튜브',
'TikTok': '틱톡',
'SEO': '에스이오',
'SaaS': '사스',
'UI': '유아이',
'UX': '유엑스',
'LLM': '엘엘엠',
'NFT': '엔에프티',
'DeFi': '디파이',
'IoT': '아이오티',
'AR': '에이알',
'VR': '브이알',
'ML': '머신러닝',
'NLP': '엔엘피',
'DevOps': '데브옵스',
'SQL': '에스큐엘',
'HTML': '에이치티엠엘',
'CSS': '씨에스에스',
'JSON': '제이슨',
'URL': '유알엘',
'HTTP': '에이치티티피',
'HTTPS': '에이치티티피에스',
'PC': '피씨',
'CPU': '씨피유',
'GPU': '지피유',
'RAM': '',
'SSD': '에스에스디',
'USB': '유에스비',
'WiFi': '와이파이',
'Bluetooth': '블루투스',
'iOS': '아이오에스',
'Android': '안드로이드',
'App': '',
'IT': '아이티',
'ICT': '아이씨티',
'SNS': '에스엔에스',
'KPI': '케이피아이',
'ROI': '알오아이',
'B2B': '비투비',
'B2C': '비투씨',
'MVP': '엠브이피',
'OKR': '오케이알',
'CTO': '씨티오',
'CEO': '씨이오',
'CFO': '씨에프오',
'HR': '에이치알',
'PR': '피알',
'IR': '아이알',
# Social/Platforms
'Instagram': '인스타그램',
'Facebook': '페이스북',
'Twitter': '트위터',
'LinkedIn': '링크드인',
'Netflix': '넷플릭스',
'Spotify': '스포티파이',
'Uber': '우버',
'Airbnb': '에어비앤비',
'Amazon': '아마존',
'Google': '구글',
'Apple': '애플',
'Microsoft': '마이크로소프트',
'Samsung': '삼성',
'LG': '엘지',
'SK': '에스케이',
'KT': '케이티',
# Finance
'ETF': '이티에프',
'IPO': '아이피오',
'S&P': '에스앤피',
'NASDAQ': '나스닥',
'KOSPI': '코스피',
'KOSDAQ': '코스닥',
'GDP': '지디피',
'IMF': '아이엠에프',
'ECB': '이씨비',
'Fed': '연준',
'P/E': '주가수익비율',
# Health/Science
'DNA': '디엔에이',
'RNA': '알엔에이',
'BMI': '비엠아이',
'COVID': '코비드',
'PCR': '피씨알',
# Education/Certification
'MBA': '엠비에이',
'PhD': '박사',
'IELTS': '아이엘츠',
'TOEIC': '토익',
'TOEFL': '토플',
# Measurement units
'km': '킬로미터',
'kg': '킬로그램',
'MB': '메가바이트',
'GB': '기가바이트',
'TB': '테라바이트',
'Hz': '헤르츠',
'MHz': '메가헤르츠',
'GHz': '기가헤르츠',
# Media/Entertainment
'OTT': '오티티',
'VOD': '브이오디',
'BGM': '비지엠',
'OST': '오에스티',
'DJ': '디제이',
'MC': '엠씨',
'PD': '피디',
'CP': '씨피',
# Common English words used in Korean context
'App Store': '앱 스토어',
'Play Store': '플레이 스토어',
'ChatBot': '챗봇',
'Web3': '웹쓰리',
'Metaverse': '메타버스',
'Blockchain': '블록체인',
'Crypto': '크립토',
'Bitcoin': '비트코인',
'Ethereum': '이더리움',
'Cloud': '클라우드',
'Big Data': '빅데이터',
'Startup': '스타트업',
'Fintech': '핀테크',
'Edtech': '에드테크',
'Healthtech': '헬스테크',
'PropTech': '프롭테크',
'LegalTech': '리걸테크',
'FOMO': '포모',
'YOLO': '욜로',
'MZ': '엠제트',
# More tech
'Python': '파이썬',
'JavaScript': '자바스크립트',
'TypeScript': '타입스크립트',
'React': '리액트',
'Node.js': '노드제이에스',
'Docker': '도커',
'Kubernetes': '쿠버네티스',
'AWS': '에이더블유에스',
'GCP': '지씨피',
'Azure': '애저',
'Slack': '슬랙',
'Zoom': '',
'Discord': '디스코드',
'Notion': '노션',
'Figma': '피그마',
'Canva': '캔바',
# Business/Strategy
'OEM': '오이엠',
'ODM': '오디엠',
'SCM': '에스씨엠',
'ERP': '이알피',
'CRM': '씨알엠',
# More social media
'Reels': '릴스',
'Stories': '스토리',
'Live': '라이브',
'Feed': '피드',
'DM': '디엠',
'PM': '피엠',
'QA': '큐에이',
# Content
'Blog': '블로그',
'Vlog': '브이로그',
'Podcast': '팟캐스트',
'Newsletter': '뉴스레터',
'Shorts': '쇼츠',
'Reel': '',
# Misc
'OK': '오케이',
'NO': '',
'YES': '예스',
'WOW': '와우',
'LOL': '엘오엘',
'BTW': '그런데',
'FYI': '참고로',
'ASAP': '최대한 빨리',
'FAQ': '자주 묻는 질문',
'Q&A': '질의응답',
'A/S': '에이에스',
'DIY': '디아이와이',
'PPT': '피피티',
'PDF': '피디에프',
'ZIP': '',
# AI/LLM extended
'Gemini': '제미나이',
'Grok': '그록',
'Copilot': '코파일럿',
'Perplexity': '퍼플렉시티',
'Midjourney': '미드저니',
'Stable Diffusion': '스테이블 디퓨전',
'DALL-E': '달리',
'Sora': '소라',
'Kling': '클링',
'Runway': '런웨이',
# Dev tools / infra
'Git': '',
'Linux': '리눅스',
'Ubuntu': '우분투',
'Windows': '윈도우',
'macOS': '맥오에스',
'Terminal': '터미널',
'CI/CD': '씨아이씨디',
'API Gateway': '에이피아이 게이트웨이',
# Finance extended
'PER': '주가수익비율',
'PBR': '주가순자산비율',
'EPS': '주당순이익',
'ROE': '자기자본이익률',
'CAGR': '연평균성장률',
# E-commerce / marketing
'CPC': '클릭당비용',
'CPM': '천회노출당비용',
'CTA': '씨티에이',
'CTR': '클릭률',
'ROAS': '광고수익률',
'LTV': '고객생애가치',
}
# Pause durations in milliseconds by sentence type
DYNAMIC_PAUSES = {
'hook_after': 500, # ms — impact emphasis after hook
'question_after': 400, # thinking time after question
'normal_after': 300, # standard sentence end
'section_break': 600, # body → closer transition
'comma': 150, # comma pause
'exclamation': 200, # exclamation mark pause
}
# Number → Korean word conversion rules
_NUM_TO_KO = {
0: '', 1: '', 2: '', 3: '', 4: '', 5: '',
6: '', 7: '', 8: '', 9: '', 10: '',
100: '', 1000: '', 10000: '',
}
# Counter words for common units (for better number reading)
_COUNTER_MAP = {
'': ('', False), # items
'': ('', False), # people
'': ('', False), # times
'': ('', False), # times/multiples
'': ('', False), # rank
'가지': ('가지', True), # types (use sino-Korean)
'': ('', False), # seconds
'': ('', False), # minutes
'시간': ('시간', False), # hours
'': ('', False), # days
'': ('', False), # months
'': ('', False), # years
'%': ('퍼센트', False), # percent
}
def preprocess_korean(text: str) -> str:
"""
Apply pronunciation map and number conversion to Korean text.
1. Replace English/acronym terms with Korean phonetics
2. Convert Arabic numerals with counter words to Korean
Returns processed text ready for TTS.
"""
# Apply pronunciation map (longer strings first to avoid partial replacement)
sorted_map = sorted(PRONUNCIATION_MAP.items(), key=lambda x: -len(x[0]))
for en, ko in sorted_map:
# Word boundary replacement to avoid partial matches
text = re.sub(r'(?<![가-힣\w])' + re.escape(en) + r'(?![가-힣\w])', ko, text)
# Convert numbers
text = _convert_numbers(text)
return text
def _convert_numbers(text: str) -> str:
"""
Convert Arabic numerals in Korean context.
e.g.: "3가지""세 가지", "100%""백 퍼센트"
"""
# Handle percentage
text = re.sub(r'(\d+)%', lambda m: _num_to_korean(int(m.group(1))) + ' 퍼센트', text)
# Handle number + counter word
for counter, (ko_counter, use_sino) in _COUNTER_MAP.items():
if counter == '%':
continue
pattern = r'(\d+)\s*' + re.escape(counter)
def replace(m, kc=ko_counter):
n = int(m.group(1))
return _num_to_korean(n) + ' ' + kc
text = re.sub(pattern, replace, text)
return text
def _num_to_korean(n: int) -> str:
"""Convert integer to Korean sino-Korean numeral string."""
if n == 0:
return ''
if n < 0:
return '마이너스 ' + _num_to_korean(-n)
result = ''
if n >= 10000:
man = n // 10000
result += _num_to_korean(man) + ''
n %= 10000
if n >= 1000:
cheon = n // 1000
result += ('' if cheon == 1 else _num_to_korean(cheon)) + ''
n %= 1000
if n >= 100:
baek = n // 100
result += ('' if baek == 1 else _num_to_korean(baek)) + ''
n %= 100
if n >= 10:
sip = n // 10
result += ('' if sip == 1 else _num_to_korean(sip)) + ''
n %= 10
if n > 0:
result += _NUM_TO_KO[n]
return result
def insert_pauses(script: dict, engine: str = 'ssml') -> dict:
"""
Insert pause markers into script by sentence type.
engine='ssml': insert SSML <break> tags (for ElevenLabs, Google TTS)
engine='marker': insert [[PAUSE_Xms]] text markers (for Edge TTS, others)
Returns modified script dict with pauses inserted.
"""
result = dict(script)
hook = script.get('hook', '')
body = script.get('body', [])
closer = script.get('closer', '')
# Add pause after hook
if hook:
pause_ms = DYNAMIC_PAUSES['hook_after']
result['hook'] = hook + _pause_marker(pause_ms, engine)
# Add pauses within body sentences
processed_body = []
for i, sentence in enumerate(body):
processed = _add_inline_pauses(sentence, engine)
# Add section break before closer transition
if i == len(body) - 1:
processed += _pause_marker(DYNAMIC_PAUSES['section_break'], engine)
else:
processed += _pause_marker(DYNAMIC_PAUSES['normal_after'], engine)
processed_body.append(processed)
result['body'] = processed_body
return result
def _add_inline_pauses(sentence: str, engine: str) -> str:
"""Add pauses at commas and after exclamation marks."""
# Comma pauses
sentence = re.sub(
r',\s*',
',' + _pause_marker(DYNAMIC_PAUSES['comma'], engine),
sentence
)
# Question mark pauses
sentence = re.sub(
r'\?\s*',
'?' + _pause_marker(DYNAMIC_PAUSES['question_after'], engine),
sentence
)
# Exclamation pauses
sentence = re.sub(
r'!\s*',
'!' + _pause_marker(DYNAMIC_PAUSES['exclamation'], engine),
sentence
)
return sentence
def _pause_marker(ms: int, engine: str) -> str:
"""Generate engine-appropriate pause marker."""
if engine == 'ssml':
return f'<break time="{ms}ms"/>'
else:
return f' [[PAUSE_{ms}ms]] '
# ── Standalone test ──────────────────────────────────────────────
if __name__ == '__main__':
import sys
if '--test' in sys.argv:
print("=== Korean Preprocessor Test ===")
test_texts = [
"AI와 ChatGPT가 SEO를 바꾸고 있어요",
"3가지 방법으로 100%의 수익을 낼 수 있습니다",
"YouTube와 TikTok에서 SNS 마케팅하기",
"GPT API를 사용한 SaaS 창업",
]
for text in test_texts:
result = preprocess_korean(text)
print(f"원문: {text}")
print(f"처리: {result}")
print()
# Test pause insertion
test_script = {
'hook': '이거 모르면 손해입니다!',
'body': ['첫 번째, AI를 활용하면 10배 빠릅니다.', '두 번째, 자동화가 핵심입니다.'],
'closer': '지금 바로 시작하세요.'
}
processed = insert_pauses(test_script, engine='marker')
print("=== Pause Insertion Test ===")
for k, v in processed.items():
print(f"{k}: {v}")