""" bots/prompt_layer/korean_preprocessor.py Korean TTS text preprocessing. Functions: - preprocess_korean(text): apply pronunciation map + number conversion - insert_pauses(script): insert SSML/marker pauses by sentence type """ import re import logging logger = logging.getLogger(__name__) # English/acronym → Korean phonetic pronunciation # 200+ entries covering tech, finance, social media, brands, etc. PRONUNCIATION_MAP = { # AI/Tech terms 'AI': '에이아이', 'API': '에이피아이', 'GPT': '지피티', 'ChatGPT': '챗지피티', 'Claude': '클로드', 'GitHub': '깃허브', 'OpenAI': '오픈에이아이', 'YouTube': '유튜브', 'TikTok': '틱톡', 'SEO': '에스이오', 'SaaS': '사스', 'UI': '유아이', 'UX': '유엑스', 'LLM': '엘엘엠', 'NFT': '엔에프티', 'DeFi': '디파이', 'IoT': '아이오티', 'AR': '에이알', 'VR': '브이알', 'ML': '머신러닝', 'NLP': '엔엘피', 'DevOps': '데브옵스', 'SQL': '에스큐엘', 'HTML': '에이치티엠엘', 'CSS': '씨에스에스', 'JSON': '제이슨', 'URL': '유알엘', 'HTTP': '에이치티티피', 'HTTPS': '에이치티티피에스', 'PC': '피씨', 'CPU': '씨피유', 'GPU': '지피유', 'RAM': '램', 'SSD': '에스에스디', 'USB': '유에스비', 'WiFi': '와이파이', 'Bluetooth': '블루투스', 'iOS': '아이오에스', 'Android': '안드로이드', 'App': '앱', 'IT': '아이티', 'ICT': '아이씨티', 'SNS': '에스엔에스', 'KPI': '케이피아이', 'ROI': '알오아이', 'B2B': '비투비', 'B2C': '비투씨', 'MVP': '엠브이피', 'OKR': '오케이알', 'CTO': '씨티오', 'CEO': '씨이오', 'CFO': '씨에프오', 'HR': '에이치알', 'PR': '피알', 'IR': '아이알', # Social/Platforms 'Instagram': '인스타그램', 'Facebook': '페이스북', 'Twitter': '트위터', 'LinkedIn': '링크드인', 'Netflix': '넷플릭스', 'Spotify': '스포티파이', 'Uber': '우버', 'Airbnb': '에어비앤비', 'Amazon': '아마존', 'Google': '구글', 'Apple': '애플', 'Microsoft': '마이크로소프트', 'Samsung': '삼성', 'LG': '엘지', 'SK': '에스케이', 'KT': '케이티', # Finance 'ETF': '이티에프', 'IPO': '아이피오', 'S&P': '에스앤피', 'NASDAQ': '나스닥', 'KOSPI': '코스피', 'KOSDAQ': '코스닥', 'GDP': '지디피', 'IMF': '아이엠에프', 'ECB': '이씨비', 'Fed': '연준', 'P/E': '주가수익비율', # Health/Science 'DNA': '디엔에이', 'RNA': '알엔에이', 'BMI': '비엠아이', 'COVID': '코비드', 'PCR': '피씨알', # Education/Certification 'MBA': '엠비에이', 'PhD': '박사', 'IELTS': '아이엘츠', 'TOEIC': '토익', 'TOEFL': '토플', # Measurement units 'km': '킬로미터', 'kg': '킬로그램', 'MB': '메가바이트', 'GB': '기가바이트', 'TB': '테라바이트', 'Hz': '헤르츠', 'MHz': '메가헤르츠', 'GHz': '기가헤르츠', # Media/Entertainment 'OTT': '오티티', 'VOD': '브이오디', 'BGM': '비지엠', 'OST': '오에스티', 'DJ': '디제이', 'MC': '엠씨', 'PD': '피디', 'CP': '씨피', # Common English words used in Korean context 'App Store': '앱 스토어', 'Play Store': '플레이 스토어', 'ChatBot': '챗봇', 'Web3': '웹쓰리', 'Metaverse': '메타버스', 'Blockchain': '블록체인', 'Crypto': '크립토', 'Bitcoin': '비트코인', 'Ethereum': '이더리움', 'Cloud': '클라우드', 'Big Data': '빅데이터', 'Startup': '스타트업', 'Fintech': '핀테크', 'Edtech': '에드테크', 'Healthtech': '헬스테크', 'PropTech': '프롭테크', 'LegalTech': '리걸테크', 'FOMO': '포모', 'YOLO': '욜로', 'MZ': '엠제트', # More tech 'Python': '파이썬', 'JavaScript': '자바스크립트', 'TypeScript': '타입스크립트', 'React': '리액트', 'Node.js': '노드제이에스', 'Docker': '도커', 'Kubernetes': '쿠버네티스', 'AWS': '에이더블유에스', 'GCP': '지씨피', 'Azure': '애저', 'Slack': '슬랙', 'Zoom': '줌', 'Discord': '디스코드', 'Notion': '노션', 'Figma': '피그마', 'Canva': '캔바', # Business/Strategy 'OEM': '오이엠', 'ODM': '오디엠', 'SCM': '에스씨엠', 'ERP': '이알피', 'CRM': '씨알엠', # More social media 'Reels': '릴스', 'Stories': '스토리', 'Live': '라이브', 'Feed': '피드', 'DM': '디엠', 'PM': '피엠', 'QA': '큐에이', # Content 'Blog': '블로그', 'Vlog': '브이로그', 'Podcast': '팟캐스트', 'Newsletter': '뉴스레터', 'Shorts': '쇼츠', 'Reel': '릴', # Misc 'OK': '오케이', 'NO': '노', 'YES': '예스', 'WOW': '와우', 'LOL': '엘오엘', 'BTW': '그런데', 'FYI': '참고로', 'ASAP': '최대한 빨리', 'FAQ': '자주 묻는 질문', 'Q&A': '질의응답', 'A/S': '에이에스', 'DIY': '디아이와이', 'PPT': '피피티', 'PDF': '피디에프', 'ZIP': '집', # AI/LLM extended 'Gemini': '제미나이', 'Grok': '그록', 'Copilot': '코파일럿', 'Perplexity': '퍼플렉시티', 'Midjourney': '미드저니', 'Stable Diffusion': '스테이블 디퓨전', 'DALL-E': '달리', 'Sora': '소라', 'Kling': '클링', 'Runway': '런웨이', # Dev tools / infra 'Git': '깃', 'Linux': '리눅스', 'Ubuntu': '우분투', 'Windows': '윈도우', 'macOS': '맥오에스', 'Terminal': '터미널', 'CI/CD': '씨아이씨디', 'API Gateway': '에이피아이 게이트웨이', # Finance extended 'PER': '주가수익비율', 'PBR': '주가순자산비율', 'EPS': '주당순이익', 'ROE': '자기자본이익률', 'CAGR': '연평균성장률', # E-commerce / marketing 'CPC': '클릭당비용', 'CPM': '천회노출당비용', 'CTA': '씨티에이', 'CTR': '클릭률', 'ROAS': '광고수익률', 'LTV': '고객생애가치', } # Pause durations in milliseconds by sentence type DYNAMIC_PAUSES = { 'hook_after': 500, # ms — impact emphasis after hook 'question_after': 400, # thinking time after question 'normal_after': 300, # standard sentence end 'section_break': 600, # body → closer transition 'comma': 150, # comma pause 'exclamation': 200, # exclamation mark pause } # Number → Korean word conversion rules _NUM_TO_KO = { 0: '영', 1: '일', 2: '이', 3: '삼', 4: '사', 5: '오', 6: '육', 7: '칠', 8: '팔', 9: '구', 10: '십', 100: '백', 1000: '천', 10000: '만', } # Counter words for common units (for better number reading) _COUNTER_MAP = { '개': ('개', False), # items '명': ('명', False), # people '번': ('번', False), # times '배': ('배', False), # times/multiples '위': ('위', False), # rank '가지': ('가지', True), # types (use sino-Korean) '초': ('초', False), # seconds '분': ('분', False), # minutes '시간': ('시간', False), # hours '일': ('일', False), # days '월': ('월', False), # months '년': ('년', False), # years '%': ('퍼센트', False), # percent } def preprocess_korean(text: str) -> str: """ Apply pronunciation map and number conversion to Korean text. 1. Replace English/acronym terms with Korean phonetics 2. Convert Arabic numerals with counter words to Korean Returns processed text ready for TTS. """ # Apply pronunciation map (longer strings first to avoid partial replacement) sorted_map = sorted(PRONUNCIATION_MAP.items(), key=lambda x: -len(x[0])) for en, ko in sorted_map: # Word boundary replacement to avoid partial matches text = re.sub(r'(? str: """ Convert Arabic numerals in Korean context. e.g.: "3가지" → "세 가지", "100%" → "백 퍼센트" """ # Handle percentage text = re.sub(r'(\d+)%', lambda m: _num_to_korean(int(m.group(1))) + ' 퍼센트', text) # Handle number + counter word for counter, (ko_counter, use_sino) in _COUNTER_MAP.items(): if counter == '%': continue pattern = r'(\d+)\s*' + re.escape(counter) def replace(m, kc=ko_counter): n = int(m.group(1)) return _num_to_korean(n) + ' ' + kc text = re.sub(pattern, replace, text) return text def _num_to_korean(n: int) -> str: """Convert integer to Korean sino-Korean numeral string.""" if n == 0: return '영' if n < 0: return '마이너스 ' + _num_to_korean(-n) result = '' if n >= 10000: man = n // 10000 result += _num_to_korean(man) + '만' n %= 10000 if n >= 1000: cheon = n // 1000 result += ('' if cheon == 1 else _num_to_korean(cheon)) + '천' n %= 1000 if n >= 100: baek = n // 100 result += ('' if baek == 1 else _num_to_korean(baek)) + '백' n %= 100 if n >= 10: sip = n // 10 result += ('' if sip == 1 else _num_to_korean(sip)) + '십' n %= 10 if n > 0: result += _NUM_TO_KO[n] return result def insert_pauses(script: dict, engine: str = 'ssml') -> dict: """ Insert pause markers into script by sentence type. engine='ssml': insert SSML tags (for ElevenLabs, Google TTS) engine='marker': insert [[PAUSE_Xms]] text markers (for Edge TTS, others) Returns modified script dict with pauses inserted. """ result = dict(script) hook = script.get('hook', '') body = script.get('body', []) closer = script.get('closer', '') # Add pause after hook if hook: pause_ms = DYNAMIC_PAUSES['hook_after'] result['hook'] = hook + _pause_marker(pause_ms, engine) # Add pauses within body sentences processed_body = [] for i, sentence in enumerate(body): processed = _add_inline_pauses(sentence, engine) # Add section break before closer transition if i == len(body) - 1: processed += _pause_marker(DYNAMIC_PAUSES['section_break'], engine) else: processed += _pause_marker(DYNAMIC_PAUSES['normal_after'], engine) processed_body.append(processed) result['body'] = processed_body return result def _add_inline_pauses(sentence: str, engine: str) -> str: """Add pauses at commas and after exclamation marks.""" # Comma pauses sentence = re.sub( r',\s*', ',' + _pause_marker(DYNAMIC_PAUSES['comma'], engine), sentence ) # Question mark pauses sentence = re.sub( r'\?\s*', '?' + _pause_marker(DYNAMIC_PAUSES['question_after'], engine), sentence ) # Exclamation pauses sentence = re.sub( r'!\s*', '!' + _pause_marker(DYNAMIC_PAUSES['exclamation'], engine), sentence ) return sentence def _pause_marker(ms: int, engine: str) -> str: """Generate engine-appropriate pause marker.""" if engine == 'ssml': return f'' else: return f' [[PAUSE_{ms}ms]] ' # ── Standalone test ────────────────────────────────────────────── if __name__ == '__main__': import sys if '--test' in sys.argv: print("=== Korean Preprocessor Test ===") test_texts = [ "AI와 ChatGPT가 SEO를 바꾸고 있어요", "3가지 방법으로 100%의 수익을 낼 수 있습니다", "YouTube와 TikTok에서 SNS 마케팅하기", "GPT API를 사용한 SaaS 창업", ] for text in test_texts: result = preprocess_korean(text) print(f"원문: {text}") print(f"처리: {result}") print() # Test pause insertion test_script = { 'hook': '이거 모르면 손해입니다!', 'body': ['첫 번째, AI를 활용하면 10배 빠릅니다.', '두 번째, 자동화가 핵심입니다.'], 'closer': '지금 바로 시작하세요.' } processed = insert_pauses(test_script, engine='marker') print("=== Pause Insertion Test ===") for k, v in processed.items(): print(f"{k}: {v}")