""" 블로그 변환봇 (converters/blog_converter.py) 역할: 원본 마크다운 → 블로그 HTML 변환 (LAYER 2) - 마크다운 → HTML (목차, 테이블, 코드블록) - AdSense 플레이스홀더 삽입 - Schema.org Article JSON-LD - 쿠팡 링크봇 호출 출력: data/outputs/{date}_{slug}_blog.html """ import json import logging import sys from datetime import datetime, timezone from pathlib import Path import markdown from bs4 import BeautifulSoup BASE_DIR = Path(__file__).parent.parent.parent sys.path.insert(0, str(BASE_DIR / 'bots')) LOG_DIR = BASE_DIR / 'logs' LOG_DIR.mkdir(exist_ok=True) OUTPUT_DIR = BASE_DIR / 'data' / 'outputs' OUTPUT_DIR.mkdir(exist_ok=True) logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(LOG_DIR / 'converter.log', encoding='utf-8'), logging.StreamHandler(), ] ) logger = logging.getLogger(__name__) BLOG_BASE_URL = 'https://the4thpath.com' def markdown_to_html(md_text: str) -> tuple[str, str]: """마크다운 → HTML (목차 포함)""" md = markdown.Markdown( extensions=['toc', 'tables', 'fenced_code', 'attr_list'], extension_configs={ 'toc': {'title': '목차', 'toc_depth': '2-3'} } ) html = md.convert(md_text) toc = md.toc return html, toc def insert_adsense_placeholders(html: str) -> str: """두 번째 H2 뒤 + 결론 H2 앞에 AdSense 슬롯 삽입""" AD_SLOT_1 = '\n\n' AD_SLOT_2 = '\n\n' soup = BeautifulSoup(html, 'lxml') h2_tags = soup.find_all('h2') if len(h2_tags) >= 2: ad_tag = BeautifulSoup(AD_SLOT_1, 'html.parser') h2_tags[1].insert_after(ad_tag) for h2 in soup.find_all('h2'): if any(kw in h2.get_text() for kw in ['결론', '마무리', '정리', '요약', 'conclusion']): ad_tag2 = BeautifulSoup(AD_SLOT_2, 'html.parser') h2.insert_before(ad_tag2) break return str(soup) def build_json_ld(article: dict, post_url: str = '') -> str: """Schema.org Article JSON-LD""" schema = { "@context": "https://schema.org", "@type": "Article", "headline": article.get('title', ''), "description": article.get('meta', ''), "datePublished": datetime.now(timezone.utc).isoformat(), "dateModified": datetime.now(timezone.utc).isoformat(), "author": {"@type": "Person", "name": "테크인사이더"}, "publisher": { "@type": "Organization", "name": "The 4th Path", "logo": {"@type": "ImageObject", "url": f"{BLOG_BASE_URL}/logo.png"} }, "mainEntityOfPage": {"@type": "WebPage", "@id": post_url or BLOG_BASE_URL}, } return f'' def build_full_html(article: dict, body_html: str, toc_html: str, post_url: str = '') -> str: """JSON-LD + 목차 + 본문 + 면책 조합""" json_ld = build_json_ld(article, post_url) disclaimer = article.get('disclaimer', '') parts = [json_ld] if toc_html: parts.append(f'
{disclaimer}
') return '\n'.join(parts) def _is_html_body(body: str) -> bool: """AI가 이미 HTML을 출력했는지 감지 (마크다운 변환 건너뜀)""" stripped = body.lstrip() return stripped.startswith('<') and any( tag in stripped[:200].lower() for tag in ['