fix: Wikipedia 이미지 태그 전체 시도, TOC h2>=3 스마트 복원

- fetch_featured_image: 태그 전체(최대 8개) 시도, 제목 제외(너무 길어 매칭 안됨) px 크기 regex로 일괄 800px 교체 - TOC: h2>=3 조건부 표시 복원 (완전제거→스마트 표시) 두 파일(publisher_bot, blog_converter) 동일하게 적용 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 18:04:21 +09:00
parent b98d694b65
commit d9f932b333
2 changed files with 21 additions and 14 deletions
@@ -96,7 +96,10 @@ def build_full_html(article: dict, body_html: str, toc_html: str,
    json_ld = build_json_ld(article, post_url)
    disclaimer = article.get('disclaimer', '')
    parts = [json_ld]
-    # 목차 비활성화 — 독자 경험 개선
+    # 목차: h2가 3개 이상인 긴 글에서만 표시
    h2_count = body_html.lower().count('<h2')
    if toc_html and toc_html.strip() not in ('', '\n') and h2_count >= 3:
        parts.append(f'<div class="toc-wrapper">{toc_html}</div>')
    parts.append(body_html)
    if disclaimer:
        parts.append(f'<hr/><p class="disclaimer"><small>{disclaimer}</small></p>')
@@ -299,38 +299,39 @@ def fetch_featured_image(article: dict) -> str:
        except Exception as e:
            logger.warning(f"Pexels 이미지 검색 실패: {e}")
-    # 4) Wikipedia 썸네일 (무료, API 키 불필요)
+    # 4) Wikipedia 썸네일 (무료, API 키 불필요) — 태그 전체 시도
    title = article.get('title', '')
    tags = article.get('tags', [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(',')]
-    for kw in ([title] + tags)[:4]:
+    # 태그만 사용 (제목은 너무 길어 Wikipedia에서 매칭 안됨)
-        if not kw:
+    search_keywords = [t for t in tags if t and len(t) <= 15][:8]
-            continue
+    from urllib.parse import quote as _quote
    for kw in search_keywords:
        # 한국어 Wikipedia
        try:
-            from urllib.parse import quote
+            wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
            wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{quote(kw)}'
            resp = requests.get(wiki_url, timeout=6,
                                headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
            if resp.status_code == 200:
                data = resp.json()
                thumb = data.get('thumbnail', {}).get('source', '')
                if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
-                    # 더 큰 해상도로 변환 (200px → 800px)
+                    thumb = re.sub(r'/\d+px-', '/800px-', thumb)
-                    thumb = thumb.replace('/200px-', '/800px-').replace('/320px-', '/800px-')
+                    logger.info(f"Wikipedia 이미지 사용: {kw} → {thumb[:60]}")
                    return thumb
        except Exception:
            pass
-        # 영문 Wikipedia fallback
+        # 영문 Wikipedia
        try:
-            wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{quote(kw)}'
+            wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
            resp = requests.get(wiki_url, timeout=6,
                                headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
            if resp.status_code == 200:
                data = resp.json()
                thumb = data.get('thumbnail', {}).get('source', '')
                if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
-                    thumb = thumb.replace('/200px-', '/800px-').replace('/320px-', '/800px-')
+                    thumb = re.sub(r'/\d+px-', '/800px-', thumb)
                    logger.info(f"Wikipedia(EN) 이미지 사용: {kw} → {thumb[:60]}")
                    return thumb
        except Exception:
            pass
@@ -359,7 +360,10 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:
            )
    html_parts.append(json_ld)
-    # 목차 비활성화 — 독자 경험 개선 (사진 아래 목차 제거)
+    # 목차: h2가 3개 이상인 긴 글에서만 표시
    h2_count = body_html.lower().count('<h2')
    if toc_html and toc_html.strip() not in ('', '\n') and h2_count >= 3:
        html_parts.append(f'<div class="toc-wrapper">{toc_html}</div>')
    html_parts.append(body_html)
    # 원문 출처 링크