fix: Wikipedia 이미지 태그 전체 시도, TOC h2>=3 스마트 복원

- fetch_featured_image: 태그 전체(최대 8개) 시도, 제목 제외(너무 길어 매칭 안됨)
  px 크기 regex로 일괄 800px 교체
- TOC: h2>=3 조건부 표시 복원 (완전제거→스마트 표시)
  두 파일(publisher_bot, blog_converter) 동일하게 적용

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
JOUNGWOOK KWON
2026-03-30 18:04:21 +09:00
parent b98d694b65
commit d9f932b333
2 changed files with 21 additions and 14 deletions

View File

@@ -96,7 +96,10 @@ def build_full_html(article: dict, body_html: str, toc_html: str,
json_ld = build_json_ld(article, post_url) json_ld = build_json_ld(article, post_url)
disclaimer = article.get('disclaimer', '') disclaimer = article.get('disclaimer', '')
parts = [json_ld] parts = [json_ld]
# 목차 비활성화 — 독자 경험 개선 # 목차: h2가 3개 이상인 긴 글에서만 표시
h2_count = body_html.lower().count('<h2')
if toc_html and toc_html.strip() not in ('', '\n') and h2_count >= 3:
parts.append(f'<div class="toc-wrapper">{toc_html}</div>')
parts.append(body_html) parts.append(body_html)
if disclaimer: if disclaimer:
parts.append(f'<hr/><p class="disclaimer"><small>{disclaimer}</small></p>') parts.append(f'<hr/><p class="disclaimer"><small>{disclaimer}</small></p>')

View File

@@ -299,38 +299,39 @@ def fetch_featured_image(article: dict) -> str:
except Exception as e: except Exception as e:
logger.warning(f"Pexels 이미지 검색 실패: {e}") logger.warning(f"Pexels 이미지 검색 실패: {e}")
# 4) Wikipedia 썸네일 (무료, API 키 불필요) # 4) Wikipedia 썸네일 (무료, API 키 불필요) — 태그 전체 시도
title = article.get('title', '')
tags = article.get('tags', []) tags = article.get('tags', [])
if isinstance(tags, str): if isinstance(tags, str):
tags = [t.strip() for t in tags.split(',')] tags = [t.strip() for t in tags.split(',')]
for kw in ([title] + tags)[:4]: # 태그만 사용 (제목은 너무 길어 Wikipedia에서 매칭 안됨)
if not kw: search_keywords = [t for t in tags if t and len(t) <= 15][:8]
continue from urllib.parse import quote as _quote
for kw in search_keywords:
# 한국어 Wikipedia
try: try:
from urllib.parse import quote wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{quote(kw)}'
resp = requests.get(wiki_url, timeout=6, resp = requests.get(wiki_url, timeout=6,
headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'}) headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
if resp.status_code == 200: if resp.status_code == 200:
data = resp.json() data = resp.json()
thumb = data.get('thumbnail', {}).get('source', '') thumb = data.get('thumbnail', {}).get('source', '')
if thumb and thumb.startswith('http') and not _is_platform_logo(thumb): if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
# 더 큰 해상도로 변환 (200px → 800px) thumb = re.sub(r'/\d+px-', '/800px-', thumb)
thumb = thumb.replace('/200px-', '/800px-').replace('/320px-', '/800px-') logger.info(f"Wikipedia 이미지 사용: {kw}{thumb[:60]}")
return thumb return thumb
except Exception: except Exception:
pass pass
# 영문 Wikipedia fallback # 영문 Wikipedia
try: try:
wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{quote(kw)}' wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
resp = requests.get(wiki_url, timeout=6, resp = requests.get(wiki_url, timeout=6,
headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'}) headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
if resp.status_code == 200: if resp.status_code == 200:
data = resp.json() data = resp.json()
thumb = data.get('thumbnail', {}).get('source', '') thumb = data.get('thumbnail', {}).get('source', '')
if thumb and thumb.startswith('http') and not _is_platform_logo(thumb): if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
thumb = thumb.replace('/200px-', '/800px-').replace('/320px-', '/800px-') thumb = re.sub(r'/\d+px-', '/800px-', thumb)
logger.info(f"Wikipedia(EN) 이미지 사용: {kw}{thumb[:60]}")
return thumb return thumb
except Exception: except Exception:
pass pass
@@ -359,7 +360,10 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:
) )
html_parts.append(json_ld) html_parts.append(json_ld)
# 목차 비활성화 — 독자 경험 개선 (사진 아래 목차 제거) # 목차: h2가 3개 이상인 긴 글에서만 표시
h2_count = body_html.lower().count('<h2')
if toc_html and toc_html.strip() not in ('', '\n') and h2_count >= 3:
html_parts.append(f'<div class="toc-wrapper">{toc_html}</div>')
html_parts.append(body_html) html_parts.append(body_html)
# 원문 출처 링크 # 원문 출처 링크