From 178caade3f15b10fa69b47f451a0d25f99757a9f Mon Sep 17 00:00:00 2001
From: JOUNGWOOK KWON <elikwon@JOUNGWOOKui-MacBookAir.local>
Date: Mon, 30 Mar 2026 18:19:13 +0900
Subject: [PATCH] =?UTF-8?q?feat:=20=EC=9B=90=EB=AC=B8=20=EA=B8=B0=EC=82=AC?=
 =?UTF-8?q?=20=EC=9D=B4=EB=AF=B8=EC=A7=80=20DuckDuckGo=20=EA=B2=80?=
 =?UTF-8?q?=EC=83=89=20+=20Blogger=20img=20=EC=82=BD=EC=9E=85=20=EB=B0=A9?=
 =?UTF-8?q?=EC=8B=9D=20=EA=B0=9C=EC=84=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. _search_real_article_image(): DuckDuckGo HTML 검색으로 원문 기사 URL 찾기
   - Google News 소스 제목 → DDG 검색 → 실제 URL → og:image
   - DDG redirect URL(uddg 파라미터)에서 실제 URL 추출
2. build_full_html(): 이미지를 div 래핑 없이 body_html 맨 앞에 직접 삽입
   - Blogger가 div class를 제거하는 문제 해결
3. fetch_featured_image() 우선순위 변경:
   RSS이미지 → og:image → DDG검색(원문) → Pexels → Wikipedia

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bots/publisher_bot.py | 113 ++++++++++++++++++++++++++++--------------
 1 file changed, 76 insertions(+), 37 deletions(-)

diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py
index a5eb87b..6aefe3b 100644
--- a/bots/publisher_bot.py
+++ b/bots/publisher_bot.py
@@ -260,8 +260,53 @@ def _fetch_og_image(url: str) -> str:
     return ''
 
 
+def _search_real_article_image(sources: list, topic: str = '') -> str:
+    """Google News 소스 → DuckDuckGo 검색으로 실제 기사 URL 찾기 → og:image 크롤링"""
+    from urllib.parse import quote as _quote, urlparse, parse_qs, unquote
+    from bs4 import BeautifulSoup as _BS
+
+    for src in sources[:3]:
+        title = src.get('title', '')
+        if not title:
+            continue
+        # "기사 제목 - 매체명" → 매체명 제거
+        clean_title = re.sub(r'\s*[-–—]\s*\S+$', '', title).strip()
+        if len(clean_title) < 5:
+            clean_title = title
+        try:
+            ddg_url = f'https://html.duckduckgo.com/html/?q={_quote(clean_title)}'
+            resp = requests.get(ddg_url, timeout=10, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+            })
+            if resp.status_code != 200:
+                continue
+            soup = _BS(resp.text, 'lxml')
+            # DuckDuckGo는 redirect URL 사용: //duckduckgo.com/l/?uddg=실제URL
+            for a_tag in soup.select('a.result__a')[:3]:
+                href = a_tag.get('href', '')
+                # uddg 파라미터에서 실제 URL 추출
+                real_url = href
+                if 'uddg=' in href:
+                    parsed = parse_qs(urlparse(href).query)
+                    uddg = parsed.get('uddg', [''])[0]
+                    if uddg:
+                        real_url = unquote(uddg)
+                if not real_url.startswith('http'):
+                    continue
+                if 'news.google.com' in real_url:
+                    continue
+                # 실제 기사 URL에서 og:image 크롤링
+                img = _fetch_og_image(real_url)
+                if img:
+                    logger.info(f"원문 기사 이미지 발견: {real_url[:50]} → {img[:60]}")
+                    return img
+        except Exception as e:
+            logger.debug(f"DuckDuckGo 검색 실패: {e}")
+    return ''
+
+
 def fetch_featured_image(article: dict) -> str:
-    """대표 이미지: RSS 이미지 → og:image 크롤링 → Pexels 순으로 시도"""
+    """대표 이미지: RSS 이미지 → 원문 기사 → og:image → Wikipedia 순으로 시도"""
     # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 제외)
     source_image = article.get('source_image', '')
     if source_image and source_image.startswith('http') and not _is_platform_logo(source_image):
@@ -278,7 +323,14 @@ def fetch_featured_image(article: dict) -> str:
     if og_image:
         return og_image
 
-    # 3) Pexels API (키가 있을 때)
+    # 3) Google News 소스 → DuckDuckGo로 실제 기사 검색 → og:image
+    sources = article.get('sources', [])
+    if sources:
+        real_image = _search_real_article_image(sources, article.get('title', ''))
+        if real_image:
+            return real_image
+
+    # 4) Pexels API (키가 있을 때)
     pexels_key = os.getenv('PEXELS_API_KEY', '')
     if pexels_key:
         tags = article.get('tags', [])
@@ -299,42 +351,27 @@ def fetch_featured_image(article: dict) -> str:
         except Exception as e:
             logger.warning(f"Pexels 이미지 검색 실패: {e}")
 
-    # 4) Wikipedia 썸네일 (무료, API 키 불필요) — 태그 전체 시도
+    # 5) Wikipedia 썸네일 (무료, API 키 불필요)
     tags = article.get('tags', [])
     if isinstance(tags, str):
         tags = [t.strip() for t in tags.split(',')]
-    # 태그만 사용 (제목은 너무 길어 Wikipedia에서 매칭 안됨)
     search_keywords = [t for t in tags if t and len(t) <= 15][:8]
     from urllib.parse import quote as _quote
     for kw in search_keywords:
-        # 한국어 Wikipedia
-        try:
-            wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
-            resp = requests.get(wiki_url, timeout=6,
-                                headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
-            if resp.status_code == 200:
-                data = resp.json()
-                thumb = data.get('thumbnail', {}).get('source', '')
-                if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
-                    thumb = re.sub(r'/\d+px-', '/800px-', thumb)
-                    logger.info(f"Wikipedia 이미지 사용: {kw} → {thumb[:60]}")
-                    return thumb
-        except Exception:
-            pass
-        # 영문 Wikipedia
-        try:
-            wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
-            resp = requests.get(wiki_url, timeout=6,
-                                headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
-            if resp.status_code == 200:
-                data = resp.json()
-                thumb = data.get('thumbnail', {}).get('source', '')
-                if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
-                    thumb = re.sub(r'/\d+px-', '/800px-', thumb)
-                    logger.info(f"Wikipedia(EN) 이미지 사용: {kw} → {thumb[:60]}")
-                    return thumb
-        except Exception:
-            pass
+        for lang in ['ko', 'en']:
+            try:
+                wiki_url = f'https://{lang}.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}'
+                resp = requests.get(wiki_url, timeout=6,
+                                    headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
+                if resp.status_code == 200:
+                    data = resp.json()
+                    thumb = data.get('thumbnail', {}).get('source', '')
+                    if thumb and thumb.startswith('http') and not _is_platform_logo(thumb):
+                        thumb = re.sub(r'/\d+px-', '/800px-', thumb)
+                        logger.info(f"Wikipedia({lang}) 이미지 사용: {kw} → {thumb[:60]}")
+                        return thumb
+            except Exception:
+                pass
 
     return ''
 
@@ -351,13 +388,15 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:
     if not has_image:
         image_url = fetch_featured_image(article)
         if image_url:
-            title = article.get('title', '')
-            html_parts.append(
-                f'<div class="featured-image" style="margin-bottom:1.5em;">'
+            title = article.get('title', '').replace('"', '&quot;')
+            # Blogger 호환: div 래핑 없이 직접 img 삽입 (본문 첫 줄에 배치)
+            img_tag = (
                 f'<img src="{image_url}" alt="{title}" '
-                f'style="width:100%;max-height:400px;object-fit:cover;border-radius:8px;" />'
-                f'</div>'
+                f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                f'margin-bottom:1.2em;" />'
             )
+            # body_html 맨 앞에 이미지 삽입 (Blogger가 div를 제거하는 문제 방지)
+            body_html = img_tag + '\n' + body_html
 
     html_parts.append(json_ld)
     # 목차: h2가 3개 이상인 긴 글에서만 표시