From 53393a6354900ffb2f8313e2ff7ea5edd9c3ea79 Mon Sep 17 00:00:00 2001
From: JOUNGWOOK KWON <elikwon@JOUNGWOOKui-MacBookAir.local>
Date: Mon, 30 Mar 2026 12:35:03 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20=EB=8C=80=ED=91=9C=20=EC=9D=B4=EB=AF=B8?=
 =?UTF-8?q?=EC=A7=80=EB=A5=BC=20=EC=9B=90=EB=B3=B8=20=EA=B8=B0=EC=82=AC=20?=
 =?UTF-8?q?og:image=20=ED=81=AC=EB=A1=A4=EB=A7=81=EC=9C=BC=EB=A1=9C=20?=
 =?UTF-8?q?=EB=B3=80=EA=B2=BD?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Unsplash Source API 중단으로 기존 폴백 작동 안 함
- 원본 기사 URL에서 og:image / twitter:image 크롤링 (가장 확실)
- 우선순위: RSS 이미지 → og:image 크롤링 → Pexels API
- lxml 파서 사용 (이미 Docker에 설치됨)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bots/publisher_bot.py | 57 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 43 insertions(+), 14 deletions(-)
diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py
index 87c5a68..03c8146 100644
--- a/bots/publisher_bot.py
+++ b/bots/publisher_bot.py
@@ -206,9 +206,38 @@ def build_json_ld(article: dict, blog_url: str = '') -> str:
     return f'<script type="application/ld+json">\n{json.dumps(schema, ensure_ascii=False, indent=2)}\n</script>'
 
 
+def _fetch_og_image(url: str) -> str:
+    """원본 기사 URL에서 og:image 메타태그 크롤링"""
+    if not url or not url.startswith('http'):
+        return ''
+    try:
+        resp = requests.get(url, timeout=10, headers={
+            'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)',
+        })
+        if resp.status_code != 200:
+            return ''
+        soup = BeautifulSoup(resp.text, 'lxml')
+        # og:image
+        og = soup.find('meta', property='og:image')
+        if og and og.get('content', '').startswith('http'):
+            return og['content']
+        # twitter:image
+        tw = soup.find('meta', attrs={'name': 'twitter:image'})
+        if tw and tw.get('content', '').startswith('http'):
+            return tw['content']
+        # 본문 첫 번째 큰 이미지
+        for img in soup.find_all('img', src=True):
+            src = img['src']
+            if src.startswith('http') and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'banner', 'ad']):
+                return src
+    except Exception as e:
+        logger.warning(f"og:image 크롤링 실패 ({url}): {e}")
+    return ''
+
+
 def fetch_featured_image(article: dict) -> str:
-    """대표 이미지: 원본 소스 이미지 → Pexels → Unsplash 순으로 시도"""
-    # 1) 원본 RSS 소스 이미지 (가장 우선)
+    """대표 이미지: RSS 이미지 → og:image 크롤링 → Pexels 순으로 시도"""
+    # 1) RSS 수집 시 가져온 소스 이미지
     source_image = article.get('source_image', '')
     if source_image and source_image.startswith('http'):
         try:
@@ -216,18 +245,21 @@ def fetch_featured_image(article: dict) -> str:
             if resp.status_code == 200:
                 return source_image
         except Exception:
-            pass  # 접근 불가면 다음 방법으로
+            pass
 
-    # 2) 검색 키워드: 태그 또는 코너 기반
-    tags = article.get('tags', [])
-    if isinstance(tags, str):
-        tags = [t.strip() for t in tags.split(',')]
-    corner = article.get('corner', '')
-    query = tags[0] if tags else corner or 'technology'
+    # 2) 원본 기사 URL에서 og:image 크롤링
+    source_url = article.get('source_url', '')
+    og_image = _fetch_og_image(source_url)
+    if og_image:
+        return og_image
 
-    # Pexels API (키가 있을 때)
+    # 3) Pexels API (키가 있을 때)
     pexels_key = os.getenv('PEXELS_API_KEY', '')
     if pexels_key:
+        tags = article.get('tags', [])
+        if isinstance(tags, str):
+            tags = [t.strip() for t in tags.split(',')]
+        query = tags[0] if tags else article.get('corner', 'technology')
         try:
             resp = requests.get(
                 'https://api.pexels.com/v1/search',
@@ -242,10 +274,7 @@ def fetch_featured_image(article: dict) -> str:
         except Exception as e:
             logger.warning(f"Pexels 이미지 검색 실패: {e}")
 
-    # Unsplash Source (API 키 불필요, 직접 URL)
-    import urllib.parse
-    encoded = urllib.parse.quote(query)
-    return f'https://source.unsplash.com/1200x630/?{encoded}'
+    return ''
 
 
 def build_full_html(article: dict, body_html: str, toc_html: str) -> str: