From 07703a0f6b5339bbfd80308de2fc7274d3e87a5f Mon Sep 17 00:00:00 2001
From: JOUNGWOOK KWON <elikwon@JOUNGWOOKui-MacBookAir.local>
Date: Mon, 30 Mar 2026 17:28:00 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20Google=20=EB=89=B4=EC=8A=A4=20URL=20?=
 =?UTF-8?q?=EB=A6=AC=EB=8B=A4=EC=9D=B4=EB=A0=89=ED=8A=B8=20=EC=B6=94?=
 =?UTF-8?q?=EC=A0=81=20+=20sources=20=EC=B5=9C=EC=A2=85=20=EB=B0=98?=
 =?UTF-8?q?=EC=98=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- requests.head() → requests.get()으로 변경 (Google 뉴스 리다이렉트 정상 추적)
- enriched sources(실제 기사 URL)를 article에 덮어써서 출처 박스에 반영
- source_image도 크롤링된 og:image로 채워짐

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 bots/scheduler.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bots/scheduler.py b/bots/scheduler.py
index 3186e12..777b2c9 100644
--- a/bots/scheduler.py
+++ b/bots/scheduler.py
@@ -283,15 +283,12 @@ def _fetch_sources_content(topic_data: dict) -> dict:
         real_url = url
 
         try:
-            # Google 뉴스 리다이렉트 → 실제 URL
-            if 'news.google.com' in url:
-                resp = requests.head(url, timeout=5, allow_redirects=True,
-                                     headers={'User-Agent': 'Mozilla/5.0'})
-                if resp.url and 'news.google.com' not in resp.url:
-                    real_url = resp.url
-
-            # 기사 내용 크롤링
-            page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
+            # Google 뉴스 리다이렉트 → 실제 URL (get으로 따라가야 함)
+            page = requests.get(url, timeout=8, allow_redirects=True, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+            })
+            if page.url and 'news.google.com' not in page.url:
+                real_url = page.url
             if page.status_code == 200:
                 soup = BeautifulSoup(page.text, 'lxml')
                 # og:description 또는 본문 첫 단락
@@ -362,6 +359,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''):
     article['source_url'] = topic_data.get('source_url') or topic_data.get('source') or ''
     article['published_at'] = topic_data.get('published_at', '')
     article['source_image'] = topic_data.get('source_image', '')
+    # enriched sources가 있으면 덮어씀 (실제 기사 URL 반영)
+    if topic_data.get('sources'):
+        article['sources'] = topic_data['sources']
     article['created_at'] = datetime.now().isoformat()
 
     output_path.parent.mkdir(parents=True, exist_ok=True)