fix: Google 뉴스 URL 리다이렉트 추적 + sources 최종 반영

- requests.head() → requests.get()으로 변경 (Google 뉴스 리다이렉트 정상 추적) - enriched sources(실제 기사 URL)를 article에 덮어써서 출처 박스에 반영 - source_image도 크롤링된 og:image로 채워짐 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 17:28:00 +09:00
parent 3119823128
commit 07703a0f6b
1 changed files with 9 additions and 9 deletions
@@ -283,15 +283,12 @@ def _fetch_sources_content(topic_data: dict) -> dict:
        real_url = url

        try:
-            # Google 뉴스 리다이렉트 → 실제 URL
-            if 'news.google.com' in url:
-                resp = requests.head(url, timeout=5, allow_redirects=True,
-                                     headers={'User-Agent': 'Mozilla/5.0'})
-                if resp.url and 'news.google.com' not in resp.url:
-                    real_url = resp.url
-
-            # 기사 내용 크롤링
-            page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
+            # Google 뉴스 리다이렉트 → 실제 URL (get으로 따라가야 함)
+            page = requests.get(url, timeout=8, allow_redirects=True, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+            })
+            if page.url and 'news.google.com' not in page.url:
+                real_url = page.url
            if page.status_code == 200:
                soup = BeautifulSoup(page.text, 'lxml')
                # og:description 또는 본문 첫 단락
@@ -362,6 +359,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''):
    article['source_url'] = topic_data.get('source_url') or topic_data.get('source') or ''
    article['published_at'] = topic_data.get('published_at', '')
    article['source_image'] = topic_data.get('source_image', '')
+    # enriched sources가 있으면 덮어씀 (실제 기사 URL 반영)
+    if topic_data.get('sources'):
+        article['sources'] = topic_data['sources']
    article['created_at'] = datetime.now().isoformat()

    output_path.parent.mkdir(parents=True, exist_ok=True)