From 07703a0f6b5339bbfd80308de2fc7274d3e87a5f Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Mon, 30 Mar 2026 17:28:00 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20Google=20=EB=89=B4=EC=8A=A4=20URL=20?= =?UTF-8?q?=EB=A6=AC=EB=8B=A4=EC=9D=B4=EB=A0=89=ED=8A=B8=20=EC=B6=94?= =?UTF-8?q?=EC=A0=81=20+=20sources=20=EC=B5=9C=EC=A2=85=20=EB=B0=98?= =?UTF-8?q?=EC=98=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - requests.head() → requests.get()으로 변경 (Google 뉴스 리다이렉트 정상 추적) - enriched sources(실제 기사 URL)를 article에 덮어써서 출처 박스에 반영 - source_image도 크롤링된 og:image로 채워짐 Co-Authored-By: Claude Sonnet 4.6 --- bots/scheduler.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bots/scheduler.py b/bots/scheduler.py index 3186e12..777b2c9 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -283,15 +283,12 @@ def _fetch_sources_content(topic_data: dict) -> dict: real_url = url try: - # Google 뉴스 리다이렉트 → 실제 URL - if 'news.google.com' in url: - resp = requests.head(url, timeout=5, allow_redirects=True, - headers={'User-Agent': 'Mozilla/5.0'}) - if resp.url and 'news.google.com' not in resp.url: - real_url = resp.url - - # 기사 내용 크롤링 - page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'}) + # Google 뉴스 리다이렉트 → 실제 URL (get으로 따라가야 함) + page = requests.get(url, timeout=8, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' + }) + if page.url and 'news.google.com' not in page.url: + real_url = page.url if page.status_code == 200: soup = BeautifulSoup(page.text, 'lxml') # og:description 또는 본문 첫 단락 @@ -362,6 +359,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''): article['source_url'] = topic_data.get('source_url') or topic_data.get('source') or '' article['published_at'] = topic_data.get('published_at', '') article['source_image'] = topic_data.get('source_image', '') + # enriched sources가 있으면 덮어씀 (실제 기사 URL 반영) + if topic_data.get('sources'): + article['sources'] = topic_data['sources'] article['created_at'] = datetime.now().isoformat() output_path.parent.mkdir(parents=True, exist_ok=True)