diff --git a/bots/scheduler.py b/bots/scheduler.py index 3186e12..777b2c9 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -283,15 +283,12 @@ def _fetch_sources_content(topic_data: dict) -> dict: real_url = url try: - # Google 뉴스 리다이렉트 → 실제 URL - if 'news.google.com' in url: - resp = requests.head(url, timeout=5, allow_redirects=True, - headers={'User-Agent': 'Mozilla/5.0'}) - if resp.url and 'news.google.com' not in resp.url: - real_url = resp.url - - # 기사 내용 크롤링 - page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'}) + # Google 뉴스 리다이렉트 → 실제 URL (get으로 따라가야 함) + page = requests.get(url, timeout=8, allow_redirects=True, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' + }) + if page.url and 'news.google.com' not in page.url: + real_url = page.url if page.status_code == 200: soup = BeautifulSoup(page.text, 'lxml') # og:description 또는 본문 첫 단락 @@ -362,6 +359,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''): article['source_url'] = topic_data.get('source_url') or topic_data.get('source') or '' article['published_at'] = topic_data.get('published_at', '') article['source_image'] = topic_data.get('source_image', '') + # enriched sources가 있으면 덮어씀 (실제 기사 URL 반영) + if topic_data.get('sources'): + article['sources'] = topic_data['sources'] article['created_at'] = datetime.now().isoformat() output_path.parent.mkdir(parents=True, exist_ok=True)