From c836c720da5577773c535c19e366d87e452b3ca9 Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Mon, 30 Mar 2026 15:26:15 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20/idea=20=EB=AA=85=EB=A0=B9=20=ED=83=80?= =?UTF-8?q?=EC=9E=84=EC=95=84=EC=9B=83=20=E2=80=94=20=EC=B2=AB=20=EA=B8=B0?= =?UTF-8?q?=EC=82=AC=EB=A7=8C=20=ED=81=AC=EB=A1=A4=EB=A7=81=EC=9C=BC?= =?UTF-8?q?=EB=A1=9C=20=EC=86=8D=EB=8F=84=20=EC=B5=9C=EC=A0=81=ED=99=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5개 기사 모두 리다이렉트 추적하면 50초+ 걸려서 Telegram 타임아웃 발생. 첫 번째 기사만 URL 변환+크롤링하고 나머지는 RSS 제목만 저장. Co-Authored-By: Claude Opus 4.6 --- bots/scheduler.py | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/bots/scheduler.py b/bots/scheduler.py index 604aae5..c4ede9a 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -801,39 +801,34 @@ def _search_and_build_topic(keyword: str, corner: str = '') -> dict: link = entry.get('link', '') pub_date = entry.get('published', '') - # Google 뉴스 URL → 실제 기사 URL 변환 - real_url = link - if 'news.google.com' in link: - try: - resp = requests.head(link, timeout=8, allow_redirects=True, - headers={'User-Agent': 'Mozilla/5.0'}) - if resp.url and 'news.google.com' not in resp.url: - real_url = resp.url - except Exception: - pass - + # Google 뉴스 RSS 제목에서 "- 매체명" 분리 sources.append({ - 'url': real_url, + 'url': link, 'title': title, 'date': pub_date, }) - # 첫 번째 기사에서 설명/이미지 크롤링 - if not best_description and real_url != link: - try: + # 첫 번째 기사만 리다이렉트 추적 + 크롤링 (속도 최적화) + if sources and 'news.google.com' in sources[0]['url']: + try: + resp = requests.head(sources[0]['url'], timeout=5, allow_redirects=True, + headers={'User-Agent': 'Mozilla/5.0'}) + if resp.url and 'news.google.com' not in resp.url: + sources[0]['url'] = resp.url + # og:description, og:image 크롤링 from bs4 import BeautifulSoup - resp = requests.get(real_url, timeout=10, + page = requests.get(resp.url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'}) - if resp.status_code == 200: - soup = BeautifulSoup(resp.text, 'lxml') + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'lxml') og_desc = soup.find('meta', property='og:description') if og_desc and og_desc.get('content'): best_description = og_desc['content'].strip()[:300] og_img = soup.find('meta', property='og:image') if og_img and og_img.get('content', '').startswith('http'): best_image = og_img['content'] - except Exception: - pass + except Exception: + pass except Exception: pass