From c836c720da5577773c535c19e366d87e452b3ca9 Mon Sep 17 00:00:00 2001
From: JOUNGWOOK KWON <elikwon@JOUNGWOOKui-MacBookAir.local>
Date: Mon, 30 Mar 2026 15:26:15 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20/idea=20=EB=AA=85=EB=A0=B9=20=ED=83=80?=
 =?UTF-8?q?=EC=9E=84=EC=95=84=EC=9B=83=20=E2=80=94=20=EC=B2=AB=20=EA=B8=B0?=
 =?UTF-8?q?=EC=82=AC=EB=A7=8C=20=ED=81=AC=EB=A1=A4=EB=A7=81=EC=9C=BC?=
 =?UTF-8?q?=EB=A1=9C=20=EC=86=8D=EB=8F=84=20=EC=B5=9C=EC=A0=81=ED=99=94?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

5개 기사 모두 리다이렉트 추적하면 50초+ 걸려서 Telegram 타임아웃 발생.
첫 번째 기사만 URL 변환+크롤링하고 나머지는 RSS 제목만 저장.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bots/scheduler.py | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/bots/scheduler.py b/bots/scheduler.py
index 604aae5..c4ede9a 100644
--- a/bots/scheduler.py
+++ b/bots/scheduler.py
@@ -801,39 +801,34 @@ def _search_and_build_topic(keyword: str, corner: str = '') -> dict:
             link = entry.get('link', '')
             pub_date = entry.get('published', '')
 
-            # Google 뉴스 URL → 실제 기사 URL 변환
-            real_url = link
-            if 'news.google.com' in link:
-                try:
-                    resp = requests.head(link, timeout=8, allow_redirects=True,
-                                         headers={'User-Agent': 'Mozilla/5.0'})
-                    if resp.url and 'news.google.com' not in resp.url:
-                        real_url = resp.url
-                except Exception:
-                    pass
-
+            # Google 뉴스 RSS 제목에서 "- 매체명" 분리
             sources.append({
-                'url': real_url,
+                'url': link,
                 'title': title,
                 'date': pub_date,
             })
 
-            # 첫 번째 기사에서 설명/이미지 크롤링
-            if not best_description and real_url != link:
-                try:
+        # 첫 번째 기사만 리다이렉트 추적 + 크롤링 (속도 최적화)
+        if sources and 'news.google.com' in sources[0]['url']:
+            try:
+                resp = requests.head(sources[0]['url'], timeout=5, allow_redirects=True,
+                                     headers={'User-Agent': 'Mozilla/5.0'})
+                if resp.url and 'news.google.com' not in resp.url:
+                    sources[0]['url'] = resp.url
+                    # og:description, og:image 크롤링
                     from bs4 import BeautifulSoup
-                    resp = requests.get(real_url, timeout=10,
+                    page = requests.get(resp.url, timeout=5,
                                         headers={'User-Agent': 'Mozilla/5.0'})
-                    if resp.status_code == 200:
-                        soup = BeautifulSoup(resp.text, 'lxml')
+                    if page.status_code == 200:
+                        soup = BeautifulSoup(page.text, 'lxml')
                         og_desc = soup.find('meta', property='og:description')
                         if og_desc and og_desc.get('content'):
                             best_description = og_desc['content'].strip()[:300]
                         og_img = soup.find('meta', property='og:image')
                         if og_img and og_img.get('content', '').startswith('http'):
                             best_image = og_img['content']
-                except Exception:
-                    pass
+            except Exception:
+                pass
     except Exception:
         pass