From f25a95440a20b0cc937a3c7396f82489382294d4 Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Mon, 30 Mar 2026 15:36:01 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20/idea=20=ED=83=80=EC=9E=84=EC=95=84?= =?UTF-8?q?=EC=9B=83=20=E2=80=94=20=EB=A6=AC=EB=8B=A4=EC=9D=B4=EB=A0=89?= =?UTF-8?q?=ED=8A=B8/=ED=81=AC=EB=A1=A4=EB=A7=81=20=EC=A0=9C=EA=B1=B0?= =?UTF-8?q?=ED=95=98=EA=B3=A0=20RSS=EB=A7=8C=20=ED=8C=8C=EC=8B=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NAS→Google 뉴스 리다이렉트 추적이 매우 느려서 Telegram 타임아웃 발생. RSS 피드 파싱만으로 제목/설명 수집, URL 변환은 글 작성 시점에 처리. Co-Authored-By: Claude Opus 4.6 --- bots/scheduler.py | 34 ++++++++-------------------------- 1 file changed, 8 insertions(+), 26 deletions(-) diff --git a/bots/scheduler.py b/bots/scheduler.py index c4ede9a..3b385eb 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -783,12 +783,11 @@ async def cmd_idea(update: Update, context: ContextTypes.DEFAULT_TYPE): def _search_and_build_topic(keyword: str, corner: str = '') -> dict: - """키워드로 Google 뉴스 검색 → 관련 기사 수집 → topic_data 생성""" - import requests + """키워드로 Google 뉴스 검색 → 관련 기사 수집 → topic_data 생성 (빠른 버전)""" import feedparser from urllib.parse import quote - # Google 뉴스 RSS로 검색 + # Google 뉴스 RSS로 검색 (리다이렉트 추적 없이 빠르게) search_url = f"https://news.google.com/rss/search?q={quote(keyword)}&hl=ko&gl=KR&ceid=KR:ko" sources = [] best_description = '' @@ -800,35 +799,18 @@ def _search_and_build_topic(keyword: str, corner: str = '') -> dict: title = entry.get('title', '') link = entry.get('link', '') pub_date = entry.get('published', '') + # RSS description에서 설명 추출 + desc = entry.get('summary', '') or entry.get('description', '') + if desc and not best_description: + # HTML 태그 제거 + import re as _re + best_description = _re.sub(r'<[^>]+>', '', desc).strip()[:300] - # Google 뉴스 RSS 제목에서 "- 매체명" 분리 sources.append({ 'url': link, 'title': title, 'date': pub_date, }) - - # 첫 번째 기사만 리다이렉트 추적 + 크롤링 (속도 최적화) - if sources and 'news.google.com' in sources[0]['url']: - try: - resp = requests.head(sources[0]['url'], timeout=5, allow_redirects=True, - headers={'User-Agent': 'Mozilla/5.0'}) - if resp.url and 'news.google.com' not in resp.url: - sources[0]['url'] = resp.url - # og:description, og:image 크롤링 - from bs4 import BeautifulSoup - page = requests.get(resp.url, timeout=5, - headers={'User-Agent': 'Mozilla/5.0'}) - if page.status_code == 200: - soup = BeautifulSoup(page.text, 'lxml') - og_desc = soup.find('meta', property='og:description') - if og_desc and og_desc.get('content'): - best_description = og_desc['content'].strip()[:300] - og_img = soup.find('meta', property='og:image') - if og_img and og_img.get('content', '').startswith('http'): - best_image = og_img['content'] - except Exception: - pass except Exception: pass