From b3ccbba491f2c1a7997356c63a367cc5abca967d Mon Sep 17 00:00:00 2001
From: JOUNGWOOK KWON <elikwon@JOUNGWOOKui-MacBookAir.local>
Date: Mon, 30 Mar 2026 18:28:13 +0900
Subject: [PATCH] =?UTF-8?q?fix:=20Google=20News=20og:title=20=EB=8D=AE?=
 =?UTF-8?q?=EC=96=B4=EC=93=B0=EA=B8=B0=20=EB=B0=A9=EC=A7=80=20-=20?=
 =?UTF-8?q?=EC=9B=90=EB=B3=B8=20RSS=20=EC=A0=9C=EB=AA=A9=20=EB=B3=B4?=
 =?UTF-8?q?=EC=A1=B4?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_fetch_sources_content()에서 Google News 페이지의 og:title='Google News'로
원래 RSS 기사 제목이 덮어씌워지던 문제 수정.
- 'Google News' 등 플랫폼 이름이면 무시, 원래 RSS 제목 유지
- og:description도 'google news' 포함 시 무시
이로써 DuckDuckGo 원문 기사 이미지 검색이 제대로 작동

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 bots/scheduler.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/bots/scheduler.py b/bots/scheduler.py
index fe10a3a..eb10edc 100644
--- a/bots/scheduler.py
+++ b/bots/scheduler.py
@@ -291,17 +291,22 @@ def _fetch_sources_content(topic_data: dict) -> dict:
                 real_url = page.url
             if page.status_code == 200:
                 soup = BeautifulSoup(page.text, 'lxml')
-                # og:description 또는 본문 첫 단락
+                # og:description 또는 본문 첫 단락 (Google News 페이지 설명 무시)
                 og_desc = soup.find('meta', property='og:description')
-                if og_desc and og_desc.get('content'):
-                    content = og_desc['content'].strip()[:500]
+                desc_text = og_desc.get('content', '').strip() if og_desc else ''
+                if desc_text and 'google news' not in desc_text.lower() and len(desc_text) > 20:
+                    content = desc_text[:500]
                 else:
                     paras = soup.find_all('p')
                     content = ' '.join(p.get_text() for p in paras[:3])[:500]
-                # og:title로 제목 업데이트
+                # og:title로 제목 업데이트 (단, 원본 제목보다 유용한 경우만)
                 og_title = soup.find('meta', property='og:title')
                 if og_title and og_title.get('content'):
-                    title = og_title['content'].strip()
+                    new_title = og_title['content'].strip()
+                    # "Google News" 등 플랫폼 이름은 무시 — 원래 RSS 제목 유지
+                    generic_titles = ['google news', 'google', 'naver', 'daum', 'yahoo']
+                    if new_title and new_title.lower() not in generic_titles and len(new_title) > 5:
+                        title = new_title
                 # og:image (플랫폼 로고/Google News 썸네일 제외)
                 if not topic_data.get('source_image'):
                     og_img = soup.find('meta', property='og:image')