diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py index edfaf47..36dbfb7 100644 --- a/bots/publisher_bot.py +++ b/bots/publisher_bot.py @@ -262,9 +262,9 @@ def _fetch_og_image(url: str) -> str: def fetch_featured_image(article: dict) -> str: """대표 이미지: RSS 이미지 → og:image 크롤링 → Pexels 순으로 시도""" - # 1) RSS 수집 시 가져온 소스 이미지 + # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 제외) source_image = article.get('source_image', '') - if source_image and source_image.startswith('http'): + if source_image and source_image.startswith('http') and not _is_platform_logo(source_image): try: resp = requests.head(source_image, timeout=5, allow_redirects=True) if resp.status_code == 200: diff --git a/bots/scheduler.py b/bots/scheduler.py index 777b2c9..fe10a3a 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -302,11 +302,15 @@ def _fetch_sources_content(topic_data: dict) -> dict: og_title = soup.find('meta', property='og:title') if og_title and og_title.get('content'): title = og_title['content'].strip() - # og:image + # og:image (플랫폼 로고/Google News 썸네일 제외) if not topic_data.get('source_image'): og_img = soup.find('meta', property='og:image') - if og_img and og_img.get('content', '').startswith('http'): - topic_data['source_image'] = og_img['content'] + img_url = og_img.get('content', '') if og_img else '' + skip_patterns = ['lh3.googleusercontent', 'google.com/images', 'logo', 'icon', + 'googlenews', 'google-news', 'placeholder', 'noimage'] + is_platform = any(p in img_url.lower() for p in skip_patterns) + if img_url.startswith('http') and not is_platform: + topic_data['source_image'] = img_url except Exception: pass