From d0cabc3f13147a7a337c16eb177f4612675b2c32 Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Mon, 30 Mar 2026 13:44:53 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20og:image=EC=97=90=EC=84=9C=20=ED=94=8C?= =?UTF-8?q?=EB=9E=AB=ED=8F=BC=20=EB=A1=9C=EA=B3=A0(Google=EB=89=B4?= =?UTF-8?q?=EC=8A=A4=20=EB=93=B1)=20=ED=95=84=ED=84=B0=EB=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - _is_platform_logo(): 로고/아이콘/기본이미지 패턴 감지 - Google 뉴스 URL인 경우 실제 기사 URL로 리다이렉트 추적 - 로고 이미지 걸러지면 Pexels 폴백으로 진행 Co-Authored-By: Claude Opus 4.6 --- bots/publisher_bot.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py index 117d39d..d3f3193 100644 --- a/bots/publisher_bot.py +++ b/bots/publisher_bot.py @@ -206,10 +206,32 @@ def build_json_ld(article: dict, blog_url: str = '') -> str: return f'' +def _is_platform_logo(image_url: str) -> bool: + """플랫폼 로고/아이콘 이미지인지 판별 — 대표 이미지로 부적합""" + skip_patterns = [ + 'logo', 'icon', 'avatar', 'banner', '/ad/', + 'google.com/images/branding', 'googlenews', 'google-news', + 'facebook.com', 'twitter.com', 'naver.com/favicon', + 'default_image', 'placeholder', 'noimage', 'no-image', + 'og-default', 'share-default', 'sns_', 'common/', + ] + url_lower = image_url.lower() + return any(p in url_lower for p in skip_patterns) + + def _fetch_og_image(url: str) -> str: """원본 기사 URL에서 og:image 메타태그 크롤링""" if not url or not url.startswith('http'): return '' + # Google 뉴스 리다이렉트인 경우 실제 기사 URL 추출 시도 + if 'news.google.com' in url: + try: + resp = requests.head(url, timeout=10, allow_redirects=True, + headers={'User-Agent': 'Mozilla/5.0'}) + if resp.url and 'news.google.com' not in resp.url: + url = resp.url + except Exception: + pass try: resp = requests.get(url, timeout=10, headers={ 'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)', @@ -220,15 +242,17 @@ def _fetch_og_image(url: str) -> str: # og:image og = soup.find('meta', property='og:image') if og and og.get('content', '').startswith('http'): - return og['content'] + if not _is_platform_logo(og['content']): + return og['content'] # twitter:image tw = soup.find('meta', attrs={'name': 'twitter:image'}) if tw and tw.get('content', '').startswith('http'): - return tw['content'] + if not _is_platform_logo(tw['content']): + return tw['content'] # 본문 첫 번째 큰 이미지 for img in soup.find_all('img', src=True): src = img['src'] - if src.startswith('http') and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'banner', 'ad']): + if src.startswith('http') and not _is_platform_logo(src): return src except Exception as e: logger.warning(f"og:image 크롤링 실패 ({url}): {e}")