From 6ad912a053e787b72278979265336d814afa2410 Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Wed, 1 Apr 2026 09:17:34 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20=EA=B8=80=20=EC=A3=BC=EC=A0=9C=EC=99=80?= =?UTF-8?q?=20=EB=AC=B4=EA=B4=80=ED=95=9C=20=EC=9D=B4=EB=AF=B8=EC=A7=80(?= =?UTF-8?q?=EC=95=A0=EB=8B=88/=EA=B2=8C=EC=9E=84/=EC=97=94=ED=84=B0)=20?= =?UTF-8?q?=ED=95=84=ED=84=B0=EB=A7=81=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- bots/publisher_bot.py | 63 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py index 9facae1..3b7098c 100644 --- a/bots/publisher_bot.py +++ b/bots/publisher_bot.py @@ -207,7 +207,7 @@ def build_json_ld(article: dict, blog_url: str = '') -> str: def _is_platform_logo(image_url: str) -> bool: - """플랫폼 로고/아이콘 이미지인지 판별 — 대표 이미지로 부적합""" + """플랫폼 로고/아이콘/광고 이미지인지 판별 — 대표 이미지로 부적합""" skip_patterns = [ 'logo', 'icon', 'avatar', 'banner', '/ad/', 'google.com/images/branding', 'googlenews', 'google-news', @@ -215,11 +215,53 @@ def _is_platform_logo(image_url: str) -> bool: 'facebook.com', 'twitter.com', 'naver.com/favicon', 'default_image', 'placeholder', 'noimage', 'no-image', 'og-default', 'share-default', 'sns_', 'common/', + # 광고/게임/이벤트 관련 패턴 + 'ad.', 'ads.', '/adv/', '/promo/', '/event/', '/game/', + 'adimg', 'adserver', 'doubleclick', 'googlesyndication', + 'akamaihd.net', 'cdn.ad', 'click.', 'tracking.', ] url_lower = image_url.lower() return any(p in url_lower for p in skip_patterns) +def _is_relevant_image(image_url: str, article: dict) -> bool: + """이미지가 글 주제와 관련 있는지 판별""" + if not image_url: + return False + url_lower = image_url.lower() + + # 엔터테인먼트/애니메이션/게임 관련 URL 패턴 — 글 주제와 무관할 가능성 높음 + entertainment_patterns = [ + 'game', 'gaming', 'casino', 'slot', 'poker', 'lottery', + 'anime', 'animation', 'cartoon', 'drama', 'movie', 'film', + 'entertainment', 'kpop', 'idol', 'singer', 'actor', + 'breadbarbershop', 'bread', 'character', 'webtoon', + 'advert', 'sponsor', 'promo', 'event_banner', 'event/', + '/show/', '/program/', '/tv/', '/ott/', + ] + + # 글 코너/태그 추출 + corner = article.get('corner', '').lower() + tags = article.get('tags', []) + if isinstance(tags, str): + tags = [t.strip().lower() for t in tags.split(',')] + else: + tags = [t.lower() for t in tags] + topic = article.get('topic', '').lower() + ' ' + article.get('title', '').lower() + + # 경제/IT/사회 관련 글인데 엔터테인먼트 이미지면 거부 + serious_corners = ['ai인사이트', '스타트업', '재테크', '경제', '사회', '정치', '국제'] + is_serious = any(c in corner for c in serious_corners) or any( + kw in topic for kw in ['경제', '투자', '금융', '정책', '기술', 'ai', '스타트업'] + ) + + if is_serious and any(p in url_lower for p in entertainment_patterns): + logger.info(f"이미지 관련성 불일치로 제외: {image_url[:80]}") + return False + + return True + + def _fetch_og_image(url: str) -> str: """원본 기사 URL에서 og:image 메타태그 크롤링""" if not url or not url.startswith('http'): @@ -307,27 +349,28 @@ def _search_real_article_image(sources: list, topic: str = '') -> str: def fetch_featured_image(article: dict) -> str: """대표 이미지: RSS 이미지 → 원문 기사 → og:image → Wikipedia 순으로 시도""" - # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 제외) + # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 + 관련성 검사) source_image = article.get('source_image', '') if source_image and source_image.startswith('http') and not _is_platform_logo(source_image): - try: - resp = requests.head(source_image, timeout=5, allow_redirects=True) - if resp.status_code == 200: - return source_image - except Exception: - pass + if _is_relevant_image(source_image, article): + try: + resp = requests.head(source_image, timeout=5, allow_redirects=True) + if resp.status_code == 200: + return source_image + except Exception: + pass # 2) 원본 기사 URL에서 og:image 크롤링 source_url = article.get('source_url', '') og_image = _fetch_og_image(source_url) - if og_image: + if og_image and _is_relevant_image(og_image, article): return og_image # 3) Google News 소스 → DuckDuckGo로 실제 기사 검색 → og:image sources = article.get('sources', []) if sources: real_image = _search_real_article_image(sources, article.get('title', '')) - if real_image: + if real_image and _is_relevant_image(real_image, article): return real_image # 4) Pexels API (키가 있을 때)