fix: og:image 도메인 검증 추가 및 외부 검색 이미지 제거

- og:image가 본문 이미지와 같은 도메인인지 검증하여 사이트 기본 이미지(KBS 브레드이발소 등) 차단 - DuckDuckGo 외부 검색 이미지 수집 제거, 참조 기사 소스 URL에서만 이미지 추출 - _search_real_article_image() 함수 삭제 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-01 10:11:24 +09:00
parent 6ad912a053
commit 08e5bfc915
1 changed files with 54 additions and 89 deletions
@@ -263,7 +263,7 @@ def _is_relevant_image(image_url: str, article: dict) -> bool:


 def _fetch_og_image(url: str) -> str:
-    """원본 기사 URL에서 og:image 메타태그 크롤링"""
+    """원본 기사 URL에서 og:image 메타태그 크롤링 (본문 이미지 검증 포함)"""
    if not url or not url.startswith('http'):
        return ''
    # Google 뉴스 리다이렉트인 경우 실제 기사 URL 추출 시도 (head는 리다이렉트 안됨 → get 사용)
@@ -282,73 +282,51 @@ def _fetch_og_image(url: str) -> str:
        if resp.status_code != 200:
            return ''
        soup = BeautifulSoup(resp.text, 'lxml')
-        # og:image
-        og = soup.find('meta', property='og:image')
-        if og and og.get('content', '').startswith('http'):
-            if not _is_platform_logo(og['content']):
-                return og['content']
-        # twitter:image
-        tw = soup.find('meta', attrs={'name': 'twitter:image'})
-        if tw and tw.get('content', '').startswith('http'):
-            if not _is_platform_logo(tw['content']):
-                return tw['content']
-        # 본문 첫 번째 큰 이미지
+
+        # 본문 내 실제 이미지 수집 (기사 내용과 관련된 이미지만 신뢰)
+        body_images = []
        for img in soup.find_all('img', src=True):
            src = img['src']
            if src.startswith('http') and not _is_platform_logo(src):
-                return src
+                body_images.append(src)
+
+        # og:image가 본문 이미지와 동일 도메인이면 신뢰, 아니면 사이트 기본 이미지 가능성
+        og_url = ''
+        og = soup.find('meta', property='og:image')
+        if og and og.get('content', '').startswith('http'):
+            if not _is_platform_logo(og['content']):
+                og_url = og['content']
+        if not og_url:
+            tw = soup.find('meta', attrs={'name': 'twitter:image'})
+            if tw and tw.get('content', '').startswith('http'):
+                if not _is_platform_logo(tw['content']):
+                    og_url = tw['content']
+
+        if og_url and body_images:
+            # og:image 도메인이 본문 이미지 도메인과 일치하면 신뢰
+            from urllib.parse import urlparse
+            og_domain = urlparse(og_url).netloc.replace('www.', '')
+            body_domains = {urlparse(u).netloc.replace('www.', '') for u in body_images}
+            if og_domain in body_domains:
+                return og_url
+            # 도메인 불일치 → 사이트 기본 og:image일 가능성 → 본문 이미지 우선 사용
+            logger.info(f"og:image 도메인({og_domain}) ≠ 본문 이미지 도메인 → 본문 이미지 사용")
+            return body_images[0]
+        elif og_url and not body_images:
+            # 본문에 이미지가 없으면 og:image를 일단 사용 (검증 불가)
+            return og_url
+        elif body_images:
+            return body_images[0]
+
    except Exception as e:
        logger.warning(f"og:image 크롤링 실패 ({url}): {e}")
    return ''


-def _search_real_article_image(sources: list, topic: str = '') -> str:
-    """Google News 소스 → DuckDuckGo 검색으로 실제 기사 URL 찾기 → og:image 크롤링"""
-    from urllib.parse import quote as _quote, urlparse, parse_qs, unquote
-    from bs4 import BeautifulSoup as _BS
-
-    for src in sources[:3]:
-        title = src.get('title', '')
-        if not title:
-            continue
-        # "기사 제목 - 매체명" → 매체명 제거
-        clean_title = re.sub(r'\s*[-–—]\s*\S+$', '', title).strip()
-        if len(clean_title) < 5:
-            clean_title = title
-        try:
-            ddg_url = f'https://html.duckduckgo.com/html/?q={_quote(clean_title)}'
-            resp = requests.get(ddg_url, timeout=10, headers={
-                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
-            })
-            if resp.status_code != 200:
-                continue
-            soup = _BS(resp.text, 'lxml')
-            # DuckDuckGo는 redirect URL 사용: //duckduckgo.com/l/?uddg=실제URL
-            for a_tag in soup.select('a.result__a')[:3]:
-                href = a_tag.get('href', '')
-                # uddg 파라미터에서 실제 URL 추출
-                real_url = href
-                if 'uddg=' in href:
-                    parsed = parse_qs(urlparse(href).query)
-                    uddg = parsed.get('uddg', [''])[0]
-                    if uddg:
-                        real_url = unquote(uddg)
-                if not real_url.startswith('http'):
-                    continue
-                if 'news.google.com' in real_url:
-                    continue
-                # 실제 기사 URL에서 og:image 크롤링
-                img = _fetch_og_image(real_url)
-                if img:
-                    logger.info(f"원문 기사 이미지 발견: {real_url[:50]} → {img[:60]}")
-                    return img
-        except Exception as e:
-            logger.debug(f"DuckDuckGo 검색 실패: {e}")
-    return ''
-

 def fetch_featured_image(article: dict) -> str:
-    """대표 이미지: RSS 이미지 → 원문 기사 → og:image → Wikipedia 순으로 시도"""
+    """대표 이미지: RSS 이미지 → 참조 기사 og:image → Wikipedia 순으로 시도
+    참조된 기사 내 이미지만 사용하여 무관한 이미지 유입을 방지한다."""
    # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 + 관련성 검사)
    source_image = article.get('source_image', '')
    if source_image and source_image.startswith('http') and not _is_platform_logo(source_image):
@@ -360,41 +338,28 @@ def fetch_featured_image(article: dict) -> str:
            except Exception:
                pass

-    # 2) 원본 기사 URL에서 og:image 크롤링
+    # 2) 참조 기사(sources) URL에서 og:image/본문 이미지 크롤링
+    #    source_url 및 sources 리스트의 URL만 사용 (외부 검색 X)
+    tried_urls = set()
+
    source_url = article.get('source_url', '')
-    og_image = _fetch_og_image(source_url)
-    if og_image and _is_relevant_image(og_image, article):
-        return og_image
+    if source_url:
+        tried_urls.add(source_url)
+        og_image = _fetch_og_image(source_url)
+        if og_image and _is_relevant_image(og_image, article):
+            return og_image

-    # 3) Google News 소스 → DuckDuckGo로 실제 기사 검색 → og:image
    sources = article.get('sources', [])
-    if sources:
-        real_image = _search_real_article_image(sources, article.get('title', ''))
-        if real_image and _is_relevant_image(real_image, article):
-            return real_image
+    for src in sources[:5]:
+        src_url = src.get('url', '') or src.get('link', '')
+        if not src_url or src_url in tried_urls:
+            continue
+        tried_urls.add(src_url)
+        og_image = _fetch_og_image(src_url)
+        if og_image and _is_relevant_image(og_image, article):
+            return og_image

-    # 4) Pexels API (키가 있을 때)
-    pexels_key = os.getenv('PEXELS_API_KEY', '')
-    if pexels_key:
-        tags = article.get('tags', [])
-        if isinstance(tags, str):
-            tags = [t.strip() for t in tags.split(',')]
-        query = tags[0] if tags else article.get('corner', 'technology')
-        try:
-            resp = requests.get(
-                'https://api.pexels.com/v1/search',
-                headers={'Authorization': pexels_key},
-                params={'query': query, 'per_page': 1, 'orientation': 'landscape'},
-                timeout=10,
-            )
-            if resp.status_code == 200:
-                photos = resp.json().get('photos', [])
-                if photos:
-                    return photos[0]['src']['large']
-        except Exception as e:
-            logger.warning(f"Pexels 이미지 검색 실패: {e}")
-
-    # 5) Wikipedia 썸네일 (무료, API 키 불필요)
+    # 3) Wikipedia 썸네일 (무료, API 키 불필요)
    tags = article.get('tags', [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(',')]