fix: og:image 도메인 검증 추가 및 외부 검색 이미지 제거

- og:image가 본문 이미지와 같은 도메인인지 검증하여 사이트 기본 이미지(KBS 브레드이발소 등) 차단 - DuckDuckGo 외부 검색 이미지 수집 제거, 참조 기사 소스 URL에서만 이미지 추출 - _search_real_article_image() 함수 삭제 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-01 10:11:24 +09:00
parent 6ad912a053
commit 08e5bfc915
1 changed files with 54 additions and 89 deletions
@@ -263,7 +263,7 @@ def _is_relevant_image(image_url: str, article: dict) -> bool:
 def _fetch_og_image(url: str) -> str:
-    """원본 기사 URL에서 og:image 메타태그 크롤링"""
+    """원본 기사 URL에서 og:image 메타태그 크롤링 (본문 이미지 검증 포함)"""
    if not url or not url.startswith('http'):
        return ''
    # Google 뉴스 리다이렉트인 경우 실제 기사 URL 추출 시도 (head는 리다이렉트 안됨 → get 사용)
@@ -282,73 +282,51 @@ def _fetch_og_image(url: str) -> str:
        if resp.status_code != 200:
            return ''
        soup = BeautifulSoup(resp.text, 'lxml')
-        # og:image
+
-        og = soup.find('meta', property='og:image')
+        # 본문 내 실제 이미지 수집 (기사 내용과 관련된 이미지만 신뢰)
-        if og and og.get('content', '').startswith('http'):
+        body_images = []
            if not _is_platform_logo(og['content']):
                return og['content']
        # twitter:image
        tw = soup.find('meta', attrs={'name': 'twitter:image'})
        if tw and tw.get('content', '').startswith('http'):
            if not _is_platform_logo(tw['content']):
                return tw['content']
        # 본문 첫 번째 큰 이미지
        for img in soup.find_all('img', src=True):
            src = img['src']
            if src.startswith('http') and not _is_platform_logo(src):
-                return src
+                body_images.append(src)
        # og:image가 본문 이미지와 동일 도메인이면 신뢰, 아니면 사이트 기본 이미지 가능성
        og_url = ''
        og = soup.find('meta', property='og:image')
        if og and og.get('content', '').startswith('http'):
            if not _is_platform_logo(og['content']):
                og_url = og['content']
        if not og_url:
            tw = soup.find('meta', attrs={'name': 'twitter:image'})
            if tw and tw.get('content', '').startswith('http'):
                if not _is_platform_logo(tw['content']):
                    og_url = tw['content']
        if og_url and body_images:
            # og:image 도메인이 본문 이미지 도메인과 일치하면 신뢰
            from urllib.parse import urlparse
            og_domain = urlparse(og_url).netloc.replace('www.', '')
            body_domains = {urlparse(u).netloc.replace('www.', '') for u in body_images}
            if og_domain in body_domains:
                return og_url
            # 도메인 불일치 → 사이트 기본 og:image일 가능성 → 본문 이미지 우선 사용
            logger.info(f"og:image 도메인({og_domain}) ≠ 본문 이미지 도메인 → 본문 이미지 사용")
            return body_images[0]
        elif og_url and not body_images:
            # 본문에 이미지가 없으면 og:image를 일단 사용 (검증 불가)
            return og_url
        elif body_images:
            return body_images[0]
    except Exception as e:
        logger.warning(f"og:image 크롤링 실패 ({url}): {e}")
    return ''
 def _search_real_article_image(sources: list, topic: str = '') -> str:
    """Google News 소스 → DuckDuckGo 검색으로 실제 기사 URL 찾기 → og:image 크롤링"""
    from urllib.parse import quote as _quote, urlparse, parse_qs, unquote
    from bs4 import BeautifulSoup as _BS
    for src in sources[:3]:
        title = src.get('title', '')
        if not title:
            continue
        # "기사 제목 - 매체명" → 매체명 제거
        clean_title = re.sub(r'\s*[-–—]\s*\S+$', '', title).strip()
        if len(clean_title) < 5:
            clean_title = title
        try:
            ddg_url = f'https://html.duckduckgo.com/html/?q={_quote(clean_title)}'
            resp = requests.get(ddg_url, timeout=10, headers={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
            })
            if resp.status_code != 200:
                continue
            soup = _BS(resp.text, 'lxml')
            # DuckDuckGo는 redirect URL 사용: //duckduckgo.com/l/?uddg=실제URL
            for a_tag in soup.select('a.result__a')[:3]:
                href = a_tag.get('href', '')
                # uddg 파라미터에서 실제 URL 추출
                real_url = href
                if 'uddg=' in href:
                    parsed = parse_qs(urlparse(href).query)
                    uddg = parsed.get('uddg', [''])[0]
                    if uddg:
                        real_url = unquote(uddg)
                if not real_url.startswith('http'):
                    continue
                if 'news.google.com' in real_url:
                    continue
                # 실제 기사 URL에서 og:image 크롤링
                img = _fetch_og_image(real_url)
                if img:
                    logger.info(f"원문 기사 이미지 발견: {real_url[:50]} → {img[:60]}")
                    return img
        except Exception as e:
            logger.debug(f"DuckDuckGo 검색 실패: {e}")
    return ''
 def fetch_featured_image(article: dict) -> str:
-    """대표 이미지: RSS 이미지 → 원문 기사 → og:image → Wikipedia 순으로 시도"""
+    """대표 이미지: RSS 이미지 → 참조 기사 og:image → Wikipedia 순으로 시도
    참조된 기사 내 이미지만 사용하여 무관한 이미지 유입을 방지한다."""
    # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 + 관련성 검사)
    source_image = article.get('source_image', '')
    if source_image and source_image.startswith('http') and not _is_platform_logo(source_image):
@@ -360,41 +338,28 @@ def fetch_featured_image(article: dict) -> str:
            except Exception:
                pass
-    # 2) 원본 기사 URL에서 og:image 크롤링
+    # 2) 참조 기사(sources) URL에서 og:image/본문 이미지 크롤링
    #    source_url 및 sources 리스트의 URL만 사용 (외부 검색 X)
    tried_urls = set()
    source_url = article.get('source_url', '')
-    og_image = _fetch_og_image(source_url)
+    if source_url:
-    if og_image and _is_relevant_image(og_image, article):
+        tried_urls.add(source_url)
-        return og_image
+        og_image = _fetch_og_image(source_url)
        if og_image and _is_relevant_image(og_image, article):
            return og_image
    # 3) Google News 소스 → DuckDuckGo로 실제 기사 검색 → og:image
    sources = article.get('sources', [])
-    if sources:
+    for src in sources[:5]:
-        real_image = _search_real_article_image(sources, article.get('title', ''))
+        src_url = src.get('url', '') or src.get('link', '')
-        if real_image and _is_relevant_image(real_image, article):
+        if not src_url or src_url in tried_urls:
-            return real_image
+            continue
        tried_urls.add(src_url)
        og_image = _fetch_og_image(src_url)
        if og_image and _is_relevant_image(og_image, article):
            return og_image
-    # 4) Pexels API (키가 있을 때)
+    # 3) Wikipedia 썸네일 (무료, API 키 불필요)
    pexels_key = os.getenv('PEXELS_API_KEY', '')
    if pexels_key:
        tags = article.get('tags', [])
        if isinstance(tags, str):
            tags = [t.strip() for t in tags.split(',')]
        query = tags[0] if tags else article.get('corner', 'technology')
        try:
            resp = requests.get(
                'https://api.pexels.com/v1/search',
                headers={'Authorization': pexels_key},
                params={'query': query, 'per_page': 1, 'orientation': 'landscape'},
                timeout=10,
            )
            if resp.status_code == 200:
                photos = resp.json().get('photos', [])
                if photos:
                    return photos[0]['src']['large']
        except Exception as e:
            logger.warning(f"Pexels 이미지 검색 실패: {e}")
    # 5) Wikipedia 썸네일 (무료, API 키 불필요)
    tags = article.get('tags', [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(',')]