feat: 텔레그램 이미지 첨부 기능 및 이미지 처리 개선

- /idea, /topic 명령어에 최대 3장 이미지 첨부 기능 추가 - 1장: 본문 최상단 배치, 2~3장: 본문 중간 균등 분산 배치 - base64 data URI 임베딩으로 핫링크 차단 문제 해결 - Claude API timeout=120s, max_retries=0 설정 (401 무한대기 방지) - DuckDuckGo 제목 검색 폴백 및 문화/엔터 섹션 이미지 필터링 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-01 18:28:19 +09:00
parent 08e5bfc915
commit 15dfc39f0f
3 changed files with 268 additions and 31 deletions
--- a/bots/publisher_bot.py
+++ b/bots/publisher_bot.py
@@ -262,24 +262,38 @@ def _is_relevant_image(image_url: str, article: dict) -> bool:
    return True


-def _fetch_og_image(url: str) -> str:
+def _fetch_og_image(url: str, skip_irrelevant_section: bool = True) -> str:
    """원본 기사 URL에서 og:image 메타태그 크롤링 (본문 이미지 검증 포함)"""
    if not url or not url.startswith('http'):
        return ''
-    # Google 뉴스 리다이렉트인 경우 실제 기사 URL 추출 시도 (head는 리다이렉트 안됨 → get 사용)
+    # 문화/엔터/스포츠 섹션 기사는 이미지 추출 건너뜀
+    if skip_irrelevant_section and _is_irrelevant_article_url(url):
+        logger.info(f"무관한 섹션 기사 이미지 건너뜀: {url[:80]}")
+        return ''
+    # Google 뉴스 리다이렉트인 경우 실제 기사 URL 추출 시도
    if 'news.google.com' in url:
        try:
            resp = requests.get(url, timeout=15, allow_redirects=True,
                                headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
            if resp.url and 'news.google.com' not in resp.url:
                url = resp.url
-        except Exception:
-            pass
+                logger.info(f"Google News 리다이렉트 성공: {url[:80]}")
+                # 리다이렉트된 실제 기사 URL도 섹션 검증
+                if skip_irrelevant_section and _is_irrelevant_article_url(url):
+                    logger.info(f"리다이렉트된 기사가 무관한 섹션: {url[:80]}")
+                    return ''
+            else:
+                logger.info(f"Google News 리다이렉트 실패 — 여전히 news.google.com")
+                return ''
+        except Exception as e:
+            logger.warning(f"Google News 리다이렉트 실패: {e}")
+            return ''
    try:
        resp = requests.get(url, timeout=10, headers={
-            'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)',
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
        })
        if resp.status_code != 200:
+            logger.info(f"기사 페이지 접근 실패 (HTTP {resp.status_code}): {url[:80]}")
            return ''
        soup = BeautifulSoup(resp.text, 'lxml')

@@ -290,7 +304,7 @@ def _fetch_og_image(url: str) -> str:
            if src.startswith('http') and not _is_platform_logo(src):
                body_images.append(src)

-        # og:image가 본문 이미지와 동일 도메인이면 신뢰, 아니면 사이트 기본 이미지 가능성
+        # og:image 추출
        og_url = ''
        og = soup.find('meta', property='og:image')
        if og and og.get('content', '').startswith('http'):
@@ -303,23 +317,84 @@ def _fetch_og_image(url: str) -> str:
                    og_url = tw['content']

        if og_url and body_images:
-            # og:image 도메인이 본문 이미지 도메인과 일치하면 신뢰
            from urllib.parse import urlparse
            og_domain = urlparse(og_url).netloc.replace('www.', '')
            body_domains = {urlparse(u).netloc.replace('www.', '') for u in body_images}
            if og_domain in body_domains:
+                logger.info(f"og:image 도메인 일치 → 사용: {og_url[:80]}")
                return og_url
-            # 도메인 불일치 → 사이트 기본 og:image일 가능성 → 본문 이미지 우선 사용
+            # 도메인 불일치 → 사이트 기본 og:image일 가능성 → 본문 이미지 우선
            logger.info(f"og:image 도메인({og_domain}) ≠ 본문 이미지 도메인 → 본문 이미지 사용")
            return body_images[0]
-        elif og_url and not body_images:
-            # 본문에 이미지가 없으면 og:image를 일단 사용 (검증 불가)
+        elif og_url:
+            logger.info(f"og:image 사용 (본문 이미지 없음): {og_url[:80]}")
            return og_url
        elif body_images:
+            logger.info(f"본문 이미지 사용: {body_images[0][:80]}")
            return body_images[0]

+        logger.info(f"이미지 없음: {url[:80]}")
    except Exception as e:
-        logger.warning(f"og:image 크롤링 실패 ({url}): {e}")
+        logger.warning(f"og:image 크롤링 실패 ({url[:60]}): {e}")
+    return ''
+
+
+def _is_irrelevant_article_url(url: str) -> bool:
+    """기사 URL 경로가 문화/엔터/스포츠 등 무관한 섹션인지 판별"""
+    url_lower = url.lower()
+    irrelevant_paths = [
+        '/culture/', '/entertainment/', '/sport/', '/sports/',
+        '/lifestyle/', '/celebrity/', '/drama/', '/movie/',
+        '/game/', '/gaming/', '/webtoon/', '/comic/',
+        '/tv/', '/ott/', '/show/', '/program/',
+        '/fun/', '/photo/', '/video/', '/gallery/',
+    ]
+    return any(p in url_lower for p in irrelevant_paths)
+
+
+def _search_article_image_by_title(sources: list) -> str:
+    """Google News 소스 제목으로 DuckDuckGo 검색 → 실제 기사 URL → og:image 크롤링
+    Google News 리다이렉트 실패 시 폴백으로 사용"""
+    from urllib.parse import quote as _quote, urlparse, parse_qs, unquote
+
+    for src in sources[:3]:
+        title = src.get('title', '')
+        if not title:
+            continue
+        # "기사 제목 - 매체명" → 매체명 제거
+        clean_title = re.sub(r'\s*[-–—]\s*\S+$', '', title).strip()
+        if len(clean_title) < 5:
+            clean_title = title
+        try:
+            ddg_url = f'https://html.duckduckgo.com/html/?q={_quote(clean_title)}'
+            resp = requests.get(ddg_url, timeout=10, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+            })
+            if resp.status_code != 200:
+                continue
+            soup = BeautifulSoup(resp.text, 'lxml')
+            for a_tag in soup.select('a.result__a')[:3]:
+                href = a_tag.get('href', '')
+                real_url = href
+                if 'uddg=' in href:
+                    parsed = parse_qs(urlparse(href).query)
+                    uddg = parsed.get('uddg', [''])[0]
+                    if uddg:
+                        real_url = unquote(uddg)
+                if not real_url.startswith('http'):
+                    continue
+                if 'news.google.com' in real_url:
+                    continue
+                # 문화/엔터/스포츠 섹션 기사는 건너뜀
+                if _is_irrelevant_article_url(real_url):
+                    logger.info(f"무관한 섹션 기사 건너뜀: {real_url[:80]}")
+                    continue
+                img = _fetch_og_image(real_url)
+                if img:
+                    logger.info(f"제목 검색으로 이미지 발견: {clean_title[:30]} → {img[:60]}")
+                    return img
+        except Exception as e:
+            logger.debug(f"제목 검색 실패: {e}")
    return ''


@@ -327,6 +402,7 @@ def _fetch_og_image(url: str) -> str:
 def fetch_featured_image(article: dict) -> str:
    """대표 이미지: RSS 이미지 → 참조 기사 og:image → Wikipedia 순으로 시도
    참조된 기사 내 이미지만 사용하여 무관한 이미지 유입을 방지한다."""
+    logger.info(f"대표 이미지 검색 시작: {article.get('title', '')[:40]}")
    # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 + 관련성 검사)
    source_image = article.get('source_image', '')
    if source_image and source_image.startswith('http') and not _is_platform_logo(source_image):
@@ -359,7 +435,14 @@ def fetch_featured_image(article: dict) -> str:
        if og_image and _is_relevant_image(og_image, article):
            return og_image

-    # 3) Wikipedia 썸네일 (무료, API 키 불필요)
+    # 3) Google News 리다이렉트 실패 시 → 기사 제목으로 DuckDuckGo 검색 폴백
+    if sources:
+        logger.info("소스 URL 직접 접근 실패 → 기사 제목으로 검색 폴백")
+        title_image = _search_article_image_by_title(sources)
+        if title_image and _is_relevant_image(title_image, article):
+            return title_image
+
+    # 4) Wikipedia 썸네일 (무료, API 키 불필요)
    tags = article.get('tags', [])
    if isinstance(tags, str):
        tags = [t.strip() for t in tags.split(',')]
@@ -404,17 +487,62 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:

    html_parts = []
    if not has_image:
-        image_url = fetch_featured_image(article)
-        if image_url:
-            title = article.get('title', '').replace('"', '&quot;')
-            # Blogger 호환: div 래핑 없이 직접 img 삽입 (본문 첫 줄에 배치)
-            img_tag = (
-                f'<img src="{image_url}" alt="{title}" '
-                f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
-                f'margin-bottom:1.2em;" />'
-            )
-            # body_html 맨 앞에 이미지 삽입 (Blogger가 div를 제거하는 문제 방지)
-            body_html = img_tag + '\n' + body_html
+        title = article.get('title', '').replace('"', '&quot;')
+        user_images = article.get('user_images', [])
+        # 하위호환: user_image 단일 필드도 지원
+        if not user_images:
+            single = article.get('user_image', '')
+            if single:
+                user_images = [single]
+        # 유효한 이미지 파일만 필터링
+        valid_user_images = [p for p in user_images if Path(p).exists()]
+        if valid_user_images:
+            # 사용자가 텔레그램으로 첨부한 이미지 → base64 data URI (핫링크 문제 없음)
+            import base64 as _b64
+            import re as _re
+            img_tags = []
+            for img_path in valid_user_images:
+                img_bytes = Path(img_path).read_bytes()
+                ext = Path(img_path).suffix.lower()
+                mime = {'.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
+                        '.gif': 'image/gif', '.webp': 'image/webp'}.get(ext, 'image/jpeg')
+                data_uri = f"data:{mime};base64,{_b64.b64encode(img_bytes).decode()}"
+                img_tags.append(
+                    f'<img src="{data_uri}" alt="{title}" '
+                    f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                    f'margin-bottom:1.2em;" />'
+                )
+            if n_imgs == 1:
+                # 1장: 본문 최상단에 배치
+                body_html = img_tags[0] + '\n' + body_html
+            else:
+                # 2~3장: 본문 블록 사이에 균등 분산 배치
+                block_pattern = _re.compile(
+                    r'(<(?:p|h[1-6]|div|ul|ol|blockquote|table|section|article|figure)'
+                    r'[\s>])',
+                    _re.IGNORECASE,
+                )
+                blocks = block_pattern.split(body_html)
+                boundary_indices = [i for i in range(1, len(blocks), 2)]
+                if len(boundary_indices) >= n_imgs + 1:
+                    spacing = len(boundary_indices) // (n_imgs + 1)
+                    insert_positions = [spacing * (k + 1) for k in range(n_imgs)]
+                    for img_idx, pos in enumerate(reversed(insert_positions)):
+                        bi = boundary_indices[min(pos, len(boundary_indices) - 1)]
+                        blocks.insert(bi, '\n' + img_tags[n_imgs - 1 - img_idx] + '\n')
+                    body_html = ''.join(blocks)
+                else:
+                    body_html = '\n'.join(img_tags) + '\n' + body_html
+            logger.info(f"사용자 첨부 이미지 {len(valid_user_images)}장 본문 분산 배치")
+        else:
+            image_url = fetch_featured_image(article)
+            if image_url:
+                img_tag = (
+                    f'<img src="{image_url}" alt="{title}" '
+                    f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                    f'margin-bottom:1.2em;" />'
+                )
+                body_html = img_tag + '\n' + body_html

    html_parts.append(json_ld)
    # 목차: h2가 3개 이상이고 TOC에 실제 링크가 있을 때만 표시