feat: Reddit 수집, 쇼츠 텔레그램 미리보기, 코너 9개 체계 정비

- Reddit 트렌딩 수집기 추가 (/reddit collect, /pick 명령어) - 쇼츠 영상 텔레그램 미리보기 후 승인 기반 YouTube 업로드 - 코너 9개로 통합 (앱추천→제품리뷰, 재테크절약→재테크, TV로보는세상/건강정보 추가) - RSS 피드 73개로 확대 (9개 코너 전체 커버) - 블로그 중복 검토 알림 수정, 글 잘림 방지 (max_tokens 8192) - 제품리뷰 다중 이미지 지원, 저품질 이미지 필터링 강화 - HookOptimizer LLM 연동, 인스타/X/틱톡 스케줄러 비활성화 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-07 13:56:20 +09:00
parent 93b2d3a264
commit 726c593e85
15 changed files with 1357 additions and 190 deletions
@@ -339,6 +339,143 @@ def _fetch_og_image(url: str, skip_irrelevant_section: bool = True) -> str:
    return ''


+def _fetch_article_images(article: dict, max_images: int = 5) -> list[str]:
+    """원문 기사에서 유효한 이미지 여러 장 수집 (제품리뷰용).
+
+    source_url + sources 리스트의 URL을 크롤링하여
+    본문 이미지를 최대 max_images장까지 수집한다.
+    각 이미지는 HEAD 요청으로 접근 가능 여부를 검증한다.
+    """
+    from urllib.parse import urlparse
+
+    urls_to_try = []
+    source_url = article.get('source_url', '')
+    if source_url and source_url.startswith('http'):
+        urls_to_try.append(source_url)
+    for src in article.get('sources', [])[:3]:
+        u = src.get('url', '') or src.get('link', '')
+        if u and u.startswith('http') and u not in urls_to_try:
+            urls_to_try.append(u)
+
+    collected = []
+    seen_urls = set()
+
+    # 프로필/아바타/저자 사진 등 제외 패턴
+    _skip_patterns = [
+        'avatar', 'author', 'profile', 'headshot', 'byline', 'gravatar',
+        'contributor', 'writer', 'staff', 'reporter', 'journalist',
+        'user-photo', 'user_photo', 'user-image', 'user_image',
+        'thumbnail-small', 'thumb-small', '/people/', '/person/',
+        'social-icon', 'share-', 'btn-', 'button', '/emoji/',
+        'badge', 'rating', 'star', 'pixel.', 'spacer', 'blank.',
+        '/1x1', 'tracking', 'analytics', 'beacon',
+    ]
+
+    for page_url in urls_to_try:
+        if len(collected) >= max_images:
+            break
+        # Google News 리다이렉트 처리
+        if 'news.google.com' in page_url:
+            try:
+                resp = requests.get(page_url, timeout=15, allow_redirects=True,
+                                    headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
+                if resp.url and 'news.google.com' not in resp.url:
+                    page_url = resp.url
+                else:
+                    continue
+            except Exception:
+                continue
+
+        try:
+            resp = requests.get(page_url, timeout=10, headers={
+                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+            })
+            if resp.status_code != 200:
+                continue
+            soup = BeautifulSoup(resp.text, 'lxml')
+
+            # 본문 영역 우선 탐색 (사이드바/푸터 이미지 제외)
+            article_body = (
+                soup.find('article') or
+                soup.find('div', class_=re.compile(r'article|post|entry|content|body', re.I)) or
+                soup.find('main') or
+                soup
+            )
+
+            for img in article_body.find_all('img', src=True):
+                if len(collected) >= max_images:
+                    break
+                src = img['src']
+                if not src.startswith('http') or _is_platform_logo(src):
+                    continue
+
+                src_lower = src.lower()
+                # 프로필/아바타/추적 이미지 제외
+                if any(p in src_lower for p in _skip_patterns):
+                    logger.debug(f"프로필/아바타 이미지 제외: {src[:80]}")
+                    continue
+
+                # alt/class 속성으로 프로필 사진 추가 필터링
+                img_alt = (img.get('alt', '') or '').lower()
+                img_class = ' '.join(img.get('class', []) or []).lower()
+                if any(p in img_alt for p in ['author', 'avatar', 'profile', 'headshot', 'byline']):
+                    continue
+                if any(p in img_class for p in ['avatar', 'author', 'profile', 'byline', 'social']):
+                    continue
+
+                # 부모 요소가 author/byline 영역이면 제외
+                parent = img.find_parent(['div', 'span', 'figure', 'a', 'section'])
+                if parent:
+                    parent_class = ' '.join(parent.get('class', []) or []).lower()
+                    parent_id = (parent.get('id', '') or '').lower()
+                    if any(p in (parent_class + parent_id) for p in ['author', 'byline', 'avatar', 'profile', 'sidebar', 'related']):
+                        continue
+
+                # 크기 힌트 — 너무 작은 이미지 제외 (300px 미만)
+                width = img.get('width', '')
+                height = img.get('height', '')
+                try:
+                    if width and int(str(width).replace('px', '')) < 300:
+                        continue
+                    if height and int(str(height).replace('px', '')) < 150:
+                        continue
+                except (ValueError, TypeError):
+                    pass
+
+                # 중복 제거 (같은 이미지의 리사이즈 버전 등)
+                parsed = urlparse(src)
+                base_key = parsed.netloc + parsed.path.rsplit('.', 1)[0] if '.' in parsed.path else src
+                if base_key in seen_urls:
+                    continue
+                seen_urls.add(base_key)
+
+                # HEAD 요청으로 접근 가능 + 파일 크기 확인
+                try:
+                    head = requests.head(src, timeout=5, allow_redirects=True,
+                                         headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
+                    if head.status_code != 200:
+                        continue
+                    ct = head.headers.get('Content-Type', '')
+                    if ct and 'image' not in ct:
+                        continue
+                    # 파일 크기 10KB 미만이면 아이콘/썸네일일 가능성 높음
+                    cl = head.headers.get('Content-Length', '')
+                    if cl and int(cl) < 10000:
+                        logger.debug(f"작은 이미지 제외 ({cl} bytes): {src[:80]}")
+                        continue
+                    collected.append(src)
+                    logger.info(f"원문 이미지 수집 [{len(collected)}/{max_images}]: {src[:80]}")
+                except Exception:
+                    continue
+
+        except Exception as e:
+            logger.warning(f"원문 이미지 크롤링 실패 ({page_url[:60]}): {e}")
+            continue
+
+    logger.info(f"원문 이미지 수집 완료: {len(collected)}장 (최대 {max_images})")
+    return collected
+
+
 def _is_irrelevant_article_url(url: str) -> bool:
    """기사 URL 경로가 문화/엔터/스포츠 등 무관한 섹션인지 판별"""
    url_lower = url.lower()
@@ -495,7 +632,26 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:
    json_ld = build_json_ld(article)
    disclaimer = article.get('disclaimer', '')

-    # 본문에 이미 <img> 태그가 있으면 대표 이미지 삽입 건너뜀
+    # 본문에 이미 <img> 태그가 있는지 확인 — 깨진 외부 이미지는 제거
+    import re as _re_img
+    _img_pattern = _re_img.compile(r'<img\s+[^>]*src=["\']([^"\']+)["\'][^>]*/?\s*>', _re_img.IGNORECASE)
+    _img_matches = list(_img_pattern.finditer(body_html))
+    if _img_matches:
+        # 외부 이미지 URL 접근 가능 여부 체크 — 깨진 이미지 제거
+        for m in reversed(_img_matches):
+            src = m.group(1)
+            if src.startswith('data:'):
+                continue  # base64는 항상 유효
+            try:
+                resp = requests.head(src, timeout=5, allow_redirects=True,
+                                     headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'})
+                if resp.status_code != 200:
+                    logger.warning(f"깨진 이미지 제거: {src[:80]} (HTTP {resp.status_code})")
+                    body_html = body_html[:m.start()] + body_html[m.end():]
+            except Exception:
+                logger.warning(f"깨진 이미지 제거 (접속 실패): {src[:80]}")
+                body_html = body_html[:m.start()] + body_html[m.end():]
+
    has_image = '<img ' in body_html.lower()

    html_parts = []
@@ -555,14 +711,64 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str:
                    body_html = '\n'.join(img_tags) + '\n' + body_html
            logger.info(f"사용자 첨부 이미지 {len(valid_user_images)}장 본문 분산 배치")
        else:
-            image_url = fetch_featured_image(article)
-            if image_url:
-                img_tag = (
-                    f'<img src="{image_url}" alt="{title}" '
-                    f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
-                    f'margin-bottom:1.2em;" />'
-                )
-                body_html = img_tag + '\n' + body_html
+            corner = article.get('corner', '')
+            # 제품리뷰: 원문 이미지 다수 수집하여 본문에 분산 배치
+            if corner == '제품리뷰':
+                source_images = _fetch_article_images(article, max_images=5)
+                if source_images:
+                    import re as _re2
+                    img_tags = []
+                    for src_url in source_images:
+                        img_tags.append(
+                            f'<img src="{src_url}" alt="{title}" '
+                            f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                            f'margin-bottom:1.2em;" loading="lazy" />'
+                        )
+                    n_imgs = len(img_tags)
+                    if n_imgs == 1:
+                        body_html = img_tags[0] + '\n' + body_html
+                    else:
+                        block_pattern = _re2.compile(
+                            r'(<(?:p|h[1-6]|div|ul|ol|blockquote|table|section|article|figure)'
+                            r'[\s>])',
+                            _re2.IGNORECASE,
+                        )
+                        blocks = block_pattern.split(body_html)
+                        boundary_indices = [i for i in range(1, len(blocks), 2)]
+                        if len(boundary_indices) >= n_imgs + 1:
+                            insert_positions = [
+                                int(len(boundary_indices) * (k + 1) / (n_imgs + 1))
+                                for k in range(n_imgs)
+                            ]
+                            insert_positions = sorted(set(insert_positions))
+                            for img_idx, pos in enumerate(reversed(insert_positions)):
+                                bi = boundary_indices[min(pos, len(boundary_indices) - 1)]
+                                img_tag_idx = min(len(img_tags) - 1, len(insert_positions) - 1 - img_idx)
+                                blocks.insert(bi, '\n' + img_tags[img_tag_idx] + '\n')
+                            body_html = ''.join(blocks)
+                        else:
+                            body_html = '\n'.join(img_tags) + '\n' + body_html
+                    logger.info(f"제품리뷰 원문 이미지 {n_imgs}장 본문 분산 배치")
+                else:
+                    # 원문 이미지 없으면 기존 대표이미지 1장
+                    image_url = fetch_featured_image(article)
+                    if image_url:
+                        img_tag = (
+                            f'<img src="{image_url}" alt="{title}" '
+                            f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                            f'margin-bottom:1.2em;" />'
+                        )
+                        body_html = img_tag + '\n' + body_html
+            else:
+                # 제품리뷰 외: 기존처럼 대표이미지 1장
+                image_url = fetch_featured_image(article)
+                if image_url:
+                    img_tag = (
+                        f'<img src="{image_url}" alt="{title}" '
+                        f'width="100%" style="max-height:420px;object-fit:cover;border-radius:8px;'
+                        f'margin-bottom:1.2em;" />'
+                    )
+                    body_html = img_tag + '\n' + body_html

    html_parts.append(json_ld)
    # 목차: h2가 3개 이상이고 TOC에 실제 링크가 있을 때만 표시