diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py index a5eb87b..6aefe3b 100644 --- a/bots/publisher_bot.py +++ b/bots/publisher_bot.py @@ -260,8 +260,53 @@ def _fetch_og_image(url: str) -> str: return '' +def _search_real_article_image(sources: list, topic: str = '') -> str: + """Google News 소스 → DuckDuckGo 검색으로 실제 기사 URL 찾기 → og:image 크롤링""" + from urllib.parse import quote as _quote, urlparse, parse_qs, unquote + from bs4 import BeautifulSoup as _BS + + for src in sources[:3]: + title = src.get('title', '') + if not title: + continue + # "기사 제목 - 매체명" → 매체명 제거 + clean_title = re.sub(r'\s*[-–—]\s*\S+$', '', title).strip() + if len(clean_title) < 5: + clean_title = title + try: + ddg_url = f'https://html.duckduckgo.com/html/?q={_quote(clean_title)}' + resp = requests.get(ddg_url, timeout=10, headers={ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36' + }) + if resp.status_code != 200: + continue + soup = _BS(resp.text, 'lxml') + # DuckDuckGo는 redirect URL 사용: //duckduckgo.com/l/?uddg=실제URL + for a_tag in soup.select('a.result__a')[:3]: + href = a_tag.get('href', '') + # uddg 파라미터에서 실제 URL 추출 + real_url = href + if 'uddg=' in href: + parsed = parse_qs(urlparse(href).query) + uddg = parsed.get('uddg', [''])[0] + if uddg: + real_url = unquote(uddg) + if not real_url.startswith('http'): + continue + if 'news.google.com' in real_url: + continue + # 실제 기사 URL에서 og:image 크롤링 + img = _fetch_og_image(real_url) + if img: + logger.info(f"원문 기사 이미지 발견: {real_url[:50]} → {img[:60]}") + return img + except Exception as e: + logger.debug(f"DuckDuckGo 검색 실패: {e}") + return '' + + def fetch_featured_image(article: dict) -> str: - """대표 이미지: RSS 이미지 → og:image 크롤링 → Pexels 순으로 시도""" + """대표 이미지: RSS 이미지 → 원문 기사 → og:image → Wikipedia 순으로 시도""" # 1) RSS 수집 시 가져온 소스 이미지 (플랫폼 로고 제외) source_image = article.get('source_image', '') if source_image and source_image.startswith('http') and not _is_platform_logo(source_image): @@ -278,7 +323,14 @@ def fetch_featured_image(article: dict) -> str: if og_image: return og_image - # 3) Pexels API (키가 있을 때) + # 3) Google News 소스 → DuckDuckGo로 실제 기사 검색 → og:image + sources = article.get('sources', []) + if sources: + real_image = _search_real_article_image(sources, article.get('title', '')) + if real_image: + return real_image + + # 4) Pexels API (키가 있을 때) pexels_key = os.getenv('PEXELS_API_KEY', '') if pexels_key: tags = article.get('tags', []) @@ -299,42 +351,27 @@ def fetch_featured_image(article: dict) -> str: except Exception as e: logger.warning(f"Pexels 이미지 검색 실패: {e}") - # 4) Wikipedia 썸네일 (무료, API 키 불필요) — 태그 전체 시도 + # 5) Wikipedia 썸네일 (무료, API 키 불필요) tags = article.get('tags', []) if isinstance(tags, str): tags = [t.strip() for t in tags.split(',')] - # 태그만 사용 (제목은 너무 길어 Wikipedia에서 매칭 안됨) search_keywords = [t for t in tags if t and len(t) <= 15][:8] from urllib.parse import quote as _quote for kw in search_keywords: - # 한국어 Wikipedia - try: - wiki_url = f'https://ko.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}' - resp = requests.get(wiki_url, timeout=6, - headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'}) - if resp.status_code == 200: - data = resp.json() - thumb = data.get('thumbnail', {}).get('source', '') - if thumb and thumb.startswith('http') and not _is_platform_logo(thumb): - thumb = re.sub(r'/\d+px-', '/800px-', thumb) - logger.info(f"Wikipedia 이미지 사용: {kw} → {thumb[:60]}") - return thumb - except Exception: - pass - # 영문 Wikipedia - try: - wiki_url = f'https://en.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}' - resp = requests.get(wiki_url, timeout=6, - headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'}) - if resp.status_code == 200: - data = resp.json() - thumb = data.get('thumbnail', {}).get('source', '') - if thumb and thumb.startswith('http') and not _is_platform_logo(thumb): - thumb = re.sub(r'/\d+px-', '/800px-', thumb) - logger.info(f"Wikipedia(EN) 이미지 사용: {kw} → {thumb[:60]}") - return thumb - except Exception: - pass + for lang in ['ko', 'en']: + try: + wiki_url = f'https://{lang}.wikipedia.org/api/rest_v1/page/summary/{_quote(kw)}' + resp = requests.get(wiki_url, timeout=6, + headers={'User-Agent': 'Mozilla/5.0 (compatible; BlogBot/1.0)'}) + if resp.status_code == 200: + data = resp.json() + thumb = data.get('thumbnail', {}).get('source', '') + if thumb and thumb.startswith('http') and not _is_platform_logo(thumb): + thumb = re.sub(r'/\d+px-', '/800px-', thumb) + logger.info(f"Wikipedia({lang}) 이미지 사용: {kw} → {thumb[:60]}") + return thumb + except Exception: + pass return '' @@ -351,13 +388,15 @@ def build_full_html(article: dict, body_html: str, toc_html: str) -> str: if not has_image: image_url = fetch_featured_image(article) if image_url: - title = article.get('title', '') - html_parts.append( - f'