From 9280be7e52506636c0a401a7451e13fbc2078a7c Mon Sep 17 00:00:00 2001 From: JOUNGWOOK KWON Date: Mon, 30 Mar 2026 12:22:48 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20=EC=9B=90=EB=B3=B8=20RSS=20=EC=86=8C?= =?UTF-8?q?=EC=8A=A4=20=EC=9D=B4=EB=AF=B8=EC=A7=80=EB=A5=BC=20=EB=8C=80?= =?UTF-8?q?=ED=91=9C=20=EC=9D=B4=EB=AF=B8=EC=A7=80=EB=A1=9C=20=EC=9A=B0?= =?UTF-8?q?=EC=84=A0=20=EC=82=AC=EC=9A=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - RSS 수집 시 media:thumbnail, media:content, enclosure, 태그에서 이미지 추출 - source_image를 topic → article → publisher로 전달 - 발행 시 우선순위: 원본 소스 이미지 → Pexels → Unsplash Co-Authored-By: Claude Opus 4.6 --- bots/collector_bot.py | 32 ++++++++++++++++++++++++++++++++ bots/publisher_bot.py | 14 ++++++++++++-- bots/scheduler.py | 1 + 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/bots/collector_bot.py b/bots/collector_bot.py index 69b68d6..586a295 100644 --- a/bots/collector_bot.py +++ b/bots/collector_bot.py @@ -373,6 +373,35 @@ def collect_product_hunt(sources_cfg: dict) -> list[dict]: return items +def _extract_rss_image(entry) -> str: + """RSS entry에서 대표 이미지 URL 추출""" + # 1) media:thumbnail + if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail: + return entry.media_thumbnail[0].get('url', '') + # 2) media:content (type이 image인 것) + if hasattr(entry, 'media_content') and entry.media_content: + for mc in entry.media_content: + if 'image' in mc.get('type', '') or mc.get('medium') == 'image': + return mc.get('url', '') + # type 없어도 url이 이미지 확장자면 + url = entry.media_content[0].get('url', '') + if any(ext in url.lower() for ext in ['.jpg', '.jpeg', '.png', '.webp']): + return url + # 3) enclosures + if hasattr(entry, 'enclosures') and entry.enclosures: + for enc in entry.enclosures: + if 'image' in enc.get('type', ''): + return enc.get('href', '') or enc.get('url', '') + # 4) summary/description 안의 태그 + desc = entry.get('summary', '') or entry.get('description', '') + if ']+src=["\']([^"\']+)["\']', desc) + if match: + return match.group(1) + return '' + + def collect_rss_feeds(sources_cfg: dict) -> list[dict]: """설정된 RSS 피드 수집""" items = [] @@ -392,6 +421,8 @@ def collect_rss_feeds(sources_cfg: dict) -> list[dict]: combined = title_text + desc_text kr_chars = sum(1 for c in combined if '\uac00' <= c <= '\ud7a3') is_english = kr_chars / max(len(combined), 1) < 0.05 + # 원본 기사 대표 이미지 추출 + image_url = _extract_rss_image(entry) items.append({ 'topic': title_text, 'description': desc_text, @@ -404,6 +435,7 @@ def collect_rss_feeds(sources_cfg: dict) -> list[dict]: '_trust_override': trust, '_rss_category': feed_cfg.get('category', ''), 'is_english': is_english, + 'source_image': image_url, }) except Exception as e: logger.warning(f"RSS 수집 실패 ({url}): {e}") diff --git a/bots/publisher_bot.py b/bots/publisher_bot.py index 3daacfa..87c5a68 100644 --- a/bots/publisher_bot.py +++ b/bots/publisher_bot.py @@ -207,8 +207,18 @@ def build_json_ld(article: dict, blog_url: str = '') -> str: def fetch_featured_image(article: dict) -> str: - """글 키워드 기반 무료 대표 이미지 URL 가져오기 (Pexels → Unsplash fallback)""" - # 검색 키워드: 태그 또는 코너 기반 + """대표 이미지: 원본 소스 이미지 → Pexels → Unsplash 순으로 시도""" + # 1) 원본 RSS 소스 이미지 (가장 우선) + source_image = article.get('source_image', '') + if source_image and source_image.startswith('http'): + try: + resp = requests.head(source_image, timeout=5, allow_redirects=True) + if resp.status_code == 200: + return source_image + except Exception: + pass # 접근 불가면 다음 방법으로 + + # 2) 검색 키워드: 태그 또는 코너 기반 tags = article.get('tags', []) if isinstance(tags, str): tags = [t.strip() for t in tags.split(',')] diff --git a/bots/scheduler.py b/bots/scheduler.py index 8f0cba7..f7bb54b 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -243,6 +243,7 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''): article['source'] = topic_data.get('source', '') article['source_url'] = topic_data.get('source_url') or topic_data.get('source') or '' article['published_at'] = topic_data.get('published_at', '') + article['source_image'] = topic_data.get('source_image', '') article['created_at'] = datetime.now().isoformat() output_path.parent.mkdir(parents=True, exist_ok=True)