feat: /idea 글 작성 시 소스 크롤링 → Gemini에 실제 내용 전달

/write 시점에 Google 뉴스 URL → 실제 기사 URL 변환 후 내용 크롤링. Gemini 프롬프트에 기사 제목+요약+URL 전달 → 실제 소스 기반 글 작성. 출처 박스에 실제 기사 링크 표시. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-30 16:20:26 +09:00
parent 6d9cf8d6da
commit e077b593c9
1 changed files with 94 additions and 1 deletions
@@ -171,7 +171,18 @@ def _build_openclaw_prompt(topic_data: dict) -> tuple[str, str]:
    # 단일 source_url도 포함
    if source and not any(source in l for l in sources_section_lines):
        sources_section_lines.append(f"{source} | 참고 출처 | {published_at}")
-    sources_prompt_text = '\n'.join(sources_prompt_lines) if sources_prompt_lines else '없음 (AI 자체 지식 활용)'
+    # 소스 내용(content)이 있으면 함께 포함
+    sources_prompt_lines_with_content = []
+    for s in sources_list:
+        url = s.get('url', '')
+        title = s.get('title', '')
+        content = s.get('content', '')
+        if url:
+            line = f"- [{title}]({url})"
+            if content:
+                line += f"\n  요약: {content[:300]}"
+            sources_prompt_lines_with_content.append(line)
+    sources_prompt_text = '\n'.join(sources_prompt_lines_with_content) if sources_prompt_lines_with_content else '\n'.join(sources_prompt_lines) if sources_prompt_lines else '없음 (AI 자체 지식 활용)'
    sources_section_text = '\n'.join(sources_section_lines) if sources_section_lines else f"{source} | 참고 출처 | {published_at}"

    system = (
@@ -232,6 +243,85 @@ def _build_openclaw_prompt(topic_data: dict) -> tuple[str, str]:
    return system, prompt


+def _fetch_sources_content(topic_data: dict) -> dict:
+    """idea/manual 소스의 경우 글 작성 전 실제 기사 내용 크롤링"""
+    if topic_data.get('source') not in ('idea', 'manual'):
+        return topic_data
+
+    import requests
+    import feedparser
+    from urllib.parse import quote
+    from bs4 import BeautifulSoup
+
+    topic = topic_data.get('topic', '')
+    existing_sources = topic_data.get('sources', [])
+
+    # 소스가 없거나 Google 뉴스 URL만 있는 경우 → 키워드로 재검색
+    need_search = not existing_sources or all('news.google.com' in s.get('url', '') for s in existing_sources)
+    if need_search:
+        try:
+            search_url = f"https://news.google.com/rss/search?q={quote(topic)}&hl=ko&gl=KR&ceid=KR:ko"
+            feed = feedparser.parse(search_url)
+            existing_sources = [{'url': e.get('link', ''), 'title': e.get('title', ''), 'date': e.get('published', '')}
+                                 for e in feed.entries[:5]]
+        except Exception:
+            pass
+
+    # 각 소스 URL 변환 + 내용 크롤링 (최대 3개, 각 5초 타임아웃)
+    enriched_sources = []
+    for s in existing_sources[:3]:
+        url = s.get('url', '')
+        title = s.get('title', '')
+        content = ''
+        real_url = url
+
+        try:
+            # Google 뉴스 리다이렉트 → 실제 URL
+            if 'news.google.com' in url:
+                resp = requests.head(url, timeout=5, allow_redirects=True,
+                                     headers={'User-Agent': 'Mozilla/5.0'})
+                if resp.url and 'news.google.com' not in resp.url:
+                    real_url = resp.url
+
+            # 기사 내용 크롤링
+            page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
+            if page.status_code == 200:
+                soup = BeautifulSoup(page.text, 'lxml')
+                # og:description 또는 본문 첫 단락
+                og_desc = soup.find('meta', property='og:description')
+                if og_desc and og_desc.get('content'):
+                    content = og_desc['content'].strip()[:500]
+                else:
+                    paras = soup.find_all('p')
+                    content = ' '.join(p.get_text() for p in paras[:3])[:500]
+                # og:title로 제목 업데이트
+                og_title = soup.find('meta', property='og:title')
+                if og_title and og_title.get('content'):
+                    title = og_title['content'].strip()
+                # og:image
+                if not topic_data.get('source_image'):
+                    og_img = soup.find('meta', property='og:image')
+                    if og_img and og_img.get('content', '').startswith('http'):
+                        topic_data['source_image'] = og_img['content']
+        except Exception:
+            pass
+
+        enriched_sources.append({
+            'url': real_url,
+            'title': title,
+            'content': content,
+            'date': s.get('date', ''),
+        })
+        logger.info(f"소스 크롤링: {title[:40]} ({real_url[:60]})")
+
+    updated = dict(topic_data)
+    updated['sources'] = enriched_sources
+    if enriched_sources:
+        updated['source_url'] = enriched_sources[0]['url']
+        updated['source_name'] = enriched_sources[0]['title']
+    return updated
+
+
 def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''):
    logger.info(f"글 작성 요청: {topic_data.get('topic', '')}")
    sys.path.insert(0, str(BASE_DIR))
@@ -240,6 +330,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''):
    from engine_loader import EngineLoader
    from article_parser import parse_output

+    # idea/manual 소스: 글 작성 전 실제 기사 내용 크롤링
+    topic_data = _fetch_sources_content(topic_data)
+
    system, prompt = _build_openclaw_prompt(topic_data)
    if direction:
        prompt += f"\n\n운영자 지시사항: {direction}"