diff --git a/bots/scheduler.py b/bots/scheduler.py index 08fc182..519b7a5 100644 --- a/bots/scheduler.py +++ b/bots/scheduler.py @@ -171,7 +171,18 @@ def _build_openclaw_prompt(topic_data: dict) -> tuple[str, str]: # 단일 source_url도 포함 if source and not any(source in l for l in sources_section_lines): sources_section_lines.append(f"{source} | 참고 출처 | {published_at}") - sources_prompt_text = '\n'.join(sources_prompt_lines) if sources_prompt_lines else '없음 (AI 자체 지식 활용)' + # 소스 내용(content)이 있으면 함께 포함 + sources_prompt_lines_with_content = [] + for s in sources_list: + url = s.get('url', '') + title = s.get('title', '') + content = s.get('content', '') + if url: + line = f"- [{title}]({url})" + if content: + line += f"\n 요약: {content[:300]}" + sources_prompt_lines_with_content.append(line) + sources_prompt_text = '\n'.join(sources_prompt_lines_with_content) if sources_prompt_lines_with_content else '\n'.join(sources_prompt_lines) if sources_prompt_lines else '없음 (AI 자체 지식 활용)' sources_section_text = '\n'.join(sources_section_lines) if sources_section_lines else f"{source} | 참고 출처 | {published_at}" system = ( @@ -232,6 +243,85 @@ def _build_openclaw_prompt(topic_data: dict) -> tuple[str, str]: return system, prompt +def _fetch_sources_content(topic_data: dict) -> dict: + """idea/manual 소스의 경우 글 작성 전 실제 기사 내용 크롤링""" + if topic_data.get('source') not in ('idea', 'manual'): + return topic_data + + import requests + import feedparser + from urllib.parse import quote + from bs4 import BeautifulSoup + + topic = topic_data.get('topic', '') + existing_sources = topic_data.get('sources', []) + + # 소스가 없거나 Google 뉴스 URL만 있는 경우 → 키워드로 재검색 + need_search = not existing_sources or all('news.google.com' in s.get('url', '') for s in existing_sources) + if need_search: + try: + search_url = f"https://news.google.com/rss/search?q={quote(topic)}&hl=ko&gl=KR&ceid=KR:ko" + feed = feedparser.parse(search_url) + existing_sources = [{'url': e.get('link', ''), 'title': e.get('title', ''), 'date': e.get('published', '')} + for e in feed.entries[:5]] + except Exception: + pass + + # 각 소스 URL 변환 + 내용 크롤링 (최대 3개, 각 5초 타임아웃) + enriched_sources = [] + for s in existing_sources[:3]: + url = s.get('url', '') + title = s.get('title', '') + content = '' + real_url = url + + try: + # Google 뉴스 리다이렉트 → 실제 URL + if 'news.google.com' in url: + resp = requests.head(url, timeout=5, allow_redirects=True, + headers={'User-Agent': 'Mozilla/5.0'}) + if resp.url and 'news.google.com' not in resp.url: + real_url = resp.url + + # 기사 내용 크롤링 + page = requests.get(real_url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'}) + if page.status_code == 200: + soup = BeautifulSoup(page.text, 'lxml') + # og:description 또는 본문 첫 단락 + og_desc = soup.find('meta', property='og:description') + if og_desc and og_desc.get('content'): + content = og_desc['content'].strip()[:500] + else: + paras = soup.find_all('p') + content = ' '.join(p.get_text() for p in paras[:3])[:500] + # og:title로 제목 업데이트 + og_title = soup.find('meta', property='og:title') + if og_title and og_title.get('content'): + title = og_title['content'].strip() + # og:image + if not topic_data.get('source_image'): + og_img = soup.find('meta', property='og:image') + if og_img and og_img.get('content', '').startswith('http'): + topic_data['source_image'] = og_img['content'] + except Exception: + pass + + enriched_sources.append({ + 'url': real_url, + 'title': title, + 'content': content, + 'date': s.get('date', ''), + }) + logger.info(f"소스 크롤링: {title[:40]} ({real_url[:60]})") + + updated = dict(topic_data) + updated['sources'] = enriched_sources + if enriched_sources: + updated['source_url'] = enriched_sources[0]['url'] + updated['source_name'] = enriched_sources[0]['title'] + return updated + + def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''): logger.info(f"글 작성 요청: {topic_data.get('topic', '')}") sys.path.insert(0, str(BASE_DIR)) @@ -240,6 +330,9 @@ def _call_openclaw(topic_data: dict, output_path: Path, direction: str = ''): from engine_loader import EngineLoader from article_parser import parse_output + # idea/manual 소스: 글 작성 전 실제 기사 내용 크롤링 + topic_data = _fetch_sources_content(topic_data) + system, prompt = _build_openclaw_prompt(topic_data) if direction: prompt += f"\n\n운영자 지시사항: {direction}"