fix: 발행 완료된 글감을 /collect, /write 목록에서 제외

- publish() 성공 시 topics/ 에서 해당 topic 파일 자동 삭제 - /write, /collect 목록에 발행 제목 유사도 80% 필터 추가 - _load_published_titles(), _filter_unpublished() 헬퍼 추가 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-31 08:55:27 +09:00
parent e1fb6c954a
commit 1bdd212639
2 changed files with 61 additions and 0 deletions
@@ -547,6 +547,22 @@ def log_published(article: dict, post_result: dict):
    return record


+def _cleanup_published_topic(article: dict):
+    """발행 완료된 topic 파일을 topics/ 에서 삭제"""
+    import hashlib
+    topics_dir = DATA_DIR / 'topics'
+    topic_text = article.get('topic', '') or article.get('title', '')
+    if not topic_text:
+        return
+    topic_id = hashlib.md5(topic_text.encode()).hexdigest()[:8]
+    for f in topics_dir.glob(f'*_{topic_id}.json'):
+        try:
+            f.unlink()
+            logger.info(f"발행 완료 topic 파일 삭제: {f.name}")
+        except Exception as e:
+            logger.debug(f"topic 파일 삭제 실패: {e}")
+
+
 def save_pending_review(article: dict, reason: str):
    """수동 검토 대기 글 저장"""
    pending_dir = DATA_DIR / 'pending_review'
@@ -617,6 +633,9 @@ def publish(article: dict) -> bool:
    # 발행 이력 저장
    log_published(article, post_result)

+    # 발행 완료된 topic 파일 정리
+    _cleanup_published_topic(article)
+
    # Telegram 알림
    title = article.get('title', '')
    corner = article.get('corner', '')
@@ -734,6 +734,9 @@ async def cmd_collect(update: Update, context: ContextTypes.DEFAULT_TYPE):
        topics_dir = DATA_DIR / 'topics'
        today = datetime.now().strftime('%Y%m%d')
        files = sorted(topics_dir.glob(f'{today}_*.json'))
+        # 이미 발행된 글감 제외
+        published_titles = _load_published_titles()
+        files = _filter_unpublished(files, published_titles)
        if not files:
            await update.message.reply_text("✅ 수집 완료! 오늘 수집된 글감이 없습니다.")
            return
@@ -759,10 +762,49 @@ async def cmd_collect(update: Update, context: ContextTypes.DEFAULT_TYPE):
        await update.message.reply_text(f"❌ 수집 오류: {e}")


+def _load_published_titles() -> set[str]:
+    """발행 이력에서 제목 set 로드 (빠른 필터링용)"""
+    titles = set()
+    published_dir = DATA_DIR / 'published'
+    if not published_dir.exists():
+        return titles
+    for f in published_dir.glob('*.json'):
+        try:
+            data = json.loads(f.read_text(encoding='utf-8'))
+            if 'title' in data:
+                titles.add(data['title'])
+        except Exception:
+            pass
+    return titles
+
+
+def _filter_unpublished(files: list, published_titles: set) -> list:
+    """이미 발행된 글감 파일 제외"""
+    from difflib import SequenceMatcher
+    result = []
+    for f in files:
+        try:
+            data = json.loads(f.read_text(encoding='utf-8'))
+            topic = data.get('topic', '')
+            # 발행 제목과 유사도 80% 이상이면 제외
+            is_published = any(
+                SequenceMatcher(None, topic, t).ratio() >= 0.8
+                for t in published_titles
+            )
+            if not is_published:
+                result.append(f)
+        except Exception:
+            result.append(f)
+    return result
+
+
 async def cmd_write(update: Update, context: ContextTypes.DEFAULT_TYPE):
    topics_dir = DATA_DIR / 'topics'
    today = datetime.now().strftime('%Y%m%d')
    files = sorted(topics_dir.glob(f'{today}_*.json'))
+    # 이미 발행된 글감 제외
+    published_titles = _load_published_titles()
+    files = _filter_unpublished(files, published_titles)
    if not files:
        await update.message.reply_text("오늘 수집된 글감이 없습니다. /collect 먼저 실행하세요.")
        return