fix: 발행 완료된 글감을 /collect, /write 목록에서 제외

- publish() 성공 시 topics/ 에서 해당 topic 파일 자동 삭제 - /write, /collect 목록에 발행 제목 유사도 80% 필터 추가 - _load_published_titles(), _filter_unpublished() 헬퍼 추가 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-31 08:55:27 +09:00
parent e1fb6c954a
commit 1bdd212639
2 changed files with 61 additions and 0 deletions
@@ -547,6 +547,22 @@ def log_published(article: dict, post_result: dict):
    return record
 def _cleanup_published_topic(article: dict):
    """발행 완료된 topic 파일을 topics/ 에서 삭제"""
    import hashlib
    topics_dir = DATA_DIR / 'topics'
    topic_text = article.get('topic', '') or article.get('title', '')
    if not topic_text:
        return
    topic_id = hashlib.md5(topic_text.encode()).hexdigest()[:8]
    for f in topics_dir.glob(f'*_{topic_id}.json'):
        try:
            f.unlink()
            logger.info(f"발행 완료 topic 파일 삭제: {f.name}")
        except Exception as e:
            logger.debug(f"topic 파일 삭제 실패: {e}")
 def save_pending_review(article: dict, reason: str):
    """수동 검토 대기 글 저장"""
    pending_dir = DATA_DIR / 'pending_review'
@@ -617,6 +633,9 @@ def publish(article: dict) -> bool:
    # 발행 이력 저장
    log_published(article, post_result)
    # 발행 완료된 topic 파일 정리
    _cleanup_published_topic(article)
    # Telegram 알림
    title = article.get('title', '')
    corner = article.get('corner', '')
@@ -734,6 +734,9 @@ async def cmd_collect(update: Update, context: ContextTypes.DEFAULT_TYPE):
        topics_dir = DATA_DIR / 'topics'
        today = datetime.now().strftime('%Y%m%d')
        files = sorted(topics_dir.glob(f'{today}_*.json'))
        # 이미 발행된 글감 제외
        published_titles = _load_published_titles()
        files = _filter_unpublished(files, published_titles)
        if not files:
            await update.message.reply_text("✅ 수집 완료! 오늘 수집된 글감이 없습니다.")
            return
@@ -759,10 +762,49 @@ async def cmd_collect(update: Update, context: ContextTypes.DEFAULT_TYPE):
        await update.message.reply_text(f"❌ 수집 오류: {e}")
 def _load_published_titles() -> set[str]:
    """발행 이력에서 제목 set 로드 (빠른 필터링용)"""
    titles = set()
    published_dir = DATA_DIR / 'published'
    if not published_dir.exists():
        return titles
    for f in published_dir.glob('*.json'):
        try:
            data = json.loads(f.read_text(encoding='utf-8'))
            if 'title' in data:
                titles.add(data['title'])
        except Exception:
            pass
    return titles
 def _filter_unpublished(files: list, published_titles: set) -> list:
    """이미 발행된 글감 파일 제외"""
    from difflib import SequenceMatcher
    result = []
    for f in files:
        try:
            data = json.loads(f.read_text(encoding='utf-8'))
            topic = data.get('topic', '')
            # 발행 제목과 유사도 80% 이상이면 제외
            is_published = any(
                SequenceMatcher(None, topic, t).ratio() >= 0.8
                for t in published_titles
            )
            if not is_published:
                result.append(f)
        except Exception:
            result.append(f)
    return result
 async def cmd_write(update: Update, context: ContextTypes.DEFAULT_TYPE):
    topics_dir = DATA_DIR / 'topics'
    today = datetime.now().strftime('%Y%m%d')
    files = sorted(topics_dir.glob(f'{today}_*.json'))
    # 이미 발행된 글감 제외
    published_titles = _load_published_titles()
    files = _filter_unpublished(files, published_titles)
    if not files:
        await update.message.reply_text("오늘 수집된 글감이 없습니다. /collect 먼저 실행하세요.")
        return