feat(v3): PR 6 — HookOptimizer + MicroSignals (3 signals)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-29 11:56:34 +09:00
parent 834577fc07
commit 0dedb0d7f8
3 changed files with 484 additions and 0 deletions
--- a/bots/quality/init.py
+++ b/bots/quality/init.py
@@ -0,0 +1,17 @@
 """
 bots/quality
 Quality signal computation for shorts content.
 V3.0 signals:
  - motion_variation_score
  - script_diversity_score
  - tts_cost_efficiency
 V3.1+ additions:
  - semantic_visual_score
  - caption_overlap_score
  - pacing_variation_score
 """
 from .micro_signals import compute_signal, SIGNALS_V1
 __all__ = ['compute_signal', 'SIGNALS_V1']
--- a/bots/quality/micro_signals.py
+++ b/bots/quality/micro_signals.py
@@ -0,0 +1,215 @@
 """
 bots/quality/micro_signals.py
 Micro-failure quality signals for shorts content.
 V3.0 scope: 3 signals
  - motion_variation_score: detects repetitive motion patterns
  - script_diversity_score: detects structural overlap with recent scripts
  - tts_cost_efficiency: monitors TTS credit usage
 Each signal returns a float 0.0-1.0 where:
  - 1.0 = perfect / no issue
  - 0.0 = critical problem
  - threshold = action trigger point
 """
 import logging
 from pathlib import Path
 from typing import Callable, Any
 logger = logging.getLogger(__name__)
 SIGNALS_V1 = {
    'motion_variation_score': {
        'description': 'Consecutive clips using same motion pattern',
        'threshold': 0.6,
        'action': 'auto_fix',   # pick different pattern automatically
        'higher_is_better': True,
    },
    'script_diversity_score': {
        'description': 'Script structure overlap with last 7 days',
        'threshold': 0.5,
        'action': 'regenerate',  # request different structure from LLM
        'higher_is_better': True,
    },
    'tts_cost_efficiency': {
        'description': 'TTS credit usage vs monthly limit',
        'threshold': 0.8,
        'action': 'switch_engine',  # downgrade to local TTS
        'higher_is_better': False,  # lower usage = better
    },
 }
 def compute_signal(signal_name: str, **kwargs) -> float:
    """
    Compute a quality signal value.
    Args:
        signal_name: One of SIGNALS_V1 keys
        **kwargs: Signal-specific inputs (see individual compute functions)
    Returns: float 0.0-1.0
    Raises: ValueError if signal_name unknown
    """
    if signal_name not in SIGNALS_V1:
        raise ValueError(f'Unknown signal: {signal_name}. Available: {list(SIGNALS_V1.keys())}')
    compute_fns = {
        'motion_variation_score': _compute_motion_variation,
        'script_diversity_score': _compute_script_diversity,
        'tts_cost_efficiency': _compute_tts_cost_efficiency,
    }
    fn = compute_fns[signal_name]
    try:
        value = fn(**kwargs)
        logger.debug(f'[품질] {signal_name} = {value:.3f}')
        return value
    except Exception as e:
        logger.warning(f'[품질] 신호 계산 실패 ({signal_name}): {e}')
        return 1.0  # Neutral value on error (don't trigger action)
 def check_and_act(signal_name: str, value: float) -> dict:
    """
    Check if signal value crosses threshold and return action.
    Returns: {
        'triggered': bool,
        'action': str or None,
        'value': float,
        'threshold': float,
    }
    """
    if signal_name not in SIGNALS_V1:
        return {'triggered': False, 'action': None, 'value': value, 'threshold': 0}
    config = SIGNALS_V1[signal_name]
    threshold = config['threshold']
    higher_is_better = config.get('higher_is_better', True)
    if higher_is_better:
        triggered = value < threshold
    else:
        triggered = value > threshold
    return {
        'triggered': triggered,
        'action': config['action'] if triggered else None,
        'value': value,
        'threshold': threshold,
    }
 def _compute_motion_variation(clips: list, **kwargs) -> float:
    """
    Compute motion variation score.
    Args:
        clips: list of dicts with 'pattern' key, e.g. [{'pattern': 'ken_burns_in'}, ...]
    Returns: 0.0-1.0 diversity score
    """
    if not clips or len(clips) < 2:
        return 1.0
    patterns = [c.get('pattern', '') for c in clips if c.get('pattern')]
    if not patterns:
        return 1.0
    # Count consecutive same-pattern pairs
    consecutive_same = sum(
        1 for i in range(len(patterns) - 1)
        if patterns[i] == patterns[i+1]
    )
    # Unique patterns ratio
    unique_ratio = len(set(patterns)) / len(patterns)
    consecutive_penalty = consecutive_same / max(len(patterns) - 1, 1)
    score = unique_ratio * (1 - consecutive_penalty)
    return round(min(1.0, max(0.0, score)), 3)
 def _compute_script_diversity(script: dict, history: list = None, **kwargs) -> float:
    """
    Compute script structure diversity vs recent history.
    Args:
        script: Current script dict with 'hook', 'body', 'closer'
        history: List of recent scripts (last 7 days), each same format
    Returns: 0.0-1.0 diversity score (1.0 = very diverse)
    """
    if not history:
        return 1.0
    # Compare script structure fingerprints
    def _fingerprint(s: dict) -> tuple:
        hook = s.get('hook', '')
        body = s.get('body', [])
        closer = s.get('closer', '')
        return (
            len(hook) // 10,  # rough length bucket
            len(body),         # number of body sentences
            hook[:5] if hook else '',   # hook start
        )
    current_fp = _fingerprint(script)
    overlaps = sum(
        1 for h in history
        if _fingerprint(h) == current_fp
    )
    overlap_rate = overlaps / len(history)
    return round(1.0 - overlap_rate, 3)
 def _compute_tts_cost_efficiency(usage: float, limit: float, **kwargs) -> float:
    """
    Compute TTS cost efficiency.
    Args:
        usage: Characters used this period
        limit: Monthly/daily character limit
    Returns: ratio (usage/limit), where > threshold triggers engine switch
    """
    if limit <= 0:
        return 0.0
    return round(min(1.0, usage / limit), 3)
 # ── Standalone test ──────────────────────────────────────────────
 if __name__ == '__main__':
    import sys
    if '--test' in sys.argv:
        print("=== Micro Signals Test ===")
        # Test motion variation
        test_clips = [
            {'pattern': 'ken_burns_in'},
            {'pattern': 'ken_burns_in'},  # repeat!
            {'pattern': 'pan_left'},
            {'pattern': 'pan_right'},
        ]
        mv = compute_signal('motion_variation_score', clips=test_clips)
        result = check_and_act('motion_variation_score', mv)
        print(f"motion_variation_score = {mv:.3f} (triggered: {result['triggered']}, action: {result['action']})")
        # Test script diversity
        current_script = {'hook': '이거 모르면 손해', 'body': ['첫째', '둘째', '셋째'], 'closer': '구독'}
        history = [
            {'hook': '이거 모르면 손해2', 'body': ['a', 'b', 'c'], 'closer': '팔로우'},
        ]
        sd = compute_signal('script_diversity_score', script=current_script, history=history)
        result2 = check_and_act('script_diversity_score', sd)
        print(f"script_diversity_score = {sd:.3f} (triggered: {result2['triggered']})")
        # Test TTS cost
        tce = compute_signal('tts_cost_efficiency', usage=8500, limit=10000)
        result3 = check_and_act('tts_cost_efficiency', tce)
        print(f"tts_cost_efficiency = {tce:.3f} (triggered: {result3['triggered']}, action: {result3['action']})")
--- a/bots/shorts/hook_optimizer.py
+++ b/bots/shorts/hook_optimizer.py
@@ -0,0 +1,252 @@
 """
 bots/shorts/hook_optimizer.py
 Hook text quality scoring and optimization.
 HookOptimizer:
  - score(hook): 0-100 quality score based on pattern match + keyword strength
  - optimize(hook, article, max_attempts): regenerate if score < 70
 V3.0 scope: pattern matching + LLM regeneration via existing writer_bot
 """
 import logging
 import re
 from typing import Optional
 logger = logging.getLogger(__name__)
 # Hook patterns mapped to template strings with {N} placeholder for numbers
 HOOK_PATTERNS = {
    'disbelief': [
        '이거 모르면 손해',
        '이게 무료라고?',
        '이걸 아직도 모른다고?',
        '믿기 힘들지만 사실입니다',
        '실화입니다',
    ],
    'warning': [
        '절대 하지 마세요',
        '이것만은 피하세요',
        '지금 당장 멈추세요',
        '알면 충격받을 수 있습니다',
    ],
    'number': [
        '단 {N}초면',
        '{N}%가 모르는',
        '{N}가지 방법',
        '{N}배 빠른',
        '상위 {N}%',
    ],
    'question': [
        '왜 아무도 안 알려줄까?',
        '진짜일까?',
        '이게 가능한 이유',
        '어떻게 하는 걸까?',
    ],
    'urgency': [
        '지금 당장',
        '오늘 안에',
        '지금 안 보면 후회',
        '당장 시작해야 하는 이유',
    ],
 }
 # High-value keywords that boost score (Korean viral hook words)
 HIGH_VALUE_KEYWORDS = [
    '무료', '공짜', '비밀', '충격', '실화', '진짜', '불법',
    '모르는', '숨겨진', '알려지지 않은', '믿기지 않는', '손해',
    '당장', '지금', '반드시', '절대', '꼭', '필수',
    '돈', '수익', '수입', '부자', '성공', '자유',
    '초보', '누구나', '쉬운', '간단한',
 ]
 # Weak words that reduce score
 WEAK_KEYWORDS = [
    '알아보겠습니다', '살펴보겠습니다', '설명드리겠습니다',
    '안녕하세요', '오늘은', '이번에는', '먼저',
 ]
 class HookOptimizer:
    """
    Scores and optimizes hook text for shorts videos.
    Score = pattern_score (0-50) + keyword_score (0-30) + length_score (0-20)
    Threshold: 70 — below this triggers regeneration
    """
    def __init__(self, threshold: int = 70):
        self.threshold = threshold
        self._recently_used_patterns: list[str] = []  # avoid repetition
    def score(self, hook: str) -> int:
        """
        Score a hook text from 0-100.
        Components:
        - pattern_score (0-50): does it match a known viral pattern?
        - keyword_score (0-30): does it contain high-value keywords?
        - length_score (0-20): optimal length (15-30 chars = max)
        """
        if not hook:
            return 0
        pattern_score = self._score_pattern(hook)
        keyword_score = self._score_keywords(hook)
        length_score = self._score_length(hook)
        total = min(100, pattern_score + keyword_score + length_score)
        return total
    def optimize(
        self,
        hook: str,
        article: dict,
        max_attempts: int = 3,
        llm_fn=None,
    ) -> str:
        """
        Score hook. If score < threshold, regenerate up to max_attempts times.
        Args:
            hook: Initial hook text
            article: Article dict with keys: title, body, corner, key_points
            max_attempts: Max regeneration attempts
            llm_fn: Optional callable(prompt) -> str for LLM regeneration.
                    If None, returns original hook (LLM not available).
        Returns: Best hook found (may still be below threshold if all attempts fail)
        """
        current = hook
        best = hook
        best_score = self.score(hook)
        logger.info(f'[훅] 초기 점수: {best_score}/100 — "{hook[:30]}..."')
        if best_score >= self.threshold:
            return hook
        if llm_fn is None:
            logger.warning(f'[훅] 점수 부족 ({best_score}/100) — LLM 없음, 원본 사용')
            return hook
        for attempt in range(max_attempts):
            prompt = self._build_regeneration_prompt(current, article, best_score)
            try:
                new_hook = llm_fn(prompt)
                if new_hook:
                    new_hook = new_hook.strip().split('\n')[0]  # Take first line
                    new_score = self.score(new_hook)
                    logger.info(f'[훅] 시도 {attempt+1}: {new_score}/100 — "{new_hook[:30]}"')
                    if new_score > best_score:
                        best = new_hook
                        best_score = new_score
                    if best_score >= self.threshold:
                        break
                    current = new_hook
            except Exception as e:
                logger.warning(f'[훅] LLM 재생성 실패 (시도 {attempt+1}): {e}')
                break
        logger.info(f'[훅] 최종 점수: {best_score}/100 — "{best[:30]}"')
        return best
    def _score_pattern(self, hook: str) -> int:
        """Check if hook matches known viral patterns. Max 50 points."""
        for pattern_name, templates in HOOK_PATTERNS.items():
            for template in templates:
                # Check for fuzzy match (template with {N} filled in)
                pattern_re = re.escape(template).replace(r'\{N\}', r'\d+')
                if re.search(pattern_re, hook):
                    # Recently used pattern gets reduced score
                    if pattern_name in self._recently_used_patterns[-3:]:
                        return 30
                    self._recently_used_patterns.append(pattern_name)
                    return 50
                # Partial match check
                core = template.replace('{N}', '').strip()
                if len(core) > 3 and core in hook:
                    return 35
        return 0
    def _score_keywords(self, hook: str) -> int:
        """Score based on high-value/weak keywords. Max 30 points."""
        score = 0
        for kw in HIGH_VALUE_KEYWORDS:
            if kw in hook:
                score += 10
                if score >= 30:
                    break
        # Penalize weak words
        for kw in WEAK_KEYWORDS:
            if kw in hook:
                score -= 15
        return max(0, min(30, score))
    def _score_length(self, hook: str) -> int:
        """Score based on hook length. Max 20 points. Optimal: 15-30 chars."""
        length = len(hook)
        if 15 <= length <= 30:
            return 20
        elif 10 <= length < 15 or 30 < length <= 40:
            return 10
        elif length < 10:
            return 5
        else:  # > 40 chars
            return 0
    def _build_regeneration_prompt(self, hook: str, article: dict, current_score: int) -> str:
        """Build LLM prompt for hook regeneration."""
        title = article.get('title', '')
        corner = article.get('corner', '')
        key_points = article.get('key_points', [])
        recently_used = ', '.join(self._recently_used_patterns[-3:]) if self._recently_used_patterns else '없음'
        points_str = '\n'.join(f'- {p}' for p in key_points[:3]) if key_points else ''
        return f"""다음 쇼츠 영상의 훅 텍스트를 개선해주세요.
 현재 훅: {hook}
 현재 점수: {current_score}/100 (기준: 70점 이상)
 콘텐츠 정보:
 - 제목: {title}
 - 코너: {corner}
 - 핵심 포인트: {points_str}
 요구사항:
 1. 15-30자 이내
 2. 다음 패턴 중 하나 사용: 충격/의심/경고/숫자/긴급
 3. 최근 사용된 패턴 제외: {recently_used}
 4. 한국어로 작성
 5. 훅 텍스트만 출력 (설명 없이)
 개선된 훅:"""
 # ── Standalone test ──────────────────────────────────────────────
 if __name__ == '__main__':
    import sys
    if '--test' in sys.argv:
        optimizer = HookOptimizer()
        test_hooks = [
            '이거 모르면 손해입니다!',
            '안녕하세요 오늘은 AI에 대해 설명드리겠습니다',
            '100%가 모르는 무료 도구',
            '지금 당장 이것만은 절대 하지 마세요',
            '어',
        ]
        print("=== Hook Optimizer Test ===")
        for hook in test_hooks:
            s = optimizer.score(hook)
            print(f'점수 {s:3d}/100: "{hook}"')
        print()
        print("Pattern test:")
        for category in HOOK_PATTERNS:
            print(f"  {category}: {len(HOOK_PATTERNS[category])} patterns")