diff --git a/bots/quality/__init__.py b/bots/quality/__init__.py new file mode 100644 index 0000000..baadedb --- /dev/null +++ b/bots/quality/__init__.py @@ -0,0 +1,17 @@ +""" +bots/quality +Quality signal computation for shorts content. + +V3.0 signals: + - motion_variation_score + - script_diversity_score + - tts_cost_efficiency + +V3.1+ additions: + - semantic_visual_score + - caption_overlap_score + - pacing_variation_score +""" +from .micro_signals import compute_signal, SIGNALS_V1 + +__all__ = ['compute_signal', 'SIGNALS_V1'] diff --git a/bots/quality/micro_signals.py b/bots/quality/micro_signals.py new file mode 100644 index 0000000..3d103fe --- /dev/null +++ b/bots/quality/micro_signals.py @@ -0,0 +1,215 @@ +""" +bots/quality/micro_signals.py +Micro-failure quality signals for shorts content. + +V3.0 scope: 3 signals + - motion_variation_score: detects repetitive motion patterns + - script_diversity_score: detects structural overlap with recent scripts + - tts_cost_efficiency: monitors TTS credit usage + +Each signal returns a float 0.0-1.0 where: + - 1.0 = perfect / no issue + - 0.0 = critical problem + - threshold = action trigger point +""" +import logging +from pathlib import Path +from typing import Callable, Any + +logger = logging.getLogger(__name__) + +SIGNALS_V1 = { + 'motion_variation_score': { + 'description': 'Consecutive clips using same motion pattern', + 'threshold': 0.6, + 'action': 'auto_fix', # pick different pattern automatically + 'higher_is_better': True, + }, + 'script_diversity_score': { + 'description': 'Script structure overlap with last 7 days', + 'threshold': 0.5, + 'action': 'regenerate', # request different structure from LLM + 'higher_is_better': True, + }, + 'tts_cost_efficiency': { + 'description': 'TTS credit usage vs monthly limit', + 'threshold': 0.8, + 'action': 'switch_engine', # downgrade to local TTS + 'higher_is_better': False, # lower usage = better + }, +} + + +def compute_signal(signal_name: str, **kwargs) -> float: + """ + Compute a quality signal value. + + Args: + signal_name: One of SIGNALS_V1 keys + **kwargs: Signal-specific inputs (see individual compute functions) + + Returns: float 0.0-1.0 + + Raises: ValueError if signal_name unknown + """ + if signal_name not in SIGNALS_V1: + raise ValueError(f'Unknown signal: {signal_name}. Available: {list(SIGNALS_V1.keys())}') + + compute_fns = { + 'motion_variation_score': _compute_motion_variation, + 'script_diversity_score': _compute_script_diversity, + 'tts_cost_efficiency': _compute_tts_cost_efficiency, + } + + fn = compute_fns[signal_name] + try: + value = fn(**kwargs) + logger.debug(f'[품질] {signal_name} = {value:.3f}') + return value + except Exception as e: + logger.warning(f'[품질] 신호 계산 실패 ({signal_name}): {e}') + return 1.0 # Neutral value on error (don't trigger action) + + +def check_and_act(signal_name: str, value: float) -> dict: + """ + Check if signal value crosses threshold and return action. + + Returns: { + 'triggered': bool, + 'action': str or None, + 'value': float, + 'threshold': float, + } + """ + if signal_name not in SIGNALS_V1: + return {'triggered': False, 'action': None, 'value': value, 'threshold': 0} + + config = SIGNALS_V1[signal_name] + threshold = config['threshold'] + higher_is_better = config.get('higher_is_better', True) + + if higher_is_better: + triggered = value < threshold + else: + triggered = value > threshold + + return { + 'triggered': triggered, + 'action': config['action'] if triggered else None, + 'value': value, + 'threshold': threshold, + } + + +def _compute_motion_variation(clips: list, **kwargs) -> float: + """ + Compute motion variation score. + + Args: + clips: list of dicts with 'pattern' key, e.g. [{'pattern': 'ken_burns_in'}, ...] + + Returns: 0.0-1.0 diversity score + """ + if not clips or len(clips) < 2: + return 1.0 + + patterns = [c.get('pattern', '') for c in clips if c.get('pattern')] + if not patterns: + return 1.0 + + # Count consecutive same-pattern pairs + consecutive_same = sum( + 1 for i in range(len(patterns) - 1) + if patterns[i] == patterns[i+1] + ) + + # Unique patterns ratio + unique_ratio = len(set(patterns)) / len(patterns) + consecutive_penalty = consecutive_same / max(len(patterns) - 1, 1) + + score = unique_ratio * (1 - consecutive_penalty) + return round(min(1.0, max(0.0, score)), 3) + + +def _compute_script_diversity(script: dict, history: list = None, **kwargs) -> float: + """ + Compute script structure diversity vs recent history. + + Args: + script: Current script dict with 'hook', 'body', 'closer' + history: List of recent scripts (last 7 days), each same format + + Returns: 0.0-1.0 diversity score (1.0 = very diverse) + """ + if not history: + return 1.0 + + # Compare script structure fingerprints + def _fingerprint(s: dict) -> tuple: + hook = s.get('hook', '') + body = s.get('body', []) + closer = s.get('closer', '') + return ( + len(hook) // 10, # rough length bucket + len(body), # number of body sentences + hook[:5] if hook else '', # hook start + ) + + current_fp = _fingerprint(script) + + overlaps = sum( + 1 for h in history + if _fingerprint(h) == current_fp + ) + + overlap_rate = overlaps / len(history) + return round(1.0 - overlap_rate, 3) + + +def _compute_tts_cost_efficiency(usage: float, limit: float, **kwargs) -> float: + """ + Compute TTS cost efficiency. + + Args: + usage: Characters used this period + limit: Monthly/daily character limit + + Returns: ratio (usage/limit), where > threshold triggers engine switch + """ + if limit <= 0: + return 0.0 + return round(min(1.0, usage / limit), 3) + + +# ── Standalone test ────────────────────────────────────────────── + +if __name__ == '__main__': + import sys + if '--test' in sys.argv: + print("=== Micro Signals Test ===") + + # Test motion variation + test_clips = [ + {'pattern': 'ken_burns_in'}, + {'pattern': 'ken_burns_in'}, # repeat! + {'pattern': 'pan_left'}, + {'pattern': 'pan_right'}, + ] + mv = compute_signal('motion_variation_score', clips=test_clips) + result = check_and_act('motion_variation_score', mv) + print(f"motion_variation_score = {mv:.3f} (triggered: {result['triggered']}, action: {result['action']})") + + # Test script diversity + current_script = {'hook': '이거 모르면 손해', 'body': ['첫째', '둘째', '셋째'], 'closer': '구독'} + history = [ + {'hook': '이거 모르면 손해2', 'body': ['a', 'b', 'c'], 'closer': '팔로우'}, + ] + sd = compute_signal('script_diversity_score', script=current_script, history=history) + result2 = check_and_act('script_diversity_score', sd) + print(f"script_diversity_score = {sd:.3f} (triggered: {result2['triggered']})") + + # Test TTS cost + tce = compute_signal('tts_cost_efficiency', usage=8500, limit=10000) + result3 = check_and_act('tts_cost_efficiency', tce) + print(f"tts_cost_efficiency = {tce:.3f} (triggered: {result3['triggered']}, action: {result3['action']})") diff --git a/bots/shorts/hook_optimizer.py b/bots/shorts/hook_optimizer.py new file mode 100644 index 0000000..c4a36b8 --- /dev/null +++ b/bots/shorts/hook_optimizer.py @@ -0,0 +1,252 @@ +""" +bots/shorts/hook_optimizer.py +Hook text quality scoring and optimization. + +HookOptimizer: + - score(hook): 0-100 quality score based on pattern match + keyword strength + - optimize(hook, article, max_attempts): regenerate if score < 70 + +V3.0 scope: pattern matching + LLM regeneration via existing writer_bot +""" +import logging +import re +from typing import Optional + +logger = logging.getLogger(__name__) + +# Hook patterns mapped to template strings with {N} placeholder for numbers +HOOK_PATTERNS = { + 'disbelief': [ + '이거 모르면 손해', + '이게 무료라고?', + '이걸 아직도 모른다고?', + '믿기 힘들지만 사실입니다', + '실화입니다', + ], + 'warning': [ + '절대 하지 마세요', + '이것만은 피하세요', + '지금 당장 멈추세요', + '알면 충격받을 수 있습니다', + ], + 'number': [ + '단 {N}초면', + '{N}%가 모르는', + '{N}가지 방법', + '{N}배 빠른', + '상위 {N}%', + ], + 'question': [ + '왜 아무도 안 알려줄까?', + '진짜일까?', + '이게 가능한 이유', + '어떻게 하는 걸까?', + ], + 'urgency': [ + '지금 당장', + '오늘 안에', + '지금 안 보면 후회', + '당장 시작해야 하는 이유', + ], +} + +# High-value keywords that boost score (Korean viral hook words) +HIGH_VALUE_KEYWORDS = [ + '무료', '공짜', '비밀', '충격', '실화', '진짜', '불법', + '모르는', '숨겨진', '알려지지 않은', '믿기지 않는', '손해', + '당장', '지금', '반드시', '절대', '꼭', '필수', + '돈', '수익', '수입', '부자', '성공', '자유', + '초보', '누구나', '쉬운', '간단한', +] + +# Weak words that reduce score +WEAK_KEYWORDS = [ + '알아보겠습니다', '살펴보겠습니다', '설명드리겠습니다', + '안녕하세요', '오늘은', '이번에는', '먼저', +] + + +class HookOptimizer: + """ + Scores and optimizes hook text for shorts videos. + + Score = pattern_score (0-50) + keyword_score (0-30) + length_score (0-20) + Threshold: 70 — below this triggers regeneration + """ + + def __init__(self, threshold: int = 70): + self.threshold = threshold + self._recently_used_patterns: list[str] = [] # avoid repetition + + def score(self, hook: str) -> int: + """ + Score a hook text from 0-100. + + Components: + - pattern_score (0-50): does it match a known viral pattern? + - keyword_score (0-30): does it contain high-value keywords? + - length_score (0-20): optimal length (15-30 chars = max) + """ + if not hook: + return 0 + + pattern_score = self._score_pattern(hook) + keyword_score = self._score_keywords(hook) + length_score = self._score_length(hook) + + total = min(100, pattern_score + keyword_score + length_score) + return total + + def optimize( + self, + hook: str, + article: dict, + max_attempts: int = 3, + llm_fn=None, + ) -> str: + """ + Score hook. If score < threshold, regenerate up to max_attempts times. + + Args: + hook: Initial hook text + article: Article dict with keys: title, body, corner, key_points + max_attempts: Max regeneration attempts + llm_fn: Optional callable(prompt) -> str for LLM regeneration. + If None, returns original hook (LLM not available). + + Returns: Best hook found (may still be below threshold if all attempts fail) + """ + current = hook + best = hook + best_score = self.score(hook) + + logger.info(f'[훅] 초기 점수: {best_score}/100 — "{hook[:30]}..."') + + if best_score >= self.threshold: + return hook + + if llm_fn is None: + logger.warning(f'[훅] 점수 부족 ({best_score}/100) — LLM 없음, 원본 사용') + return hook + + for attempt in range(max_attempts): + prompt = self._build_regeneration_prompt(current, article, best_score) + + try: + new_hook = llm_fn(prompt) + if new_hook: + new_hook = new_hook.strip().split('\n')[0] # Take first line + new_score = self.score(new_hook) + logger.info(f'[훅] 시도 {attempt+1}: {new_score}/100 — "{new_hook[:30]}"') + + if new_score > best_score: + best = new_hook + best_score = new_score + + if best_score >= self.threshold: + break + + current = new_hook + except Exception as e: + logger.warning(f'[훅] LLM 재생성 실패 (시도 {attempt+1}): {e}') + break + + logger.info(f'[훅] 최종 점수: {best_score}/100 — "{best[:30]}"') + return best + + def _score_pattern(self, hook: str) -> int: + """Check if hook matches known viral patterns. Max 50 points.""" + for pattern_name, templates in HOOK_PATTERNS.items(): + for template in templates: + # Check for fuzzy match (template with {N} filled in) + pattern_re = re.escape(template).replace(r'\{N\}', r'\d+') + if re.search(pattern_re, hook): + # Recently used pattern gets reduced score + if pattern_name in self._recently_used_patterns[-3:]: + return 30 + self._recently_used_patterns.append(pattern_name) + return 50 + # Partial match check + core = template.replace('{N}', '').strip() + if len(core) > 3 and core in hook: + return 35 + return 0 + + def _score_keywords(self, hook: str) -> int: + """Score based on high-value/weak keywords. Max 30 points.""" + score = 0 + for kw in HIGH_VALUE_KEYWORDS: + if kw in hook: + score += 10 + if score >= 30: + break + + # Penalize weak words + for kw in WEAK_KEYWORDS: + if kw in hook: + score -= 15 + + return max(0, min(30, score)) + + def _score_length(self, hook: str) -> int: + """Score based on hook length. Max 20 points. Optimal: 15-30 chars.""" + length = len(hook) + if 15 <= length <= 30: + return 20 + elif 10 <= length < 15 or 30 < length <= 40: + return 10 + elif length < 10: + return 5 + else: # > 40 chars + return 0 + + def _build_regeneration_prompt(self, hook: str, article: dict, current_score: int) -> str: + """Build LLM prompt for hook regeneration.""" + title = article.get('title', '') + corner = article.get('corner', '') + key_points = article.get('key_points', []) + recently_used = ', '.join(self._recently_used_patterns[-3:]) if self._recently_used_patterns else '없음' + + points_str = '\n'.join(f'- {p}' for p in key_points[:3]) if key_points else '' + + return f"""다음 쇼츠 영상의 훅 텍스트를 개선해주세요. + +현재 훅: {hook} +현재 점수: {current_score}/100 (기준: 70점 이상) + +콘텐츠 정보: +- 제목: {title} +- 코너: {corner} +- 핵심 포인트: {points_str} + +요구사항: +1. 15-30자 이내 +2. 다음 패턴 중 하나 사용: 충격/의심/경고/숫자/긴급 +3. 최근 사용된 패턴 제외: {recently_used} +4. 한국어로 작성 +5. 훅 텍스트만 출력 (설명 없이) + +개선된 훅:""" + + +# ── Standalone test ────────────────────────────────────────────── + +if __name__ == '__main__': + import sys + if '--test' in sys.argv: + optimizer = HookOptimizer() + test_hooks = [ + '이거 모르면 손해입니다!', + '안녕하세요 오늘은 AI에 대해 설명드리겠습니다', + '100%가 모르는 무료 도구', + '지금 당장 이것만은 절대 하지 마세요', + '어', + ] + print("=== Hook Optimizer Test ===") + for hook in test_hooks: + s = optimizer.score(hook) + print(f'점수 {s:3d}/100: "{hook}"') + print() + print("Pattern test:") + for category in HOOK_PATTERNS: + print(f" {category}: {len(HOOK_PATTERNS[category])} patterns")