feat: v3.2 — YouTube Shorts 봇 + 수동 어시스트 + 보안 개선

주요 추가 기능: - bots/shorts/ 서브모듈 7개: tts_engine, script_extractor, asset_resolver, stock_fetcher, caption_renderer, video_assembler, youtube_uploader - bots/shorts_bot.py: 6단계 Shorts 파이프라인 오케스트레이터 (auto/semi_auto 두 가지 생산 모드, CLI 지원) - bots/writer_bot.py: 독립 실행형 AI 글쓰기 봇 (대시보드 연동) - bots/assist_bot.py: URL 기반 수동 어시스트 파이프라인 - config/shorts_config.json: Shorts 전체 설정 - templates/shorts/extract_prompt.txt: LLM 스크립트 추출 프롬프트 - scheduler.py에 shorts 잡(10:35/16:00) + /shorts Telegram 명령 추가 보안 개선: - .env 파일 외부 경로 참조로 변경 (load_dotenv dotenv_path, 24개 파일) - .gitignore에 민감 파일/내부 문서/런타임 데이터 항목 추가 문서: - README.md 전면 재작성 (상세 한글 설명, 설치/설정/사용법 포함) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 17:51:02 +09:00
parent 392c2e13f1
commit 9b44a07a44
39 changed files with 3455 additions and 641 deletions
@@ -0,0 +1,371 @@
+"""
+bots/shorts/tts_engine.py
+역할: 쇼츠 스크립트 텍스트 → 음성(WAV) + 단어별 타임스탬프(JSON) 생성
+
+엔진 우선순위 (shorts_config.json tts.engine_priority):
+  1. ElevenLabs    — 최고 품질, ELEVENLABS_API_KEY 필요
+  2. Google Cloud TTS — 중간 품질, GOOGLE_TTS_API_KEY 필요
+  3. Edge TTS      — 무료 폴백, API 키 불필요
+
+출력:
+  data/shorts/tts/{timestamp}.wav
+  data/shorts/tts/{timestamp}_timestamps.json
+    [{word: str, start: float, end: float}, ...]
+"""
+import asyncio
+import json
+import logging
+import os
+import re
+import struct
+import tempfile
+import wave
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# ─── 공통 유틸 ────────────────────────────────────────────────
+
+
+def _load_config() -> dict:
+    cfg_path = Path(__file__).parent.parent.parent / 'config' / 'shorts_config.json'
+    if cfg_path.exists():
+        return json.loads(cfg_path.read_text(encoding='utf-8'))
+    return {}
+
+
+def _concat_script(script: dict) -> str:
+    """스크립트 dict → 읽기용 단일 텍스트. 문장 사이 공백 추가."""
+    parts = [script.get('hook', '')]
+    parts.extend(script.get('body', []))
+    parts.append(script.get('closer', ''))
+    return ' '.join(p for p in parts if p)
+
+
+def _add_pause(wav_path: Path, pause_ms: int = 300) -> None:
+    """WAV 파일 끝에 무음 pause_ms 밀리초 추가 (인플레이스)."""
+    with wave.open(str(wav_path), 'rb') as wf:
+        params = wf.getparams()
+        frames = wf.readframes(wf.getnframes())
+
+    silence_frames = int(params.framerate * pause_ms / 1000)
+    silence = b'\x00' * silence_frames * params.nchannels * params.sampwidth
+
+    with wave.open(str(wav_path), 'wb') as wf:
+        wf.setparams(params)
+        wf.writeframes(frames + silence)
+
+
+def _get_wav_duration(wav_path: Path) -> float:
+    with wave.open(str(wav_path), 'rb') as wf:
+        return wf.getnframes() / wf.getframerate()
+
+
+# ─── ElevenLabs ───────────────────────────────────────────────
+
+def _tts_elevenlabs(text: str, output_path: Path, cfg: dict) -> list[dict]:
+    """
+    ElevenLabs TTS + 단어별 타임스탬프.
+    Returns: [{word, start, end}, ...]
+    """
+    import requests
+
+    api_key = os.environ.get('ELEVENLABS_API_KEY', '')
+    if not api_key:
+        raise RuntimeError('ELEVENLABS_API_KEY not set')
+
+    el_cfg = cfg.get('tts', {}).get('elevenlabs', {})
+    voice_id = el_cfg.get('voice_id', 'pNInz6obpgDQGcFmaJgB')
+    model_id = el_cfg.get('model', 'eleven_multilingual_v2')
+    stability = el_cfg.get('stability', 0.5)
+    similarity = el_cfg.get('similarity_boost', 0.8)
+    speed = el_cfg.get('speed', 1.1)
+
+    url = f'https://api.elevenlabs.io/v1/text-to-speech/{voice_id}/with-timestamps'
+    headers = {'xi-api-key': api_key, 'Content-Type': 'application/json'}
+    payload = {
+        'text': text,
+        'model_id': model_id,
+        'voice_settings': {
+            'stability': stability,
+            'similarity_boost': similarity,
+            'speed': speed,
+        },
+    }
+
+    resp = requests.post(url, headers=headers, json=payload, timeout=60)
+    resp.raise_for_status()
+    data = resp.json()
+
+    # 오디오 디코딩
+    import base64
+    audio_b64 = data.get('audio_base64', '')
+    audio_bytes = base64.b64decode(audio_b64)
+
+    # ElevenLabs는 mp3 반환 → wav 변환
+    mp3_tmp = output_path.with_suffix('.mp3')
+    mp3_tmp.write_bytes(audio_bytes)
+    _mp3_to_wav(mp3_tmp, output_path)
+    mp3_tmp.unlink(missing_ok=True)
+
+    # 타임스탬프 파싱
+    alignment = data.get('alignment', {})
+    chars = alignment.get('characters', [])
+    starts = alignment.get('character_start_times_seconds', [])
+    ends = alignment.get('character_end_times_seconds', [])
+
+    timestamps = _chars_to_words(chars, starts, ends)
+    return timestamps
+
+
+def _chars_to_words(chars: list, starts: list, ends: list) -> list[dict]:
+    """ElevenLabs 문자 레벨 타임스탬프 → 단어 레벨."""
+    words = []
+    cur_word = ''
+    cur_start = 0.0
+    cur_end = 0.0
+
+    for ch, st, en in zip(chars, starts, ends):
+        if ch in (' ', '\n'):
+            if cur_word:
+                words.append({'word': cur_word, 'start': round(cur_start, 3), 'end': round(cur_end, 3)})
+                cur_word = ''
+        else:
+            if not cur_word:
+                cur_start = st
+            cur_word += ch
+            cur_end = en
+
+    if cur_word:
+        words.append({'word': cur_word, 'start': round(cur_start, 3), 'end': round(cur_end, 3)})
+
+    return words
+
+
+def _mp3_to_wav(mp3_path: Path, wav_path: Path) -> None:
+    try:
+        from pydub import AudioSegment
+        AudioSegment.from_mp3(str(mp3_path)).export(str(wav_path), format='wav')
+        return
+    except Exception:
+        pass
+
+    # ffmpeg 폴백
+    import subprocess
+    ffmpeg = _get_ffmpeg()
+    subprocess.run(
+        [ffmpeg, '-y', '-i', str(mp3_path), str(wav_path)],
+        check=True, capture_output=True,
+    )
+
+
+def _get_ffmpeg() -> str:
+    ffmpeg_env = os.environ.get('FFMPEG_PATH', '')
+    if ffmpeg_env and Path(ffmpeg_env).exists():
+        return ffmpeg_env
+    return 'ffmpeg'
+
+
+# ─── Google Cloud TTS ─────────────────────────────────────────
+
+def _tts_google_cloud(text: str, output_path: Path, cfg: dict) -> list[dict]:
+    """
+    Google Cloud TTS (REST API) + SSML time_pointing으로 타임스탬프 추출.
+    Returns: [{word, start, end}, ...]
+    """
+    import requests
+
+    api_key = os.environ.get('GOOGLE_TTS_API_KEY', '')
+    if not api_key:
+        raise RuntimeError('GOOGLE_TTS_API_KEY not set')
+
+    gc_cfg = cfg.get('tts', {}).get('google_cloud', {})
+    voice_name = gc_cfg.get('voice_name', 'ko-KR-Neural2-C')
+    speaking_rate = gc_cfg.get('speaking_rate', 1.1)
+
+    # SSML: 단어별 mark 삽입
+    words = text.split()
+    ssml_parts = []
+    for i, w in enumerate(words):
+        ssml_parts.append(f'<mark name="w{i}"/>{w}')
+    ssml_text = ' '.join(ssml_parts)
+    ssml = f'<speak>{ssml_text}<mark name="end"/></speak>'
+
+    url = f'https://texttospeech.googleapis.com/v1beta1/text:synthesize?key={api_key}'
+    payload = {
+        'input': {'ssml': ssml},
+        'voice': {'languageCode': voice_name[:5], 'name': voice_name},
+        'audioConfig': {
+            'audioEncoding': 'LINEAR16',
+            'speakingRate': speaking_rate,
+            'sampleRateHertz': 44100,
+        },
+        'enableTimePointing': ['SSML_MARK'],
+    }
+
+    resp = requests.post(url, json=payload, timeout=60)
+    resp.raise_for_status()
+    data = resp.json()
+
+    import base64
+    audio_bytes = base64.b64decode(data['audioContent'])
+    output_path.write_bytes(audio_bytes)
+
+    # 타임스탬프 파싱
+    timepoints = data.get('timepoints', [])
+    timestamps = _gcloud_marks_to_words(words, timepoints)
+    return timestamps
+
+
+def _gcloud_marks_to_words(words: list[str], timepoints: list[dict]) -> list[dict]:
+    """Google Cloud TTS mark 타임포인트 → 단어별 {word, start, end}."""
+    mark_map = {tp['markName']: tp['timeSeconds'] for tp in timepoints}
+    total_dur = mark_map.get('end', 0.0)
+
+    result = []
+    for i, w in enumerate(words):
+        start = mark_map.get(f'w{i}', 0.0)
+        end = mark_map.get(f'w{i+1}', total_dur)
+        result.append({'word': w, 'start': round(start, 3), 'end': round(end, 3)})
+    return result
+
+
+# ─── Edge TTS + Whisper ───────────────────────────────────────
+
+def _tts_edge(text: str, output_path: Path, cfg: dict) -> list[dict]:
+    """
+    Edge TTS (무료) → WAV 생성 후 Whisper로 단어별 타임스탬프 추출.
+    Returns: [{word, start, end}, ...]
+    """
+    import edge_tts
+
+    edge_cfg = cfg.get('tts', {}).get('edge_tts', {})
+    voice = edge_cfg.get('voice', 'ko-KR-SunHiNeural')
+    rate = edge_cfg.get('rate', '+10%')
+
+    mp3_tmp = output_path.with_suffix('.mp3')
+
+    async def _generate():
+        communicate = edge_tts.Communicate(text, voice, rate=rate)
+        await communicate.save(str(mp3_tmp))
+
+    asyncio.get_event_loop().run_until_complete(_generate())
+
+    # mp3 → wav
+    _mp3_to_wav(mp3_tmp, output_path)
+    mp3_tmp.unlink(missing_ok=True)
+
+    # Whisper로 타임스탬프 추출
+    timestamps = _whisper_timestamps(output_path)
+    return timestamps
+
+
+def _whisper_timestamps(wav_path: Path) -> list[dict]:
+    """openai-whisper를 사용해 단어별 타임스탬프 추출. 없으면 균등 분할."""
+    try:
+        import whisper  # type: ignore
+
+        model = whisper.load_model('tiny')
+        result = model.transcribe(str(wav_path), word_timestamps=True, language='ko')
+        words = []
+        for seg in result.get('segments', []):
+            for w in seg.get('words', []):
+                words.append({
+                    'word': w['word'].strip(),
+                    'start': round(w['start'], 3),
+                    'end': round(w['end'], 3),
+                })
+        if words:
+            return words
+    except Exception as e:
+        logger.warning(f'Whisper 타임스탬프 실패: {e} — 균등 분할 사용')
+
+    return _uniform_timestamps(wav_path)
+
+
+def _uniform_timestamps(wav_path: Path) -> list[dict]:
+    """Whisper 없을 때 균등 분할 타임스탬프 (캡션 품질 저하 감수)."""
+    duration = _get_wav_duration(wav_path)
+    with wave.open(str(wav_path), 'rb') as wf:
+        pass  # just to confirm it's readable
+
+    # WAV 파일에서 텍스트를 다시 알 수 없으므로 빈 리스트 반환
+    # (caption_renderer가 균등 분할을 처리)
+    return []
+
+
+# ─── 메인 엔트리포인트 ────────────────────────────────────────
+
+def generate_tts(
+    script: dict,
+    output_dir: Path,
+    timestamp: str,
+    cfg: Optional[dict] = None,
+) -> tuple[Path, list[dict]]:
+    """
+    스크립트 dict → WAV + 단어별 타임스탬프.
+
+    Args:
+        script:     {hook, body, closer, ...}
+        output_dir: data/shorts/tts/
+        timestamp:  파일명 prefix (e.g. "20260328_120000")
+        cfg:        shorts_config.json dict (없으면 자동 로드)
+
+    Returns:
+        (wav_path, timestamps)  — timestamps: [{word, start, end}, ...]
+    """
+    if cfg is None:
+        cfg = _load_config()
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    wav_path = output_dir / f'{timestamp}.wav'
+    ts_path = output_dir / f'{timestamp}_timestamps.json'
+
+    text = _concat_script(script)
+    pause_ms = cfg.get('tts', {}).get('inter_sentence_pause_ms', 300)
+    priority = cfg.get('tts', {}).get('engine_priority', ['elevenlabs', 'google_cloud', 'edge_tts'])
+
+    engine_map = {
+        'elevenlabs':   _tts_elevenlabs,
+        'google_cloud': _tts_google_cloud,
+        'edge_tts':     _tts_edge,
+    }
+
+    timestamps: list[dict] = []
+    last_error: Optional[Exception] = None
+
+    for engine_name in priority:
+        fn = engine_map.get(engine_name)
+        if fn is None:
+            continue
+        try:
+            logger.info(f'TTS 엔진 시도: {engine_name}')
+            timestamps = fn(text, wav_path, cfg)
+            logger.info(f'TTS 완료 ({engine_name}): {wav_path.name}')
+            break
+        except Exception as e:
+            logger.warning(f'TTS 엔진 실패 ({engine_name}): {e}')
+            last_error = e
+            if wav_path.exists():
+                wav_path.unlink()
+
+    if not wav_path.exists():
+        raise RuntimeError(f'모든 TTS 엔진 실패. 마지막 오류: {last_error}')
+
+    # 문장 끝 무음 추가
+    try:
+        _add_pause(wav_path, pause_ms)
+    except Exception as e:
+        logger.warning(f'무음 추가 실패: {e}')
+
+    # 타임스탬프 저장
+    ts_path.write_text(json.dumps(timestamps, ensure_ascii=False, indent=2), encoding='utf-8')
+    logger.info(f'타임스탬프 저장: {ts_path.name} ({len(timestamps)}단어)')
+
+    return wav_path, timestamps
+
+
+def load_timestamps(ts_path: Path) -> list[dict]:
+    """저장된 타임스탬프 JSON 로드."""
+    return json.loads(ts_path.read_text(encoding='utf-8'))