Files
Hydra-Engine/scripts/dr_watchdog.py
2026-03-30 13:19:11 +09:00

73 lines
2.2 KiB
Python

#!/usr/bin/env python3
"""
AWS Lambda 배포용 DR L2 워치독.
1분마다 헬스체크. 3회 연속 실패 시 비상 청산 API 호출.
"""
import os
import json
import boto3
import requests
HEALTH_URL = os.environ.get("HYDRA_HEALTH_URL", "http://YOUR_MINI_PC_IP:8000/health")
KILL_URL = os.environ.get("HYDRA_KILL_URL", "http://YOUR_MINI_PC_IP:8000/killswitch")
HYDRA_API_KEY = os.environ.get("HYDRA_API_KEY", "")
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
MAX_FAILURES = 3
FAILURE_KEY = "hydra_watchdog_failures"
def get_failure_count() -> int:
dynamo = boto3.resource("dynamodb")
table = dynamo.Table("hydra_watchdog")
resp = table.get_item(Key={"id": FAILURE_KEY})
return int(resp.get("Item", {}).get("count", 0))
def set_failure_count(count: int) -> None:
dynamo = boto3.resource("dynamodb")
table = dynamo.Table("hydra_watchdog")
table.put_item(Item={"id": FAILURE_KEY, "count": count})
def reset_failure_count() -> None:
set_failure_count(0)
def emergency_close_all() -> dict:
resp = requests.post(
KILL_URL,
params={"reason": "dr_l2_watchdog"},
headers={"X-HYDRA-KEY": HYDRA_API_KEY},
timeout=30,
)
return resp.json()
def send_telegram_alert(message: str) -> None:
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
return
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
requests.post(url, json={"chat_id": TELEGRAM_CHAT_ID, "text": message}, timeout=10)
def lambda_handler(event, context) -> dict:
failure_count = get_failure_count()
try:
r = requests.get(HEALTH_URL, timeout=5)
if r.status_code == 200:
reset_failure_count()
return {"status": "ok", "failures": 0}
except Exception:
pass
failure_count += 1
set_failure_count(failure_count)
if failure_count >= MAX_FAILURES:
result = emergency_close_all()
send_telegram_alert(f"⚠️ DR L2 발동: 미니PC {MAX_FAILURES}회 무응답.\n전 포지션 청산: {result}")
reset_failure_count()
return {"status": "failure", "failures": failure_count}