73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
AWS Lambda 배포용 DR L2 워치독.
|
|
1분마다 헬스체크. 3회 연속 실패 시 비상 청산 API 호출.
|
|
"""
|
|
import os
|
|
import json
|
|
import boto3
|
|
import requests
|
|
|
|
HEALTH_URL = os.environ.get("HYDRA_HEALTH_URL", "http://YOUR_MINI_PC_IP:8000/health")
|
|
KILL_URL = os.environ.get("HYDRA_KILL_URL", "http://YOUR_MINI_PC_IP:8000/killswitch")
|
|
HYDRA_API_KEY = os.environ.get("HYDRA_API_KEY", "")
|
|
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
|
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
|
|
MAX_FAILURES = 3
|
|
FAILURE_KEY = "hydra_watchdog_failures"
|
|
|
|
|
|
def get_failure_count() -> int:
|
|
dynamo = boto3.resource("dynamodb")
|
|
table = dynamo.Table("hydra_watchdog")
|
|
resp = table.get_item(Key={"id": FAILURE_KEY})
|
|
return int(resp.get("Item", {}).get("count", 0))
|
|
|
|
|
|
def set_failure_count(count: int) -> None:
|
|
dynamo = boto3.resource("dynamodb")
|
|
table = dynamo.Table("hydra_watchdog")
|
|
table.put_item(Item={"id": FAILURE_KEY, "count": count})
|
|
|
|
|
|
def reset_failure_count() -> None:
|
|
set_failure_count(0)
|
|
|
|
|
|
def emergency_close_all() -> dict:
|
|
resp = requests.post(
|
|
KILL_URL,
|
|
params={"reason": "dr_l2_watchdog"},
|
|
headers={"X-HYDRA-KEY": HYDRA_API_KEY},
|
|
timeout=30,
|
|
)
|
|
return resp.json()
|
|
|
|
|
|
def send_telegram_alert(message: str) -> None:
|
|
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
|
|
return
|
|
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
|
requests.post(url, json={"chat_id": TELEGRAM_CHAT_ID, "text": message}, timeout=10)
|
|
|
|
|
|
def lambda_handler(event, context) -> dict:
|
|
failure_count = get_failure_count()
|
|
try:
|
|
r = requests.get(HEALTH_URL, timeout=5)
|
|
if r.status_code == 200:
|
|
reset_failure_count()
|
|
return {"status": "ok", "failures": 0}
|
|
except Exception:
|
|
pass
|
|
|
|
failure_count += 1
|
|
set_failure_count(failure_count)
|
|
|
|
if failure_count >= MAX_FAILURES:
|
|
result = emergency_close_all()
|
|
send_telegram_alert(f"⚠️ DR L2 발동: 미니PC {MAX_FAILURES}회 무응답.\n전 포지션 청산: {result}")
|
|
reset_failure_count()
|
|
|
|
return {"status": "failure", "failures": failure_count}
|