Initial public release
This commit is contained in:
@@ -0,0 +1,72 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
AWS Lambda 배포용 DR L2 워치독.
|
||||
1분마다 헬스체크. 3회 연속 실패 시 비상 청산 API 호출.
|
||||
"""
|
||||
import os
|
||||
import json
|
||||
import boto3
|
||||
import requests
|
||||
|
||||
HEALTH_URL = os.environ.get("HYDRA_HEALTH_URL", "http://YOUR_MINI_PC_IP:8000/health")
|
||||
KILL_URL = os.environ.get("HYDRA_KILL_URL", "http://YOUR_MINI_PC_IP:8000/killswitch")
|
||||
HYDRA_API_KEY = os.environ.get("HYDRA_API_KEY", "")
|
||||
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN", "")
|
||||
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
|
||||
MAX_FAILURES = 3
|
||||
FAILURE_KEY = "hydra_watchdog_failures"
|
||||
|
||||
|
||||
def get_failure_count() -> int:
|
||||
dynamo = boto3.resource("dynamodb")
|
||||
table = dynamo.Table("hydra_watchdog")
|
||||
resp = table.get_item(Key={"id": FAILURE_KEY})
|
||||
return int(resp.get("Item", {}).get("count", 0))
|
||||
|
||||
|
||||
def set_failure_count(count: int) -> None:
|
||||
dynamo = boto3.resource("dynamodb")
|
||||
table = dynamo.Table("hydra_watchdog")
|
||||
table.put_item(Item={"id": FAILURE_KEY, "count": count})
|
||||
|
||||
|
||||
def reset_failure_count() -> None:
|
||||
set_failure_count(0)
|
||||
|
||||
|
||||
def emergency_close_all() -> dict:
|
||||
resp = requests.post(
|
||||
KILL_URL,
|
||||
params={"reason": "dr_l2_watchdog"},
|
||||
headers={"X-HYDRA-KEY": HYDRA_API_KEY},
|
||||
timeout=30,
|
||||
)
|
||||
return resp.json()
|
||||
|
||||
|
||||
def send_telegram_alert(message: str) -> None:
|
||||
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
|
||||
return
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
requests.post(url, json={"chat_id": TELEGRAM_CHAT_ID, "text": message}, timeout=10)
|
||||
|
||||
|
||||
def lambda_handler(event, context) -> dict:
|
||||
failure_count = get_failure_count()
|
||||
try:
|
||||
r = requests.get(HEALTH_URL, timeout=5)
|
||||
if r.status_code == 200:
|
||||
reset_failure_count()
|
||||
return {"status": "ok", "failures": 0}
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
failure_count += 1
|
||||
set_failure_count(failure_count)
|
||||
|
||||
if failure_count >= MAX_FAILURES:
|
||||
result = emergency_close_all()
|
||||
send_telegram_alert(f"⚠️ DR L2 발동: 미니PC {MAX_FAILURES}회 무응답.\n전 포지션 청산: {result}")
|
||||
reset_failure_count()
|
||||
|
||||
return {"status": "failure", "failures": failure_count}
|
||||
Reference in New Issue
Block a user