feat: harden installer with rich health check diagnostics

Two-stage health verification (health + readiness), 30s timeout,
parse_health_json() helper with jq/python3/node fallbacks, smart
port-conflict handling with version/provider mismatch detection,
and enhanced completion summary showing version, AI auth, and uptime.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-12 21:58:15 -05:00
parent 05e904e613
commit 1d76f93304
+244 -14
View File
@@ -273,6 +273,128 @@ ensure_jq_or_fallback() {
return 1 return 1
} }
###############################################################################
# Parse /api/health JSON response — extract worker metadata into globals
# Uses jq → python3 → node fallback chain (matching installer conventions)
# Sets: WORKER_VERSION, WORKER_AI_PROVIDER, WORKER_AI_AUTH_METHOD,
# WORKER_INITIALIZED, WORKER_REPORTED_PID, WORKER_UPTIME
###############################################################################
parse_health_json() {
local raw_json="$1"
# Reset all health globals before parsing
WORKER_VERSION=""
WORKER_AI_PROVIDER=""
WORKER_AI_AUTH_METHOD=""
WORKER_INITIALIZED=""
WORKER_REPORTED_PID=""
WORKER_UPTIME=""
if [[ -z "$raw_json" ]]; then
return 0
fi
# Try jq first (fastest, most reliable)
if command -v jq &>/dev/null; then
WORKER_VERSION="$(echo "$raw_json" | jq -r '.version // empty' 2>/dev/null)" || true
WORKER_AI_PROVIDER="$(echo "$raw_json" | jq -r '.ai.provider // empty' 2>/dev/null)" || true
WORKER_AI_AUTH_METHOD="$(echo "$raw_json" | jq -r '.ai.authMethod // empty' 2>/dev/null)" || true
WORKER_INITIALIZED="$(echo "$raw_json" | jq -r '.initialized // empty' 2>/dev/null)" || true
WORKER_REPORTED_PID="$(echo "$raw_json" | jq -r '.pid // empty' 2>/dev/null)" || true
WORKER_UPTIME="$(echo "$raw_json" | jq -r '.uptime // empty' 2>/dev/null)" || true
return 0
fi
# Try python3 fallback
if command -v python3 &>/dev/null; then
local parsed
parsed="$(INSTALLER_HEALTH_JSON="$raw_json" python3 -c "
import json, os, sys
try:
data = json.loads(os.environ['INSTALLER_HEALTH_JSON'])
ai = data.get('ai') or {}
fields = [
str(data.get('version', '')),
str(ai.get('provider', '')),
str(ai.get('authMethod', '')),
str(data.get('initialized', '')),
str(data.get('pid', '')),
str(data.get('uptime', '')),
]
sys.stdout.write('\n'.join(fields))
except Exception:
sys.stdout.write('\n\n\n\n\n')
" 2>/dev/null)" || true
if [[ -n "$parsed" ]]; then
local -a health_fields
IFS=$'\n' read -r -d '' -a health_fields <<< "$parsed" || true
WORKER_VERSION="${health_fields[0]:-}"
WORKER_AI_PROVIDER="${health_fields[1]:-}"
WORKER_AI_AUTH_METHOD="${health_fields[2]:-}"
WORKER_INITIALIZED="${health_fields[3]:-}"
WORKER_REPORTED_PID="${health_fields[4]:-}"
WORKER_UPTIME="${health_fields[5]:-}"
# Normalize python's None/empty representations
[[ "$WORKER_VERSION" == "None" ]] && WORKER_VERSION=""
[[ "$WORKER_AI_PROVIDER" == "None" ]] && WORKER_AI_PROVIDER=""
[[ "$WORKER_AI_AUTH_METHOD" == "None" ]] && WORKER_AI_AUTH_METHOD=""
[[ "$WORKER_INITIALIZED" == "None" ]] && WORKER_INITIALIZED=""
[[ "$WORKER_REPORTED_PID" == "None" ]] && WORKER_REPORTED_PID=""
[[ "$WORKER_UPTIME" == "None" ]] && WORKER_UPTIME=""
fi
return 0
fi
# Fallback to node (always available — it's a dependency)
local parsed
parsed="$(INSTALLER_HEALTH_JSON="$raw_json" node -e "
try {
const data = JSON.parse(process.env.INSTALLER_HEALTH_JSON);
const ai = data.ai || {};
const fields = [
data.version ?? '',
ai.provider ?? '',
ai.authMethod ?? '',
data.initialized != null ? String(data.initialized) : '',
data.pid != null ? String(data.pid) : '',
data.uptime != null ? String(data.uptime) : '',
];
process.stdout.write(fields.join('\n'));
} catch (e) {
process.stdout.write('\n\n\n\n\n');
}
" 2>/dev/null)" || true
if [[ -n "$parsed" ]]; then
local -a health_fields
IFS=$'\n' read -r -d '' -a health_fields <<< "$parsed" || true
WORKER_VERSION="${health_fields[0]:-}"
WORKER_AI_PROVIDER="${health_fields[1]:-}"
WORKER_AI_AUTH_METHOD="${health_fields[2]:-}"
WORKER_INITIALIZED="${health_fields[3]:-}"
WORKER_REPORTED_PID="${health_fields[4]:-}"
WORKER_UPTIME="${health_fields[5]:-}"
fi
}
###############################################################################
# Format uptime from milliseconds to human-readable (e.g., "2m 15s", "1h 23m")
###############################################################################
format_uptime_ms() {
local ms="$1"
local secs=$((ms / 1000))
if (( secs >= 3600 )); then
echo "$((secs / 3600))h $((secs % 3600 / 60))m"
elif (( secs >= 60 )); then
echo "$((secs / 60))m $((secs % 60))s"
else
echo "${secs}s"
fi
}
############################################################################### ###############################################################################
# Banner # Banner
############################################################################### ###############################################################################
@@ -1030,6 +1152,12 @@ find_claude_mem_install_dir() {
############################################################################### ###############################################################################
WORKER_PID="" WORKER_PID=""
WORKER_VERSION=""
WORKER_AI_PROVIDER=""
WORKER_AI_AUTH_METHOD=""
WORKER_INITIALIZED=""
WORKER_REPORTED_PID=""
WORKER_UPTIME=""
start_worker() { start_worker() {
info "Starting claude-mem worker service..." info "Starting claude-mem worker service..."
@@ -1083,43 +1211,74 @@ start_worker() {
} }
############################################################################### ###############################################################################
# Health verification # Health verification — two-stage: health (alive) then readiness (initialized)
# Polls http://localhost:37777/api/health up to 10 times with 1-second intervals # Stage 1: Poll /api/health for HTTP 200 (worker process is running)
# Stage 2: Poll /api/readiness for HTTP 200 (worker is fully initialized)
# Total budget: 30 attempts (30 seconds) shared across both stages
############################################################################### ###############################################################################
verify_health() { verify_health() {
local max_attempts=10 local max_attempts=30
local attempt=1 local attempt=1
local health_url="http://127.0.0.1:37777/api/health" local health_url="http://127.0.0.1:37777/api/health"
local readiness_url="http://127.0.0.1:37777/api/readiness"
local health_alive=false
info "Verifying worker health..." info "Verifying worker health..."
# ── Stage 1: Wait for /api/health to return HTTP 200 (worker is alive) ──
while (( attempt <= max_attempts )); do while (( attempt <= max_attempts )); do
local response local http_status
response="$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null)" || true http_status="$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null)" || true
if [[ "$response" == "200" ]]; then if [[ "$http_status" == "200" ]]; then
# Verify the response body contains status:ok health_alive=true
# Fetch the full health response body and parse metadata
local body local body
body="$(curl -s "$health_url" 2>/dev/null)" || true body="$(curl -s "$health_url" 2>/dev/null)" || true
if echo "$body" | grep -q '"status"[[:space:]]*:[[:space:]]*"ok"'; then parse_health_json "$body"
success "Worker is healthy (port 37777)"
return 0 success "Worker is alive, waiting for initialization..."
fi
break
fi fi
if (( attempt < max_attempts )); then
info "Waiting for worker to start... (attempt ${attempt}/${max_attempts})" info "Waiting for worker to start... (attempt ${attempt}/${max_attempts})"
fi
sleep 1 sleep 1
attempt=$((attempt + 1)) attempt=$((attempt + 1))
done done
# If health never responded, the worker is not running at all
if [[ "$health_alive" != "true" ]]; then
warn "Worker health check timed out after ${max_attempts} attempts" warn "Worker health check timed out after ${max_attempts} attempts"
warn "The worker may still be starting up. Check status with:" warn "The worker may still be starting up. Check status with:"
warn " curl http://127.0.0.1:37777/api/health" warn " curl http://127.0.0.1:37777/api/health"
warn " Or check logs: ~/.claude-mem/logs/" warn " Or check logs: ~/.claude-mem/logs/"
return 1 return 1
fi
# ── Stage 2: Wait for /api/readiness to return HTTP 200 (fully initialized) ──
attempt=$((attempt + 1))
while (( attempt <= max_attempts )); do
local readiness_status
readiness_status="$(curl -s -o /dev/null -w "%{http_code}" "$readiness_url" 2>/dev/null)" || true
if [[ "$readiness_status" == "200" ]]; then
success "Worker is ready!"
return 0
fi
info "Waiting for worker to initialize... (attempt ${attempt}/${max_attempts})"
sleep 1
attempt=$((attempt + 1))
done
# Readiness timed out but health is OK — worker is running, just not fully initialized yet
warn "Worker is running but initialization is still in progress"
warn "This is normal on first run — the worker will finish initializing in the background."
warn "Check readiness with: curl http://127.0.0.1:37777/api/readiness"
return 0
} }
############################################################################### ###############################################################################
@@ -1151,7 +1310,7 @@ setup_observation_feed() {
read_tty -r answer read_tty -r answer
answer="${answer:-n}" answer="${answer:-n}"
if [[ "${answer,,}" != "y" && "${answer,,}" != "yes" ]]; then if [[ "$answer" != [yY] && "$answer" != [yY][eE][sS] ]]; then
echo "" echo ""
info "Skipped observation feed setup." info "Skipped observation feed setup."
info "You can configure it later by re-running this installer or" info "You can configure it later by re-running this installer or"
@@ -1376,17 +1535,40 @@ print_completion_summary() {
echo -e " ${COLOR_GREEN}${COLOR_RESET} Dependencies installed (Bun, uv)" echo -e " ${COLOR_GREEN}${COLOR_RESET} Dependencies installed (Bun, uv)"
echo -e " ${COLOR_GREEN}${COLOR_RESET} OpenClaw gateway detected" echo -e " ${COLOR_GREEN}${COLOR_RESET} OpenClaw gateway detected"
# Show installed version from health data if available
if [[ -n "$WORKER_VERSION" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem v${COLOR_BOLD}${WORKER_VERSION}${COLOR_RESET} installed and running"
else
echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem plugin installed and enabled" echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem plugin installed and enabled"
fi
echo -e " ${COLOR_GREEN}${COLOR_RESET} Memory slot configured" echo -e " ${COLOR_GREEN}${COLOR_RESET} Memory slot configured"
# Show AI provider with auth method from health data if available
if [[ -n "$WORKER_AI_AUTH_METHOD" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${WORKER_AI_PROVIDER} (${WORKER_AI_AUTH_METHOD})${COLOR_RESET}"
else
echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${provider_display}${COLOR_RESET}" echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${provider_display}${COLOR_RESET}"
fi
echo -e " ${COLOR_GREEN}${COLOR_RESET} Settings written to ~/.claude-mem/settings.json" echo -e " ${COLOR_GREEN}${COLOR_RESET} Settings written to ~/.claude-mem/settings.json"
if [[ -n "$WORKER_PID" ]] && kill -0 "$WORKER_PID" 2>/dev/null; then if [[ -n "$WORKER_PID" ]] && kill -0 "$WORKER_PID" 2>/dev/null; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} Worker running on port ${COLOR_BOLD}37777${COLOR_RESET} (PID: ${WORKER_PID})" echo -e " ${COLOR_GREEN}${COLOR_RESET} Worker running on port ${COLOR_BOLD}37777${COLOR_RESET} (PID: ${WORKER_PID})"
elif [[ -n "$WORKER_UPTIME" && "$WORKER_UPTIME" =~ ^[0-9]+$ ]] && (( WORKER_UPTIME > 0 )); then
local uptime_formatted
uptime_formatted="$(format_uptime_ms "$WORKER_UPTIME")"
echo -e " ${COLOR_GREEN}${COLOR_RESET} Worker running on port ${COLOR_BOLD}37777${COLOR_RESET} (PID: ${WORKER_REPORTED_PID}, uptime: ${uptime_formatted})"
else else
echo -e " ${COLOR_YELLOW}${COLOR_RESET} Worker may not be running — check logs at ~/.claude-mem/logs/" echo -e " ${COLOR_YELLOW}${COLOR_RESET} Worker may not be running — check logs at ~/.claude-mem/logs/"
fi fi
# Show initialization warning if worker is alive but not yet initialized
if [[ "$WORKER_INITIALIZED" != "true" ]] && { [[ -n "$WORKER_REPORTED_PID" ]] || { [[ -n "$WORKER_PID" ]] && kill -0 "$WORKER_PID" 2>/dev/null; }; }; then
echo -e " ${COLOR_YELLOW}${COLOR_RESET} Worker is starting but still initializing (this is normal on first run)"
fi
if [[ "$FEED_CONFIGURED" == "true" ]]; then if [[ "$FEED_CONFIGURED" == "true" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} Observation feed: ${COLOR_BOLD}${FEED_CHANNEL}${COLOR_RESET}${FEED_TARGET_ID}" echo -e " ${COLOR_GREEN}${COLOR_RESET} Observation feed: ${COLOR_BOLD}${FEED_CHANNEL}${COLOR_RESET}${FEED_TARGET_ID}"
else else
@@ -1474,7 +1656,55 @@ main() {
warn "Port 37777 is already in use (worker may already be running)" warn "Port 37777 is already in use (worker may already be running)"
info "Checking if the existing service is healthy..." info "Checking if the existing service is healthy..."
if verify_health; then if verify_health; then
# verify_health already called parse_health_json — WORKER_* globals are set.
# Determine the expected version from the installed plugin's package.json.
local expected_version=""
if [[ -n "$CLAUDE_MEM_INSTALL_DIR" ]] || find_claude_mem_install_dir; then
expected_version="$(INSTALLER_PKG="${CLAUDE_MEM_INSTALL_DIR}/package.json" node -e "
try { process.stdout.write(JSON.parse(require('fs').readFileSync(process.env.INSTALLER_PKG, 'utf8')).version || ''); }
catch(e) {}
" 2>/dev/null)" || true
fi
local needs_restart=""
# Check if worker version is outdated compared to installed version
if [[ -n "$WORKER_VERSION" && -n "$expected_version" && "$WORKER_VERSION" != "$expected_version" ]]; then
warn "Existing worker is v${WORKER_VERSION} but installed v${expected_version} — restart recommended"
info " Run: curl -X POST http://127.0.0.1:37777/api/admin/restart"
needs_restart="true"
fi
# Check if AI provider doesn't match current configuration
if [[ -n "$WORKER_AI_PROVIDER" && -n "$AI_PROVIDER" && "$WORKER_AI_PROVIDER" != "$AI_PROVIDER" ]]; then
warn "Worker is using ${WORKER_AI_PROVIDER} but you configured ${AI_PROVIDER} — restart to apply changes"
needs_restart="true"
fi
# If everything is current, show full healthy status
if [[ "$needs_restart" != "true" ]]; then
local uptime_display=""
if [[ -n "$WORKER_UPTIME" && "$WORKER_UPTIME" =~ ^[0-9]+$ && "$WORKER_UPTIME" != "0" ]]; then
uptime_display="$(format_uptime_ms "$WORKER_UPTIME")"
fi
local status_parts=""
if [[ -n "$WORKER_VERSION" ]]; then
status_parts="v${WORKER_VERSION}"
fi
if [[ -n "$WORKER_AI_PROVIDER" ]]; then
status_parts="${status_parts:+${status_parts}, }${WORKER_AI_PROVIDER}"
fi
if [[ -n "$uptime_display" ]]; then
status_parts="${status_parts:+${status_parts}, }uptime: ${uptime_display}"
fi
if [[ -n "$status_parts" ]]; then
success "Existing worker is healthy (${status_parts}) — skipping startup"
else
success "Existing worker is healthy — skipping startup" success "Existing worker is healthy — skipping startup"
fi
fi
else else
warn "Port 37777 is occupied but not responding to health checks" warn "Port 37777 is occupied but not responding to health checks"
warn "Another process may be using this port. Stop it and re-run the installer," warn "Another process may be using this port. Stop it and re-run the installer,"