feat: harden installer with rich health check diagnostics

Two-stage health verification (health + readiness), 30s timeout,
parse_health_json() helper with jq/python3/node fallbacks, smart
port-conflict handling with version/provider mismatch detection,
and enhanced completion summary showing version, AI auth, and uptime.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-12 21:58:15 -05:00
parent 05e904e613
commit 1d76f93304
+253 -23
View File
@@ -273,6 +273,128 @@ ensure_jq_or_fallback() {
return 1
}
###############################################################################
# Parse /api/health JSON response — extract worker metadata into globals
# Uses jq → python3 → node fallback chain (matching installer conventions)
# Sets: WORKER_VERSION, WORKER_AI_PROVIDER, WORKER_AI_AUTH_METHOD,
# WORKER_INITIALIZED, WORKER_REPORTED_PID, WORKER_UPTIME
###############################################################################
parse_health_json() {
local raw_json="$1"
# Reset all health globals before parsing
WORKER_VERSION=""
WORKER_AI_PROVIDER=""
WORKER_AI_AUTH_METHOD=""
WORKER_INITIALIZED=""
WORKER_REPORTED_PID=""
WORKER_UPTIME=""
if [[ -z "$raw_json" ]]; then
return 0
fi
# Try jq first (fastest, most reliable)
if command -v jq &>/dev/null; then
WORKER_VERSION="$(echo "$raw_json" | jq -r '.version // empty' 2>/dev/null)" || true
WORKER_AI_PROVIDER="$(echo "$raw_json" | jq -r '.ai.provider // empty' 2>/dev/null)" || true
WORKER_AI_AUTH_METHOD="$(echo "$raw_json" | jq -r '.ai.authMethod // empty' 2>/dev/null)" || true
WORKER_INITIALIZED="$(echo "$raw_json" | jq -r '.initialized // empty' 2>/dev/null)" || true
WORKER_REPORTED_PID="$(echo "$raw_json" | jq -r '.pid // empty' 2>/dev/null)" || true
WORKER_UPTIME="$(echo "$raw_json" | jq -r '.uptime // empty' 2>/dev/null)" || true
return 0
fi
# Try python3 fallback
if command -v python3 &>/dev/null; then
local parsed
parsed="$(INSTALLER_HEALTH_JSON="$raw_json" python3 -c "
import json, os, sys
try:
data = json.loads(os.environ['INSTALLER_HEALTH_JSON'])
ai = data.get('ai') or {}
fields = [
str(data.get('version', '')),
str(ai.get('provider', '')),
str(ai.get('authMethod', '')),
str(data.get('initialized', '')),
str(data.get('pid', '')),
str(data.get('uptime', '')),
]
sys.stdout.write('\n'.join(fields))
except Exception:
sys.stdout.write('\n\n\n\n\n')
" 2>/dev/null)" || true
if [[ -n "$parsed" ]]; then
local -a health_fields
IFS=$'\n' read -r -d '' -a health_fields <<< "$parsed" || true
WORKER_VERSION="${health_fields[0]:-}"
WORKER_AI_PROVIDER="${health_fields[1]:-}"
WORKER_AI_AUTH_METHOD="${health_fields[2]:-}"
WORKER_INITIALIZED="${health_fields[3]:-}"
WORKER_REPORTED_PID="${health_fields[4]:-}"
WORKER_UPTIME="${health_fields[5]:-}"
# Normalize python's None/empty representations
[[ "$WORKER_VERSION" == "None" ]] && WORKER_VERSION=""
[[ "$WORKER_AI_PROVIDER" == "None" ]] && WORKER_AI_PROVIDER=""
[[ "$WORKER_AI_AUTH_METHOD" == "None" ]] && WORKER_AI_AUTH_METHOD=""
[[ "$WORKER_INITIALIZED" == "None" ]] && WORKER_INITIALIZED=""
[[ "$WORKER_REPORTED_PID" == "None" ]] && WORKER_REPORTED_PID=""
[[ "$WORKER_UPTIME" == "None" ]] && WORKER_UPTIME=""
fi
return 0
fi
# Fallback to node (always available — it's a dependency)
local parsed
parsed="$(INSTALLER_HEALTH_JSON="$raw_json" node -e "
try {
const data = JSON.parse(process.env.INSTALLER_HEALTH_JSON);
const ai = data.ai || {};
const fields = [
data.version ?? '',
ai.provider ?? '',
ai.authMethod ?? '',
data.initialized != null ? String(data.initialized) : '',
data.pid != null ? String(data.pid) : '',
data.uptime != null ? String(data.uptime) : '',
];
process.stdout.write(fields.join('\n'));
} catch (e) {
process.stdout.write('\n\n\n\n\n');
}
" 2>/dev/null)" || true
if [[ -n "$parsed" ]]; then
local -a health_fields
IFS=$'\n' read -r -d '' -a health_fields <<< "$parsed" || true
WORKER_VERSION="${health_fields[0]:-}"
WORKER_AI_PROVIDER="${health_fields[1]:-}"
WORKER_AI_AUTH_METHOD="${health_fields[2]:-}"
WORKER_INITIALIZED="${health_fields[3]:-}"
WORKER_REPORTED_PID="${health_fields[4]:-}"
WORKER_UPTIME="${health_fields[5]:-}"
fi
}
###############################################################################
# Format uptime from milliseconds to human-readable (e.g., "2m 15s", "1h 23m")
###############################################################################
format_uptime_ms() {
local ms="$1"
local secs=$((ms / 1000))
if (( secs >= 3600 )); then
echo "$((secs / 3600))h $((secs % 3600 / 60))m"
elif (( secs >= 60 )); then
echo "$((secs / 60))m $((secs % 60))s"
else
echo "${secs}s"
fi
}
###############################################################################
# Banner
###############################################################################
@@ -1030,6 +1152,12 @@ find_claude_mem_install_dir() {
###############################################################################
WORKER_PID=""
WORKER_VERSION=""
WORKER_AI_PROVIDER=""
WORKER_AI_AUTH_METHOD=""
WORKER_INITIALIZED=""
WORKER_REPORTED_PID=""
WORKER_UPTIME=""
start_worker() {
info "Starting claude-mem worker service..."
@@ -1083,43 +1211,74 @@ start_worker() {
}
###############################################################################
# Health verification
# Polls http://localhost:37777/api/health up to 10 times with 1-second intervals
# Health verification — two-stage: health (alive) then readiness (initialized)
# Stage 1: Poll /api/health for HTTP 200 (worker process is running)
# Stage 2: Poll /api/readiness for HTTP 200 (worker is fully initialized)
# Total budget: 30 attempts (30 seconds) shared across both stages
###############################################################################
verify_health() {
local max_attempts=10
local max_attempts=30
local attempt=1
local health_url="http://127.0.0.1:37777/api/health"
local readiness_url="http://127.0.0.1:37777/api/readiness"
local health_alive=false
info "Verifying worker health..."
# ── Stage 1: Wait for /api/health to return HTTP 200 (worker is alive) ──
while (( attempt <= max_attempts )); do
local response
response="$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null)" || true
local http_status
http_status="$(curl -s -o /dev/null -w "%{http_code}" "$health_url" 2>/dev/null)" || true
if [[ "$response" == "200" ]]; then
# Verify the response body contains status:ok
if [[ "$http_status" == "200" ]]; then
health_alive=true
# Fetch the full health response body and parse metadata
local body
body="$(curl -s "$health_url" 2>/dev/null)" || true
if echo "$body" | grep -q '"status"[[:space:]]*:[[:space:]]*"ok"'; then
success "Worker is healthy (port 37777)"
return 0
fi
parse_health_json "$body"
success "Worker is alive, waiting for initialization..."
break
fi
if (( attempt < max_attempts )); then
info "Waiting for worker to start... (attempt ${attempt}/${max_attempts})"
fi
info "Waiting for worker to start... (attempt ${attempt}/${max_attempts})"
sleep 1
attempt=$((attempt + 1))
done
warn "Worker health check timed out after ${max_attempts} attempts"
warn "The worker may still be starting up. Check status with:"
warn " curl http://127.0.0.1:37777/api/health"
warn " Or check logs: ~/.claude-mem/logs/"
return 1
# If health never responded, the worker is not running at all
if [[ "$health_alive" != "true" ]]; then
warn "Worker health check timed out after ${max_attempts} attempts"
warn "The worker may still be starting up. Check status with:"
warn " curl http://127.0.0.1:37777/api/health"
warn " Or check logs: ~/.claude-mem/logs/"
return 1
fi
# ── Stage 2: Wait for /api/readiness to return HTTP 200 (fully initialized) ──
attempt=$((attempt + 1))
while (( attempt <= max_attempts )); do
local readiness_status
readiness_status="$(curl -s -o /dev/null -w "%{http_code}" "$readiness_url" 2>/dev/null)" || true
if [[ "$readiness_status" == "200" ]]; then
success "Worker is ready!"
return 0
fi
info "Waiting for worker to initialize... (attempt ${attempt}/${max_attempts})"
sleep 1
attempt=$((attempt + 1))
done
# Readiness timed out but health is OK — worker is running, just not fully initialized yet
warn "Worker is running but initialization is still in progress"
warn "This is normal on first run — the worker will finish initializing in the background."
warn "Check readiness with: curl http://127.0.0.1:37777/api/readiness"
return 0
}
###############################################################################
@@ -1151,7 +1310,7 @@ setup_observation_feed() {
read_tty -r answer
answer="${answer:-n}"
if [[ "${answer,,}" != "y" && "${answer,,}" != "yes" ]]; then
if [[ "$answer" != [yY] && "$answer" != [yY][eE][sS] ]]; then
echo ""
info "Skipped observation feed setup."
info "You can configure it later by re-running this installer or"
@@ -1376,17 +1535,40 @@ print_completion_summary() {
echo -e " ${COLOR_GREEN}${COLOR_RESET} Dependencies installed (Bun, uv)"
echo -e " ${COLOR_GREEN}${COLOR_RESET} OpenClaw gateway detected"
echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem plugin installed and enabled"
# Show installed version from health data if available
if [[ -n "$WORKER_VERSION" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem v${COLOR_BOLD}${WORKER_VERSION}${COLOR_RESET} installed and running"
else
echo -e " ${COLOR_GREEN}${COLOR_RESET} claude-mem plugin installed and enabled"
fi
echo -e " ${COLOR_GREEN}${COLOR_RESET} Memory slot configured"
echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${provider_display}${COLOR_RESET}"
# Show AI provider with auth method from health data if available
if [[ -n "$WORKER_AI_AUTH_METHOD" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${WORKER_AI_PROVIDER} (${WORKER_AI_AUTH_METHOD})${COLOR_RESET}"
else
echo -e " ${COLOR_GREEN}${COLOR_RESET} AI provider: ${COLOR_BOLD}${provider_display}${COLOR_RESET}"
fi
echo -e " ${COLOR_GREEN}${COLOR_RESET} Settings written to ~/.claude-mem/settings.json"
if [[ -n "$WORKER_PID" ]] && kill -0 "$WORKER_PID" 2>/dev/null; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} Worker running on port ${COLOR_BOLD}37777${COLOR_RESET} (PID: ${WORKER_PID})"
elif [[ -n "$WORKER_UPTIME" && "$WORKER_UPTIME" =~ ^[0-9]+$ ]] && (( WORKER_UPTIME > 0 )); then
local uptime_formatted
uptime_formatted="$(format_uptime_ms "$WORKER_UPTIME")"
echo -e " ${COLOR_GREEN}${COLOR_RESET} Worker running on port ${COLOR_BOLD}37777${COLOR_RESET} (PID: ${WORKER_REPORTED_PID}, uptime: ${uptime_formatted})"
else
echo -e " ${COLOR_YELLOW}${COLOR_RESET} Worker may not be running — check logs at ~/.claude-mem/logs/"
fi
# Show initialization warning if worker is alive but not yet initialized
if [[ "$WORKER_INITIALIZED" != "true" ]] && { [[ -n "$WORKER_REPORTED_PID" ]] || { [[ -n "$WORKER_PID" ]] && kill -0 "$WORKER_PID" 2>/dev/null; }; }; then
echo -e " ${COLOR_YELLOW}${COLOR_RESET} Worker is starting but still initializing (this is normal on first run)"
fi
if [[ "$FEED_CONFIGURED" == "true" ]]; then
echo -e " ${COLOR_GREEN}${COLOR_RESET} Observation feed: ${COLOR_BOLD}${FEED_CHANNEL}${COLOR_RESET}${FEED_TARGET_ID}"
else
@@ -1474,7 +1656,55 @@ main() {
warn "Port 37777 is already in use (worker may already be running)"
info "Checking if the existing service is healthy..."
if verify_health; then
success "Existing worker is healthy — skipping startup"
# verify_health already called parse_health_json — WORKER_* globals are set.
# Determine the expected version from the installed plugin's package.json.
local expected_version=""
if [[ -n "$CLAUDE_MEM_INSTALL_DIR" ]] || find_claude_mem_install_dir; then
expected_version="$(INSTALLER_PKG="${CLAUDE_MEM_INSTALL_DIR}/package.json" node -e "
try { process.stdout.write(JSON.parse(require('fs').readFileSync(process.env.INSTALLER_PKG, 'utf8')).version || ''); }
catch(e) {}
" 2>/dev/null)" || true
fi
local needs_restart=""
# Check if worker version is outdated compared to installed version
if [[ -n "$WORKER_VERSION" && -n "$expected_version" && "$WORKER_VERSION" != "$expected_version" ]]; then
warn "Existing worker is v${WORKER_VERSION} but installed v${expected_version} — restart recommended"
info " Run: curl -X POST http://127.0.0.1:37777/api/admin/restart"
needs_restart="true"
fi
# Check if AI provider doesn't match current configuration
if [[ -n "$WORKER_AI_PROVIDER" && -n "$AI_PROVIDER" && "$WORKER_AI_PROVIDER" != "$AI_PROVIDER" ]]; then
warn "Worker is using ${WORKER_AI_PROVIDER} but you configured ${AI_PROVIDER} — restart to apply changes"
needs_restart="true"
fi
# If everything is current, show full healthy status
if [[ "$needs_restart" != "true" ]]; then
local uptime_display=""
if [[ -n "$WORKER_UPTIME" && "$WORKER_UPTIME" =~ ^[0-9]+$ && "$WORKER_UPTIME" != "0" ]]; then
uptime_display="$(format_uptime_ms "$WORKER_UPTIME")"
fi
local status_parts=""
if [[ -n "$WORKER_VERSION" ]]; then
status_parts="v${WORKER_VERSION}"
fi
if [[ -n "$WORKER_AI_PROVIDER" ]]; then
status_parts="${status_parts:+${status_parts}, }${WORKER_AI_PROVIDER}"
fi
if [[ -n "$uptime_display" ]]; then
status_parts="${status_parts:+${status_parts}, }uptime: ${uptime_display}"
fi
if [[ -n "$status_parts" ]]; then
success "Existing worker is healthy (${status_parts}) — skipping startup"
else
success "Existing worker is healthy — skipping startup"
fi
fi
else
warn "Port 37777 is occupied but not responding to health checks"
warn "Another process may be using this port. Stop it and re-run the installer,"