fix: hook resilience and worker lifecycle improvements (#957, #923, #984, #987, #1042)

Reduce timeouts to eliminate 10-30s startup delay when worker is dead
(common on WSL2 after hibernate). Add stale PID detection, graceful
error handling across all handlers, and error classification that
distinguishes worker unavailability from handler bugs.

- HEALTH_CHECK 30s→3s, new POST_SPAWN_WAIT (5s), PORT_IN_USE_WAIT (3s)
- isProcessAlive() with EPERM handling, cleanStalePidFile()
- getPluginVersion() try-catch for shutdown race (#1042)
- isWorkerUnavailableError: transport+5xx+429→exit 0, 4xx→exit 2
- No-op handler for unknown event types (#984)
- Wrap all handler fetch calls in try-catch for graceful degradation
- CLAUDE_MEM_HEALTH_TIMEOUT_MS env var override with validation
This commit is contained in:
Rod Boev
2026-02-10 15:34:35 -05:00
parent 6ac5507e4e
commit 418e38ee46
16 changed files with 791 additions and 348 deletions
+65 -2
View File
@@ -8,6 +8,61 @@ export interface HookCommandOptions {
skipExit?: boolean;
}
/**
* Classify whether an error indicates the worker is unavailable (graceful degradation)
* vs a handler/client bug (blocking error that developers need to see).
*
* Exit 0 (graceful degradation):
* - Transport failures: ECONNREFUSED, ECONNRESET, EPIPE, ETIMEDOUT, fetch failed
* - Timeout errors: timed out, timeout
* - Server errors: HTTP 5xx status codes
*
* Exit 2 (blocking error — handler/client bug):
* - HTTP 4xx status codes (bad request, not found, validation error)
* - Programming errors (TypeError, ReferenceError, SyntaxError)
* - All other unexpected errors
*/
export function isWorkerUnavailableError(error: unknown): boolean {
const message = error instanceof Error ? error.message : String(error);
const lower = message.toLowerCase();
// Transport failures — worker unreachable
const transportPatterns = [
'econnrefused',
'econnreset',
'epipe',
'etimedout',
'enotfound',
'econnaborted',
'enetunreach',
'ehostunreach',
'fetch failed',
'unable to connect',
'socket hang up',
];
if (transportPatterns.some(p => lower.includes(p))) return true;
// Timeout errors — worker didn't respond in time
if (lower.includes('timed out') || lower.includes('timeout')) return true;
// HTTP 5xx server errors — worker has internal problems
if (/failed:\s*5\d{2}/.test(message) || /status[:\s]+5\d{2}/.test(message)) return true;
// HTTP 429 (rate limit) — treat as transient unavailability, not a bug
if (/failed:\s*429/.test(message) || /status[:\s]+429/.test(message)) return true;
// HTTP 4xx client errors — our bug, NOT worker unavailability
if (/failed:\s*4\d{2}/.test(message) || /status[:\s]+4\d{2}/.test(message)) return false;
// Programming errors — code bugs, not worker unavailability
if (error instanceof TypeError || error instanceof ReferenceError || error instanceof SyntaxError) {
return false;
}
// Default: treat unknown errors as blocking (conservative — surface bugs)
return false;
}
export async function hookCommand(platform: string, event: string, options: HookCommandOptions = {}): Promise<number> {
try {
const adapter = getPlatformAdapter(platform);
@@ -26,9 +81,17 @@ export async function hookCommand(platform: string, event: string, options: Hook
}
return exitCode;
} catch (error) {
if (isWorkerUnavailableError(error)) {
// Worker unavailable — degrade gracefully, don't block the user
console.error(`[claude-mem] Worker unavailable, skipping hook: ${error instanceof Error ? error.message : error}`);
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.SUCCESS); // = 0 (graceful)
}
return HOOK_EXIT_CODES.SUCCESS;
}
// Handler/client bug — show as blocking error so developers see it
console.error(`Hook error: ${error}`);
// Use exit code 2 (blocking error) so users see the error message
// Exit code 1 only shows in verbose mode per Claude Code docs
if (!options.skipExit) {
process.exit(HOOK_EXIT_CODES.BLOCKING_ERROR); // = 2
}