Reduce timeouts to eliminate 10-30s startup delay when worker is dead (common on WSL2 after hibernate). Add stale PID detection, graceful error handling across all handlers, and error classification that distinguishes worker unavailability from handler bugs. - HEALTH_CHECK 30s→3s, new POST_SPAWN_WAIT (5s), PORT_IN_USE_WAIT (3s) - isProcessAlive() with EPERM handling, cleanStalePidFile() - getPluginVersion() try-catch for shutdown race (#1042) - isWorkerUnavailableError: transport+5xx+429→exit 0, 4xx→exit 2 - No-op handler for unknown event types (#984) - Wrap all handler fetch calls in try-catch for graceful degradation - CLAUDE_MEM_HEALTH_TIMEOUT_MS env var override with validation
This commit is contained in:
@@ -77,7 +77,11 @@ export function removePidFile(): void {
|
||||
}
|
||||
|
||||
/**
|
||||
* Get platform-adjusted timeout (Windows socket cleanup is slower)
|
||||
* Get platform-adjusted timeout for worker-side socket operations (2.0x on Windows).
|
||||
*
|
||||
* Note: Two platform multiplier functions exist intentionally:
|
||||
* - getTimeout() in hook-constants.ts uses 1.5x for hook-side operations (fast path)
|
||||
* - getPlatformTimeout() here uses 2.0x for worker-side socket operations (slower path)
|
||||
*/
|
||||
export function getPlatformTimeout(baseMs: number): number {
|
||||
const WINDOWS_MULTIPLIER = 2.0;
|
||||
@@ -398,6 +402,56 @@ export function spawnDaemon(
|
||||
return child.pid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a process with the given PID is alive.
|
||||
*
|
||||
* Uses the process.kill(pid, 0) idiom: signal 0 doesn't send a signal,
|
||||
* it just checks if the process exists and is reachable.
|
||||
*
|
||||
* EPERM is treated as "alive" because it means the process exists but
|
||||
* belongs to a different user/session (common in multi-user setups).
|
||||
* PID 0 (Windows WMIC sentinel for unknown PID) is treated as alive.
|
||||
*/
|
||||
export function isProcessAlive(pid: number): boolean {
|
||||
// PID 0 is the Windows WMIC sentinel value — process was spawned but PID unknown
|
||||
if (pid === 0) return true;
|
||||
|
||||
// Invalid PIDs are not alive
|
||||
if (!Number.isInteger(pid) || pid < 0) return false;
|
||||
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch (error: unknown) {
|
||||
const code = (error as NodeJS.ErrnoException).code;
|
||||
// EPERM = process exists but different user/session — treat as alive
|
||||
if (code === 'EPERM') return true;
|
||||
// ESRCH = no such process — it's dead
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the PID file and remove it if the recorded process is dead (stale).
|
||||
*
|
||||
* This is a cheap operation: one filesystem read + one signal-0 check.
|
||||
* Called at the top of ensureWorkerStarted() to clean up after WSL2
|
||||
* hibernate, OOM kills, or other ungraceful worker deaths.
|
||||
*/
|
||||
export function cleanStalePidFile(): void {
|
||||
const pidInfo = readPidFile();
|
||||
if (!pidInfo) return;
|
||||
|
||||
if (!isProcessAlive(pidInfo.pid)) {
|
||||
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead)', {
|
||||
pid: pidInfo.pid,
|
||||
port: pidInfo.port,
|
||||
startedAt: pidInfo.startedAt
|
||||
});
|
||||
removePidFile();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create signal handler factory for graceful shutdown
|
||||
* Returns a handler function that can be passed to process.on('SIGTERM') etc.
|
||||
|
||||
Reference in New Issue
Block a user