fix: hook resilience and worker lifecycle improvements (#957, #923, #984, #987, #1042)

Reduce timeouts to eliminate 10-30s startup delay when worker is dead
(common on WSL2 after hibernate). Add stale PID detection, graceful
error handling across all handlers, and error classification that
distinguishes worker unavailability from handler bugs.

- HEALTH_CHECK 30s→3s, new POST_SPAWN_WAIT (5s), PORT_IN_USE_WAIT (3s)
- isProcessAlive() with EPERM handling, cleanStalePidFile()
- getPluginVersion() try-catch for shutdown race (#1042)
- isWorkerUnavailableError: transport+5xx+429→exit 0, 4xx→exit 2
- No-op handler for unknown event types (#984)
- Wrap all handler fetch calls in try-catch for graceful degradation
- CLAUDE_MEM_HEALTH_TIMEOUT_MS env var override with validation
This commit is contained in:
Rod Boev
2026-02-10 15:34:35 -05:00
parent 6ac5507e4e
commit 418e38ee46
16 changed files with 791 additions and 348 deletions
@@ -8,6 +8,8 @@ import {
removePidFile,
getPlatformTimeout,
parseElapsedTime,
isProcessAlive,
cleanStalePidFile,
type PidInfo
} from '../../src/services/infrastructure/index.js';
@@ -221,4 +223,69 @@ describe('ProcessManager', () => {
expect(result).toBe(666);
});
});
describe('isProcessAlive', () => {
it('should return true for the current process', () => {
expect(isProcessAlive(process.pid)).toBe(true);
});
it('should return false for a non-existent PID', () => {
// Use a very high PID that's extremely unlikely to exist
expect(isProcessAlive(2147483647)).toBe(false);
});
it('should return true for PID 0 (Windows WMIC sentinel)', () => {
expect(isProcessAlive(0)).toBe(true);
});
it('should return false for negative PIDs', () => {
expect(isProcessAlive(-1)).toBe(false);
expect(isProcessAlive(-999)).toBe(false);
});
it('should return false for non-integer PIDs', () => {
expect(isProcessAlive(1.5)).toBe(false);
expect(isProcessAlive(NaN)).toBe(false);
});
});
describe('cleanStalePidFile', () => {
it('should remove PID file when process is dead', () => {
// Write a PID file with a non-existent PID
const staleInfo: PidInfo = {
pid: 2147483647,
port: 37777,
startedAt: '2024-01-01T00:00:00.000Z'
};
writePidFile(staleInfo);
expect(existsSync(PID_FILE)).toBe(true);
cleanStalePidFile();
expect(existsSync(PID_FILE)).toBe(false);
});
it('should keep PID file when process is alive', () => {
// Write a PID file with the current process PID (definitely alive)
const liveInfo: PidInfo = {
pid: process.pid,
port: 37777,
startedAt: new Date().toISOString()
};
writePidFile(liveInfo);
cleanStalePidFile();
// PID file should still exist since process.pid is alive
expect(existsSync(PID_FILE)).toBe(true);
});
it('should do nothing when PID file does not exist', () => {
removePidFile();
expect(existsSync(PID_FILE)).toBe(false);
// Should not throw
expect(() => cleanStalePidFile()).not.toThrow();
});
});
});