fix: address PR review findings for worker lifecycle

- Add PID validation to restart case (matches start case)
- Wrap forceKillProcess() in try/catch for graceful shutdown
- Wrap getChildProcesses() in try/catch for Windows failures
- Add logging to readPidFile(), removePidFile(), httpShutdown()

Fixes critical issues found in PR #458 review.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2025-12-26 21:12:11 -05:00
parent 84a23450c8
commit d616307781
2 changed files with 95 additions and 61 deletions
File diff suppressed because one or more lines are too long
+51 -17
View File
@@ -42,11 +42,18 @@ function readPidFile(): PidInfo | null {
try { try {
if (!existsSync(PID_FILE)) return null; if (!existsSync(PID_FILE)) return null;
return JSON.parse(readFileSync(PID_FILE, 'utf-8')); return JSON.parse(readFileSync(PID_FILE, 'utf-8'));
} catch { return null; } } catch (error) {
logger.warn('SYSTEM', 'Failed to read PID file', { path: PID_FILE, error: (error as Error).message });
return null;
}
} }
function removePidFile(): void { function removePidFile(): void {
try { if (existsSync(PID_FILE)) unlinkSync(PID_FILE); } catch {} try {
if (existsSync(PID_FILE)) unlinkSync(PID_FILE);
} catch (error) {
logger.warn('SYSTEM', 'Failed to remove PID file', { path: PID_FILE, error: (error as Error).message });
}
} }
async function isPortInUse(port: number): Promise<boolean> { async function isPortInUse(port: number): Promise<boolean> {
@@ -76,12 +83,23 @@ async function waitForHealth(port: number, timeoutMs: number = 30000): Promise<b
async function httpShutdown(port: number): Promise<boolean> { async function httpShutdown(port: number): Promise<boolean> {
try { try {
await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, { const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
method: 'POST', method: 'POST',
signal: AbortSignal.timeout(5000) signal: AbortSignal.timeout(5000)
}); });
if (!response.ok) {
logger.warn('SYSTEM', 'Shutdown request returned error', { port, status: response.status });
return false;
}
return true; return true;
} catch { return false; } } catch (error) {
// Connection refused is expected if worker already stopped
const isConnectionRefused = (error as Error).message?.includes('ECONNREFUSED');
if (!isConnectionRefused) {
logger.warn('SYSTEM', 'Shutdown request failed', { port, error: (error as Error).message });
}
return false;
}
} }
async function waitForPortFree(port: number, timeoutMs: number = 10000): Promise<boolean> { async function waitForPortFree(port: number, timeoutMs: number = 10000): Promise<boolean> {
@@ -735,13 +753,18 @@ export class WorkerService {
return []; return [];
} }
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`; try {
const { stdout } = await execAsync(cmd, { timeout: 60000 }); const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`;
return stdout const { stdout } = await execAsync(cmd, { timeout: 60000 });
.trim() return stdout
.split('\n') .trim()
.map(s => parseInt(s.trim(), 10)) .split('\n')
.filter(n => !isNaN(n) && Number.isInteger(n) && n > 0); // SECURITY: Validate each PID .map(s => parseInt(s.trim(), 10))
.filter(n => !isNaN(n) && Number.isInteger(n) && n > 0); // SECURITY: Validate each PID
} catch (error) {
logger.warn('SYSTEM', 'Failed to enumerate child processes', { parentPid, error: (error as Error).message });
return []; // Fail safely - continue shutdown without child process cleanup
}
} }
/** /**
@@ -754,12 +777,17 @@ export class WorkerService {
return; return;
} }
if (process.platform === 'win32') { try {
// /T kills entire process tree, /F forces termination if (process.platform === 'win32') {
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 60000 }); // /T kills entire process tree, /F forces termination
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 60000 });
} else {
process.kill(pid, 'SIGKILL');
}
logger.info('SYSTEM', 'Killed process', { pid }); logger.info('SYSTEM', 'Killed process', { pid });
} else { } catch {
process.kill(pid, 'SIGKILL'); // Process may have already exited - continue shutdown
logger.debug('SYSTEM', 'Process already exited during force kill', { pid });
} }
} }
@@ -889,8 +917,14 @@ async function main() {
windowsHide: true, windowsHide: true,
env: { ...process.env, CLAUDE_MEM_WORKER_PORT: String(port) } env: { ...process.env, CLAUDE_MEM_WORKER_PORT: String(port) }
}); });
if (child.pid === undefined) {
console.error('Failed to spawn worker daemon during restart');
process.exit(1);
}
child.unref(); child.unref();
writePidFile({ pid: child.pid!, port, startedAt: new Date().toISOString() }); writePidFile({ pid: child.pid, port, startedAt: new Date().toISOString() });
const healthy = await waitForHealth(port, 30000); const healthy = await waitForHealth(port, 30000);
if (!healthy) { if (!healthy) {
removePidFile(); removePidFile();