fix(windows): solve zombie port problem with wrapper architecture (#372)
On Windows, Bun doesn't properly release socket handles when the worker process exits, causing "zombie ports" that remain bound even after all processes are dead. This required a system reboot to clear. Solution: Introduce a wrapper process (worker-wrapper.cjs) that: - Spawns the actual worker as a child with IPC channel - On restart/shutdown, uses `taskkill /T /F` to kill the entire process tree - Exits itself, allowing hooks to start fresh The wrapper has no sockets, so Bun's socket cleanup bug doesn't affect it. When the wrapper kills the inner worker tree and exits, the port is properly released and can be immediately reused. Key changes: - New worker-wrapper.ts for Windows process lifecycle management - ProcessManager starts wrapper on Windows, worker directly on Unix - Worker sends IPC messages to wrapper for restart/shutdown - Health endpoint now includes debug info (build ID, managed status, hasIpc) Tested: Restart API now properly releases port and new worker binds to same port. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -43,8 +43,10 @@ export class ProcessManager {
|
||||
// Ensure log directory exists
|
||||
mkdirSync(LOG_DIR, { recursive: true });
|
||||
|
||||
// Get worker script path
|
||||
const workerScript = join(MARKETPLACE_ROOT, 'plugin', 'scripts', 'worker-service.cjs');
|
||||
// On Windows, use the wrapper script to solve zombie port problem
|
||||
// On Unix, use the worker directly
|
||||
const scriptName = process.platform === 'win32' ? 'worker-wrapper.cjs' : 'worker-service.cjs';
|
||||
const workerScript = join(MARKETPLACE_ROOT, 'plugin', 'scripts', scriptName);
|
||||
|
||||
if (!existsSync(workerScript)) {
|
||||
return { success: false, error: `Worker script not found at ${workerScript}` };
|
||||
@@ -86,6 +88,10 @@ export class ProcessManager {
|
||||
// Note: windowsHide: true doesn't work with detached: true (Bun inherits Node.js process spawning semantics)
|
||||
// See: https://github.com/nodejs/node/issues/21825 and PR #315 for detailed testing
|
||||
//
|
||||
// On Windows, we start worker-wrapper.cjs which manages the actual worker-service.cjs.
|
||||
// This solves the zombie port problem: the wrapper has no sockets, so when it kills
|
||||
// and respawns the inner worker, the socket is properly released.
|
||||
//
|
||||
// Security: All paths (bunPath, script, MARKETPLACE_ROOT) are application-controlled system paths,
|
||||
// not user input. If an attacker could modify these paths, they would already have full filesystem
|
||||
// access including direct access to ~/.claude-mem/claude-mem.db. Nevertheless, we properly escape
|
||||
@@ -168,8 +174,21 @@ export class ProcessManager {
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
process.kill(info.pid, 'SIGTERM');
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
if (process.platform === 'win32') {
|
||||
// On Windows, use taskkill /T /F to kill entire process tree
|
||||
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
||||
// which is necessary to properly release the socket and avoid zombie ports
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
} else {
|
||||
// On Unix, use signals
|
||||
process.kill(info.pid, 'SIGTERM');
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
}
|
||||
} catch {
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
|
||||
+144
-19
@@ -118,8 +118,17 @@ export class WorkerService {
|
||||
*/
|
||||
private setupRoutes(): void {
|
||||
// Health check endpoint
|
||||
// TEST_BUILD_ID helps verify which build is running during debugging
|
||||
const TEST_BUILD_ID = 'TEST-008-wrapper-ipc';
|
||||
this.app.get('/api/health', (_req, res) => {
|
||||
res.status(200).json({ status: 'ok' });
|
||||
res.status(200).json({
|
||||
status: 'ok',
|
||||
build: TEST_BUILD_ID,
|
||||
managed: process.env.CLAUDE_MEM_MANAGED === 'true',
|
||||
hasIpc: typeof process.send === 'function',
|
||||
platform: process.platform,
|
||||
pid: process.pid,
|
||||
});
|
||||
});
|
||||
|
||||
// Version endpoint - returns the worker's current version
|
||||
@@ -179,18 +188,43 @@ export class WorkerService {
|
||||
// Admin endpoints for process management
|
||||
this.app.post('/api/admin/restart', async (_req, res) => {
|
||||
res.json({ status: 'restarting' });
|
||||
setTimeout(async () => {
|
||||
await this.shutdown();
|
||||
process.exit(0);
|
||||
}, 100);
|
||||
|
||||
// On Windows, if managed by wrapper, send message to parent to handle restart
|
||||
// This solves the Windows zombie port problem where sockets aren't properly released
|
||||
const isWindowsManaged = process.platform === 'win32' &&
|
||||
process.env.CLAUDE_MEM_MANAGED === 'true' &&
|
||||
process.send;
|
||||
|
||||
if (isWindowsManaged) {
|
||||
logger.info('SYSTEM', 'Sending restart request to wrapper');
|
||||
process.send!({ type: 'restart' });
|
||||
} else {
|
||||
// Unix or standalone Windows - handle restart ourselves
|
||||
setTimeout(async () => {
|
||||
await this.shutdown();
|
||||
process.exit(0);
|
||||
}, 100);
|
||||
}
|
||||
});
|
||||
|
||||
this.app.post('/api/admin/shutdown', async (_req, res) => {
|
||||
res.json({ status: 'shutting_down' });
|
||||
setTimeout(async () => {
|
||||
await this.shutdown();
|
||||
process.exit(0);
|
||||
}, 100);
|
||||
|
||||
// On Windows, if managed by wrapper, send message to parent to handle shutdown
|
||||
const isWindowsManaged = process.platform === 'win32' &&
|
||||
process.env.CLAUDE_MEM_MANAGED === 'true' &&
|
||||
process.send;
|
||||
|
||||
if (isWindowsManaged) {
|
||||
logger.info('SYSTEM', 'Sending shutdown request to wrapper');
|
||||
process.send!({ type: 'shutdown' });
|
||||
} else {
|
||||
// Unix or standalone Windows - handle shutdown ourselves
|
||||
setTimeout(async () => {
|
||||
await this.shutdown();
|
||||
process.exit(0);
|
||||
}, 100);
|
||||
}
|
||||
});
|
||||
|
||||
this.viewerRoutes.setupRoutes(this.app);
|
||||
@@ -399,12 +433,32 @@ export class WorkerService {
|
||||
|
||||
/**
|
||||
* Shutdown the worker service
|
||||
*
|
||||
* IMPORTANT: On Windows, we must kill all child processes before exiting
|
||||
* to prevent zombie ports. The socket handle can be inherited by children,
|
||||
* and if not properly closed, the port stays bound after process death.
|
||||
*/
|
||||
async shutdown(): Promise<void> {
|
||||
// Shutdown all active sessions
|
||||
logger.info('SYSTEM', 'Shutdown initiated');
|
||||
|
||||
// STEP 1: Enumerate all child processes BEFORE we start closing things
|
||||
const childPids = await this.getChildProcesses(process.pid);
|
||||
logger.info('SYSTEM', 'Found child processes', { count: childPids.length, pids: childPids });
|
||||
|
||||
// STEP 2: Close HTTP server first
|
||||
if (this.server) {
|
||||
this.server.closeAllConnections();
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
this.server!.close(err => err ? reject(err) : resolve());
|
||||
});
|
||||
this.server = null;
|
||||
logger.info('SYSTEM', 'HTTP server closed');
|
||||
}
|
||||
|
||||
// STEP 3: Shutdown active sessions
|
||||
await this.sessionManager.shutdownAll();
|
||||
|
||||
// Close MCP client connection (terminates MCP server process)
|
||||
// STEP 4: Close MCP client connection (signals child to exit gracefully)
|
||||
if (this.mcpClient) {
|
||||
try {
|
||||
await this.mcpClient.close();
|
||||
@@ -414,19 +468,90 @@ export class WorkerService {
|
||||
}
|
||||
}
|
||||
|
||||
// Close HTTP server
|
||||
if (this.server) {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
this.server!.close(err => err ? reject(err) : resolve());
|
||||
});
|
||||
}
|
||||
|
||||
// Close database connection (includes ChromaSync cleanup)
|
||||
// STEP 5: Close database connection (includes ChromaSync cleanup)
|
||||
await this.dbManager.close();
|
||||
|
||||
// STEP 6: Force kill any remaining child processes (Windows zombie port fix)
|
||||
if (childPids.length > 0) {
|
||||
logger.info('SYSTEM', 'Force killing remaining children');
|
||||
for (const pid of childPids) {
|
||||
await this.forceKillProcess(pid);
|
||||
}
|
||||
// Wait for children to fully exit
|
||||
await this.waitForProcessesExit(childPids, 5000);
|
||||
}
|
||||
|
||||
logger.info('SYSTEM', 'Worker shutdown complete');
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all child process PIDs (Windows-specific)
|
||||
*/
|
||||
private async getChildProcesses(parentPid: number): Promise<number[]> {
|
||||
if (process.platform !== 'win32') {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`;
|
||||
const { stdout } = await execAsync(cmd, { timeout: 5000 });
|
||||
return stdout
|
||||
.trim()
|
||||
.split('\n')
|
||||
.map(s => parseInt(s.trim(), 10))
|
||||
.filter(n => !isNaN(n));
|
||||
} catch (error) {
|
||||
logger.warn('SYSTEM', 'Failed to enumerate child processes', {}, error as Error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Force kill a process by PID (Windows: uses taskkill /F /T)
|
||||
*/
|
||||
private async forceKillProcess(pid: number): Promise<void> {
|
||||
try {
|
||||
if (process.platform === 'win32') {
|
||||
// /T kills entire process tree, /F forces termination
|
||||
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 5000 });
|
||||
logger.info('SYSTEM', 'Killed process', { pid });
|
||||
} else {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
}
|
||||
} catch (error) {
|
||||
// Process may already be dead, which is fine
|
||||
logger.debug('SYSTEM', 'Process already dead or kill failed', { pid });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for processes to fully exit
|
||||
*/
|
||||
private async waitForProcessesExit(pids: number[], timeoutMs: number): Promise<void> {
|
||||
const start = Date.now();
|
||||
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
const stillAlive = pids.filter(pid => {
|
||||
try {
|
||||
process.kill(pid, 0); // Signal 0 checks if process exists
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
if (stillAlive.length === 0) {
|
||||
logger.info('SYSTEM', 'All child processes exited');
|
||||
return;
|
||||
}
|
||||
|
||||
logger.debug('SYSTEM', 'Waiting for processes to exit', { stillAlive });
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
}
|
||||
|
||||
logger.warn('SYSTEM', 'Timeout waiting for child processes to exit');
|
||||
}
|
||||
|
||||
/**
|
||||
* Summarize request body for logging
|
||||
* Used to avoid logging sensitive data or large payloads
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
/**
|
||||
* Worker Wrapper - Manages worker process lifecycle
|
||||
*
|
||||
* This wrapper exists to solve the Windows zombie port problem.
|
||||
* The wrapper spawns the actual worker as a child process.
|
||||
* When restart/shutdown is requested, the wrapper kills the child
|
||||
* and respawns it (or exits), ensuring clean socket cleanup.
|
||||
*
|
||||
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
||||
* doesn't affect it.
|
||||
*/
|
||||
|
||||
import { spawn, ChildProcess, execSync } from 'child_process';
|
||||
import path from 'path';
|
||||
|
||||
const isWindows = process.platform === 'win32';
|
||||
|
||||
const SCRIPT_DIR = __dirname;
|
||||
const INNER_SCRIPT = path.join(SCRIPT_DIR, 'worker-service.cjs');
|
||||
|
||||
let inner: ChildProcess | null = null;
|
||||
let isShuttingDown = false;
|
||||
|
||||
function log(msg: string) {
|
||||
const timestamp = new Date().toISOString();
|
||||
console.log(`[${timestamp}] [wrapper] ${msg}`);
|
||||
}
|
||||
|
||||
function spawnInner() {
|
||||
log(`Spawning inner worker: ${INNER_SCRIPT}`);
|
||||
|
||||
inner = spawn(process.execPath, [INNER_SCRIPT], {
|
||||
stdio: ['inherit', 'inherit', 'inherit', 'ipc'],
|
||||
env: { ...process.env, CLAUDE_MEM_MANAGED: 'true' },
|
||||
cwd: path.dirname(INNER_SCRIPT),
|
||||
});
|
||||
|
||||
inner.on('message', async (msg: { type: string }) => {
|
||||
if (msg.type === 'restart' || msg.type === 'shutdown') {
|
||||
// Both restart and shutdown: kill inner and exit wrapper
|
||||
// The hooks will start a fresh wrapper+inner if needed
|
||||
log(`${msg.type} requested by inner`);
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
log('Exiting wrapper');
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
|
||||
inner.on('exit', (code, signal) => {
|
||||
log(`Inner exited with code=${code}, signal=${signal}`);
|
||||
inner = null;
|
||||
|
||||
// If inner crashed unexpectedly (not during shutdown), respawn it
|
||||
if (!isShuttingDown && code !== 0) {
|
||||
log('Inner crashed, respawning in 1 second...');
|
||||
setTimeout(() => spawnInner(), 1000);
|
||||
}
|
||||
});
|
||||
|
||||
inner.on('error', (err) => {
|
||||
log(`Inner error: ${err.message}`);
|
||||
});
|
||||
}
|
||||
|
||||
async function killInner(): Promise<void> {
|
||||
if (!inner || !inner.pid) {
|
||||
log('No inner process to kill');
|
||||
return;
|
||||
}
|
||||
|
||||
const pid = inner.pid;
|
||||
log(`Killing inner process tree (pid=${pid})`);
|
||||
|
||||
if (isWindows) {
|
||||
// On Windows, use taskkill /T /F to kill entire process tree
|
||||
// This ensures all children (MCP server, ChromaSync, etc.) are killed
|
||||
// which is necessary to properly release the socket
|
||||
try {
|
||||
execSync(`taskkill /PID ${pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
log(`taskkill completed for pid=${pid}`);
|
||||
} catch (error) {
|
||||
// Process may already be dead
|
||||
log(`taskkill failed (process may be dead): ${error}`);
|
||||
}
|
||||
} else {
|
||||
// On Unix, SIGTERM then SIGKILL
|
||||
inner.kill('SIGTERM');
|
||||
|
||||
// Wait for exit with timeout
|
||||
const exitPromise = new Promise<void>(resolve => {
|
||||
if (!inner) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
inner.on('exit', () => resolve());
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<void>(resolve =>
|
||||
setTimeout(() => resolve(), 5000)
|
||||
);
|
||||
|
||||
await Promise.race([exitPromise, timeoutPromise]);
|
||||
|
||||
// Force kill if still alive
|
||||
if (inner && !inner.killed) {
|
||||
log('Inner did not exit gracefully, force killing');
|
||||
inner.kill('SIGKILL');
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the process to fully exit
|
||||
await waitForProcessExit(pid, 5000);
|
||||
|
||||
inner = null;
|
||||
log('Inner process terminated');
|
||||
}
|
||||
|
||||
async function waitForProcessExit(pid: number, timeoutMs: number): Promise<void> {
|
||||
const start = Date.now();
|
||||
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
process.kill(pid, 0); // Check if process exists
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
} catch {
|
||||
// Process is dead
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log(`Timeout waiting for process ${pid} to exit`);
|
||||
}
|
||||
|
||||
// Handle wrapper signals
|
||||
process.on('SIGTERM', async () => {
|
||||
log('Wrapper received SIGTERM');
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
log('Wrapper received SIGINT');
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// Start the inner worker
|
||||
log('Wrapper starting');
|
||||
spawnInner();
|
||||
Reference in New Issue
Block a user