fix(windows): solve zombie port problem with wrapper architecture (#372)

On Windows, Bun doesn't properly release socket handles when the worker
process exits, causing "zombie ports" that remain bound even after all
processes are dead. This required a system reboot to clear.

Solution: Introduce a wrapper process (worker-wrapper.cjs) that:
- Spawns the actual worker as a child with IPC channel
- On restart/shutdown, uses `taskkill /T /F` to kill the entire process tree
- Exits itself, allowing hooks to start fresh

The wrapper has no sockets, so Bun's socket cleanup bug doesn't affect it.
When the wrapper kills the inner worker tree and exits, the port is properly
released and can be immediately reused.

Key changes:
- New worker-wrapper.ts for Windows process lifecycle management
- ProcessManager starts wrapper on Windows, worker directly on Unix
- Worker sends IPC messages to wrapper for restart/shutdown
- Health endpoint now includes debug info (build ID, managed status, hasIpc)

Tested: Restart API now properly releases port and new worker binds to same port.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
ToxMox
2025-12-17 14:45:41 -05:00
committed by GitHub
parent 1fec1e8339
commit a5bf653a47
14 changed files with 664 additions and 248 deletions
+23 -4
View File
@@ -43,8 +43,10 @@ export class ProcessManager {
// Ensure log directory exists
mkdirSync(LOG_DIR, { recursive: true });
// Get worker script path
const workerScript = join(MARKETPLACE_ROOT, 'plugin', 'scripts', 'worker-service.cjs');
// On Windows, use the wrapper script to solve zombie port problem
// On Unix, use the worker directly
const scriptName = process.platform === 'win32' ? 'worker-wrapper.cjs' : 'worker-service.cjs';
const workerScript = join(MARKETPLACE_ROOT, 'plugin', 'scripts', scriptName);
if (!existsSync(workerScript)) {
return { success: false, error: `Worker script not found at ${workerScript}` };
@@ -86,6 +88,10 @@ export class ProcessManager {
// Note: windowsHide: true doesn't work with detached: true (Bun inherits Node.js process spawning semantics)
// See: https://github.com/nodejs/node/issues/21825 and PR #315 for detailed testing
//
// On Windows, we start worker-wrapper.cjs which manages the actual worker-service.cjs.
// This solves the zombie port problem: the wrapper has no sockets, so when it kills
// and respawns the inner worker, the socket is properly released.
//
// Security: All paths (bunPath, script, MARKETPLACE_ROOT) are application-controlled system paths,
// not user input. If an attacker could modify these paths, they would already have full filesystem
// access including direct access to ~/.claude-mem/claude-mem.db. Nevertheless, we properly escape
@@ -168,8 +174,21 @@ export class ProcessManager {
if (!info) return true;
try {
process.kill(info.pid, 'SIGTERM');
await this.waitForExit(info.pid, timeout);
if (process.platform === 'win32') {
// On Windows, use taskkill /T /F to kill entire process tree
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
// which is necessary to properly release the socket and avoid zombie ports
const { execSync } = await import('child_process');
try {
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
} catch {
// Process may already be dead
}
} else {
// On Unix, use signals
process.kill(info.pid, 'SIGTERM');
await this.waitForExit(info.pid, timeout);
}
} catch {
try {
process.kill(info.pid, 'SIGKILL');
+144 -19
View File
@@ -118,8 +118,17 @@ export class WorkerService {
*/
private setupRoutes(): void {
// Health check endpoint
// TEST_BUILD_ID helps verify which build is running during debugging
const TEST_BUILD_ID = 'TEST-008-wrapper-ipc';
this.app.get('/api/health', (_req, res) => {
res.status(200).json({ status: 'ok' });
res.status(200).json({
status: 'ok',
build: TEST_BUILD_ID,
managed: process.env.CLAUDE_MEM_MANAGED === 'true',
hasIpc: typeof process.send === 'function',
platform: process.platform,
pid: process.pid,
});
});
// Version endpoint - returns the worker's current version
@@ -179,18 +188,43 @@ export class WorkerService {
// Admin endpoints for process management
this.app.post('/api/admin/restart', async (_req, res) => {
res.json({ status: 'restarting' });
setTimeout(async () => {
await this.shutdown();
process.exit(0);
}, 100);
// On Windows, if managed by wrapper, send message to parent to handle restart
// This solves the Windows zombie port problem where sockets aren't properly released
const isWindowsManaged = process.platform === 'win32' &&
process.env.CLAUDE_MEM_MANAGED === 'true' &&
process.send;
if (isWindowsManaged) {
logger.info('SYSTEM', 'Sending restart request to wrapper');
process.send!({ type: 'restart' });
} else {
// Unix or standalone Windows - handle restart ourselves
setTimeout(async () => {
await this.shutdown();
process.exit(0);
}, 100);
}
});
this.app.post('/api/admin/shutdown', async (_req, res) => {
res.json({ status: 'shutting_down' });
setTimeout(async () => {
await this.shutdown();
process.exit(0);
}, 100);
// On Windows, if managed by wrapper, send message to parent to handle shutdown
const isWindowsManaged = process.platform === 'win32' &&
process.env.CLAUDE_MEM_MANAGED === 'true' &&
process.send;
if (isWindowsManaged) {
logger.info('SYSTEM', 'Sending shutdown request to wrapper');
process.send!({ type: 'shutdown' });
} else {
// Unix or standalone Windows - handle shutdown ourselves
setTimeout(async () => {
await this.shutdown();
process.exit(0);
}, 100);
}
});
this.viewerRoutes.setupRoutes(this.app);
@@ -399,12 +433,32 @@ export class WorkerService {
/**
* Shutdown the worker service
*
* IMPORTANT: On Windows, we must kill all child processes before exiting
* to prevent zombie ports. The socket handle can be inherited by children,
* and if not properly closed, the port stays bound after process death.
*/
async shutdown(): Promise<void> {
// Shutdown all active sessions
logger.info('SYSTEM', 'Shutdown initiated');
// STEP 1: Enumerate all child processes BEFORE we start closing things
const childPids = await this.getChildProcesses(process.pid);
logger.info('SYSTEM', 'Found child processes', { count: childPids.length, pids: childPids });
// STEP 2: Close HTTP server first
if (this.server) {
this.server.closeAllConnections();
await new Promise<void>((resolve, reject) => {
this.server!.close(err => err ? reject(err) : resolve());
});
this.server = null;
logger.info('SYSTEM', 'HTTP server closed');
}
// STEP 3: Shutdown active sessions
await this.sessionManager.shutdownAll();
// Close MCP client connection (terminates MCP server process)
// STEP 4: Close MCP client connection (signals child to exit gracefully)
if (this.mcpClient) {
try {
await this.mcpClient.close();
@@ -414,19 +468,90 @@ export class WorkerService {
}
}
// Close HTTP server
if (this.server) {
await new Promise<void>((resolve, reject) => {
this.server!.close(err => err ? reject(err) : resolve());
});
}
// Close database connection (includes ChromaSync cleanup)
// STEP 5: Close database connection (includes ChromaSync cleanup)
await this.dbManager.close();
// STEP 6: Force kill any remaining child processes (Windows zombie port fix)
if (childPids.length > 0) {
logger.info('SYSTEM', 'Force killing remaining children');
for (const pid of childPids) {
await this.forceKillProcess(pid);
}
// Wait for children to fully exit
await this.waitForProcessesExit(childPids, 5000);
}
logger.info('SYSTEM', 'Worker shutdown complete');
}
/**
* Get all child process PIDs (Windows-specific)
*/
private async getChildProcesses(parentPid: number): Promise<number[]> {
if (process.platform !== 'win32') {
return [];
}
try {
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`;
const { stdout } = await execAsync(cmd, { timeout: 5000 });
return stdout
.trim()
.split('\n')
.map(s => parseInt(s.trim(), 10))
.filter(n => !isNaN(n));
} catch (error) {
logger.warn('SYSTEM', 'Failed to enumerate child processes', {}, error as Error);
return [];
}
}
/**
* Force kill a process by PID (Windows: uses taskkill /F /T)
*/
private async forceKillProcess(pid: number): Promise<void> {
try {
if (process.platform === 'win32') {
// /T kills entire process tree, /F forces termination
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 5000 });
logger.info('SYSTEM', 'Killed process', { pid });
} else {
process.kill(pid, 'SIGKILL');
}
} catch (error) {
// Process may already be dead, which is fine
logger.debug('SYSTEM', 'Process already dead or kill failed', { pid });
}
}
/**
* Wait for processes to fully exit
*/
private async waitForProcessesExit(pids: number[], timeoutMs: number): Promise<void> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
const stillAlive = pids.filter(pid => {
try {
process.kill(pid, 0); // Signal 0 checks if process exists
return true;
} catch {
return false;
}
});
if (stillAlive.length === 0) {
logger.info('SYSTEM', 'All child processes exited');
return;
}
logger.debug('SYSTEM', 'Waiting for processes to exit', { stillAlive });
await new Promise(r => setTimeout(r, 100));
}
logger.warn('SYSTEM', 'Timeout waiting for child processes to exit');
}
/**
* Summarize request body for logging
* Used to avoid logging sensitive data or large payloads
+152
View File
@@ -0,0 +1,152 @@
/**
* Worker Wrapper - Manages worker process lifecycle
*
* This wrapper exists to solve the Windows zombie port problem.
* The wrapper spawns the actual worker as a child process.
* When restart/shutdown is requested, the wrapper kills the child
* and respawns it (or exits), ensuring clean socket cleanup.
*
* The wrapper itself has no sockets, so Bun's socket cleanup bug
* doesn't affect it.
*/
import { spawn, ChildProcess, execSync } from 'child_process';
import path from 'path';
const isWindows = process.platform === 'win32';
const SCRIPT_DIR = __dirname;
const INNER_SCRIPT = path.join(SCRIPT_DIR, 'worker-service.cjs');
let inner: ChildProcess | null = null;
let isShuttingDown = false;
function log(msg: string) {
const timestamp = new Date().toISOString();
console.log(`[${timestamp}] [wrapper] ${msg}`);
}
function spawnInner() {
log(`Spawning inner worker: ${INNER_SCRIPT}`);
inner = spawn(process.execPath, [INNER_SCRIPT], {
stdio: ['inherit', 'inherit', 'inherit', 'ipc'],
env: { ...process.env, CLAUDE_MEM_MANAGED: 'true' },
cwd: path.dirname(INNER_SCRIPT),
});
inner.on('message', async (msg: { type: string }) => {
if (msg.type === 'restart' || msg.type === 'shutdown') {
// Both restart and shutdown: kill inner and exit wrapper
// The hooks will start a fresh wrapper+inner if needed
log(`${msg.type} requested by inner`);
isShuttingDown = true;
await killInner();
log('Exiting wrapper');
process.exit(0);
}
});
inner.on('exit', (code, signal) => {
log(`Inner exited with code=${code}, signal=${signal}`);
inner = null;
// If inner crashed unexpectedly (not during shutdown), respawn it
if (!isShuttingDown && code !== 0) {
log('Inner crashed, respawning in 1 second...');
setTimeout(() => spawnInner(), 1000);
}
});
inner.on('error', (err) => {
log(`Inner error: ${err.message}`);
});
}
async function killInner(): Promise<void> {
if (!inner || !inner.pid) {
log('No inner process to kill');
return;
}
const pid = inner.pid;
log(`Killing inner process tree (pid=${pid})`);
if (isWindows) {
// On Windows, use taskkill /T /F to kill entire process tree
// This ensures all children (MCP server, ChromaSync, etc.) are killed
// which is necessary to properly release the socket
try {
execSync(`taskkill /PID ${pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
log(`taskkill completed for pid=${pid}`);
} catch (error) {
// Process may already be dead
log(`taskkill failed (process may be dead): ${error}`);
}
} else {
// On Unix, SIGTERM then SIGKILL
inner.kill('SIGTERM');
// Wait for exit with timeout
const exitPromise = new Promise<void>(resolve => {
if (!inner) {
resolve();
return;
}
inner.on('exit', () => resolve());
});
const timeoutPromise = new Promise<void>(resolve =>
setTimeout(() => resolve(), 5000)
);
await Promise.race([exitPromise, timeoutPromise]);
// Force kill if still alive
if (inner && !inner.killed) {
log('Inner did not exit gracefully, force killing');
inner.kill('SIGKILL');
}
}
// Wait for the process to fully exit
await waitForProcessExit(pid, 5000);
inner = null;
log('Inner process terminated');
}
async function waitForProcessExit(pid: number, timeoutMs: number): Promise<void> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
process.kill(pid, 0); // Check if process exists
await new Promise(r => setTimeout(r, 100));
} catch {
// Process is dead
return;
}
}
log(`Timeout waiting for process ${pid} to exit`);
}
// Handle wrapper signals
process.on('SIGTERM', async () => {
log('Wrapper received SIGTERM');
isShuttingDown = true;
await killInner();
process.exit(0);
});
process.on('SIGINT', async () => {
log('Wrapper received SIGINT');
isShuttingDown = true;
await killInner();
process.exit(0);
});
// Start the inner worker
log('Wrapper starting');
spawnInner();