fix(windows): improve worker stop/restart reliability (#395)
* fix(windows): enable worker logging on Windows Previously, Windows worker startup via PowerShell Start-Process did not redirect stdout/stderr to log files, making debugging startup failures impossible. This adds -RedirectStandardOutput and -RedirectStandardError to capture worker logs to ~/.claude-mem/logs/worker-YYYY-MM-DD.log. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(windows): improve worker stop/restart reliability - Use HTTP shutdown endpoint as primary stop method (worker kills itself) - Only remove PID file after confirming worker is actually dead - Remove auto-respawn from wrapper to prevent PID file mismatches - Wrapper now exits when inner worker crashes (hooks will restart) This hopefully fixes issues where npm run worker:stop would fail silently when the worker was started from hooks, leaving zombie processes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import { spawn, spawnSync } from 'child_process';
|
||||
import { homedir } from 'os';
|
||||
import { DATA_DIR } from '../../shared/paths.js';
|
||||
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
|
||||
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
|
||||
|
||||
const PID_FILE = join(DATA_DIR, 'worker.pid');
|
||||
const LOG_DIR = join(DATA_DIR, 'logs');
|
||||
@@ -16,6 +17,7 @@ const HEALTH_CHECK_TIMEOUT_MS = 10000;
|
||||
const HEALTH_CHECK_INTERVAL_MS = 200;
|
||||
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
|
||||
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
|
||||
const HTTP_SHUTDOWN_TIMEOUT_MS = 2000;
|
||||
|
||||
interface PidInfo {
|
||||
pid: number;
|
||||
@@ -99,8 +101,9 @@ export class ProcessManager {
|
||||
const escapedBunPath = this.escapePowerShellString(bunPath);
|
||||
const escapedScript = this.escapePowerShellString(script);
|
||||
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
|
||||
const escapedLogFile = this.escapePowerShellString(logFile);
|
||||
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
|
||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -PassThru | Select-Object -ExpandProperty Id`;
|
||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
|
||||
|
||||
const result = spawnSync('powershell', ['-Command', psCommand], {
|
||||
stdio: 'pipe',
|
||||
@@ -171,34 +174,65 @@ export class ProcessManager {
|
||||
|
||||
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
|
||||
const info = this.getPidInfo();
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
if (process.platform === 'win32') {
|
||||
// On Windows, use taskkill /T /F to kill entire process tree
|
||||
if (process.platform === 'win32') {
|
||||
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
|
||||
// because the worker shuts itself down from the inside (via wrapper IPC)
|
||||
const port = info?.port ?? this.getPortFromSettings();
|
||||
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
|
||||
|
||||
if (httpShutdownSucceeded) {
|
||||
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
|
||||
// HTTP shutdown failed (worker not responding), fall back to taskkill
|
||||
if (!info) {
|
||||
// No PID file and HTTP failed - nothing more we can do
|
||||
return true;
|
||||
}
|
||||
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
// Use taskkill /T /F to kill entire process tree
|
||||
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
||||
// which is necessary to properly release the socket and avoid zombie ports
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
} else {
|
||||
// On Unix, use signals
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
|
||||
// Wait for process to actually exit before removing PID file
|
||||
try {
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
} catch {
|
||||
// Timeout waiting - process may still be alive
|
||||
}
|
||||
|
||||
// Only remove PID file if process is confirmed dead
|
||||
if (!this.isProcessAlive(info.pid)) {
|
||||
this.removePidFile();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// Unix: Use signals (unchanged behavior)
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
process.kill(info.pid, 'SIGTERM');
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
}
|
||||
} catch {
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already dead
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already dead
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.removePidFile();
|
||||
return true;
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
@@ -229,6 +263,66 @@ export class ProcessManager {
|
||||
return alive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get worker port from settings file
|
||||
*/
|
||||
private static getPortFromSettings(): number {
|
||||
try {
|
||||
const settingsPath = join(DATA_DIR, 'settings.json');
|
||||
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
|
||||
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
|
||||
} catch {
|
||||
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to shut down the worker via HTTP endpoint
|
||||
* Returns true if shutdown succeeded, false if worker not responding
|
||||
*/
|
||||
private static async tryHttpShutdown(port: number): Promise<boolean> {
|
||||
try {
|
||||
// Send shutdown request
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||
method: 'POST',
|
||||
signal: AbortSignal.timeout(HTTP_SHUTDOWN_TIMEOUT_MS)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wait for worker to actually stop responding
|
||||
return await this.waitForWorkerDown(port, PROCESS_STOP_TIMEOUT_MS);
|
||||
} catch {
|
||||
// Worker not responding to HTTP - it may be dead or hung
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for worker to stop responding on the given port
|
||||
*/
|
||||
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeout) {
|
||||
try {
|
||||
await fetch(`http://127.0.0.1:${port}/api/health`, {
|
||||
signal: AbortSignal.timeout(500)
|
||||
});
|
||||
// Still responding, wait and retry
|
||||
await new Promise(resolve => setTimeout(resolve, PROCESS_EXIT_CHECK_INTERVAL_MS));
|
||||
} catch {
|
||||
// Worker stopped responding - success
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout - worker still responding
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
private static getPidInfo(): PidInfo | null {
|
||||
try {
|
||||
|
||||
@@ -3,11 +3,15 @@
|
||||
*
|
||||
* This wrapper exists to solve the Windows zombie port problem.
|
||||
* The wrapper spawns the actual worker as a child process.
|
||||
* When restart/shutdown is requested, the wrapper kills the child
|
||||
* and respawns it (or exits), ensuring clean socket cleanup.
|
||||
* When shutdown is requested, the wrapper kills the child and exits.
|
||||
* The hooks will start a fresh wrapper+worker if needed.
|
||||
*
|
||||
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
||||
* doesn't affect it.
|
||||
*
|
||||
* NOTE: The wrapper does NOT auto-restart the worker on crash.
|
||||
* This is intentional - the hooks handle startup via ensureWorkerRunning().
|
||||
* Auto-restart would cause PID file mismatches and potential infinite loops.
|
||||
*/
|
||||
|
||||
import { spawn, ChildProcess, execSync } from 'child_process';
|
||||
@@ -51,10 +55,11 @@ function spawnInner() {
|
||||
log(`Inner exited with code=${code}, signal=${signal}`);
|
||||
inner = null;
|
||||
|
||||
// If inner crashed unexpectedly (not during shutdown), respawn it
|
||||
if (!isShuttingDown && code !== 0) {
|
||||
log('Inner crashed, respawning in 1 second...');
|
||||
setTimeout(() => spawnInner(), 1000);
|
||||
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
|
||||
// Auto-restart causes PID file mismatches and potential infinite loops
|
||||
if (!isShuttingDown) {
|
||||
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
|
||||
process.exit(code ?? 1);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user