fix(windows): improve worker stop/restart reliability (#395)

* fix(windows): enable worker logging on Windows

Previously, Windows worker startup via PowerShell Start-Process did not
redirect stdout/stderr to log files, making debugging startup failures
impossible. This adds -RedirectStandardOutput and -RedirectStandardError
to capture worker logs to ~/.claude-mem/logs/worker-YYYY-MM-DD.log.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(windows): improve worker stop/restart reliability

- Use HTTP shutdown endpoint as primary stop method (worker kills itself)
- Only remove PID file after confirming worker is actually dead
- Remove auto-respawn from wrapper to prevent PID file mismatches
- Wrapper now exits when inner worker crashes (hooks will restart)

This hopefully fixes issues where npm run worker:stop would fail silently when
the worker was started from hooks, leaving zombie processes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
ToxMox
2025-12-20 16:50:32 -05:00
committed by GitHub
parent 15fe0cfe3c
commit af145cfaef
4 changed files with 133 additions and 34 deletions
+115 -21
View File
@@ -5,6 +5,7 @@ import { spawn, spawnSync } from 'child_process';
import { homedir } from 'os';
import { DATA_DIR } from '../../shared/paths.js';
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
const PID_FILE = join(DATA_DIR, 'worker.pid');
const LOG_DIR = join(DATA_DIR, 'logs');
@@ -16,6 +17,7 @@ const HEALTH_CHECK_TIMEOUT_MS = 10000;
const HEALTH_CHECK_INTERVAL_MS = 200;
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
const HTTP_SHUTDOWN_TIMEOUT_MS = 2000;
interface PidInfo {
pid: number;
@@ -99,8 +101,9 @@ export class ProcessManager {
const escapedBunPath = this.escapePowerShellString(bunPath);
const escapedScript = this.escapePowerShellString(script);
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
const escapedLogFile = this.escapePowerShellString(logFile);
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -PassThru | Select-Object -ExpandProperty Id`;
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
const result = spawnSync('powershell', ['-Command', psCommand], {
stdio: 'pipe',
@@ -171,34 +174,65 @@ export class ProcessManager {
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
const info = this.getPidInfo();
if (!info) return true;
try {
if (process.platform === 'win32') {
// On Windows, use taskkill /T /F to kill entire process tree
if (process.platform === 'win32') {
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
// because the worker shuts itself down from the inside (via wrapper IPC)
const port = info?.port ?? this.getPortFromSettings();
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
if (httpShutdownSucceeded) {
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
this.removePidFile();
return true;
}
// HTTP shutdown failed (worker not responding), fall back to taskkill
if (!info) {
// No PID file and HTTP failed - nothing more we can do
return true;
}
const { execSync } = await import('child_process');
try {
// Use taskkill /T /F to kill entire process tree
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
// which is necessary to properly release the socket and avoid zombie ports
const { execSync } = await import('child_process');
try {
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
} catch {
// Process may already be dead
}
} else {
// On Unix, use signals
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
} catch {
// Process may already be dead
}
// Wait for process to actually exit before removing PID file
try {
await this.waitForExit(info.pid, timeout);
} catch {
// Timeout waiting - process may still be alive
}
// Only remove PID file if process is confirmed dead
if (!this.isProcessAlive(info.pid)) {
this.removePidFile();
}
return true;
} else {
// Unix: Use signals (unchanged behavior)
if (!info) return true;
try {
process.kill(info.pid, 'SIGTERM');
await this.waitForExit(info.pid, timeout);
}
} catch {
try {
process.kill(info.pid, 'SIGKILL');
} catch {
// Process already dead
try {
process.kill(info.pid, 'SIGKILL');
} catch {
// Process already dead
}
}
}
this.removePidFile();
return true;
this.removePidFile();
return true;
}
}
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
@@ -229,6 +263,66 @@ export class ProcessManager {
return alive;
}
/**
* Get worker port from settings file
*/
private static getPortFromSettings(): number {
try {
const settingsPath = join(DATA_DIR, 'settings.json');
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
} catch {
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
}
}
/**
* Try to shut down the worker via HTTP endpoint
* Returns true if shutdown succeeded, false if worker not responding
*/
private static async tryHttpShutdown(port: number): Promise<boolean> {
try {
// Send shutdown request
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
method: 'POST',
signal: AbortSignal.timeout(HTTP_SHUTDOWN_TIMEOUT_MS)
});
if (!response.ok) {
return false;
}
// Wait for worker to actually stop responding
return await this.waitForWorkerDown(port, PROCESS_STOP_TIMEOUT_MS);
} catch {
// Worker not responding to HTTP - it may be dead or hung
return false;
}
}
/**
* Wait for worker to stop responding on the given port
*/
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
try {
await fetch(`http://127.0.0.1:${port}/api/health`, {
signal: AbortSignal.timeout(500)
});
// Still responding, wait and retry
await new Promise(resolve => setTimeout(resolve, PROCESS_EXIT_CHECK_INTERVAL_MS));
} catch {
// Worker stopped responding - success
return true;
}
}
// Timeout - worker still responding
return false;
}
// Helper methods
private static getPidInfo(): PidInfo | null {
try {
+11 -6
View File
@@ -3,11 +3,15 @@
*
* This wrapper exists to solve the Windows zombie port problem.
* The wrapper spawns the actual worker as a child process.
* When restart/shutdown is requested, the wrapper kills the child
* and respawns it (or exits), ensuring clean socket cleanup.
* When shutdown is requested, the wrapper kills the child and exits.
* The hooks will start a fresh wrapper+worker if needed.
*
* The wrapper itself has no sockets, so Bun's socket cleanup bug
* doesn't affect it.
*
* NOTE: The wrapper does NOT auto-restart the worker on crash.
* This is intentional - the hooks handle startup via ensureWorkerRunning().
* Auto-restart would cause PID file mismatches and potential infinite loops.
*/
import { spawn, ChildProcess, execSync } from 'child_process';
@@ -51,10 +55,11 @@ function spawnInner() {
log(`Inner exited with code=${code}, signal=${signal}`);
inner = null;
// If inner crashed unexpectedly (not during shutdown), respawn it
if (!isShuttingDown && code !== 0) {
log('Inner crashed, respawning in 1 second...');
setTimeout(() => spawnInner(), 1000);
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
// Auto-restart causes PID file mismatches and potential infinite loops
if (!isShuttingDown) {
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
process.exit(code ?? 1);
}
});