fix(windows): improve worker stop/restart reliability (#395)

* fix(windows): enable worker logging on Windows

Previously, Windows worker startup via PowerShell Start-Process did not
redirect stdout/stderr to log files, making debugging startup failures
impossible. This adds -RedirectStandardOutput and -RedirectStandardError
to capture worker logs to ~/.claude-mem/logs/worker-YYYY-MM-DD.log.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

* fix(windows): improve worker stop/restart reliability

- Use HTTP shutdown endpoint as primary stop method (worker kills itself)
- Only remove PID file after confirming worker is actually dead
- Remove auto-respawn from wrapper to prevent PID file mismatches
- Wrapper now exits when inner worker crashes (hooks will restart)

This hopefully fixes issues where npm run worker:stop would fail silently when
the worker was started from hooks, leaving zombie processes.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
ToxMox
2025-12-20 16:50:32 -05:00
committed by GitHub
parent 15fe0cfe3c
commit af145cfaef
4 changed files with 133 additions and 34 deletions
File diff suppressed because one or more lines are too long
+1 -1
View File
@@ -1,2 +1,2 @@
#!/usr/bin/env bun
"use strict";var u=Object.create;var w=Object.defineProperty;var I=Object.getOwnPropertyDescriptor;var f=Object.getOwnPropertyNames;var g=Object.getPrototypeOf,k=Object.prototype.hasOwnProperty;var y=(e,i,t,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of f(i))!k.call(e,s)&&s!==t&&w(e,s,{get:()=>i[s],enumerable:!(o=I(i,s))||o.enumerable});return e};var P=(e,i,t)=>(t=e!=null?u(g(e)):{},y(i||!e||!e.__esModule?w(t,"default",{value:e,enumerable:!0}):t,e));var c=require("child_process"),p=P(require("path"),1),h=process.platform==="win32",x=__dirname,l=p.default.join(x,"worker-service.cjs"),n=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function m(){r(`Spawning inner worker: ${l}`),n=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),n.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),n.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),n=null,!a&&e!==0&&(r("Inner crashed, respawning in 1 second..."),setTimeout(()=>m(),1e3))}),n.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!n||!n.pid){r("No inner process to kill");return}let e=n.pid;if(r(`Killing inner process tree (pid=${e})`),h)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{n.kill("SIGTERM");let i=new Promise(o=>{if(!n){o();return}n.on("exit",()=>o())}),t=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,t]),n&&!n.killed&&(r("Inner did not exit gracefully, force killing"),n.kill("SIGKILL"))}await S(e,5e3),n=null,r("Inner process terminated")}async function S(e,i){let t=Date.now();for(;Date.now()-t<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");m();
"use strict";var m=Object.create;var w=Object.defineProperty;var u=Object.getOwnPropertyDescriptor;var I=Object.getOwnPropertyNames;var f=Object.getPrototypeOf,x=Object.prototype.hasOwnProperty;var g=(e,i,n,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of I(i))!x.call(e,s)&&s!==n&&w(e,s,{get:()=>i[s],enumerable:!(o=u(i,s))||o.enumerable});return e};var k=(e,i,n)=>(n=e!=null?m(f(e)):{},g(i||!e||!e.__esModule?w(n,"default",{value:e,enumerable:!0}):n,e));var c=require("child_process"),p=k(require("path"),1),y=process.platform==="win32",P=__dirname,l=p.default.join(P,"worker-service.cjs"),t=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function h(){r(`Spawning inner worker: ${l}`),t=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),t.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),t.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),t=null,a||(r("Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)"),process.exit(e??1))}),t.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!t||!t.pid){r("No inner process to kill");return}let e=t.pid;if(r(`Killing inner process tree (pid=${e})`),y)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{t.kill("SIGTERM");let i=new Promise(o=>{if(!t){o();return}t.on("exit",()=>o())}),n=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,n]),t&&!t.killed&&(r("Inner did not exit gracefully, force killing"),t.kill("SIGKILL"))}await S(e,5e3),t=null,r("Inner process terminated")}async function S(e,i){let n=Date.now();for(;Date.now()-n<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");h();
+115 -21
View File
@@ -5,6 +5,7 @@ import { spawn, spawnSync } from 'child_process';
import { homedir } from 'os';
import { DATA_DIR } from '../../shared/paths.js';
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
const PID_FILE = join(DATA_DIR, 'worker.pid');
const LOG_DIR = join(DATA_DIR, 'logs');
@@ -16,6 +17,7 @@ const HEALTH_CHECK_TIMEOUT_MS = 10000;
const HEALTH_CHECK_INTERVAL_MS = 200;
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
const HTTP_SHUTDOWN_TIMEOUT_MS = 2000;
interface PidInfo {
pid: number;
@@ -99,8 +101,9 @@ export class ProcessManager {
const escapedBunPath = this.escapePowerShellString(bunPath);
const escapedScript = this.escapePowerShellString(script);
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
const escapedLogFile = this.escapePowerShellString(logFile);
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -PassThru | Select-Object -ExpandProperty Id`;
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
const result = spawnSync('powershell', ['-Command', psCommand], {
stdio: 'pipe',
@@ -171,34 +174,65 @@ export class ProcessManager {
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
const info = this.getPidInfo();
if (!info) return true;
try {
if (process.platform === 'win32') {
// On Windows, use taskkill /T /F to kill entire process tree
if (process.platform === 'win32') {
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
// because the worker shuts itself down from the inside (via wrapper IPC)
const port = info?.port ?? this.getPortFromSettings();
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
if (httpShutdownSucceeded) {
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
this.removePidFile();
return true;
}
// HTTP shutdown failed (worker not responding), fall back to taskkill
if (!info) {
// No PID file and HTTP failed - nothing more we can do
return true;
}
const { execSync } = await import('child_process');
try {
// Use taskkill /T /F to kill entire process tree
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
// which is necessary to properly release the socket and avoid zombie ports
const { execSync } = await import('child_process');
try {
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
} catch {
// Process may already be dead
}
} else {
// On Unix, use signals
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
} catch {
// Process may already be dead
}
// Wait for process to actually exit before removing PID file
try {
await this.waitForExit(info.pid, timeout);
} catch {
// Timeout waiting - process may still be alive
}
// Only remove PID file if process is confirmed dead
if (!this.isProcessAlive(info.pid)) {
this.removePidFile();
}
return true;
} else {
// Unix: Use signals (unchanged behavior)
if (!info) return true;
try {
process.kill(info.pid, 'SIGTERM');
await this.waitForExit(info.pid, timeout);
}
} catch {
try {
process.kill(info.pid, 'SIGKILL');
} catch {
// Process already dead
try {
process.kill(info.pid, 'SIGKILL');
} catch {
// Process already dead
}
}
}
this.removePidFile();
return true;
this.removePidFile();
return true;
}
}
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
@@ -229,6 +263,66 @@ export class ProcessManager {
return alive;
}
/**
* Get worker port from settings file
*/
private static getPortFromSettings(): number {
try {
const settingsPath = join(DATA_DIR, 'settings.json');
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
} catch {
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
}
}
/**
* Try to shut down the worker via HTTP endpoint
* Returns true if shutdown succeeded, false if worker not responding
*/
private static async tryHttpShutdown(port: number): Promise<boolean> {
try {
// Send shutdown request
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
method: 'POST',
signal: AbortSignal.timeout(HTTP_SHUTDOWN_TIMEOUT_MS)
});
if (!response.ok) {
return false;
}
// Wait for worker to actually stop responding
return await this.waitForWorkerDown(port, PROCESS_STOP_TIMEOUT_MS);
} catch {
// Worker not responding to HTTP - it may be dead or hung
return false;
}
}
/**
* Wait for worker to stop responding on the given port
*/
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
const startTime = Date.now();
while (Date.now() - startTime < timeout) {
try {
await fetch(`http://127.0.0.1:${port}/api/health`, {
signal: AbortSignal.timeout(500)
});
// Still responding, wait and retry
await new Promise(resolve => setTimeout(resolve, PROCESS_EXIT_CHECK_INTERVAL_MS));
} catch {
// Worker stopped responding - success
return true;
}
}
// Timeout - worker still responding
return false;
}
// Helper methods
private static getPidInfo(): PidInfo | null {
try {
+11 -6
View File
@@ -3,11 +3,15 @@
*
* This wrapper exists to solve the Windows zombie port problem.
* The wrapper spawns the actual worker as a child process.
* When restart/shutdown is requested, the wrapper kills the child
* and respawns it (or exits), ensuring clean socket cleanup.
* When shutdown is requested, the wrapper kills the child and exits.
* The hooks will start a fresh wrapper+worker if needed.
*
* The wrapper itself has no sockets, so Bun's socket cleanup bug
* doesn't affect it.
*
* NOTE: The wrapper does NOT auto-restart the worker on crash.
* This is intentional - the hooks handle startup via ensureWorkerRunning().
* Auto-restart would cause PID file mismatches and potential infinite loops.
*/
import { spawn, ChildProcess, execSync } from 'child_process';
@@ -51,10 +55,11 @@ function spawnInner() {
log(`Inner exited with code=${code}, signal=${signal}`);
inner = null;
// If inner crashed unexpectedly (not during shutdown), respawn it
if (!isShuttingDown && code !== 0) {
log('Inner crashed, respawning in 1 second...');
setTimeout(() => spawnInner(), 1000);
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
// Auto-restart causes PID file mismatches and potential infinite loops
if (!isShuttingDown) {
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
process.exit(code ?? 1);
}
});