fix(windows): improve worker stop/restart reliability (#395)
* fix(windows): enable worker logging on Windows Previously, Windows worker startup via PowerShell Start-Process did not redirect stdout/stderr to log files, making debugging startup failures impossible. This adds -RedirectStandardOutput and -RedirectStandardError to capture worker logs to ~/.claude-mem/logs/worker-YYYY-MM-DD.log. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(windows): improve worker stop/restart reliability - Use HTTP shutdown endpoint as primary stop method (worker kills itself) - Only remove PID file after confirming worker is actually dead - Remove auto-respawn from wrapper to prevent PID file mismatches - Wrapper now exits when inner worker crashes (hooks will restart) This hopefully fixes issues where npm run worker:stop would fail silently when the worker was started from hooks, leaving zombie processes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,2 +1,2 @@
|
||||
#!/usr/bin/env bun
|
||||
"use strict";var u=Object.create;var w=Object.defineProperty;var I=Object.getOwnPropertyDescriptor;var f=Object.getOwnPropertyNames;var g=Object.getPrototypeOf,k=Object.prototype.hasOwnProperty;var y=(e,i,t,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of f(i))!k.call(e,s)&&s!==t&&w(e,s,{get:()=>i[s],enumerable:!(o=I(i,s))||o.enumerable});return e};var P=(e,i,t)=>(t=e!=null?u(g(e)):{},y(i||!e||!e.__esModule?w(t,"default",{value:e,enumerable:!0}):t,e));var c=require("child_process"),p=P(require("path"),1),h=process.platform==="win32",x=__dirname,l=p.default.join(x,"worker-service.cjs"),n=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function m(){r(`Spawning inner worker: ${l}`),n=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),n.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),n.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),n=null,!a&&e!==0&&(r("Inner crashed, respawning in 1 second..."),setTimeout(()=>m(),1e3))}),n.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!n||!n.pid){r("No inner process to kill");return}let e=n.pid;if(r(`Killing inner process tree (pid=${e})`),h)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{n.kill("SIGTERM");let i=new Promise(o=>{if(!n){o();return}n.on("exit",()=>o())}),t=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,t]),n&&!n.killed&&(r("Inner did not exit gracefully, force killing"),n.kill("SIGKILL"))}await S(e,5e3),n=null,r("Inner process terminated")}async function S(e,i){let t=Date.now();for(;Date.now()-t<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");m();
|
||||
"use strict";var m=Object.create;var w=Object.defineProperty;var u=Object.getOwnPropertyDescriptor;var I=Object.getOwnPropertyNames;var f=Object.getPrototypeOf,x=Object.prototype.hasOwnProperty;var g=(e,i,n,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of I(i))!x.call(e,s)&&s!==n&&w(e,s,{get:()=>i[s],enumerable:!(o=u(i,s))||o.enumerable});return e};var k=(e,i,n)=>(n=e!=null?m(f(e)):{},g(i||!e||!e.__esModule?w(n,"default",{value:e,enumerable:!0}):n,e));var c=require("child_process"),p=k(require("path"),1),y=process.platform==="win32",P=__dirname,l=p.default.join(P,"worker-service.cjs"),t=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function h(){r(`Spawning inner worker: ${l}`),t=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),t.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),t.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),t=null,a||(r("Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)"),process.exit(e??1))}),t.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!t||!t.pid){r("No inner process to kill");return}let e=t.pid;if(r(`Killing inner process tree (pid=${e})`),y)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{t.kill("SIGTERM");let i=new Promise(o=>{if(!t){o();return}t.on("exit",()=>o())}),n=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,n]),t&&!t.killed&&(r("Inner did not exit gracefully, force killing"),t.kill("SIGKILL"))}await S(e,5e3),t=null,r("Inner process terminated")}async function S(e,i){let n=Date.now();for(;Date.now()-n<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");h();
|
||||
|
||||
@@ -5,6 +5,7 @@ import { spawn, spawnSync } from 'child_process';
|
||||
import { homedir } from 'os';
|
||||
import { DATA_DIR } from '../../shared/paths.js';
|
||||
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
|
||||
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
|
||||
|
||||
const PID_FILE = join(DATA_DIR, 'worker.pid');
|
||||
const LOG_DIR = join(DATA_DIR, 'logs');
|
||||
@@ -16,6 +17,7 @@ const HEALTH_CHECK_TIMEOUT_MS = 10000;
|
||||
const HEALTH_CHECK_INTERVAL_MS = 200;
|
||||
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
|
||||
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
|
||||
const HTTP_SHUTDOWN_TIMEOUT_MS = 2000;
|
||||
|
||||
interface PidInfo {
|
||||
pid: number;
|
||||
@@ -99,8 +101,9 @@ export class ProcessManager {
|
||||
const escapedBunPath = this.escapePowerShellString(bunPath);
|
||||
const escapedScript = this.escapePowerShellString(script);
|
||||
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
|
||||
const escapedLogFile = this.escapePowerShellString(logFile);
|
||||
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
|
||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -PassThru | Select-Object -ExpandProperty Id`;
|
||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
|
||||
|
||||
const result = spawnSync('powershell', ['-Command', psCommand], {
|
||||
stdio: 'pipe',
|
||||
@@ -171,34 +174,65 @@ export class ProcessManager {
|
||||
|
||||
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
|
||||
const info = this.getPidInfo();
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
if (process.platform === 'win32') {
|
||||
// On Windows, use taskkill /T /F to kill entire process tree
|
||||
if (process.platform === 'win32') {
|
||||
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
|
||||
// because the worker shuts itself down from the inside (via wrapper IPC)
|
||||
const port = info?.port ?? this.getPortFromSettings();
|
||||
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
|
||||
|
||||
if (httpShutdownSucceeded) {
|
||||
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
|
||||
// HTTP shutdown failed (worker not responding), fall back to taskkill
|
||||
if (!info) {
|
||||
// No PID file and HTTP failed - nothing more we can do
|
||||
return true;
|
||||
}
|
||||
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
// Use taskkill /T /F to kill entire process tree
|
||||
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
||||
// which is necessary to properly release the socket and avoid zombie ports
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
} else {
|
||||
// On Unix, use signals
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
|
||||
// Wait for process to actually exit before removing PID file
|
||||
try {
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
} catch {
|
||||
// Timeout waiting - process may still be alive
|
||||
}
|
||||
|
||||
// Only remove PID file if process is confirmed dead
|
||||
if (!this.isProcessAlive(info.pid)) {
|
||||
this.removePidFile();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// Unix: Use signals (unchanged behavior)
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
process.kill(info.pid, 'SIGTERM');
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
}
|
||||
} catch {
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already dead
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already dead
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
this.removePidFile();
|
||||
return true;
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
@@ -229,6 +263,66 @@ export class ProcessManager {
|
||||
return alive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get worker port from settings file
|
||||
*/
|
||||
private static getPortFromSettings(): number {
|
||||
try {
|
||||
const settingsPath = join(DATA_DIR, 'settings.json');
|
||||
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
|
||||
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
|
||||
} catch {
|
||||
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to shut down the worker via HTTP endpoint
|
||||
* Returns true if shutdown succeeded, false if worker not responding
|
||||
*/
|
||||
private static async tryHttpShutdown(port: number): Promise<boolean> {
|
||||
try {
|
||||
// Send shutdown request
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||
method: 'POST',
|
||||
signal: AbortSignal.timeout(HTTP_SHUTDOWN_TIMEOUT_MS)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wait for worker to actually stop responding
|
||||
return await this.waitForWorkerDown(port, PROCESS_STOP_TIMEOUT_MS);
|
||||
} catch {
|
||||
// Worker not responding to HTTP - it may be dead or hung
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for worker to stop responding on the given port
|
||||
*/
|
||||
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeout) {
|
||||
try {
|
||||
await fetch(`http://127.0.0.1:${port}/api/health`, {
|
||||
signal: AbortSignal.timeout(500)
|
||||
});
|
||||
// Still responding, wait and retry
|
||||
await new Promise(resolve => setTimeout(resolve, PROCESS_EXIT_CHECK_INTERVAL_MS));
|
||||
} catch {
|
||||
// Worker stopped responding - success
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout - worker still responding
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
private static getPidInfo(): PidInfo | null {
|
||||
try {
|
||||
|
||||
@@ -3,11 +3,15 @@
|
||||
*
|
||||
* This wrapper exists to solve the Windows zombie port problem.
|
||||
* The wrapper spawns the actual worker as a child process.
|
||||
* When restart/shutdown is requested, the wrapper kills the child
|
||||
* and respawns it (or exits), ensuring clean socket cleanup.
|
||||
* When shutdown is requested, the wrapper kills the child and exits.
|
||||
* The hooks will start a fresh wrapper+worker if needed.
|
||||
*
|
||||
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
||||
* doesn't affect it.
|
||||
*
|
||||
* NOTE: The wrapper does NOT auto-restart the worker on crash.
|
||||
* This is intentional - the hooks handle startup via ensureWorkerRunning().
|
||||
* Auto-restart would cause PID file mismatches and potential infinite loops.
|
||||
*/
|
||||
|
||||
import { spawn, ChildProcess, execSync } from 'child_process';
|
||||
@@ -51,10 +55,11 @@ function spawnInner() {
|
||||
log(`Inner exited with code=${code}, signal=${signal}`);
|
||||
inner = null;
|
||||
|
||||
// If inner crashed unexpectedly (not during shutdown), respawn it
|
||||
if (!isShuttingDown && code !== 0) {
|
||||
log('Inner crashed, respawning in 1 second...');
|
||||
setTimeout(() => spawnInner(), 1000);
|
||||
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
|
||||
// Auto-restart causes PID file mismatches and potential infinite loops
|
||||
if (!isShuttingDown) {
|
||||
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
|
||||
process.exit(code ?? 1);
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
Reference in New Issue
Block a user