fix(windows): improve worker stop/restart reliability (#395)
* fix(windows): enable worker logging on Windows Previously, Windows worker startup via PowerShell Start-Process did not redirect stdout/stderr to log files, making debugging startup failures impossible. This adds -RedirectStandardOutput and -RedirectStandardError to capture worker logs to ~/.claude-mem/logs/worker-YYYY-MM-DD.log. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(windows): improve worker stop/restart reliability - Use HTTP shutdown endpoint as primary stop method (worker kills itself) - Only remove PID file after confirming worker is actually dead - Remove auto-respawn from wrapper to prevent PID file mismatches - Wrapper now exits when inner worker crashes (hooks will restart) This hopefully fixes issues where npm run worker:stop would fail silently when the worker was started from hooks, leaving zombie processes. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
File diff suppressed because one or more lines are too long
@@ -1,2 +1,2 @@
|
|||||||
#!/usr/bin/env bun
|
#!/usr/bin/env bun
|
||||||
"use strict";var u=Object.create;var w=Object.defineProperty;var I=Object.getOwnPropertyDescriptor;var f=Object.getOwnPropertyNames;var g=Object.getPrototypeOf,k=Object.prototype.hasOwnProperty;var y=(e,i,t,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of f(i))!k.call(e,s)&&s!==t&&w(e,s,{get:()=>i[s],enumerable:!(o=I(i,s))||o.enumerable});return e};var P=(e,i,t)=>(t=e!=null?u(g(e)):{},y(i||!e||!e.__esModule?w(t,"default",{value:e,enumerable:!0}):t,e));var c=require("child_process"),p=P(require("path"),1),h=process.platform==="win32",x=__dirname,l=p.default.join(x,"worker-service.cjs"),n=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function m(){r(`Spawning inner worker: ${l}`),n=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),n.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),n.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),n=null,!a&&e!==0&&(r("Inner crashed, respawning in 1 second..."),setTimeout(()=>m(),1e3))}),n.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!n||!n.pid){r("No inner process to kill");return}let e=n.pid;if(r(`Killing inner process tree (pid=${e})`),h)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{n.kill("SIGTERM");let i=new Promise(o=>{if(!n){o();return}n.on("exit",()=>o())}),t=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,t]),n&&!n.killed&&(r("Inner did not exit gracefully, force killing"),n.kill("SIGKILL"))}await S(e,5e3),n=null,r("Inner process terminated")}async function S(e,i){let t=Date.now();for(;Date.now()-t<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");m();
|
"use strict";var m=Object.create;var w=Object.defineProperty;var u=Object.getOwnPropertyDescriptor;var I=Object.getOwnPropertyNames;var f=Object.getPrototypeOf,x=Object.prototype.hasOwnProperty;var g=(e,i,n,o)=>{if(i&&typeof i=="object"||typeof i=="function")for(let s of I(i))!x.call(e,s)&&s!==n&&w(e,s,{get:()=>i[s],enumerable:!(o=u(i,s))||o.enumerable});return e};var k=(e,i,n)=>(n=e!=null?m(f(e)):{},g(i||!e||!e.__esModule?w(n,"default",{value:e,enumerable:!0}):n,e));var c=require("child_process"),p=k(require("path"),1),y=process.platform==="win32",P=__dirname,l=p.default.join(P,"worker-service.cjs"),t=null,a=!1;function r(e){let i=new Date().toISOString();console.log(`[${i}] [wrapper] ${e}`)}function h(){r(`Spawning inner worker: ${l}`),t=(0,c.spawn)(process.execPath,[l],{stdio:["inherit","inherit","inherit","ipc"],env:{...process.env,CLAUDE_MEM_MANAGED:"true"},cwd:p.default.dirname(l)}),t.on("message",async e=>{(e.type==="restart"||e.type==="shutdown")&&(r(`${e.type} requested by inner`),a=!0,await d(),r("Exiting wrapper"),process.exit(0))}),t.on("exit",(e,i)=>{r(`Inner exited with code=${e}, signal=${i}`),t=null,a||(r("Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)"),process.exit(e??1))}),t.on("error",e=>{r(`Inner error: ${e.message}`)})}async function d(){if(!t||!t.pid){r("No inner process to kill");return}let e=t.pid;if(r(`Killing inner process tree (pid=${e})`),y)try{(0,c.execSync)(`taskkill /PID ${e} /T /F`,{timeout:1e4,stdio:"ignore"}),r(`taskkill completed for pid=${e}`)}catch(i){r(`taskkill failed (process may be dead): ${i}`)}else{t.kill("SIGTERM");let i=new Promise(o=>{if(!t){o();return}t.on("exit",()=>o())}),n=new Promise(o=>setTimeout(()=>o(),5e3));await Promise.race([i,n]),t&&!t.killed&&(r("Inner did not exit gracefully, force killing"),t.kill("SIGKILL"))}await S(e,5e3),t=null,r("Inner process terminated")}async function S(e,i){let n=Date.now();for(;Date.now()-n<i;)try{process.kill(e,0),await new Promise(o=>setTimeout(o,100))}catch{return}r(`Timeout waiting for process ${e} to exit`)}process.on("SIGTERM",async()=>{r("Wrapper received SIGTERM"),a=!0,await d(),process.exit(0)});process.on("SIGINT",async()=>{r("Wrapper received SIGINT"),a=!0,await d(),process.exit(0)});r("Wrapper starting");h();
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ import { spawn, spawnSync } from 'child_process';
|
|||||||
import { homedir } from 'os';
|
import { homedir } from 'os';
|
||||||
import { DATA_DIR } from '../../shared/paths.js';
|
import { DATA_DIR } from '../../shared/paths.js';
|
||||||
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
|
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
|
||||||
|
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
|
||||||
|
|
||||||
const PID_FILE = join(DATA_DIR, 'worker.pid');
|
const PID_FILE = join(DATA_DIR, 'worker.pid');
|
||||||
const LOG_DIR = join(DATA_DIR, 'logs');
|
const LOG_DIR = join(DATA_DIR, 'logs');
|
||||||
@@ -16,6 +17,7 @@ const HEALTH_CHECK_TIMEOUT_MS = 10000;
|
|||||||
const HEALTH_CHECK_INTERVAL_MS = 200;
|
const HEALTH_CHECK_INTERVAL_MS = 200;
|
||||||
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
|
const HEALTH_CHECK_FETCH_TIMEOUT_MS = 1000;
|
||||||
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
|
const PROCESS_EXIT_CHECK_INTERVAL_MS = 100;
|
||||||
|
const HTTP_SHUTDOWN_TIMEOUT_MS = 2000;
|
||||||
|
|
||||||
interface PidInfo {
|
interface PidInfo {
|
||||||
pid: number;
|
pid: number;
|
||||||
@@ -99,8 +101,9 @@ export class ProcessManager {
|
|||||||
const escapedBunPath = this.escapePowerShellString(bunPath);
|
const escapedBunPath = this.escapePowerShellString(bunPath);
|
||||||
const escapedScript = this.escapePowerShellString(script);
|
const escapedScript = this.escapePowerShellString(script);
|
||||||
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
|
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
|
||||||
|
const escapedLogFile = this.escapePowerShellString(logFile);
|
||||||
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
|
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
|
||||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -PassThru | Select-Object -ExpandProperty Id`;
|
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
|
||||||
|
|
||||||
const result = spawnSync('powershell', ['-Command', psCommand], {
|
const result = spawnSync('powershell', ['-Command', psCommand], {
|
||||||
stdio: 'pipe',
|
stdio: 'pipe',
|
||||||
@@ -171,34 +174,65 @@ export class ProcessManager {
|
|||||||
|
|
||||||
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
|
static async stop(timeout: number = PROCESS_STOP_TIMEOUT_MS): Promise<boolean> {
|
||||||
const info = this.getPidInfo();
|
const info = this.getPidInfo();
|
||||||
if (!info) return true;
|
|
||||||
|
|
||||||
try {
|
if (process.platform === 'win32') {
|
||||||
if (process.platform === 'win32') {
|
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
|
||||||
// On Windows, use taskkill /T /F to kill entire process tree
|
// because the worker shuts itself down from the inside (via wrapper IPC)
|
||||||
|
const port = info?.port ?? this.getPortFromSettings();
|
||||||
|
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
|
||||||
|
|
||||||
|
if (httpShutdownSucceeded) {
|
||||||
|
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
|
||||||
|
this.removePidFile();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTTP shutdown failed (worker not responding), fall back to taskkill
|
||||||
|
if (!info) {
|
||||||
|
// No PID file and HTTP failed - nothing more we can do
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const { execSync } = await import('child_process');
|
||||||
|
try {
|
||||||
|
// Use taskkill /T /F to kill entire process tree
|
||||||
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
||||||
// which is necessary to properly release the socket and avoid zombie ports
|
// which is necessary to properly release the socket and avoid zombie ports
|
||||||
const { execSync } = await import('child_process');
|
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||||
try {
|
} catch {
|
||||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
// Process may already be dead
|
||||||
} catch {
|
}
|
||||||
// Process may already be dead
|
|
||||||
}
|
// Wait for process to actually exit before removing PID file
|
||||||
} else {
|
try {
|
||||||
// On Unix, use signals
|
await this.waitForExit(info.pid, timeout);
|
||||||
|
} catch {
|
||||||
|
// Timeout waiting - process may still be alive
|
||||||
|
}
|
||||||
|
|
||||||
|
// Only remove PID file if process is confirmed dead
|
||||||
|
if (!this.isProcessAlive(info.pid)) {
|
||||||
|
this.removePidFile();
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
// Unix: Use signals (unchanged behavior)
|
||||||
|
if (!info) return true;
|
||||||
|
|
||||||
|
try {
|
||||||
process.kill(info.pid, 'SIGTERM');
|
process.kill(info.pid, 'SIGTERM');
|
||||||
await this.waitForExit(info.pid, timeout);
|
await this.waitForExit(info.pid, timeout);
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
try {
|
|
||||||
process.kill(info.pid, 'SIGKILL');
|
|
||||||
} catch {
|
} catch {
|
||||||
// Process already dead
|
try {
|
||||||
|
process.kill(info.pid, 'SIGKILL');
|
||||||
|
} catch {
|
||||||
|
// Process already dead
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
this.removePidFile();
|
this.removePidFile();
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||||
@@ -229,6 +263,66 @@ export class ProcessManager {
|
|||||||
return alive;
|
return alive;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get worker port from settings file
|
||||||
|
*/
|
||||||
|
private static getPortFromSettings(): number {
|
||||||
|
try {
|
||||||
|
const settingsPath = join(DATA_DIR, 'settings.json');
|
||||||
|
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
|
||||||
|
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
|
||||||
|
} catch {
|
||||||
|
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try to shut down the worker via HTTP endpoint
|
||||||
|
* Returns true if shutdown succeeded, false if worker not responding
|
||||||
|
*/
|
||||||
|
private static async tryHttpShutdown(port: number): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
// Send shutdown request
|
||||||
|
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||||
|
method: 'POST',
|
||||||
|
signal: AbortSignal.timeout(HTTP_SHUTDOWN_TIMEOUT_MS)
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for worker to actually stop responding
|
||||||
|
return await this.waitForWorkerDown(port, PROCESS_STOP_TIMEOUT_MS);
|
||||||
|
} catch {
|
||||||
|
// Worker not responding to HTTP - it may be dead or hung
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wait for worker to stop responding on the given port
|
||||||
|
*/
|
||||||
|
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
|
||||||
|
const startTime = Date.now();
|
||||||
|
|
||||||
|
while (Date.now() - startTime < timeout) {
|
||||||
|
try {
|
||||||
|
await fetch(`http://127.0.0.1:${port}/api/health`, {
|
||||||
|
signal: AbortSignal.timeout(500)
|
||||||
|
});
|
||||||
|
// Still responding, wait and retry
|
||||||
|
await new Promise(resolve => setTimeout(resolve, PROCESS_EXIT_CHECK_INTERVAL_MS));
|
||||||
|
} catch {
|
||||||
|
// Worker stopped responding - success
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Timeout - worker still responding
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Helper methods
|
// Helper methods
|
||||||
private static getPidInfo(): PidInfo | null {
|
private static getPidInfo(): PidInfo | null {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -3,11 +3,15 @@
|
|||||||
*
|
*
|
||||||
* This wrapper exists to solve the Windows zombie port problem.
|
* This wrapper exists to solve the Windows zombie port problem.
|
||||||
* The wrapper spawns the actual worker as a child process.
|
* The wrapper spawns the actual worker as a child process.
|
||||||
* When restart/shutdown is requested, the wrapper kills the child
|
* When shutdown is requested, the wrapper kills the child and exits.
|
||||||
* and respawns it (or exits), ensuring clean socket cleanup.
|
* The hooks will start a fresh wrapper+worker if needed.
|
||||||
*
|
*
|
||||||
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
||||||
* doesn't affect it.
|
* doesn't affect it.
|
||||||
|
*
|
||||||
|
* NOTE: The wrapper does NOT auto-restart the worker on crash.
|
||||||
|
* This is intentional - the hooks handle startup via ensureWorkerRunning().
|
||||||
|
* Auto-restart would cause PID file mismatches and potential infinite loops.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import { spawn, ChildProcess, execSync } from 'child_process';
|
import { spawn, ChildProcess, execSync } from 'child_process';
|
||||||
@@ -51,10 +55,11 @@ function spawnInner() {
|
|||||||
log(`Inner exited with code=${code}, signal=${signal}`);
|
log(`Inner exited with code=${code}, signal=${signal}`);
|
||||||
inner = null;
|
inner = null;
|
||||||
|
|
||||||
// If inner crashed unexpectedly (not during shutdown), respawn it
|
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
|
||||||
if (!isShuttingDown && code !== 0) {
|
// Auto-restart causes PID file mismatches and potential infinite loops
|
||||||
log('Inner crashed, respawning in 1 second...');
|
if (!isShuttingDown) {
|
||||||
setTimeout(() => spawnInner(), 1000);
|
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
|
||||||
|
process.exit(code ?? 1);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user