Merge main into feature/openrouter-provider
Resolved conflicts in built files by accepting main's versions. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,433 +0,0 @@
|
||||
import { existsSync, readFileSync, writeFileSync, unlinkSync, mkdirSync } from 'fs';
|
||||
import { createWriteStream } from 'fs';
|
||||
import { join } from 'path';
|
||||
import { spawn, spawnSync } from 'child_process';
|
||||
import { homedir } from 'os';
|
||||
import { DATA_DIR } from '../../shared/paths.js';
|
||||
import { getBunPath, isBunAvailable } from '../../utils/bun-path.js';
|
||||
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
|
||||
|
||||
const PID_FILE = join(DATA_DIR, 'worker.pid');
|
||||
const LOG_DIR = join(DATA_DIR, 'logs');
|
||||
const MARKETPLACE_ROOT = join(homedir(), '.claude', 'plugins', 'marketplaces', 'thedotmack');
|
||||
|
||||
interface PidInfo {
|
||||
pid: number;
|
||||
port: number;
|
||||
startedAt: string;
|
||||
version: string;
|
||||
}
|
||||
|
||||
export class ProcessManager {
|
||||
static async start(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
// Validate port range
|
||||
if (isNaN(port) || port < 1024 || port > 65535) {
|
||||
return {
|
||||
success: false,
|
||||
error: `Invalid port ${port}. Must be between 1024 and 65535`
|
||||
};
|
||||
}
|
||||
|
||||
// Check if already running
|
||||
if (await this.isRunning()) {
|
||||
const info = this.getPidInfo();
|
||||
return { success: true, pid: info?.pid };
|
||||
}
|
||||
|
||||
// Ensure log directory exists
|
||||
mkdirSync(LOG_DIR, { recursive: true });
|
||||
|
||||
// On Windows, use the wrapper script to solve zombie port problem
|
||||
// On Unix, use the worker directly
|
||||
const scriptName = process.platform === 'win32' ? 'worker-wrapper.cjs' : 'worker-service.cjs';
|
||||
const workerScript = join(MARKETPLACE_ROOT, 'plugin', 'scripts', scriptName);
|
||||
|
||||
if (!existsSync(workerScript)) {
|
||||
return { success: false, error: `Worker script not found at ${workerScript}` };
|
||||
}
|
||||
|
||||
const logFile = this.getLogFilePath();
|
||||
|
||||
// Use Bun on all platforms with PowerShell workaround for Windows console popups
|
||||
return this.startWithBun(workerScript, logFile, port);
|
||||
}
|
||||
|
||||
private static isBunAvailable(): boolean {
|
||||
return isBunAvailable();
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes a string for safe use in PowerShell single-quoted strings.
|
||||
* In PowerShell single quotes, the only special character is the single quote itself,
|
||||
* which must be doubled to escape it.
|
||||
*/
|
||||
private static escapePowerShellString(str: string): string {
|
||||
return str.replace(/'/g, "''");
|
||||
}
|
||||
|
||||
private static async startWithBun(script: string, logFile: string, port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
const bunPath = getBunPath();
|
||||
if (!bunPath) {
|
||||
return {
|
||||
success: false,
|
||||
error: 'Bun is required but not found in PATH or common installation paths. Install from https://bun.sh'
|
||||
};
|
||||
}
|
||||
try {
|
||||
const isWindows = process.platform === 'win32';
|
||||
|
||||
if (isWindows) {
|
||||
// Windows: Use PowerShell Start-Process with -WindowStyle Hidden
|
||||
// This properly hides the console window (affects both Bun and Node.js)
|
||||
// Note: windowsHide: true doesn't work with detached: true (Bun inherits Node.js process spawning semantics)
|
||||
// See: https://github.com/nodejs/node/issues/21825 and PR #315 for detailed testing
|
||||
//
|
||||
// On Windows, we start worker-wrapper.cjs which manages the actual worker-service.cjs.
|
||||
// This solves the zombie port problem: the wrapper has no sockets, so when it kills
|
||||
// and respawns the inner worker, the socket is properly released.
|
||||
//
|
||||
// Security: All paths (bunPath, script, MARKETPLACE_ROOT) are application-controlled system paths,
|
||||
// not user input. If an attacker could modify these paths, they would already have full filesystem
|
||||
// access including direct access to ~/.claude-mem/claude-mem.db. Nevertheless, we properly escape
|
||||
// all values for PowerShell to follow security best practices.
|
||||
const escapedBunPath = this.escapePowerShellString(bunPath);
|
||||
const escapedScript = this.escapePowerShellString(script);
|
||||
const escapedWorkDir = this.escapePowerShellString(MARKETPLACE_ROOT);
|
||||
const escapedLogFile = this.escapePowerShellString(logFile);
|
||||
const envVars = `$env:CLAUDE_MEM_WORKER_PORT='${port}'`;
|
||||
const psCommand = `${envVars}; Start-Process -FilePath '${escapedBunPath}' -ArgumentList '${escapedScript}' -WorkingDirectory '${escapedWorkDir}' -WindowStyle Hidden -RedirectStandardOutput '${escapedLogFile}' -RedirectStandardError '${escapedLogFile}.err' -PassThru | Select-Object -ExpandProperty Id`;
|
||||
|
||||
const result = spawnSync('powershell', ['-Command', psCommand], {
|
||||
stdio: 'pipe',
|
||||
timeout: 10000,
|
||||
windowsHide: true
|
||||
});
|
||||
|
||||
if (result.status !== 0) {
|
||||
return {
|
||||
success: false,
|
||||
error: `PowerShell spawn failed: ${result.stderr?.toString() || 'unknown error'}`
|
||||
};
|
||||
}
|
||||
|
||||
const pid = parseInt(result.stdout.toString().trim(), 10);
|
||||
if (isNaN(pid)) {
|
||||
return { success: false, error: 'Failed to get PID from PowerShell' };
|
||||
}
|
||||
|
||||
// Write PID file
|
||||
this.writePidFile({
|
||||
pid,
|
||||
port,
|
||||
startedAt: new Date().toISOString(),
|
||||
version: process.env.npm_package_version || 'unknown'
|
||||
});
|
||||
|
||||
// Wait for health
|
||||
return this.waitForHealth(pid, port);
|
||||
} else {
|
||||
// Unix: Use standard spawn with detached
|
||||
const child = spawn(bunPath, [script], {
|
||||
detached: true,
|
||||
stdio: ['ignore', 'pipe', 'pipe'],
|
||||
env: { ...process.env, CLAUDE_MEM_WORKER_PORT: String(port) },
|
||||
cwd: MARKETPLACE_ROOT
|
||||
});
|
||||
|
||||
// Write logs
|
||||
const logStream = createWriteStream(logFile, { flags: 'a' });
|
||||
child.stdout?.pipe(logStream);
|
||||
child.stderr?.pipe(logStream);
|
||||
|
||||
child.unref();
|
||||
|
||||
if (!child.pid) {
|
||||
return { success: false, error: 'Failed to get PID from spawned process' };
|
||||
}
|
||||
|
||||
// Write PID file
|
||||
this.writePidFile({
|
||||
pid: child.pid,
|
||||
port,
|
||||
startedAt: new Date().toISOString(),
|
||||
version: process.env.npm_package_version || 'unknown'
|
||||
});
|
||||
|
||||
// Wait for health
|
||||
return this.waitForHealth(child.pid, port);
|
||||
}
|
||||
} catch (error) {
|
||||
return {
|
||||
success: false,
|
||||
error: error instanceof Error ? error.message : String(error)
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
static async stop(timeout: number = 5000): Promise<boolean> {
|
||||
const info = this.getPidInfo();
|
||||
|
||||
if (process.platform === 'win32') {
|
||||
// Windows: Try graceful HTTP shutdown first - this works regardless of PID file state
|
||||
// because the worker shuts itself down from the inside (via wrapper IPC)
|
||||
const port = info?.port ?? this.getPortFromSettings();
|
||||
const httpShutdownSucceeded = await this.tryHttpShutdown(port);
|
||||
|
||||
if (httpShutdownSucceeded) {
|
||||
// HTTP shutdown succeeded - worker confirmed down, safe to remove PID file
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
|
||||
// HTTP shutdown failed (worker not responding), fall back to taskkill
|
||||
if (!info) {
|
||||
// No PID file and HTTP failed - nothing more we can do
|
||||
return true;
|
||||
}
|
||||
|
||||
const { execSync } = await import('child_process');
|
||||
try {
|
||||
// Use taskkill /T /F to kill entire process tree
|
||||
// This ensures the wrapper AND all its children (inner worker, MCP, ChromaSync) are killed
|
||||
// which is necessary to properly release the socket and avoid zombie ports
|
||||
execSync(`taskkill /PID ${info.pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may already be dead
|
||||
}
|
||||
|
||||
// Wait for process to actually exit before removing PID file
|
||||
try {
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
} catch {
|
||||
// Timeout waiting - process may still be alive
|
||||
}
|
||||
|
||||
// Only remove PID file if process is confirmed dead
|
||||
if (!this.isProcessAlive(info.pid)) {
|
||||
this.removePidFile();
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
// Unix: Use signals (unchanged behavior)
|
||||
if (!info) return true;
|
||||
|
||||
try {
|
||||
process.kill(info.pid, 'SIGTERM');
|
||||
await this.waitForExit(info.pid, timeout);
|
||||
} catch {
|
||||
try {
|
||||
process.kill(info.pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already dead
|
||||
}
|
||||
}
|
||||
|
||||
this.removePidFile();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
static async restart(port: number): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
await this.stop();
|
||||
return this.start(port);
|
||||
}
|
||||
|
||||
static async status(): Promise<{ running: boolean; pid?: number; port?: number; uptime?: string }> {
|
||||
const info = this.getPidInfo();
|
||||
if (!info) return { running: false };
|
||||
|
||||
const running = this.isProcessAlive(info.pid);
|
||||
return {
|
||||
running,
|
||||
pid: running ? info.pid : undefined,
|
||||
port: running ? info.port : undefined,
|
||||
uptime: running ? this.formatUptime(info.startedAt) : undefined
|
||||
};
|
||||
}
|
||||
|
||||
static async isRunning(): Promise<boolean> {
|
||||
const info = this.getPidInfo();
|
||||
if (!info) return false;
|
||||
const alive = this.isProcessAlive(info.pid);
|
||||
if (!alive) {
|
||||
this.removePidFile(); // Clean up stale PID file
|
||||
}
|
||||
return alive;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get worker port from settings file
|
||||
*/
|
||||
private static getPortFromSettings(): number {
|
||||
try {
|
||||
const settingsPath = join(DATA_DIR, 'settings.json');
|
||||
const settings = SettingsDefaultsManager.loadFromFile(settingsPath);
|
||||
return parseInt(settings.CLAUDE_MEM_WORKER_PORT, 10);
|
||||
} catch {
|
||||
return parseInt(SettingsDefaultsManager.get('CLAUDE_MEM_WORKER_PORT'), 10);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to shut down the worker via HTTP endpoint
|
||||
* Returns true if shutdown succeeded, false if worker not responding
|
||||
*/
|
||||
private static async tryHttpShutdown(port: number): Promise<boolean> {
|
||||
try {
|
||||
// Send shutdown request
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||
method: 'POST',
|
||||
signal: AbortSignal.timeout(2000)
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Wait for worker to actually stop responding
|
||||
return await this.waitForWorkerDown(port, 5000);
|
||||
} catch {
|
||||
// Worker not responding to HTTP - it may be dead or hung
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for worker to stop responding on the given port
|
||||
*/
|
||||
private static async waitForWorkerDown(port: number, timeout: number): Promise<boolean> {
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeout) {
|
||||
try {
|
||||
await fetch(`http://127.0.0.1:${port}/api/health`, {
|
||||
signal: AbortSignal.timeout(500)
|
||||
});
|
||||
// Still responding, wait and retry
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
} catch {
|
||||
// Worker stopped responding - success
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// Timeout - worker still responding
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helper methods
|
||||
private static getPidInfo(): PidInfo | null {
|
||||
try {
|
||||
if (!existsSync(PID_FILE)) return null;
|
||||
const content = readFileSync(PID_FILE, 'utf-8');
|
||||
const parsed = JSON.parse(content);
|
||||
// Validate required fields have correct types
|
||||
if (typeof parsed.pid !== 'number' || typeof parsed.port !== 'number') {
|
||||
logger.warn('PROCESS', 'Malformed PID file: missing or invalid pid/port fields', {}, { parsed });
|
||||
return null;
|
||||
}
|
||||
return parsed as PidInfo;
|
||||
} catch (error) {
|
||||
logger.warn('PROCESS', 'Failed to read PID file', {}, {
|
||||
error: error instanceof Error ? error.message : String(error),
|
||||
path: PID_FILE
|
||||
});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private static writePidFile(info: PidInfo): void {
|
||||
mkdirSync(DATA_DIR, { recursive: true });
|
||||
writeFileSync(PID_FILE, JSON.stringify(info, null, 2));
|
||||
}
|
||||
|
||||
private static removePidFile(): void {
|
||||
try {
|
||||
if (existsSync(PID_FILE)) {
|
||||
unlinkSync(PID_FILE);
|
||||
}
|
||||
} catch {
|
||||
// Ignore errors
|
||||
}
|
||||
}
|
||||
|
||||
private static isProcessAlive(pid: number): boolean {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static async waitForHealth(pid: number, port: number, timeoutMs: number = 10000): Promise<{ success: boolean; pid?: number; error?: string }> {
|
||||
const startTime = Date.now();
|
||||
const isWindows = process.platform === 'win32';
|
||||
// Increase timeout on Windows to account for slower process startup
|
||||
const adjustedTimeout = isWindows ? timeoutMs * 2 : timeoutMs;
|
||||
|
||||
while (Date.now() - startTime < adjustedTimeout) {
|
||||
// Check if process is still alive
|
||||
if (!this.isProcessAlive(pid)) {
|
||||
const errorMsg = isWindows
|
||||
? `Process died during startup\n\nTroubleshooting:\n1. Check Task Manager for zombie 'bun.exe' or 'node.exe' processes\n2. Verify port ${port} is not in use: netstat -ano | findstr ${port}\n3. Check worker logs in ~/.claude-mem/logs/\n4. See GitHub issues: #363, #367, #371, #373\n5. Docs: https://docs.claude-mem.ai/troubleshooting/windows-issues`
|
||||
: 'Process died during startup';
|
||||
return { success: false, error: errorMsg };
|
||||
}
|
||||
|
||||
// Try readiness check (changed from /health to /api/readiness)
|
||||
try {
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/readiness`, {
|
||||
signal: AbortSignal.timeout(1000)
|
||||
});
|
||||
if (response.ok) {
|
||||
return { success: true, pid };
|
||||
}
|
||||
} catch {
|
||||
// Not ready yet, continue polling
|
||||
}
|
||||
|
||||
await new Promise(resolve => setTimeout(resolve, 200));
|
||||
}
|
||||
|
||||
const timeoutMsg = isWindows
|
||||
? `Worker failed to start on Windows (readiness check timed out after ${adjustedTimeout}ms)\n\nTroubleshooting:\n1. Check Task Manager for zombie 'bun.exe' or 'node.exe' processes\n2. Verify port ${port} is not in use: netstat -ano | findstr ${port}\n3. Check worker logs in ~/.claude-mem/logs/\n4. See GitHub issues: #363, #367, #371, #373\n5. Docs: https://docs.claude-mem.ai/troubleshooting/windows-issues`
|
||||
: `Readiness check timed out after ${adjustedTimeout}ms`;
|
||||
|
||||
return { success: false, error: timeoutMsg };
|
||||
}
|
||||
|
||||
private static async waitForExit(pid: number, timeout: number): Promise<void> {
|
||||
const startTime = Date.now();
|
||||
|
||||
while (Date.now() - startTime < timeout) {
|
||||
if (!this.isProcessAlive(pid)) {
|
||||
return;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 100));
|
||||
}
|
||||
|
||||
throw new Error('Process did not exit within timeout');
|
||||
}
|
||||
|
||||
private static getLogFilePath(): string {
|
||||
const date = new Date().toISOString().slice(0, 10);
|
||||
return join(LOG_DIR, `worker-${date}.log`);
|
||||
}
|
||||
|
||||
private static formatUptime(startedAt: string): string {
|
||||
const startTime = new Date(startedAt).getTime();
|
||||
const now = Date.now();
|
||||
const diffMs = now - startTime;
|
||||
|
||||
const seconds = Math.floor(diffMs / 1000);
|
||||
const minutes = Math.floor(seconds / 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
const days = Math.floor(hours / 24);
|
||||
|
||||
if (days > 0) return `${days}d ${hours % 24}h`;
|
||||
if (hours > 0) return `${hours}h ${minutes % 60}m`;
|
||||
if (minutes > 0) return `${minutes}m ${seconds % 60}s`;
|
||||
return `${seconds}s`;
|
||||
}
|
||||
}
|
||||
+256
-44
@@ -14,11 +14,103 @@ import { Client } from '@modelcontextprotocol/sdk/client/index.js';
|
||||
import { StdioClientTransport } from '@modelcontextprotocol/sdk/client/stdio.js';
|
||||
import { getWorkerPort, getWorkerHost } from '../shared/worker-utils.js';
|
||||
import { logger } from '../utils/logger.js';
|
||||
import { exec, execSync } from 'child_process';
|
||||
import { exec, execSync, spawn } from 'child_process';
|
||||
import { homedir } from 'os';
|
||||
import { existsSync, writeFileSync, readFileSync, unlinkSync, mkdirSync } from 'fs';
|
||||
import { promisify } from 'util';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
// PID file management for self-spawn pattern
|
||||
const DATA_DIR = path.join(homedir(), '.claude-mem');
|
||||
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
|
||||
const HOOK_RESPONSE = '{"continue": true, "suppressOutput": true}';
|
||||
|
||||
interface PidInfo {
|
||||
pid: number;
|
||||
port: number;
|
||||
startedAt: string;
|
||||
}
|
||||
|
||||
// PID file utility functions
|
||||
function writePidFile(info: PidInfo): void {
|
||||
mkdirSync(DATA_DIR, { recursive: true });
|
||||
writeFileSync(PID_FILE, JSON.stringify(info, null, 2));
|
||||
}
|
||||
|
||||
function readPidFile(): PidInfo | null {
|
||||
try {
|
||||
if (!existsSync(PID_FILE)) return null;
|
||||
return JSON.parse(readFileSync(PID_FILE, 'utf-8'));
|
||||
} catch (error) {
|
||||
logger.warn('SYSTEM', 'Failed to read PID file', { path: PID_FILE, error: (error as Error).message });
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function removePidFile(): void {
|
||||
try {
|
||||
if (existsSync(PID_FILE)) unlinkSync(PID_FILE);
|
||||
} catch (error) {
|
||||
logger.warn('SYSTEM', 'Failed to remove PID file', { path: PID_FILE, error: (error as Error).message });
|
||||
}
|
||||
}
|
||||
|
||||
async function isPortInUse(port: number): Promise<boolean> {
|
||||
try {
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/health`, {
|
||||
signal: AbortSignal.timeout(2000)
|
||||
});
|
||||
return response.ok;
|
||||
} catch { return false; }
|
||||
}
|
||||
|
||||
async function waitForHealth(port: number, timeoutMs: number = 30000): Promise<boolean> {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/readiness`, {
|
||||
signal: AbortSignal.timeout(2000)
|
||||
});
|
||||
if (response.ok) return true;
|
||||
} catch {
|
||||
// Not ready yet
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function httpShutdown(port: number): Promise<boolean> {
|
||||
try {
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||
method: 'POST',
|
||||
signal: AbortSignal.timeout(5000)
|
||||
});
|
||||
if (!response.ok) {
|
||||
logger.warn('SYSTEM', 'Shutdown request returned error', { port, status: response.status });
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} catch (error) {
|
||||
// Connection refused is expected if worker already stopped
|
||||
const isConnectionRefused = (error as Error).message?.includes('ECONNREFUSED');
|
||||
if (!isConnectionRefused) {
|
||||
logger.warn('SYSTEM', 'Shutdown request failed', { port, error: (error as Error).message });
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
async function waitForPortFree(port: number, timeoutMs: number = 10000): Promise<boolean> {
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
if (!(await isPortInUse(port))) return true;
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Import composed service layer
|
||||
import { DatabaseManager } from './worker/DatabaseManager.js';
|
||||
import { SessionManager } from './worker/SessionManager.js';
|
||||
@@ -270,7 +362,7 @@ export class WorkerService {
|
||||
this.app.get('/api/context/inject', async (req, res, next) => {
|
||||
try {
|
||||
// Wait for initialization to complete (with timeout)
|
||||
const timeoutMs = 30000; // 30 second timeout
|
||||
const timeoutMs = 300000; // 5 minute timeout for slow systems
|
||||
const timeoutPromise = new Promise((_, reject) =>
|
||||
setTimeout(() => reject(new Error('Initialization timeout')), timeoutMs)
|
||||
);
|
||||
@@ -330,7 +422,7 @@ export class WorkerService {
|
||||
if (isWindows) {
|
||||
// Windows: Use PowerShell Get-CimInstance to find chroma-mcp processes
|
||||
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.Name -like '*python*' -and $_.CommandLine -like '*chroma-mcp*' } | Select-Object -ExpandProperty ProcessId"`;
|
||||
const { stdout } = await execAsync(cmd, { timeout: 5000 });
|
||||
const { stdout } = await execAsync(cmd, { timeout: 60000 });
|
||||
|
||||
if (!stdout.trim()) {
|
||||
logger.debug('SYSTEM', 'No orphaned chroma-mcp processes found (Windows)');
|
||||
@@ -385,10 +477,20 @@ export class WorkerService {
|
||||
logger.warn('SYSTEM', 'Skipping invalid PID', { pid });
|
||||
continue;
|
||||
}
|
||||
execSync(`taskkill /PID ${pid} /T /F`, { timeout: 5000, stdio: 'ignore' });
|
||||
try {
|
||||
execSync(`taskkill /PID ${pid} /T /F`, { timeout: 60000, stdio: 'ignore' });
|
||||
} catch {
|
||||
// Process may have already exited - continue cleanup
|
||||
}
|
||||
}
|
||||
} else {
|
||||
await execAsync(`kill ${pids.join(' ')}`);
|
||||
for (const pid of pids) {
|
||||
try {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process already exited - that's fine
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pids.length });
|
||||
@@ -467,11 +569,11 @@ export class WorkerService {
|
||||
env: process.env
|
||||
});
|
||||
|
||||
// Add timeout guard to prevent hanging on MCP connection (15 seconds)
|
||||
const MCP_INIT_TIMEOUT_MS = 15000;
|
||||
// Add timeout guard to prevent hanging on MCP connection (5 minutes for slow systems)
|
||||
const MCP_INIT_TIMEOUT_MS = 300000;
|
||||
const mcpConnectionPromise = this.mcpClient.connect(transport);
|
||||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error('MCP connection timeout after 15s')), MCP_INIT_TIMEOUT_MS)
|
||||
setTimeout(() => reject(new Error('MCP connection timeout after 5 minutes')), MCP_INIT_TIMEOUT_MS)
|
||||
);
|
||||
|
||||
await Promise.race([mcpConnectionPromise, timeoutPromise]);
|
||||
@@ -655,13 +757,18 @@ export class WorkerService {
|
||||
return [];
|
||||
}
|
||||
|
||||
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`;
|
||||
const { stdout } = await execAsync(cmd, { timeout: 5000 });
|
||||
return stdout
|
||||
.trim()
|
||||
.split('\n')
|
||||
.map(s => parseInt(s.trim(), 10))
|
||||
.filter(n => !isNaN(n) && Number.isInteger(n) && n > 0); // SECURITY: Validate each PID
|
||||
try {
|
||||
const cmd = `powershell -Command "Get-CimInstance Win32_Process | Where-Object { $_.ParentProcessId -eq ${parentPid} } | Select-Object -ExpandProperty ProcessId"`;
|
||||
const { stdout } = await execAsync(cmd, { timeout: 60000 });
|
||||
return stdout
|
||||
.trim()
|
||||
.split('\n')
|
||||
.map(s => parseInt(s.trim(), 10))
|
||||
.filter(n => !isNaN(n) && Number.isInteger(n) && n > 0); // SECURITY: Validate each PID
|
||||
} catch (error) {
|
||||
logger.warn('SYSTEM', 'Failed to enumerate child processes', { parentPid, error: (error as Error).message });
|
||||
return []; // Fail safely - continue shutdown without child process cleanup
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -674,12 +781,17 @@ export class WorkerService {
|
||||
return;
|
||||
}
|
||||
|
||||
if (process.platform === 'win32') {
|
||||
// /T kills entire process tree, /F forces termination
|
||||
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 5000 });
|
||||
try {
|
||||
if (process.platform === 'win32') {
|
||||
// /T kills entire process tree, /F forces termination
|
||||
await execAsync(`taskkill /PID ${pid} /T /F`, { timeout: 60000 });
|
||||
} else {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
}
|
||||
logger.info('SYSTEM', 'Killed process', { pid });
|
||||
} else {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
} catch {
|
||||
// Process may have already exited - continue shutdown
|
||||
logger.debug('SYSTEM', 'Process already exited during force kill', { pid });
|
||||
}
|
||||
}
|
||||
|
||||
@@ -691,8 +803,12 @@ export class WorkerService {
|
||||
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
const stillAlive = pids.filter(pid => {
|
||||
process.kill(pid, 0); // Signal 0 checks if process exists - throws if dead
|
||||
return true;
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
});
|
||||
|
||||
if (stillAlive.length === 0) {
|
||||
@@ -741,31 +857,127 @@ export class WorkerService {
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Main Entry Point
|
||||
// CLI Entry Point
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* Start the worker service (if running as main module)
|
||||
* Note: Using require.main check for CJS compatibility (build outputs CJS)
|
||||
*/
|
||||
if (require.main === module || !module.parent) {
|
||||
const worker = new WorkerService();
|
||||
async function main() {
|
||||
const command = process.argv[2];
|
||||
const port = getWorkerPort();
|
||||
|
||||
// Graceful shutdown
|
||||
process.on('SIGTERM', async () => {
|
||||
logger.info('SYSTEM', 'Received SIGTERM, shutting down gracefully');
|
||||
await worker.shutdown();
|
||||
process.exit(0);
|
||||
});
|
||||
switch (command) {
|
||||
case 'start': {
|
||||
// Already running?
|
||||
if (await isPortInUse(port)) {
|
||||
console.log(HOOK_RESPONSE);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
logger.info('SYSTEM', 'Received SIGINT, shutting down gracefully');
|
||||
await worker.shutdown();
|
||||
process.exit(0);
|
||||
});
|
||||
// Spawn self as daemon
|
||||
const child = spawn(process.execPath, [__filename, '--daemon'], {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
windowsHide: true,
|
||||
env: { ...process.env, CLAUDE_MEM_WORKER_PORT: String(port) }
|
||||
});
|
||||
|
||||
worker.start().catch((error) => {
|
||||
logger.failure('SYSTEM', 'Worker failed to start', {}, error as Error);
|
||||
process.exit(1);
|
||||
});
|
||||
if (child.pid === undefined) {
|
||||
console.error('Failed to spawn worker daemon');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
child.unref();
|
||||
|
||||
// Write PID file
|
||||
writePidFile({ pid: child.pid, port, startedAt: new Date().toISOString() });
|
||||
|
||||
// Wait for health
|
||||
const healthy = await waitForHealth(port, 30000);
|
||||
if (!healthy) {
|
||||
removePidFile();
|
||||
console.error('Worker failed to start');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log(HOOK_RESPONSE);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
case 'stop': {
|
||||
await httpShutdown(port);
|
||||
await waitForPortFree(port, 10000);
|
||||
removePidFile();
|
||||
console.log(HOOK_RESPONSE);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
case 'restart': {
|
||||
await httpShutdown(port);
|
||||
await waitForPortFree(port, 10000);
|
||||
removePidFile();
|
||||
// Fall through to start a new instance
|
||||
const child = spawn(process.execPath, [__filename, '--daemon'], {
|
||||
detached: true,
|
||||
stdio: 'ignore',
|
||||
windowsHide: true,
|
||||
env: { ...process.env, CLAUDE_MEM_WORKER_PORT: String(port) }
|
||||
});
|
||||
|
||||
if (child.pid === undefined) {
|
||||
console.error('Failed to spawn worker daemon during restart');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
child.unref();
|
||||
writePidFile({ pid: child.pid, port, startedAt: new Date().toISOString() });
|
||||
const healthy = await waitForHealth(port, 30000);
|
||||
if (!healthy) {
|
||||
removePidFile();
|
||||
console.error('Worker failed to restart');
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(HOOK_RESPONSE);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
case 'status': {
|
||||
const running = await isPortInUse(port);
|
||||
const pidInfo = readPidFile();
|
||||
if (running && pidInfo) {
|
||||
console.log(`Worker running (PID: ${pidInfo.pid}, Port: ${pidInfo.port})`);
|
||||
} else {
|
||||
console.log('Worker not running');
|
||||
}
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
case '--daemon':
|
||||
default: {
|
||||
// Run server directly
|
||||
const worker = new WorkerService();
|
||||
|
||||
process.on('SIGTERM', async () => {
|
||||
logger.info('SYSTEM', 'Received SIGTERM');
|
||||
await worker.shutdown();
|
||||
removePidFile();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
logger.info('SYSTEM', 'Received SIGINT');
|
||||
await worker.shutdown();
|
||||
removePidFile();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
worker.start().catch((error) => {
|
||||
logger.failure('SYSTEM', 'Worker failed to start', {}, error as Error);
|
||||
removePidFile();
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (require.main === module || !module.parent) {
|
||||
main();
|
||||
}
|
||||
|
||||
@@ -1,157 +0,0 @@
|
||||
/**
|
||||
* Worker Wrapper - Manages worker process lifecycle
|
||||
*
|
||||
* This wrapper exists to solve the Windows zombie port problem.
|
||||
* The wrapper spawns the actual worker as a child process.
|
||||
* When shutdown is requested, the wrapper kills the child and exits.
|
||||
* The hooks will start a fresh wrapper+worker if needed.
|
||||
*
|
||||
* The wrapper itself has no sockets, so Bun's socket cleanup bug
|
||||
* doesn't affect it.
|
||||
*
|
||||
* NOTE: The wrapper does NOT auto-restart the worker on crash.
|
||||
* This is intentional - the hooks handle startup via ensureWorkerRunning().
|
||||
* Auto-restart would cause PID file mismatches and potential infinite loops.
|
||||
*/
|
||||
|
||||
import { spawn, ChildProcess, execSync } from 'child_process';
|
||||
import path from 'path';
|
||||
|
||||
const isWindows = process.platform === 'win32';
|
||||
|
||||
const SCRIPT_DIR = __dirname;
|
||||
const INNER_SCRIPT = path.join(SCRIPT_DIR, 'worker-service.cjs');
|
||||
|
||||
let inner: ChildProcess | null = null;
|
||||
let isShuttingDown = false;
|
||||
|
||||
function log(msg: string) {
|
||||
const timestamp = new Date().toISOString();
|
||||
console.log(`[${timestamp}] [wrapper] ${msg}`);
|
||||
}
|
||||
|
||||
function spawnInner() {
|
||||
log(`Spawning inner worker: ${INNER_SCRIPT}`);
|
||||
|
||||
inner = spawn(process.execPath, [INNER_SCRIPT], {
|
||||
stdio: ['inherit', 'inherit', 'inherit', 'ipc'],
|
||||
env: { ...process.env, CLAUDE_MEM_MANAGED: 'true' },
|
||||
cwd: path.dirname(INNER_SCRIPT),
|
||||
});
|
||||
|
||||
inner.on('message', async (msg: { type: string }) => {
|
||||
if (msg.type === 'restart' || msg.type === 'shutdown') {
|
||||
// Both restart and shutdown: kill inner and exit wrapper
|
||||
// The hooks will start a fresh wrapper+inner if needed
|
||||
log(`${msg.type} requested by inner`);
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
log('Exiting wrapper');
|
||||
process.exit(0);
|
||||
}
|
||||
});
|
||||
|
||||
inner.on('exit', (code, signal) => {
|
||||
log(`Inner exited with code=${code}, signal=${signal}`);
|
||||
inner = null;
|
||||
|
||||
// Don't auto-restart - let hooks handle it via ensureWorkerRunning()
|
||||
// Auto-restart causes PID file mismatches and potential infinite loops
|
||||
if (!isShuttingDown) {
|
||||
log('Inner exited unexpectedly, wrapper exiting (hooks will restart if needed)');
|
||||
process.exit(code ?? 1);
|
||||
}
|
||||
});
|
||||
|
||||
inner.on('error', (err) => {
|
||||
log(`Inner error: ${err.message}`);
|
||||
});
|
||||
}
|
||||
|
||||
async function killInner(): Promise<void> {
|
||||
if (!inner || !inner.pid) {
|
||||
log('No inner process to kill');
|
||||
return;
|
||||
}
|
||||
|
||||
const pid = inner.pid;
|
||||
log(`Killing inner process tree (pid=${pid})`);
|
||||
|
||||
if (isWindows) {
|
||||
// On Windows, use taskkill /T /F to kill entire process tree
|
||||
// This ensures all children (MCP server, ChromaSync, etc.) are killed
|
||||
// which is necessary to properly release the socket
|
||||
try {
|
||||
execSync(`taskkill /PID ${pid} /T /F`, { timeout: 10000, stdio: 'ignore' });
|
||||
log(`taskkill completed for pid=${pid}`);
|
||||
} catch (error) {
|
||||
// Process may already be dead
|
||||
log(`taskkill failed (process may be dead): ${error}`);
|
||||
}
|
||||
} else {
|
||||
// On Unix, SIGTERM then SIGKILL
|
||||
inner.kill('SIGTERM');
|
||||
|
||||
// Wait for exit with timeout
|
||||
const exitPromise = new Promise<void>(resolve => {
|
||||
if (!inner) {
|
||||
resolve();
|
||||
return;
|
||||
}
|
||||
inner.on('exit', () => resolve());
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<void>(resolve =>
|
||||
setTimeout(() => resolve(), 5000)
|
||||
);
|
||||
|
||||
await Promise.race([exitPromise, timeoutPromise]);
|
||||
|
||||
// Force kill if still alive
|
||||
if (inner && !inner.killed) {
|
||||
log('Inner did not exit gracefully, force killing');
|
||||
inner.kill('SIGKILL');
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for the process to fully exit
|
||||
await waitForProcessExit(pid, 5000);
|
||||
|
||||
inner = null;
|
||||
log('Inner process terminated');
|
||||
}
|
||||
|
||||
async function waitForProcessExit(pid: number, timeoutMs: number): Promise<void> {
|
||||
const start = Date.now();
|
||||
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
process.kill(pid, 0); // Check if process exists
|
||||
await new Promise(r => setTimeout(r, 100));
|
||||
} catch {
|
||||
// Process is dead
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
log(`Timeout waiting for process ${pid} to exit`);
|
||||
}
|
||||
|
||||
// Handle wrapper signals
|
||||
process.on('SIGTERM', async () => {
|
||||
log('Wrapper received SIGTERM');
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
process.on('SIGINT', async () => {
|
||||
log('Wrapper received SIGINT');
|
||||
isShuttingDown = true;
|
||||
await killInner();
|
||||
process.exit(0);
|
||||
});
|
||||
|
||||
// Start the inner worker
|
||||
log('Wrapper starting');
|
||||
spawnInner();
|
||||
@@ -28,9 +28,9 @@ function isValidBranchName(branchName: string): boolean {
|
||||
return validBranchRegex.test(branchName) && !branchName.includes('..');
|
||||
}
|
||||
|
||||
// Timeout constants
|
||||
const GIT_COMMAND_TIMEOUT_MS = 30_000;
|
||||
const NPM_INSTALL_TIMEOUT_MS = 120_000;
|
||||
// Timeout constants (increased for slow systems)
|
||||
const GIT_COMMAND_TIMEOUT_MS = 300_000;
|
||||
const NPM_INSTALL_TIMEOUT_MS = 600_000;
|
||||
const DEFAULT_SHELL_TIMEOUT_MS = 60_000;
|
||||
|
||||
export interface BranchInfo {
|
||||
|
||||
Reference in New Issue
Block a user