feat: add embedded Process Supervisor for unified process lifecycle (#1370)
* feat: add embedded Process Supervisor for unified process lifecycle management Consolidates scattered process management (ProcessManager, GracefulShutdown, HealthMonitor, ProcessRegistry) into a unified src/supervisor/ module. New: ProcessRegistry with JSON persistence, env sanitizer (strips CLAUDECODE_* vars), graceful shutdown cascade (SIGTERM → 5s wait → SIGKILL with tree-kill on Windows), PID file liveness validation, and singleton Supervisor API. Fixes #1352 (worker inherits CLAUDECODE env causing nested sessions) Fixes #1356 (zombie TCP socket after Windows reboot) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add session-scoped process reaping to supervisor Adds reapSession(sessionId) to ProcessRegistry for killing session-tagged processes on session end. SessionManager.deleteSession() now triggers reaping. Tightens orphan reaper interval from 60s to 30s. Fixes #1351 (MCP server processes leak on session end) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add Unix domain socket support for worker communication Introduces socket-manager.ts for UDS-based worker communication, eliminating port 37777 collisions between concurrent sessions. Worker listens on ~/.claude-mem/sockets/worker.sock by default with TCP fallback. All hook handlers, MCP server, health checks, and admin commands updated to use socket-aware workerHttpRequest(). Backwards compatible — settings can force TCP mode via CLAUDE_MEM_WORKER_TRANSPORT=tcp. Fixes #1346 (port 37777 collision across concurrent sessions) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: remove in-process worker fallback from hook command Removes the fallback path where hook scripts started WorkerService in-process, making the worker a grandchild of Claude Code (killed by sandbox). Hooks now always delegate to ensureWorkerStarted() which spawns a fully detached daemon. Fixes #1249 (grandchild process killed by sandbox) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add health checker and /api/admin/doctor endpoint Adds 30-second periodic health sweep that prunes dead processes from the supervisor registry and cleans stale socket files. Adds /api/admin/doctor endpoint exposing supervisor state, process liveness, and environment health. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add comprehensive supervisor test suite 64 tests covering all supervisor modules: process registry (18 tests), env sanitizer (8), shutdown cascade (10), socket manager (15), health checker (5), and supervisor API (6). Includes persistence, isolation, edge cases, and cross-module integration scenarios. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: revert Unix domain socket transport, restore TCP on port 37777 The socket-manager introduced UDS as default transport, but this broke the HTTP server's TCP accessibility (viewer UI, curl, external monitoring). Since there's only ever one worker process handling all sessions, the port collision rationale for UDS doesn't apply. Reverts to TCP-only, removing ~900 lines of unnecessary complexity. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: remove dead code found in pre-landing review Remove unused `acceptingSpawns` field from Supervisor class (written but never read — assertCanSpawn uses stopPromise instead) and unused `buildWorkerUrl` import from context handler. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * updated gitignore * fix: address PR review feedback - downgrade HTTP logging, clean up gitignore, harden supervisor - Downgrade request/response HTTP logging from info to debug to reduce noise - Remove unused getWorkerPort imports, use buildWorkerUrl helper - Export ENV_PREFIXES/ENV_EXACT_MATCHES from env-sanitizer, reuse in Server.ts - Fix isPidAlive(0) returning true (should be false) - Add shutdownInitiated flag to prevent signal handler race condition - Make validateWorkerPidFile testable with pidFilePath option - Remove unused dataDir from ShutdownCascadeOptions - Upgrade reapSession log from debug to warn - Rename zombiePidFiles to deadProcessPids (returns actual PIDs) - Clean up gitignore: remove duplicate datasets/, stale ~*/ and http*/ patterns - Fix tests to use temp directories instead of relying on real PID file Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,12 +10,7 @@
|
||||
|
||||
import http from 'http';
|
||||
import { logger } from '../../utils/logger.js';
|
||||
import {
|
||||
getChildProcesses,
|
||||
forceKillProcess,
|
||||
waitForProcessesExit,
|
||||
removePidFile
|
||||
} from './ProcessManager.js';
|
||||
import { stopSupervisor } from '../../supervisor/index.js';
|
||||
|
||||
export interface ShutdownableService {
|
||||
shutdownAll(): Promise<void>;
|
||||
@@ -57,49 +52,35 @@ export interface GracefulShutdownConfig {
|
||||
export async function performGracefulShutdown(config: GracefulShutdownConfig): Promise<void> {
|
||||
logger.info('SYSTEM', 'Shutdown initiated');
|
||||
|
||||
// Clean up PID file on shutdown
|
||||
removePidFile();
|
||||
|
||||
// STEP 1: Enumerate all child processes BEFORE we start closing things
|
||||
const childPids = await getChildProcesses(process.pid);
|
||||
logger.info('SYSTEM', 'Found child processes', { count: childPids.length, pids: childPids });
|
||||
|
||||
// STEP 2: Close HTTP server first
|
||||
// STEP 1: Close HTTP server first
|
||||
if (config.server) {
|
||||
await closeHttpServer(config.server);
|
||||
logger.info('SYSTEM', 'HTTP server closed');
|
||||
}
|
||||
|
||||
// STEP 3: Shutdown active sessions
|
||||
// STEP 2: Shutdown active sessions
|
||||
await config.sessionManager.shutdownAll();
|
||||
|
||||
// STEP 4: Close MCP client connection (signals child to exit gracefully)
|
||||
// STEP 3: Close MCP client connection (signals child to exit gracefully)
|
||||
if (config.mcpClient) {
|
||||
await config.mcpClient.close();
|
||||
logger.info('SYSTEM', 'MCP client closed');
|
||||
}
|
||||
|
||||
// STEP 5: Stop Chroma MCP connection
|
||||
// STEP 4: Stop Chroma MCP connection
|
||||
if (config.chromaMcpManager) {
|
||||
logger.info('SHUTDOWN', 'Stopping Chroma MCP connection...');
|
||||
await config.chromaMcpManager.stop();
|
||||
logger.info('SHUTDOWN', 'Chroma MCP connection stopped');
|
||||
}
|
||||
|
||||
// STEP 6: Close database connection (includes ChromaSync cleanup)
|
||||
// STEP 5: Close database connection (includes ChromaSync cleanup)
|
||||
if (config.dbManager) {
|
||||
await config.dbManager.close();
|
||||
}
|
||||
|
||||
// STEP 7: Force kill any remaining child processes (Windows zombie port fix)
|
||||
if (childPids.length > 0) {
|
||||
logger.info('SYSTEM', 'Force killing remaining children');
|
||||
for (const pid of childPids) {
|
||||
await forceKillProcess(pid);
|
||||
}
|
||||
// Wait for children to fully exit
|
||||
await waitForProcessesExit(childPids, 5000);
|
||||
}
|
||||
// STEP 6: Supervisor handles tracked child termination, PID cleanup, and stale sockets.
|
||||
await stopSupervisor();
|
||||
|
||||
logger.info('SYSTEM', 'Worker shutdown complete');
|
||||
}
|
||||
|
||||
@@ -14,6 +14,26 @@ import { readFileSync } from 'fs';
|
||||
import { logger } from '../../utils/logger.js';
|
||||
import { MARKETPLACE_ROOT } from '../../shared/paths.js';
|
||||
|
||||
/**
|
||||
* Make an HTTP request to the worker via TCP.
|
||||
* Returns { ok, statusCode, body } or throws on transport error.
|
||||
*/
|
||||
async function httpRequestToWorker(
|
||||
port: number,
|
||||
endpointPath: string,
|
||||
method: string = 'GET'
|
||||
): Promise<{ ok: boolean; statusCode: number; body: string }> {
|
||||
const response = await fetch(`http://127.0.0.1:${port}${endpointPath}`, { method });
|
||||
// Gracefully handle cases where response body isn't available (e.g., test mocks)
|
||||
let body = '';
|
||||
try {
|
||||
body = await response.text();
|
||||
} catch {
|
||||
// Body unavailable — health/readiness checks only need .ok
|
||||
}
|
||||
return { ok: response.ok, statusCode: response.status, body };
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a port is in use by querying the health endpoint
|
||||
*/
|
||||
@@ -29,7 +49,7 @@ export async function isPortInUse(port: number): Promise<boolean> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll a localhost endpoint until it returns 200 OK or timeout.
|
||||
* Poll a worker endpoint until it returns 200 OK or timeout.
|
||||
* Shared implementation for liveness and readiness checks.
|
||||
*/
|
||||
async function pollEndpointUntilOk(
|
||||
@@ -41,12 +61,11 @@ async function pollEndpointUntilOk(
|
||||
const start = Date.now();
|
||||
while (Date.now() - start < timeoutMs) {
|
||||
try {
|
||||
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
|
||||
const response = await fetch(`http://127.0.0.1:${port}${endpointPath}`);
|
||||
if (response.ok) return true;
|
||||
const result = await httpRequestToWorker(port, endpointPath);
|
||||
if (result.ok) return true;
|
||||
} catch (error) {
|
||||
// [ANTI-PATTERN IGNORED]: Retry loop - expected failures during startup, will retry
|
||||
logger.debug('SYSTEM', retryLogMessage, { port }, error as Error);
|
||||
logger.debug('SYSTEM', retryLogMessage, {}, error as Error);
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
@@ -87,28 +106,24 @@ export async function waitForPortFree(port: number, timeoutMs: number = 10000):
|
||||
|
||||
/**
|
||||
* Send HTTP shutdown request to a running worker
|
||||
* @param port Worker port
|
||||
* @returns true if shutdown request was acknowledged, false otherwise
|
||||
*/
|
||||
export async function httpShutdown(port: number): Promise<boolean> {
|
||||
try {
|
||||
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/admin/shutdown`, {
|
||||
method: 'POST'
|
||||
});
|
||||
if (!response.ok) {
|
||||
logger.warn('SYSTEM', 'Shutdown request returned error', { port, status: response.status });
|
||||
const result = await httpRequestToWorker(port, '/api/admin/shutdown', 'POST');
|
||||
if (!result.ok) {
|
||||
logger.warn('SYSTEM', 'Shutdown request returned error', { status: result.statusCode });
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} catch (error) {
|
||||
// Connection refused is expected if worker already stopped
|
||||
if (error instanceof Error && error.message?.includes('ECONNREFUSED')) {
|
||||
logger.debug('SYSTEM', 'Worker already stopped', { port }, error);
|
||||
logger.debug('SYSTEM', 'Worker already stopped', {}, error);
|
||||
return false;
|
||||
}
|
||||
// Unexpected error - log full details
|
||||
logger.error('SYSTEM', 'Shutdown request failed unexpectedly', { port }, error as Error);
|
||||
logger.error('SYSTEM', 'Shutdown request failed unexpectedly', {}, error as Error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -135,17 +150,17 @@ export function getInstalledPluginVersion(): string {
|
||||
|
||||
/**
|
||||
* Get the running worker's version via API
|
||||
* This is the "actual" version currently running
|
||||
* This is the "actual" version currently running.
|
||||
*/
|
||||
export async function getRunningWorkerVersion(port: number): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(`http://127.0.0.1:${port}/api/version`);
|
||||
if (!response.ok) return null;
|
||||
const data = await response.json() as { version: string };
|
||||
const result = await httpRequestToWorker(port, '/api/version');
|
||||
if (!result.ok) return null;
|
||||
const data = JSON.parse(result.body) as { version: string };
|
||||
return data.version;
|
||||
} catch {
|
||||
// Expected: worker not running or version endpoint unavailable
|
||||
logger.debug('SYSTEM', 'Could not fetch worker version', { port });
|
||||
logger.debug('SYSTEM', 'Could not fetch worker version', {});
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,8 @@ import { exec, execSync, spawn } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { logger } from '../../utils/logger.js';
|
||||
import { HOOK_TIMEOUTS } from '../../shared/hook-constants.js';
|
||||
import { sanitizeEnv } from '../../supervisor/env-sanitizer.js';
|
||||
import { getSupervisor, validateWorkerPidFile, type ValidateWorkerPidStatus } from '../../supervisor/index.js';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
@@ -625,11 +627,13 @@ export function spawnDaemon(
|
||||
extraEnv: Record<string, string> = {}
|
||||
): number | undefined {
|
||||
const isWindows = process.platform === 'win32';
|
||||
const env = {
|
||||
getSupervisor().assertCanSpawn('worker daemon');
|
||||
|
||||
const env = sanitizeEnv({
|
||||
...process.env,
|
||||
CLAUDE_MEM_WORKER_PORT: String(port),
|
||||
...extraEnv
|
||||
};
|
||||
});
|
||||
|
||||
if (isWindows) {
|
||||
// Use PowerShell Start-Process to spawn a hidden, independent process
|
||||
@@ -764,18 +768,8 @@ export function touchPidFile(): void {
|
||||
* Called at the top of ensureWorkerStarted() to clean up after WSL2
|
||||
* hibernate, OOM kills, or other ungraceful worker deaths.
|
||||
*/
|
||||
export function cleanStalePidFile(): void {
|
||||
const pidInfo = readPidFile();
|
||||
if (!pidInfo) return;
|
||||
|
||||
if (!isProcessAlive(pidInfo.pid)) {
|
||||
logger.info('SYSTEM', 'Removing stale PID file (worker process is dead)', {
|
||||
pid: pidInfo.pid,
|
||||
port: pidInfo.port,
|
||||
startedAt: pidInfo.startedAt
|
||||
});
|
||||
removePidFile();
|
||||
}
|
||||
export function cleanStalePidFile(): ValidateWorkerPidStatus {
|
||||
return validateWorkerPidFile({ logAlive: false });
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user