feat: add embedded Process Supervisor for unified process lifecycle (#1370)
* feat: add embedded Process Supervisor for unified process lifecycle management Consolidates scattered process management (ProcessManager, GracefulShutdown, HealthMonitor, ProcessRegistry) into a unified src/supervisor/ module. New: ProcessRegistry with JSON persistence, env sanitizer (strips CLAUDECODE_* vars), graceful shutdown cascade (SIGTERM → 5s wait → SIGKILL with tree-kill on Windows), PID file liveness validation, and singleton Supervisor API. Fixes #1352 (worker inherits CLAUDECODE env causing nested sessions) Fixes #1356 (zombie TCP socket after Windows reboot) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add session-scoped process reaping to supervisor Adds reapSession(sessionId) to ProcessRegistry for killing session-tagged processes on session end. SessionManager.deleteSession() now triggers reaping. Tightens orphan reaper interval from 60s to 30s. Fixes #1351 (MCP server processes leak on session end) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add Unix domain socket support for worker communication Introduces socket-manager.ts for UDS-based worker communication, eliminating port 37777 collisions between concurrent sessions. Worker listens on ~/.claude-mem/sockets/worker.sock by default with TCP fallback. All hook handlers, MCP server, health checks, and admin commands updated to use socket-aware workerHttpRequest(). Backwards compatible — settings can force TCP mode via CLAUDE_MEM_WORKER_TRANSPORT=tcp. Fixes #1346 (port 37777 collision across concurrent sessions) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: remove in-process worker fallback from hook command Removes the fallback path where hook scripts started WorkerService in-process, making the worker a grandchild of Claude Code (killed by sandbox). Hooks now always delegate to ensureWorkerStarted() which spawns a fully detached daemon. Fixes #1249 (grandchild process killed by sandbox) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * feat: add health checker and /api/admin/doctor endpoint Adds 30-second periodic health sweep that prunes dead processes from the supervisor registry and cleans stale socket files. Adds /api/admin/doctor endpoint exposing supervisor state, process liveness, and environment health. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * test: add comprehensive supervisor test suite 64 tests covering all supervisor modules: process registry (18 tests), env sanitizer (8), shutdown cascade (10), socket manager (15), health checker (5), and supervisor API (6). Includes persistence, isolation, edge cases, and cross-module integration scenarios. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix: revert Unix domain socket transport, restore TCP on port 37777 The socket-manager introduced UDS as default transport, but this broke the HTTP server's TCP accessibility (viewer UI, curl, external monitoring). Since there's only ever one worker process handling all sessions, the port collision rationale for UDS doesn't apply. Reverts to TCP-only, removing ~900 lines of unnecessary complexity. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * chore: remove dead code found in pre-landing review Remove unused `acceptingSpawns` field from Supervisor class (written but never read — assertCanSpawn uses stopPromise instead) and unused `buildWorkerUrl` import from context handler. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * updated gitignore * fix: address PR review feedback - downgrade HTTP logging, clean up gitignore, harden supervisor - Downgrade request/response HTTP logging from info to debug to reduce noise - Remove unused getWorkerPort imports, use buildWorkerUrl helper - Export ENV_PREFIXES/ENV_EXACT_MATCHES from env-sanitizer, reuse in Server.ts - Fix isPidAlive(0) returning true (should be false) - Add shutdownInitiated flag to prevent signal handler race condition - Make validateWorkerPidFile testable with pidFilePath option - Remove unused dataDir from ShutdownCascadeOptions - Upgrade reapSession log from debug to warn - Rename zombiePidFiles to deadProcessPids (returns actual PIDs) - Clean up gitignore: remove duplicate datasets/, stale ~*/ and http*/ patterns - Fix tests to use temp directories instead of relying on real PID file Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,8 @@
|
||||
import { spawn, exec, ChildProcess } from 'child_process';
|
||||
import { promisify } from 'util';
|
||||
import { logger } from '../../utils/logger.js';
|
||||
import { sanitizeEnv } from '../../supervisor/env-sanitizer.js';
|
||||
import { getSupervisor } from '../../supervisor/index.js';
|
||||
|
||||
const execAsync = promisify(exec);
|
||||
|
||||
@@ -29,14 +31,36 @@ interface TrackedProcess {
|
||||
process: ChildProcess;
|
||||
}
|
||||
|
||||
// PID Registry - tracks spawned Claude subprocesses
|
||||
const processRegistry = new Map<number, TrackedProcess>();
|
||||
function getTrackedProcesses(): TrackedProcess[] {
|
||||
return getSupervisor().getRegistry()
|
||||
.getAll()
|
||||
.filter(record => record.type === 'sdk')
|
||||
.map((record) => {
|
||||
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
|
||||
if (!processRef) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
pid: record.pid,
|
||||
sessionDbId: Number(record.sessionId),
|
||||
spawnedAt: Date.parse(record.startedAt),
|
||||
process: processRef
|
||||
};
|
||||
})
|
||||
.filter((value): value is TrackedProcess => value !== null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a spawned process in the registry
|
||||
*/
|
||||
export function registerProcess(pid: number, sessionDbId: number, process: ChildProcess): void {
|
||||
processRegistry.set(pid, { pid, sessionDbId, spawnedAt: Date.now(), process });
|
||||
getSupervisor().registerProcess(`sdk:${sessionDbId}:${pid}`, {
|
||||
pid,
|
||||
type: 'sdk',
|
||||
sessionId: sessionDbId,
|
||||
startedAt: new Date().toISOString()
|
||||
}, process);
|
||||
logger.info('PROCESS', `Registered PID ${pid} for session ${sessionDbId}`, { pid, sessionDbId });
|
||||
}
|
||||
|
||||
@@ -44,7 +68,11 @@ export function registerProcess(pid: number, sessionDbId: number, process: Child
|
||||
* Unregister a process from the registry and notify pool waiters
|
||||
*/
|
||||
export function unregisterProcess(pid: number): void {
|
||||
processRegistry.delete(pid);
|
||||
for (const record of getSupervisor().getRegistry().getByPid(pid)) {
|
||||
if (record.type === 'sdk') {
|
||||
getSupervisor().unregisterProcess(record.id);
|
||||
}
|
||||
}
|
||||
logger.debug('PROCESS', `Unregistered PID ${pid}`, { pid });
|
||||
// Notify waiters that a pool slot may be available
|
||||
notifySlotAvailable();
|
||||
@@ -55,10 +83,7 @@ export function unregisterProcess(pid: number): void {
|
||||
* Warns if multiple processes found (indicates race condition)
|
||||
*/
|
||||
export function getProcessBySession(sessionDbId: number): TrackedProcess | undefined {
|
||||
const matches: TrackedProcess[] = [];
|
||||
for (const [, info] of processRegistry) {
|
||||
if (info.sessionDbId === sessionDbId) matches.push(info);
|
||||
}
|
||||
const matches = getTrackedProcesses().filter(info => info.sessionDbId === sessionDbId);
|
||||
if (matches.length > 1) {
|
||||
logger.warn('PROCESS', `Multiple processes found for session ${sessionDbId}`, {
|
||||
count: matches.length,
|
||||
@@ -72,7 +97,7 @@ export function getProcessBySession(sessionDbId: number): TrackedProcess | undef
|
||||
* Get count of active processes in the registry
|
||||
*/
|
||||
export function getActiveCount(): number {
|
||||
return processRegistry.size;
|
||||
return getSupervisor().getRegistry().getAll().filter(record => record.type === 'sdk').length;
|
||||
}
|
||||
|
||||
// Waiters for pool slots - resolved when a process exits and frees a slot
|
||||
@@ -95,13 +120,14 @@ const TOTAL_PROCESS_HARD_CAP = 10;
|
||||
|
||||
export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_000): Promise<void> {
|
||||
// Hard cap: refuse to spawn if too many processes exist regardless of pool accounting
|
||||
if (processRegistry.size >= TOTAL_PROCESS_HARD_CAP) {
|
||||
throw new Error(`Hard cap exceeded: ${processRegistry.size} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
||||
const activeCount = getActiveCount();
|
||||
if (activeCount >= TOTAL_PROCESS_HARD_CAP) {
|
||||
throw new Error(`Hard cap exceeded: ${activeCount} processes in registry (cap=${TOTAL_PROCESS_HARD_CAP}). Refusing to spawn more.`);
|
||||
}
|
||||
|
||||
if (processRegistry.size < maxConcurrent) return;
|
||||
if (activeCount < maxConcurrent) return;
|
||||
|
||||
logger.info('PROCESS', `Pool limit reached (${processRegistry.size}/${maxConcurrent}), waiting for slot...`);
|
||||
logger.info('PROCESS', `Pool limit reached (${activeCount}/${maxConcurrent}), waiting for slot...`);
|
||||
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
const timeout = setTimeout(() => {
|
||||
@@ -112,7 +138,7 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
||||
|
||||
const onSlot = () => {
|
||||
clearTimeout(timeout);
|
||||
if (processRegistry.size < maxConcurrent) {
|
||||
if (getActiveCount() < maxConcurrent) {
|
||||
resolve();
|
||||
} else {
|
||||
// Still full, re-queue
|
||||
@@ -129,7 +155,7 @@ export async function waitForSlot(maxConcurrent: number, timeoutMs: number = 60_
|
||||
*/
|
||||
export function getActiveProcesses(): Array<{ pid: number; sessionDbId: number; ageMs: number }> {
|
||||
const now = Date.now();
|
||||
return Array.from(processRegistry.values()).map(info => ({
|
||||
return getTrackedProcesses().map(info => ({
|
||||
pid: info.pid,
|
||||
sessionDbId: info.sessionDbId,
|
||||
ageMs: now - info.spawnedAt
|
||||
@@ -308,17 +334,26 @@ export async function reapOrphanedProcesses(activeSessionIds: Set<number>): Prom
|
||||
let killed = 0;
|
||||
|
||||
// Registry-based: kill processes for dead sessions
|
||||
for (const [pid, info] of processRegistry) {
|
||||
if (activeSessionIds.has(info.sessionDbId)) continue; // Active = safe
|
||||
for (const record of getSupervisor().getRegistry().getAll().filter(entry => entry.type === 'sdk')) {
|
||||
const pid = record.pid;
|
||||
const sessionDbId = Number(record.sessionId);
|
||||
const processRef = getSupervisor().getRegistry().getRuntimeProcess(record.id);
|
||||
|
||||
logger.warn('PROCESS', `Killing orphan PID ${pid} (session ${info.sessionDbId} gone)`, { pid, sessionDbId: info.sessionDbId });
|
||||
if (activeSessionIds.has(sessionDbId)) continue; // Active = safe
|
||||
|
||||
logger.warn('PROCESS', `Killing orphan PID ${pid} (session ${sessionDbId} gone)`, { pid, sessionDbId });
|
||||
try {
|
||||
info.process.kill('SIGKILL');
|
||||
if (processRef) {
|
||||
processRef.kill('SIGKILL');
|
||||
} else {
|
||||
process.kill(pid, 'SIGKILL');
|
||||
}
|
||||
killed++;
|
||||
} catch {
|
||||
// Already dead
|
||||
}
|
||||
unregisterProcess(pid);
|
||||
getSupervisor().unregisterProcess(record.id);
|
||||
notifySlotAvailable();
|
||||
}
|
||||
|
||||
// System-level: find ppid=1 orphans
|
||||
@@ -347,20 +382,23 @@ export function createPidCapturingSpawn(sessionDbId: number) {
|
||||
env?: NodeJS.ProcessEnv;
|
||||
signal?: AbortSignal;
|
||||
}) => {
|
||||
getSupervisor().assertCanSpawn('claude sdk');
|
||||
|
||||
// On Windows, use cmd.exe wrapper for .cmd files to properly handle paths with spaces
|
||||
const useCmdWrapper = process.platform === 'win32' && spawnOptions.command.endsWith('.cmd');
|
||||
const env = sanitizeEnv(spawnOptions.env ?? process.env);
|
||||
|
||||
const child = useCmdWrapper
|
||||
? spawn('cmd.exe', ['/d', '/c', spawnOptions.command, ...spawnOptions.args], {
|
||||
cwd: spawnOptions.cwd,
|
||||
env: spawnOptions.env,
|
||||
env,
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
signal: spawnOptions.signal,
|
||||
windowsHide: true
|
||||
})
|
||||
: spawn(spawnOptions.command, spawnOptions.args, {
|
||||
cwd: spawnOptions.cwd,
|
||||
env: spawnOptions.env,
|
||||
env,
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
signal: spawnOptions.signal, // CRITICAL: Pass signal for AbortController integration
|
||||
windowsHide: true
|
||||
@@ -407,7 +445,7 @@ export function createPidCapturingSpawn(sessionDbId: number) {
|
||||
* Start the orphan reaper interval
|
||||
* Returns cleanup function to stop the interval
|
||||
*/
|
||||
export function startOrphanReaper(getActiveSessionIds: () => Set<number>, intervalMs: number = 60 * 1000): () => void {
|
||||
export function startOrphanReaper(getActiveSessionIds: () => Set<number>, intervalMs: number = 30 * 1000): () => void {
|
||||
const interval = setInterval(async () => {
|
||||
try {
|
||||
const activeIds = getActiveSessionIds();
|
||||
|
||||
Reference in New Issue
Block a user