Files
claude-mem/src/services/infrastructure/GracefulShutdown.ts
T
Rod Boev a3f9e7f638 fix: prevent chroma-mcp spawn storm with 5-layer defense (641 processes → max 2)
During SIGHUP testing with 6+ active sessions, ChromaSync.ensureConnection()
had no mutex — concurrent fire-and-forget syncObservation() calls each spawned
a chroma-mcp subprocess via StdioClientTransport, creating 641 orphans in ~5min.
Error-driven reconnection formed a positive feedback loop amplifying the storm.

Defense layers:
- Layer 0: Connection mutex via promise memoization (prevents concurrent spawns)
- Layer 1: Pre-spawn process count guard using execFileSync('ps') (kills excess)
- Layer 2: Hardened close() with try-finally + Unix pkill in GracefulShutdown
- Layer 3: Count-based orphan reaper in ProcessManager (not age-based)
- Layer 4: Circuit breaker stops retries after 3 consecutive failures for 60s

Closes #1063, closes #695
Relates to #1010, #707
2026-02-11 07:19:28 -05:00

132 lines
3.9 KiB
TypeScript

/**
* GracefulShutdown - Cleanup utilities for graceful exit
*
* Extracted from worker-service.ts to provide centralized shutdown coordination.
* Handles:
* - HTTP server closure (with Windows-specific delays)
* - Session manager shutdown coordination
* - Child process cleanup (Windows zombie port fix)
*/
import http from 'http';
import { execFileSync } from 'child_process';
import { logger } from '../../utils/logger.js';
import {
getChildProcesses,
forceKillProcess,
waitForProcessesExit,
removePidFile
} from './ProcessManager.js';
export interface ShutdownableService {
shutdownAll(): Promise<void>;
}
export interface CloseableClient {
close(): Promise<void>;
}
export interface CloseableDatabase {
close(): Promise<void>;
}
/**
* Configuration for graceful shutdown
*/
export interface GracefulShutdownConfig {
server: http.Server | null;
sessionManager: ShutdownableService;
mcpClient?: CloseableClient;
dbManager?: CloseableDatabase;
}
/**
* Perform graceful shutdown of all services
*
* IMPORTANT: On Windows, we must kill all child processes before exiting
* to prevent zombie ports. The socket handle can be inherited by children,
* and if not properly closed, the port stays bound after process death.
*/
export async function performGracefulShutdown(config: GracefulShutdownConfig): Promise<void> {
logger.info('SYSTEM', 'Shutdown initiated');
// Clean up PID file on shutdown
removePidFile();
// STEP 1: Enumerate all child processes BEFORE we start closing things
const childPids = await getChildProcesses(process.pid);
logger.info('SYSTEM', 'Found child processes', { count: childPids.length, pids: childPids });
// STEP 2: Close HTTP server first
if (config.server) {
await closeHttpServer(config.server);
logger.info('SYSTEM', 'HTTP server closed');
}
// STEP 3: Shutdown active sessions
await config.sessionManager.shutdownAll();
// STEP 4: Close MCP client connection (signals child to exit gracefully)
if (config.mcpClient) {
await config.mcpClient.close();
logger.info('SYSTEM', 'MCP client closed');
}
// STEP 5: Close database connection (includes ChromaSync cleanup)
if (config.dbManager) {
await config.dbManager.close();
}
// STEP 5.5: Kill any chroma-mcp children that survived transport.close() (Unix only)
// On Unix, getChildProcesses() returns [] (Windows-only), so chroma-mcp
// subprocesses spawned via StdioClientTransport may escape STEP 5 cleanup
if (process.platform !== 'win32') {
try {
execFileSync('pkill', ['-P', String(process.pid), '-f', 'chroma-mcp'], {
timeout: 3000,
stdio: 'ignore'
});
logger.info('SYSTEM', 'Killed chroma-mcp child processes');
} catch {
// pkill returns exit code 1 if no processes matched — that's fine
}
}
// STEP 6: Force kill any remaining child processes (Windows zombie port fix)
if (childPids.length > 0) {
logger.info('SYSTEM', 'Force killing remaining children');
for (const pid of childPids) {
await forceKillProcess(pid);
}
// Wait for children to fully exit
await waitForProcessesExit(childPids, 5000);
}
logger.info('SYSTEM', 'Worker shutdown complete');
}
/**
* Close HTTP server with Windows-specific delays
* Windows needs extra time to release sockets properly
*/
async function closeHttpServer(server: http.Server): Promise<void> {
// Close all active connections
server.closeAllConnections();
// Give Windows time to close connections before closing server (prevents zombie ports)
if (process.platform === 'win32') {
await new Promise(r => setTimeout(r, 500));
}
// Close the server
await new Promise<void>((resolve, reject) => {
server.close(err => err ? reject(err) : resolve());
});
// Extra delay on Windows to ensure port is fully released
if (process.platform === 'win32') {
await new Promise(r => setTimeout(r, 500));
logger.info('SYSTEM', 'Waited for Windows port cleanup');
}
}