a3f9e7f638
During SIGHUP testing with 6+ active sessions, ChromaSync.ensureConnection()
had no mutex — concurrent fire-and-forget syncObservation() calls each spawned
a chroma-mcp subprocess via StdioClientTransport, creating 641 orphans in ~5min.
Error-driven reconnection formed a positive feedback loop amplifying the storm.
Defense layers:
- Layer 0: Connection mutex via promise memoization (prevents concurrent spawns)
- Layer 1: Pre-spawn process count guard using execFileSync('ps') (kills excess)
- Layer 2: Hardened close() with try-finally + Unix pkill in GracefulShutdown
- Layer 3: Count-based orphan reaper in ProcessManager (not age-based)
- Layer 4: Circuit breaker stops retries after 3 consecutive failures for 60s
Closes #1063, closes #695
Relates to #1010, #707
132 lines
3.9 KiB
TypeScript
132 lines
3.9 KiB
TypeScript
/**
|
|
* GracefulShutdown - Cleanup utilities for graceful exit
|
|
*
|
|
* Extracted from worker-service.ts to provide centralized shutdown coordination.
|
|
* Handles:
|
|
* - HTTP server closure (with Windows-specific delays)
|
|
* - Session manager shutdown coordination
|
|
* - Child process cleanup (Windows zombie port fix)
|
|
*/
|
|
|
|
import http from 'http';
|
|
import { execFileSync } from 'child_process';
|
|
import { logger } from '../../utils/logger.js';
|
|
import {
|
|
getChildProcesses,
|
|
forceKillProcess,
|
|
waitForProcessesExit,
|
|
removePidFile
|
|
} from './ProcessManager.js';
|
|
|
|
export interface ShutdownableService {
|
|
shutdownAll(): Promise<void>;
|
|
}
|
|
|
|
export interface CloseableClient {
|
|
close(): Promise<void>;
|
|
}
|
|
|
|
export interface CloseableDatabase {
|
|
close(): Promise<void>;
|
|
}
|
|
|
|
/**
|
|
* Configuration for graceful shutdown
|
|
*/
|
|
export interface GracefulShutdownConfig {
|
|
server: http.Server | null;
|
|
sessionManager: ShutdownableService;
|
|
mcpClient?: CloseableClient;
|
|
dbManager?: CloseableDatabase;
|
|
}
|
|
|
|
/**
|
|
* Perform graceful shutdown of all services
|
|
*
|
|
* IMPORTANT: On Windows, we must kill all child processes before exiting
|
|
* to prevent zombie ports. The socket handle can be inherited by children,
|
|
* and if not properly closed, the port stays bound after process death.
|
|
*/
|
|
export async function performGracefulShutdown(config: GracefulShutdownConfig): Promise<void> {
|
|
logger.info('SYSTEM', 'Shutdown initiated');
|
|
|
|
// Clean up PID file on shutdown
|
|
removePidFile();
|
|
|
|
// STEP 1: Enumerate all child processes BEFORE we start closing things
|
|
const childPids = await getChildProcesses(process.pid);
|
|
logger.info('SYSTEM', 'Found child processes', { count: childPids.length, pids: childPids });
|
|
|
|
// STEP 2: Close HTTP server first
|
|
if (config.server) {
|
|
await closeHttpServer(config.server);
|
|
logger.info('SYSTEM', 'HTTP server closed');
|
|
}
|
|
|
|
// STEP 3: Shutdown active sessions
|
|
await config.sessionManager.shutdownAll();
|
|
|
|
// STEP 4: Close MCP client connection (signals child to exit gracefully)
|
|
if (config.mcpClient) {
|
|
await config.mcpClient.close();
|
|
logger.info('SYSTEM', 'MCP client closed');
|
|
}
|
|
|
|
// STEP 5: Close database connection (includes ChromaSync cleanup)
|
|
if (config.dbManager) {
|
|
await config.dbManager.close();
|
|
}
|
|
|
|
// STEP 5.5: Kill any chroma-mcp children that survived transport.close() (Unix only)
|
|
// On Unix, getChildProcesses() returns [] (Windows-only), so chroma-mcp
|
|
// subprocesses spawned via StdioClientTransport may escape STEP 5 cleanup
|
|
if (process.platform !== 'win32') {
|
|
try {
|
|
execFileSync('pkill', ['-P', String(process.pid), '-f', 'chroma-mcp'], {
|
|
timeout: 3000,
|
|
stdio: 'ignore'
|
|
});
|
|
logger.info('SYSTEM', 'Killed chroma-mcp child processes');
|
|
} catch {
|
|
// pkill returns exit code 1 if no processes matched — that's fine
|
|
}
|
|
}
|
|
|
|
// STEP 6: Force kill any remaining child processes (Windows zombie port fix)
|
|
if (childPids.length > 0) {
|
|
logger.info('SYSTEM', 'Force killing remaining children');
|
|
for (const pid of childPids) {
|
|
await forceKillProcess(pid);
|
|
}
|
|
// Wait for children to fully exit
|
|
await waitForProcessesExit(childPids, 5000);
|
|
}
|
|
|
|
logger.info('SYSTEM', 'Worker shutdown complete');
|
|
}
|
|
|
|
/**
|
|
* Close HTTP server with Windows-specific delays
|
|
* Windows needs extra time to release sockets properly
|
|
*/
|
|
async function closeHttpServer(server: http.Server): Promise<void> {
|
|
// Close all active connections
|
|
server.closeAllConnections();
|
|
|
|
// Give Windows time to close connections before closing server (prevents zombie ports)
|
|
if (process.platform === 'win32') {
|
|
await new Promise(r => setTimeout(r, 500));
|
|
}
|
|
|
|
// Close the server
|
|
await new Promise<void>((resolve, reject) => {
|
|
server.close(err => err ? reject(err) : resolve());
|
|
});
|
|
|
|
// Extra delay on Windows to ensure port is fully released
|
|
if (process.platform === 'win32') {
|
|
await new Promise(r => setTimeout(r, 500));
|
|
logger.info('SYSTEM', 'Waited for Windows port cleanup');
|
|
}
|
|
}
|