fix: prevent chroma-mcp spawn storm with 5-layer defense (641 processes → max 2)
During SIGHUP testing with 6+ active sessions, ChromaSync.ensureConnection()
had no mutex — concurrent fire-and-forget syncObservation() calls each spawned
a chroma-mcp subprocess via StdioClientTransport, creating 641 orphans in ~5min.
Error-driven reconnection formed a positive feedback loop amplifying the storm.
Defense layers:
- Layer 0: Connection mutex via promise memoization (prevents concurrent spawns)
- Layer 1: Pre-spawn process count guard using execFileSync('ps') (kills excess)
- Layer 2: Hardened close() with try-finally + Unix pkill in GracefulShutdown
- Layer 3: Count-based orphan reaper in ProcessManager (not age-based)
- Layer 4: Circuit breaker stops retries after 3 consecutive failures for 60s
Closes #1063, closes #695
Relates to #1010, #707
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
import http from 'http';
|
||||
import { execFileSync } from 'child_process';
|
||||
import { logger } from '../../utils/logger.js';
|
||||
import {
|
||||
getChildProcesses,
|
||||
@@ -76,6 +77,21 @@ export async function performGracefulShutdown(config: GracefulShutdownConfig): P
|
||||
await config.dbManager.close();
|
||||
}
|
||||
|
||||
// STEP 5.5: Kill any chroma-mcp children that survived transport.close() (Unix only)
|
||||
// On Unix, getChildProcesses() returns [] (Windows-only), so chroma-mcp
|
||||
// subprocesses spawned via StdioClientTransport may escape STEP 5 cleanup
|
||||
if (process.platform !== 'win32') {
|
||||
try {
|
||||
execFileSync('pkill', ['-P', String(process.pid), '-f', 'chroma-mcp'], {
|
||||
timeout: 3000,
|
||||
stdio: 'ignore'
|
||||
});
|
||||
logger.info('SYSTEM', 'Killed chroma-mcp child processes');
|
||||
} catch {
|
||||
// pkill returns exit code 1 if no processes matched — that's fine
|
||||
}
|
||||
}
|
||||
|
||||
// STEP 6: Force kill any remaining child processes (Windows zombie port fix)
|
||||
if (childPids.length > 0) {
|
||||
logger.info('SYSTEM', 'Force killing remaining children');
|
||||
|
||||
Reference in New Issue
Block a user