fix: prevent chroma-mcp spawn storm with 5-layer defense (641 processes → max 2)

During SIGHUP testing with 6+ active sessions, ChromaSync.ensureConnection()
had no mutex — concurrent fire-and-forget syncObservation() calls each spawned
a chroma-mcp subprocess via StdioClientTransport, creating 641 orphans in ~5min.
Error-driven reconnection formed a positive feedback loop amplifying the storm.

Defense layers:
- Layer 0: Connection mutex via promise memoization (prevents concurrent spawns)
- Layer 1: Pre-spawn process count guard using execFileSync('ps') (kills excess)
- Layer 2: Hardened close() with try-finally + Unix pkill in GracefulShutdown
- Layer 3: Count-based orphan reaper in ProcessManager (not age-based)
- Layer 4: Circuit breaker stops retries after 3 consecutive failures for 60s

Closes #1063, closes #695
Relates to #1010, #707
This commit is contained in:
Rod Boev
2026-02-11 05:53:10 -05:00
parent 79b3a61ac8
commit a3f9e7f638
7 changed files with 505 additions and 21 deletions
@@ -335,6 +335,75 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length });
}
/**
* Clean up excess chroma-mcp processes by count (not age).
*
* Unlike cleanupOrphanedProcesses() which uses ORPHAN_MAX_AGE_MINUTES = 30,
* this function kills by count — essential for catching spawn storms where
* all processes are young. Keeps the newest processes (by elapsed time)
* and kills the rest.
*
* Returns the number of processes killed.
*/
export async function cleanupExcessChromaProcesses(maxAllowed: number = 2): Promise<number> {
// Windows: Chroma is disabled entirely, no cleanup needed
if (process.platform === 'win32') return 0;
try {
const { stdout } = await execAsync(
'ps -eo pid,etime,command | grep -E "chroma-mcp" | grep -v grep || true'
);
if (!stdout.trim()) return 0;
const processes: Array<{ pid: number; ageMinutes: number }> = [];
for (const line of stdout.trim().split('\n')) {
if (!line.trim()) continue;
const match = line.trim().match(/^(\d+)\s+(\S+)\s+(.*)$/);
if (!match) continue;
const pid = parseInt(match[1], 10);
const etime = match[2];
if (!Number.isInteger(pid) || pid <= 0 || pid === process.pid) continue;
const ageMinutes = parseElapsedTime(etime);
processes.push({ pid, ageMinutes });
}
if (processes.length <= maxAllowed) return 0;
// Sort: newest first (lowest age), keep maxAllowed, kill rest
processes.sort((a, b) => a.ageMinutes - b.ageMinutes);
const toKill = processes.slice(maxAllowed);
let killed = 0;
for (const { pid } of toKill) {
try {
process.kill(pid, 'SIGTERM');
killed++;
logger.info('SYSTEM', 'Killed excess chroma-mcp process', { pid });
} catch {
// Process may already be dead
}
}
if (killed > 0) {
logger.warn('SYSTEM', 'Cleaned up excess chroma-mcp processes by count', {
found: processes.length,
killed,
maxAllowed
});
}
return killed;
} catch (error) {
logger.debug('SYSTEM', 'Failed to enumerate chroma-mcp processes', {}, error as Error);
return 0;
}
}
/**
* Spawn a detached daemon process
* Returns the child PID or undefined if spawn failed