From a3f9e7f638d951d76b6989ac96534a335e9b258f Mon Sep 17 00:00:00 2001 From: Rod Boev Date: Wed, 11 Feb 2026 05:53:10 -0500 Subject: [PATCH] =?UTF-8?q?fix:=20prevent=20chroma-mcp=20spawn=20storm=20w?= =?UTF-8?q?ith=205-layer=20defense=20(641=20processes=20=E2=86=92=20max=20?= =?UTF-8?q?2)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During SIGHUP testing with 6+ active sessions, ChromaSync.ensureConnection() had no mutex — concurrent fire-and-forget syncObservation() calls each spawned a chroma-mcp subprocess via StdioClientTransport, creating 641 orphans in ~5min. Error-driven reconnection formed a positive feedback loop amplifying the storm. Defense layers: - Layer 0: Connection mutex via promise memoization (prevents concurrent spawns) - Layer 1: Pre-spawn process count guard using execFileSync('ps') (kills excess) - Layer 2: Hardened close() with try-finally + Unix pkill in GracefulShutdown - Layer 3: Count-based orphan reaper in ProcessManager (not age-based) - Layer 4: Circuit breaker stops retries after 3 consecutive failures for 60s Closes #1063, closes #695 Relates to #1010, #707 --- .../infrastructure/GracefulShutdown.ts | 16 ++ src/services/infrastructure/ProcessManager.ts | 69 ++++++ src/services/sync/ChromaSync.ts | 168 ++++++++++++-- src/services/worker-service.ts | 2 + src/services/worker/ProcessRegistry.ts | 6 +- tests/infrastructure/process-manager.test.ts | 60 +++++ tests/integration/chroma-vector-sync.test.ts | 205 ++++++++++++++++++ 7 files changed, 505 insertions(+), 21 deletions(-) diff --git a/src/services/infrastructure/GracefulShutdown.ts b/src/services/infrastructure/GracefulShutdown.ts index 35226d14..d05940f7 100644 --- a/src/services/infrastructure/GracefulShutdown.ts +++ b/src/services/infrastructure/GracefulShutdown.ts @@ -9,6 +9,7 @@ */ import http from 'http'; +import { execFileSync } from 'child_process'; import { logger } from '../../utils/logger.js'; import { getChildProcesses, @@ -76,6 +77,21 @@ export async function performGracefulShutdown(config: GracefulShutdownConfig): P await config.dbManager.close(); } + // STEP 5.5: Kill any chroma-mcp children that survived transport.close() (Unix only) + // On Unix, getChildProcesses() returns [] (Windows-only), so chroma-mcp + // subprocesses spawned via StdioClientTransport may escape STEP 5 cleanup + if (process.platform !== 'win32') { + try { + execFileSync('pkill', ['-P', String(process.pid), '-f', 'chroma-mcp'], { + timeout: 3000, + stdio: 'ignore' + }); + logger.info('SYSTEM', 'Killed chroma-mcp child processes'); + } catch { + // pkill returns exit code 1 if no processes matched — that's fine + } + } + // STEP 6: Force kill any remaining child processes (Windows zombie port fix) if (childPids.length > 0) { logger.info('SYSTEM', 'Force killing remaining children'); diff --git a/src/services/infrastructure/ProcessManager.ts b/src/services/infrastructure/ProcessManager.ts index 9d128afd..4b47b1f9 100644 --- a/src/services/infrastructure/ProcessManager.ts +++ b/src/services/infrastructure/ProcessManager.ts @@ -335,6 +335,75 @@ export async function cleanupOrphanedProcesses(): Promise { logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length }); } +/** + * Clean up excess chroma-mcp processes by count (not age). + * + * Unlike cleanupOrphanedProcesses() which uses ORPHAN_MAX_AGE_MINUTES = 30, + * this function kills by count — essential for catching spawn storms where + * all processes are young. Keeps the newest processes (by elapsed time) + * and kills the rest. + * + * Returns the number of processes killed. + */ +export async function cleanupExcessChromaProcesses(maxAllowed: number = 2): Promise { + // Windows: Chroma is disabled entirely, no cleanup needed + if (process.platform === 'win32') return 0; + + try { + const { stdout } = await execAsync( + 'ps -eo pid,etime,command | grep -E "chroma-mcp" | grep -v grep || true' + ); + + if (!stdout.trim()) return 0; + + const processes: Array<{ pid: number; ageMinutes: number }> = []; + + for (const line of stdout.trim().split('\n')) { + if (!line.trim()) continue; + const match = line.trim().match(/^(\d+)\s+(\S+)\s+(.*)$/); + if (!match) continue; + + const pid = parseInt(match[1], 10); + const etime = match[2]; + + if (!Number.isInteger(pid) || pid <= 0 || pid === process.pid) continue; + + const ageMinutes = parseElapsedTime(etime); + processes.push({ pid, ageMinutes }); + } + + if (processes.length <= maxAllowed) return 0; + + // Sort: newest first (lowest age), keep maxAllowed, kill rest + processes.sort((a, b) => a.ageMinutes - b.ageMinutes); + const toKill = processes.slice(maxAllowed); + + let killed = 0; + for (const { pid } of toKill) { + try { + process.kill(pid, 'SIGTERM'); + killed++; + logger.info('SYSTEM', 'Killed excess chroma-mcp process', { pid }); + } catch { + // Process may already be dead + } + } + + if (killed > 0) { + logger.warn('SYSTEM', 'Cleaned up excess chroma-mcp processes by count', { + found: processes.length, + killed, + maxAllowed + }); + } + + return killed; + } catch (error) { + logger.debug('SYSTEM', 'Failed to enumerate chroma-mcp processes', {}, error as Error); + return 0; + } +} + /** * Spawn a detached daemon process * Returns the child PID or undefined if spawn failed diff --git a/src/services/sync/ChromaSync.ts b/src/services/sync/ChromaSync.ts index 95bb1018..31f5f40f 100644 --- a/src/services/sync/ChromaSync.ts +++ b/src/services/sync/ChromaSync.ts @@ -18,12 +18,15 @@ import { USER_SETTINGS_PATH } from '../../shared/paths.js'; import path from 'path'; import os from 'os'; import fs from 'fs'; -import { execSync } from 'child_process'; +import { execSync, execFileSync } from 'child_process'; // Version injected at build time by esbuild define declare const __DEFAULT_PACKAGE_VERSION__: string; const packageVersion = typeof __DEFAULT_PACKAGE_VERSION__ !== 'undefined' ? __DEFAULT_PACKAGE_VERSION__ : '0.0.0-dev'; +// Maximum allowed chroma-mcp processes before pre-spawn guard kills excess +const MAX_CHROMA_PROCESSES = 2; // 1 active + 1 starting + interface ChromaDocument { id: string; document: string; @@ -90,6 +93,16 @@ export class ChromaSync { // MCP SDK's StdioClientTransport uses shell:false and no detached flag, so console is inherited. private readonly disabled: boolean = false; + // Layer 0: Connection mutex — coalesces concurrent callers onto single spawn + private connectionPromise: Promise | null = null; + + // Layer 4: Circuit breaker — stops retry storms after repeated failures + private consecutiveFailures: number = 0; + private lastFailureTime: number = 0; + private static readonly MAX_FAILURES = 3; + private static readonly BACKOFF_BASE_MS = 2000; + private static readonly CIRCUIT_OPEN_MS = 60000; // 1 minute cooldown + constructor(project: string) { this.project = project; this.collectionName = `cm__${project}`; @@ -178,14 +191,107 @@ export class ChromaSync { } /** - * Ensure MCP client is connected to Chroma server - * Throws error if connection fails + * Ensure MCP client is connected to Chroma server (mutex wrapper). + * Coalesces concurrent callers onto a single connection attempt. + * This prevents N concurrent calls from each spawning a chroma-mcp subprocess. */ private async ensureConnection(): Promise { - if (this.connected && this.client) { - return; + if (this.connected && this.client) return; + + // Layer 0: Coalesce concurrent callers onto a single connection attempt + if (this.connectionPromise) { + return this.connectionPromise; } + this.connectionPromise = this._doConnect(); + try { + await this.connectionPromise; + } finally { + this.connectionPromise = null; + } + } + + /** + * Layer 4: Circuit breaker — refuse to spawn after repeated failures. + * After MAX_FAILURES consecutive connection failures, stops all spawn + * attempts for CIRCUIT_OPEN_MS to prevent process accumulation storms. + */ + private checkCircuitBreaker(): void { + if (this.consecutiveFailures >= ChromaSync.MAX_FAILURES) { + const elapsed = Date.now() - this.lastFailureTime; + if (elapsed < ChromaSync.CIRCUIT_OPEN_MS) { + throw new Error( + `Chroma circuit breaker open: ${this.consecutiveFailures} consecutive failures. ` + + `Retry in ${Math.ceil((ChromaSync.CIRCUIT_OPEN_MS - elapsed) / 1000)}s` + ); + } + // Cooldown expired, allow retry + logger.info('CHROMA_SYNC', 'Circuit breaker cooldown expired, allowing retry', { + consecutiveFailures: this.consecutiveFailures, + cooldownMs: ChromaSync.CIRCUIT_OPEN_MS + }); + } + } + + /** + * Layer 1: Pre-spawn process count guard. + * Kills excess chroma-mcp processes before spawning a new one. + * Uses execFileSync (no shell) to list processes, filters in JavaScript. + */ + private killExcessChromaProcesses(): void { + if (process.platform === 'win32') return; // Windows has Chroma disabled entirely + + try { + // Use execFileSync to avoid shell injection — filter in JavaScript + const output = execFileSync('ps', ['-eo', 'pid,command'], { + encoding: 'utf8', + timeout: 5000, + stdio: ['pipe', 'pipe', 'pipe'] + }); + + // Filter for chroma-mcp processes in JavaScript (no shell grep needed) + const pids = output.split('\n') + .filter(l => l.includes('chroma-mcp')) + .map(l => parseInt(l.trim().split(/\s+/)[0], 10)) + .filter(pid => pid > 0 && pid !== process.pid) + .sort((a, b) => b - a); // Descending: newest (highest PID) first + + if (pids.length < MAX_CHROMA_PROCESSES) return; + + // Keep newest MAX_CHROMA_PROCESSES - 1 (making room for the one we're about to spawn) + const toKill = pids.slice(MAX_CHROMA_PROCESSES - 1); + for (const pid of toKill) { + try { + process.kill(pid, 'SIGTERM'); + } catch { + // Process may already be dead + } + } + + if (toKill.length > 0) { + logger.warn('CHROMA_SYNC', 'Killed excess chroma-mcp processes before spawning', { + found: pids.length, + killed: toKill.length, + maxAllowed: MAX_CHROMA_PROCESSES + }); + } + } catch { + // ps may fail — don't block connection + } + } + + /** + * Internal connection logic — called only via ensureConnection() mutex. + * Implements circuit breaker (Layer 4), pre-spawn guard (Layer 1), + * and actual connection setup. + */ + private async _doConnect(): Promise { + // Layer 4: Circuit breaker check — refuse if too many recent failures + this.checkCircuitBreaker(); + + // Layer 1: Kill excess processes before spawning a new one + this.killExcessChromaProcesses(); + logger.info('CHROMA_SYNC', 'Connecting to Chroma MCP server...', { project: this.project }); try { @@ -238,9 +344,20 @@ export class ChromaSync { await this.client.connect(this.transport); this.connected = true; + // Layer 4: Reset circuit breaker on success + this.consecutiveFailures = 0; + logger.info('CHROMA_SYNC', 'Connected to Chroma MCP server', { project: this.project }); } catch (error) { - logger.error('CHROMA_SYNC', 'Failed to connect to Chroma MCP server', { project: this.project }, error as Error); + // Layer 4: Track failure for circuit breaker + this.consecutiveFailures++; + this.lastFailureTime = Date.now(); + + logger.error('CHROMA_SYNC', 'Failed to connect to Chroma MCP server', { + project: this.project, + consecutiveFailures: this.consecutiveFailures, + circuitBreakerThreshold: ChromaSync.MAX_FAILURES + }, error as Error); throw new Error(`Chroma connection failed: ${error instanceof Error ? error.message : String(error)}`); } } @@ -291,6 +408,7 @@ export class ChromaSync { this.connected = false; this.client = null; this.transport = null; + this.connectionPromise = null; logger.error('CHROMA_SYNC', 'Connection lost during collection check', { collection: this.collectionName }, error as Error); throw new Error(`Chroma connection lost: ${errorMessage}`); @@ -960,6 +1078,7 @@ export class ChromaSync { this.connected = false; this.client = null; this.transport = null; + this.connectionPromise = null; logger.error('CHROMA_SYNC', 'Connection lost during query', { project: this.project, query }, error as Error); throw new Error(`Chroma query failed - connection lost: ${errorMessage}`); @@ -1017,28 +1136,37 @@ export class ChromaSync { } /** - * Close the Chroma client connection and cleanup subprocess + * Close the Chroma client connection and cleanup subprocess. + * Uses try-finally to guarantee state reset even if close() throws. + * Individual close calls use .catch() to prevent one failure from + * blocking the other (e.g., client.close() failing shouldn't prevent + * transport.close() from killing the subprocess). */ async close(): Promise { if (!this.connected && !this.client && !this.transport) { return; } - // Close client first - if (this.client) { - await this.client.close(); - } - - // Explicitly close transport to kill subprocess - if (this.transport) { - await this.transport.close(); + try { + // Close client first, then transport — catch individual errors + if (this.client) { + await this.client.close().catch((err: Error) => { + logger.debug('CHROMA_SYNC', 'Client close error (may already be disconnected)', {}, err); + }); + } + if (this.transport) { + await this.transport.close().catch((err: Error) => { + logger.debug('CHROMA_SYNC', 'Transport close error (may already be dead)', {}, err); + }); + } + } finally { + // Always reset state, even if close throws + this.connected = false; + this.client = null; + this.transport = null; + this.connectionPromise = null; } logger.info('CHROMA_SYNC', 'Chroma client and subprocess closed', { project: this.project }); - - // Always reset state - this.connected = false; - this.client = null; - this.transport = null; } } diff --git a/src/services/worker-service.ts b/src/services/worker-service.ts index f7948a4d..2c7816ec 100644 --- a/src/services/worker-service.ts +++ b/src/services/worker-service.ts @@ -66,6 +66,7 @@ import { removePidFile, getPlatformTimeout, cleanupOrphanedProcesses, + cleanupExcessChromaProcesses, spawnDaemon, createSignalHandler } from './infrastructure/ProcessManager.js'; @@ -316,6 +317,7 @@ export class WorkerService { private async initializeBackground(): Promise { try { await cleanupOrphanedProcesses(); + await cleanupExcessChromaProcesses(); // Load mode configuration const { ModeManager } = await import('./domain/ModeManager.js'); diff --git a/src/services/worker/ProcessRegistry.ts b/src/services/worker/ProcessRegistry.ts index 0341d6be..6d29fb66 100644 --- a/src/services/worker/ProcessRegistry.ts +++ b/src/services/worker/ProcessRegistry.ts @@ -19,6 +19,7 @@ import { spawn, exec, ChildProcess } from 'child_process'; import { promisify } from 'util'; import { logger } from '../../utils/logger.js'; +import { cleanupExcessChromaProcesses } from '../infrastructure/ProcessManager.js'; const execAsync = promisify(exec); @@ -212,7 +213,7 @@ async function killSystemOrphans(): Promise { try { const { stdout } = await execAsync( - 'ps -eo pid,ppid,args 2>/dev/null | grep -E "claude.*haiku|claude.*output-format" | grep -v grep' + 'ps -eo pid,ppid,args 2>/dev/null | grep -E "claude.*haiku|claude.*output-format|chroma-mcp" | grep -v grep' ); let killed = 0; @@ -262,6 +263,9 @@ export async function reapOrphanedProcesses(activeSessionIds: Set): Prom // Daemon children: find idle SDK processes that didn't terminate killed += await killIdleDaemonChildren(); + // Count-based: kill excess chroma-mcp processes regardless of age + killed += await cleanupExcessChromaProcesses(); + return killed; } diff --git a/tests/infrastructure/process-manager.test.ts b/tests/infrastructure/process-manager.test.ts index 16267fba..1ce12f37 100644 --- a/tests/infrastructure/process-manager.test.ts +++ b/tests/infrastructure/process-manager.test.ts @@ -8,6 +8,7 @@ import { removePidFile, getPlatformTimeout, parseElapsedTime, + cleanupExcessChromaProcesses, type PidInfo } from '../../src/services/infrastructure/index.js'; @@ -221,4 +222,63 @@ describe('ProcessManager', () => { expect(result).toBe(666); }); }); + + describe('cleanupExcessChromaProcesses (Issue #1063)', () => { + /** + * Tests for count-based chroma-mcp process cleanup. + * Unlike the age-based cleanupOrphanedProcesses() which has a 30-minute + * threshold, this function kills by count — essential for catching spawn + * storms where all processes are young. + */ + + it('should be exported and callable', () => { + expect(typeof cleanupExcessChromaProcesses).toBe('function'); + }); + + it('should return 0 on Windows (Chroma disabled)', async () => { + const originalPlatform = process.platform; + Object.defineProperty(process, 'platform', { + value: 'win32', + writable: true, + configurable: true + }); + + try { + const killed = await cleanupExcessChromaProcesses(); + expect(killed).toBe(0); + } finally { + Object.defineProperty(process, 'platform', { + value: originalPlatform, + writable: true, + configurable: true + }); + } + }); + + it('should accept custom maxAllowed parameter', async () => { + // Should not throw with any valid maxAllowed value + const killed = await cleanupExcessChromaProcesses(5); + expect(killed).toBeGreaterThanOrEqual(0); + }); + + it('should return a number (killed count)', async () => { + const killed = await cleanupExcessChromaProcesses(); + expect(typeof killed).toBe('number'); + expect(killed).toBeGreaterThanOrEqual(0); + }); + + it('should exist in ProcessManager source with count-based logic', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/infrastructure/ProcessManager.ts', import.meta.url) + ).text(); + + // Verify count-based logic exists (not age-based) + expect(sourceFile).toContain('cleanupExcessChromaProcesses'); + expect(sourceFile).toContain('chroma-mcp'); + + // Should sort by age and keep newest + expect(sourceFile).toContain('.sort('); + expect(sourceFile).toContain('.slice(maxAllowed)'); + }); + }); }); diff --git a/tests/integration/chroma-vector-sync.test.ts b/tests/integration/chroma-vector-sync.test.ts index bb8c19d0..eb3a6479 100644 --- a/tests/integration/chroma-vector-sync.test.ts +++ b/tests/integration/chroma-vector-sync.test.ts @@ -392,4 +392,209 @@ describe('ChromaSync Vector Sync Integration', () => { expect(sourceFile).toContain('this.transport = null'); }); }); + + describe('Spawn storm prevention (Issue #1063)', () => { + /** + * Regression tests for chroma-mcp spawn storm: + * 641 processes spawned in ~5 minutes from 6 concurrent sessions. + * + * Root cause: ensureConnection() had no mutex. Concurrent callers + * each spawned a chroma-mcp subprocess via StdioClientTransport. + * + * Fix: 5 defense layers — connection mutex, pre-spawn count guard, + * hardened close(), count-based orphan reaper, circuit breaker. + */ + + describe('Layer 0: Connection mutex', () => { + it('should have connectionPromise field for mutex', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + // connectionPromise should be null initially + expect(syncAny.connectionPromise).toBeNull(); + }); + + it('should coalesce concurrent ensureConnection calls via source code', async () => { + // Static analysis: verify mutex pattern exists in source + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // Verify mutex pattern: check for connectionPromise, return it + expect(sourceFile).toContain('if (this.connectionPromise)'); + expect(sourceFile).toContain('return this.connectionPromise'); + expect(sourceFile).toContain('this.connectionPromise = this._doConnect()'); + }); + + it('should clear connectionPromise in finally block', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // The finally block ensures connectionPromise is cleared even on error + expect(sourceFile).toContain('finally {'); + expect(sourceFile).toContain('this.connectionPromise = null'); + }); + + it('should clear connectionPromise in error recovery paths', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // Count occurrences of connectionPromise = null (should be in multiple places: + // finally block, ensureCollection error, queryChroma error, close()) + const matches = sourceFile.match(/this\.connectionPromise = null/g) || []; + expect(matches.length).toBeGreaterThanOrEqual(4); + }); + }); + + describe('Layer 1: Pre-spawn process count guard', () => { + it('should have killExcessChromaProcesses method', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + expect(typeof syncAny.killExcessChromaProcesses).toBe('function'); + }); + + it('should use execFileSync not execSync for safety', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // killExcessChromaProcesses should use execFileSync (no shell injection) + // Extract just the method body + const methodStart = sourceFile.indexOf('killExcessChromaProcesses'); + const methodBody = sourceFile.slice(methodStart, methodStart + 500); + expect(methodBody).toContain('execFileSync'); + }); + + it('should define MAX_CHROMA_PROCESSES constant', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + expect(sourceFile).toContain('MAX_CHROMA_PROCESSES'); + }); + }); + + describe('Layer 2: Hardened close()', () => { + it('should use try-finally to guarantee state reset', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // Find the close() method body (needs larger slice to capture finally block) + const closeStart = sourceFile.indexOf('async close():'); + const closeBody = sourceFile.slice(closeStart, closeStart + 1000); + + // Verify try-finally pattern + expect(closeBody).toContain('try {'); + expect(closeBody).toContain('} finally {'); + }); + + it('should catch individual close errors with .catch()', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + const closeStart = sourceFile.indexOf('async close():'); + const closeBody = sourceFile.slice(closeStart, closeStart + 600); + + // Both client and transport close should have .catch() + expect(closeBody).toContain('this.client.close().catch('); + expect(closeBody).toContain('this.transport.close().catch('); + }); + + it('should reset connectionPromise in close()', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + // Simulate partially initialized state — must set connected=true + // so close() doesn't early-return before reaching the finally block + syncAny.connectionPromise = Promise.resolve(); + syncAny.connected = true; + + await sync.close(); + + // connectionPromise must be reset + expect(syncAny.connectionPromise).toBeNull(); + }); + }); + + describe('Layer 4: Circuit breaker', () => { + it('should have circuit breaker fields', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + expect(syncAny.consecutiveFailures).toBe(0); + expect(syncAny.lastFailureTime).toBe(0); + }); + + it('should have circuit breaker constants', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const syncClass = ChromaSync as any; + + expect(syncClass.MAX_FAILURES).toBe(3); + expect(syncClass.CIRCUIT_OPEN_MS).toBe(60000); + }); + + it('should throw when circuit breaker is open', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + // Simulate 3 consecutive failures just now + syncAny.consecutiveFailures = 3; + syncAny.lastFailureTime = Date.now(); + + // checkCircuitBreaker should throw + expect(() => syncAny.checkCircuitBreaker()).toThrow('circuit breaker open'); + }); + + it('should allow retry after cooldown expires', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + // Simulate 3 failures that happened 2 minutes ago (past cooldown) + syncAny.consecutiveFailures = 3; + syncAny.lastFailureTime = Date.now() - 120000; + + // checkCircuitBreaker should NOT throw — cooldown expired + expect(() => syncAny.checkCircuitBreaker()).not.toThrow(); + }); + + it('should not throw when under failure threshold', async () => { + const { ChromaSync } = await import('../../src/services/sync/ChromaSync.js'); + const sync = new ChromaSync(testProject); + const syncAny = sync as any; + + // 2 failures (under MAX_FAILURES = 3) + syncAny.consecutiveFailures = 2; + syncAny.lastFailureTime = Date.now(); + + expect(() => syncAny.checkCircuitBreaker()).not.toThrow(); + }); + + it('should track failures in _doConnect error path via source code', async () => { + const sourceFile = await Bun.file( + new URL('../../src/services/sync/ChromaSync.ts', import.meta.url) + ).text(); + + // Verify _doConnect increments consecutiveFailures on error + const doConnectStart = sourceFile.indexOf('private async _doConnect'); + const doConnectBody = sourceFile.slice(doConnectStart, doConnectStart + 3000); + + expect(doConnectBody).toContain('this.consecutiveFailures++'); + expect(doConnectBody).toContain('this.lastFailureTime = Date.now()'); + + // And resets on success + expect(doConnectBody).toContain('this.consecutiveFailures = 0'); + }); + }); + }); });