fix: prevent duplicate worker daemons and zombie processes (#1178)
* fix: prevent duplicate worker daemons and zombie processes Three root causes of chroma-mcp timeouts: 1. HTTP shutdown (POST /api/admin/shutdown) closed resources but never called process.exit(). Zombie workers stayed alive, background tasks reconnected to chroma-mcp, spawning duplicate subprocesses that all contended for the same persistent data directory. 2. No guard against concurrent daemon startup. When hooks fired simultaneously, multiple daemons started before either wrote a PID file. The loser got EADDRINUSE but stayed alive because signal handlers registered in the constructor prevented exit. 3. Corrupt 147GB HNSW index file caused all chroma queries to timeout (MCP error -32001). Data fix: deleted corrupt collection, backfill rebuilds from SQLite. Code fixes: - Add PID-based guard in daemon startup: exit if PID file process alive - Add port-based guard in daemon startup: exit if port already bound (runs before WorkerService constructor registers keepalive handlers) - Add process.exit(0) after HTTP shutdown/restart completes Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: aggressive startup cleanup and one-time chroma wipe for upgrade Kill orphaned worker-service.cjs and chroma-mcp processes immediately at startup (no age gate) while keeping 30-min threshold for mcp-server. Wipe corrupt chroma data once on upgrade from pre-v10.3 versions — backfill rebuilds from SQLite automatically. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: wrap shutdown handlers in try/finally to guarantee process.exit If onShutdown() or onRestart() threw, process.exit(0) was never reached, leaving the daemon alive as a zombie. Also removed redundant require('fs') calls in process-manager tests where ESM imports already existed. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
import { describe, it, expect, beforeEach, afterEach } from 'bun:test';
|
||||
import { existsSync, readFileSync } from 'fs';
|
||||
import { existsSync, readFileSync, mkdirSync, writeFileSync, rmSync } from 'fs';
|
||||
import { homedir } from 'os';
|
||||
import { tmpdir } from 'os';
|
||||
import path from 'path';
|
||||
import {
|
||||
writePidFile,
|
||||
@@ -12,6 +13,7 @@ import {
|
||||
cleanStalePidFile,
|
||||
spawnDaemon,
|
||||
resolveWorkerRuntimePath,
|
||||
runOneTimeChromaMigration,
|
||||
type PidInfo
|
||||
} from '../../src/services/infrastructure/index.js';
|
||||
|
||||
@@ -32,7 +34,6 @@ describe('ProcessManager', () => {
|
||||
afterEach(() => {
|
||||
// Restore original PID file or remove test one
|
||||
if (originalPidContent !== null) {
|
||||
const { writeFileSync } = require('fs');
|
||||
writeFileSync(PID_FILE, originalPidContent);
|
||||
originalPidContent = null;
|
||||
} else {
|
||||
@@ -105,7 +106,6 @@ describe('ProcessManager', () => {
|
||||
});
|
||||
|
||||
it('should return null for corrupted JSON', () => {
|
||||
const { writeFileSync } = require('fs');
|
||||
writeFileSync(PID_FILE, 'not valid json {{{');
|
||||
|
||||
const result = readPidFile();
|
||||
@@ -415,4 +415,53 @@ describe('ProcessManager', () => {
|
||||
// This is a logic verification test — actual signal delivery is tested manually
|
||||
});
|
||||
});
|
||||
|
||||
describe('runOneTimeChromaMigration', () => {
|
||||
let testDataDir: string;
|
||||
|
||||
beforeEach(() => {
|
||||
testDataDir = path.join(tmpdir(), `claude-mem-test-${Date.now()}-${Math.random().toString(36).slice(2)}`);
|
||||
mkdirSync(testDataDir, { recursive: true });
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
rmSync(testDataDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
it('should wipe chroma directory and write marker file', () => {
|
||||
// Create a fake chroma directory with data
|
||||
const chromaDir = path.join(testDataDir, 'chroma');
|
||||
mkdirSync(chromaDir, { recursive: true });
|
||||
writeFileSync(path.join(chromaDir, 'test-data.bin'), 'fake chroma data');
|
||||
|
||||
runOneTimeChromaMigration(testDataDir);
|
||||
|
||||
// Chroma dir should be gone
|
||||
expect(existsSync(chromaDir)).toBe(false);
|
||||
// Marker file should exist
|
||||
expect(existsSync(path.join(testDataDir, '.chroma-cleaned-v10.3'))).toBe(true);
|
||||
});
|
||||
|
||||
it('should skip when marker file already exists (idempotent)', () => {
|
||||
// Write marker file first
|
||||
writeFileSync(path.join(testDataDir, '.chroma-cleaned-v10.3'), 'already done');
|
||||
|
||||
// Create a chroma directory that should NOT be wiped
|
||||
const chromaDir = path.join(testDataDir, 'chroma');
|
||||
mkdirSync(chromaDir, { recursive: true });
|
||||
writeFileSync(path.join(chromaDir, 'important.bin'), 'should survive');
|
||||
|
||||
runOneTimeChromaMigration(testDataDir);
|
||||
|
||||
// Chroma dir should still exist (migration was skipped)
|
||||
expect(existsSync(chromaDir)).toBe(true);
|
||||
expect(existsSync(path.join(chromaDir, 'important.bin'))).toBe(true);
|
||||
});
|
||||
|
||||
it('should handle missing chroma directory gracefully', () => {
|
||||
// No chroma dir exists — should just write marker without error
|
||||
expect(() => runOneTimeChromaMigration(testDataDir)).not.toThrow();
|
||||
expect(existsSync(path.join(testDataDir, '.chroma-cleaned-v10.3'))).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user