MAESTRO: Expand startup orphan cleanup to target mcp-server and worker-service processes

The startup cleanupOrphanedProcesses() only targeted chroma-mcp, leaving
orphaned mcp-server.cjs and worker-service.cjs processes undetected after
daemon crashes. Expanded to target all claude-mem process types with
30-minute age filtering and current PID exclusion. Closes PR #687 (which
had a spawnDaemon regression removing Windows WMIC support).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-05 22:06:46 -05:00
parent edfc1a403f
commit d333c7dc08
5 changed files with 243 additions and 140 deletions
+2 -1
View File
@@ -23,4 +23,5 @@ These PRs address orphaned/zombie processes. This is a recurring theme — v9.0.
- [x] Review PR #713 (`fix: prevent SDK subprocess accumulation` by @cjpeterein). Files: 17 files (very large PR). Steps: (1) `gh pr checkout 713` (2) This is a large PR touching many files — check how much overlaps with v9.0.8 ProcessRegistry work (3) Large PRs on process management are risky. If substantial overlap exists with shipped code, close with explanation. (4) If unique value remains, request the author to rebase and reduce scope.
> **Result: CLOSED** — 873 additions across 18 files with extensive overlap with shipped infrastructure. ProcessRegistry (v9.0.8) provides PID tracking and 3-layer orphan reaper, SessionQueueProcessor (v9.0.13) provides 3-minute idle timeout, and ProcessManager.ts already handles startup orphan cleanup. The only genuinely new addition was a try/finally wrapper around the SDK query loop, which is redundant given deleteSession() → abort() → ensureProcessExit() → SIGKILL escalation already handles subprocess cleanup. The large surface area (18 files) makes this high-risk for marginal defensive benefit.
- [ ] Review PR #687 (`fix: expand orphaned process cleanup to include mcp-server and worker-service` by @MrSaneApps). File: `src/services/infrastructure/ProcessManager.ts`. Steps: (1) `gh pr checkout 687` (2) Single-file change expanding what processes the reaper targets — low risk (3) Verify it correctly identifies mcp-server and worker-service processes (4) Run `npm run build` (5) If clean: `gh pr merge 687 --rebase --delete-branch`
- [x] Review PR #687 (`fix: expand orphaned process cleanup to include mcp-server and worker-service` by @MrSaneApps). File: `src/services/infrastructure/ProcessManager.ts`. Steps: (1) `gh pr checkout 687` (2) Single-file change expanding what processes the reaper targets — low risk (3) Verify it correctly identifies mcp-server and worker-service processes (4) Run `npm run build` (5) If clean: `gh pr merge 687 --rebase --delete-branch`
> **Result: CLOSED — Core concept implemented on main.** The PR correctly identified a real gap: startup cleanup only targeted `chroma-mcp` while orphaned `mcp-server.cjs` and `worker-service.cjs` processes went undetected after daemon crashes. However, the PR also destructively rewrote `spawnDaemon()`, removing the Windows WMIC spawn path (needed for console-popup-free daemon spawning) in favor of a simple `spawn()` with `windowsHide: true` — a regression. The `claude-mem` pattern was also overly broad. **Implemented directly on main:** expanded `cleanupOrphanedProcesses()` to target `mcp-server.cjs`, `worker-service.cjs`, and `chroma-mcp` with 30-minute age filtering, current PID exclusion, and proper `parseElapsedTime()` helper. Build passes, tests added and passing.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+110 -34
View File
@@ -22,6 +22,17 @@ const execAsync = promisify(exec);
const DATA_DIR = path.join(homedir(), '.claude-mem');
const PID_FILE = path.join(DATA_DIR, 'worker.pid');
// Orphaned process cleanup patterns and thresholds
// These are claude-mem processes that can accumulate if not properly terminated
const ORPHAN_PROCESS_PATTERNS = [
'mcp-server.cjs', // Main MCP server process
'worker-service.cjs', // Background worker daemon
'chroma-mcp' // ChromaDB MCP subprocess
];
// Only kill processes older than this to avoid killing the current session
const ORPHAN_MAX_AGE_MINUTES = 30;
export interface PidInfo {
pid: number;
port: number;
@@ -162,55 +173,119 @@ export async function waitForProcessesExit(pids: number[], timeoutMs: number): P
}
/**
* Clean up orphaned chroma-mcp processes from previous worker sessions
* Prevents process accumulation and memory leaks
* Parse process elapsed time from ps etime format: [[DD-]HH:]MM:SS
* Returns age in minutes, or -1 if parsing fails
*/
export function parseElapsedTime(etime: string): number {
if (!etime || etime.trim() === '') return -1;
const cleaned = etime.trim();
let totalMinutes = 0;
// DD-HH:MM:SS format
const dayMatch = cleaned.match(/^(\d+)-(\d+):(\d+):(\d+)$/);
if (dayMatch) {
totalMinutes = parseInt(dayMatch[1], 10) * 24 * 60 +
parseInt(dayMatch[2], 10) * 60 +
parseInt(dayMatch[3], 10);
return totalMinutes;
}
// HH:MM:SS format
const hourMatch = cleaned.match(/^(\d+):(\d+):(\d+)$/);
if (hourMatch) {
totalMinutes = parseInt(hourMatch[1], 10) * 60 + parseInt(hourMatch[2], 10);
return totalMinutes;
}
// MM:SS format
const minMatch = cleaned.match(/^(\d+):(\d+)$/);
if (minMatch) {
return parseInt(minMatch[1], 10);
}
return -1;
}
/**
* Clean up orphaned claude-mem processes from previous worker sessions
*
* Targets mcp-server.cjs, worker-service.cjs, and chroma-mcp processes
* that survived a previous daemon crash. Only kills processes older than
* ORPHAN_MAX_AGE_MINUTES to avoid killing the current session.
*
* The periodic ProcessRegistry reaper handles in-session orphans;
* this function handles cross-session orphans at startup.
*/
export async function cleanupOrphanedProcesses(): Promise<void> {
const isWindows = process.platform === 'win32';
const pids: number[] = [];
const currentPid = process.pid;
const pidsToKill: number[] = [];
try {
if (isWindows) {
// Windows: Use PowerShell Get-CimInstance instead of WMIC (deprecated in Windows 11)
const cmd = `powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process | Where-Object { \\$_.Name -like '*python*' -and \\$_.CommandLine -like '*chroma-mcp*' } | Select-Object -ExpandProperty ProcessId"`;
// Windows: Use PowerShell Get-CimInstance with JSON output for age filtering
const patternConditions = ORPHAN_PROCESS_PATTERNS
.map(p => `\\$_.CommandLine -like '*${p}*'`)
.join(' -or ');
const cmd = `powershell -NoProfile -NonInteractive -Command "Get-CimInstance Win32_Process | Where-Object { (${patternConditions}) -and \\$_.ProcessId -ne ${currentPid} } | Select-Object ProcessId, CreationDate | ConvertTo-Json"`;
const { stdout } = await execAsync(cmd, { timeout: HOOK_TIMEOUTS.POWERSHELL_COMMAND });
if (!stdout.trim()) {
logger.debug('SYSTEM', 'No orphaned chroma-mcp processes found (Windows)');
if (!stdout.trim() || stdout.trim() === 'null') {
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Windows)');
return;
}
// PowerShell outputs just numbers (one per line), simpler than WMIC's "ProcessId=1234" format
const lines = stdout
.split('\n')
.map(line => line.trim())
.filter(line => line.length > 0 && /^\d+$/.test(line));
const processes = JSON.parse(stdout);
const processList = Array.isArray(processes) ? processes : [processes];
const now = Date.now();
for (const line of lines) {
const pid = parseInt(line, 10);
// SECURITY: Validate PID is positive integer before adding to list
if (!isNaN(pid) && Number.isInteger(pid) && pid > 0) {
pids.push(pid);
for (const proc of processList) {
const pid = proc.ProcessId;
// SECURITY: Validate PID is positive integer and not current process
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
// Parse Windows WMI date format: /Date(1234567890123)/
const creationMatch = proc.CreationDate?.match(/\/Date\((\d+)\)\//);
if (creationMatch) {
const creationTime = parseInt(creationMatch[1], 10);
const ageMinutes = (now - creationTime) / (1000 * 60);
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
pidsToKill.push(pid);
logger.debug('SYSTEM', 'Found orphaned process', { pid, ageMinutes: Math.round(ageMinutes) });
}
}
}
} else {
// Unix: Use ps aux | grep
const { stdout } = await execAsync('ps aux | grep "chroma-mcp" | grep -v grep || true');
// Unix: Use ps with elapsed time for age-based filtering
const patternRegex = ORPHAN_PROCESS_PATTERNS.join('|');
const { stdout } = await execAsync(
`ps -eo pid,etime,command | grep -E "${patternRegex}" | grep -v grep || true`
);
if (!stdout.trim()) {
logger.debug('SYSTEM', 'No orphaned chroma-mcp processes found (Unix)');
logger.debug('SYSTEM', 'No orphaned claude-mem processes found (Unix)');
return;
}
const lines = stdout.trim().split('\n');
for (const line of lines) {
const parts = line.trim().split(/\s+/);
if (parts.length > 1) {
const pid = parseInt(parts[1], 10);
// SECURITY: Validate PID is positive integer before adding to list
if (!isNaN(pid) && Number.isInteger(pid) && pid > 0) {
pids.push(pid);
}
// Parse: " 1234 01:23:45 /path/to/process"
const match = line.trim().match(/^(\d+)\s+(\S+)\s+(.*)$/);
if (!match) continue;
const pid = parseInt(match[1], 10);
const etime = match[2];
// SECURITY: Validate PID is positive integer and not current process
if (!Number.isInteger(pid) || pid <= 0 || pid === currentPid) continue;
const ageMinutes = parseElapsedTime(etime);
if (ageMinutes >= ORPHAN_MAX_AGE_MINUTES) {
pidsToKill.push(pid);
logger.debug('SYSTEM', 'Found orphaned process', { pid, ageMinutes, command: match[3].substring(0, 80) });
}
}
}
@@ -220,19 +295,20 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
return;
}
if (pids.length === 0) {
if (pidsToKill.length === 0) {
return;
}
logger.info('SYSTEM', 'Cleaning up orphaned chroma-mcp processes', {
logger.info('SYSTEM', 'Cleaning up orphaned claude-mem processes', {
platform: isWindows ? 'Windows' : 'Unix',
count: pids.length,
pids
count: pidsToKill.length,
pids: pidsToKill,
maxAgeMinutes: ORPHAN_MAX_AGE_MINUTES
});
// Kill all found processes
if (isWindows) {
for (const pid of pids) {
for (const pid of pidsToKill) {
// SECURITY: Double-check PID validation before using in taskkill command
if (!Number.isInteger(pid) || pid <= 0) {
logger.warn('SYSTEM', 'Skipping invalid PID', { pid });
@@ -246,7 +322,7 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
}
}
} else {
for (const pid of pids) {
for (const pid of pidsToKill) {
try {
process.kill(pid, 'SIGKILL');
} catch (error) {
@@ -256,7 +332,7 @@ export async function cleanupOrphanedProcesses(): Promise<void> {
}
}
logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pids.length });
logger.info('SYSTEM', 'Orphaned processes cleaned up', { count: pidsToKill.length });
}
/**
@@ -7,6 +7,7 @@ import {
readPidFile,
removePidFile,
getPlatformTimeout,
parseElapsedTime,
type PidInfo
} from '../../src/services/infrastructure/index.js';
@@ -134,6 +135,32 @@ describe('ProcessManager', () => {
});
});
describe('parseElapsedTime', () => {
it('should parse MM:SS format', () => {
expect(parseElapsedTime('05:30')).toBe(5);
expect(parseElapsedTime('00:45')).toBe(0);
expect(parseElapsedTime('59:59')).toBe(59);
});
it('should parse HH:MM:SS format', () => {
expect(parseElapsedTime('01:30:00')).toBe(90);
expect(parseElapsedTime('02:15:30')).toBe(135);
expect(parseElapsedTime('00:05:00')).toBe(5);
});
it('should parse DD-HH:MM:SS format', () => {
expect(parseElapsedTime('1-00:00:00')).toBe(1440); // 1 day
expect(parseElapsedTime('2-12:30:00')).toBe(3630); // 2 days + 12.5 hours
expect(parseElapsedTime('0-01:00:00')).toBe(60); // 1 hour
});
it('should return -1 for empty or invalid input', () => {
expect(parseElapsedTime('')).toBe(-1);
expect(parseElapsedTime(' ')).toBe(-1);
expect(parseElapsedTime('invalid')).toBe(-1);
});
});
describe('getPlatformTimeout', () => {
const originalPlatform = process.platform;