fix: prevent zombie process accumulation via PID registry and signal propagation (Issue #737) (#806)
* Fix zombie process accumulation (Issue #737) Problem: Claude haiku subprocesses spawned by the SDK weren't terminating properly, causing zombie process accumulation (user reported 155 processes consuming 51GB RAM). Root causes: 1. SDK's SpawnedProcess interface hides subprocess PIDs 2. deleteSession() didn't verify subprocess exit 3. abort() was fire-and-forget with no confirmation 4. No mechanism to track or clean up orphaned processes Solution: - Add ProcessRegistry module to track spawned Claude subprocesses - Use SDK's spawnClaudeCodeProcess option to capture PIDs via custom spawn - Pass signal parameter to enable AbortController integration - Wait for subprocess exit in deleteSession() with 5s timeout - Escalate to SIGKILL if graceful exit fails - Add orphan reaper running every 5 minutes as safety net Files changed: - src/services/worker/ProcessRegistry.ts (new): PID registry and reaper - src/services/worker/SDKAgent.ts: Use custom spawn to capture PIDs - src/services/worker/SessionManager.ts: Verify subprocess exit on delete - src/services/worker-service.ts: Start/stop orphan reaper Fixes #737 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix: address code review feedback - Replace busy-wait polling with event-based proc.once('exit') - Detect and warn about multiple processes per session (race condition) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: bigphoot <bigphoot@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,7 @@ import { logger } from '../../utils/logger.js';
|
||||
import type { ActiveSession, PendingMessage, PendingMessageWithId, ObservationData } from '../worker-types.js';
|
||||
import { PendingMessageStore } from '../sqlite/PendingMessageStore.js';
|
||||
import { SessionQueueProcessor } from '../queue/SessionQueueProcessor.js';
|
||||
import { getProcessBySession, ensureProcessExit } from './ProcessRegistry.js';
|
||||
|
||||
export class SessionManager {
|
||||
private dbManager: DatabaseManager;
|
||||
@@ -256,6 +257,7 @@ export class SessionManager {
|
||||
|
||||
/**
|
||||
* Delete a session (abort SDK agent and cleanup)
|
||||
* Verifies subprocess exit to prevent zombie process accumulation (Issue #737)
|
||||
*/
|
||||
async deleteSession(sessionDbId: number): Promise<void> {
|
||||
const session = this.sessions.get(sessionDbId);
|
||||
@@ -265,17 +267,27 @@ export class SessionManager {
|
||||
|
||||
const sessionDuration = Date.now() - session.startTime;
|
||||
|
||||
// Abort the SDK agent
|
||||
// 1. Abort the SDK agent
|
||||
session.abortController.abort();
|
||||
|
||||
// Wait for generator to finish
|
||||
// 2. Wait for generator to finish
|
||||
if (session.generatorPromise) {
|
||||
await session.generatorPromise.catch(error => {
|
||||
await session.generatorPromise.catch(() => {
|
||||
logger.debug('SYSTEM', 'Generator already failed, cleaning up', { sessionId: session.sessionDbId });
|
||||
});
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
// 3. Verify subprocess exit with 5s timeout (Issue #737 fix)
|
||||
const tracked = getProcessBySession(sessionDbId);
|
||||
if (tracked && !tracked.process.killed && tracked.process.exitCode === null) {
|
||||
logger.debug('SESSION', `Waiting for subprocess PID ${tracked.pid} to exit`, {
|
||||
sessionId: sessionDbId,
|
||||
pid: tracked.pid
|
||||
});
|
||||
await ensureProcessExit(tracked, 5000);
|
||||
}
|
||||
|
||||
// 4. Cleanup
|
||||
this.sessions.delete(sessionDbId);
|
||||
this.sessionQueues.delete(sessionDbId);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user