fix: prevent zombie process accumulation by verifying subprocess exit (#1168) (#1175)

Two changes fix the observer process resource leak:

1. Add ensureProcessExit to generator finally blocks in SessionRoutes and
   worker-service, matching the pattern already working in SDKAgent.

2. Add stale session reaper (every 2m) that removes sessions with no active
   generator and no pending work after 15m idle. This unblocks the orphan
   reaper which previously skipped processes for "active" sessions.

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-18 16:33:23 -05:00
committed by GitHub
parent 2180d31ee6
commit 5d79bb7a7a
5 changed files with 172 additions and 105 deletions
+29 -2
View File
@@ -116,7 +116,7 @@ import { LogsRoutes } from './worker/http/routes/LogsRoutes.js';
import { MemoryRoutes } from './worker/http/routes/MemoryRoutes.js';
// Process management for zombie cleanup (Issue #737)
import { startOrphanReaper, reapOrphanedProcesses } from './worker/ProcessRegistry.js';
import { startOrphanReaper, reapOrphanedProcesses, getProcessBySession, ensureProcessExit } from './worker/ProcessRegistry.js';
/**
* Build JSON status output for hook framework communication.
@@ -176,6 +176,9 @@ export class WorkerService {
// Orphan reaper cleanup function (Issue #737)
private stopOrphanReaper: (() => void) | null = null;
// Stale session reaper interval (Issue #1168)
private staleSessionReaperInterval: ReturnType<typeof setInterval> | null = null;
// AI interaction tracking for health endpoint
private lastAiInteraction: {
timestamp: number;
@@ -465,6 +468,18 @@ export class WorkerService {
});
logger.info('SYSTEM', 'Started orphan reaper (runs every 5 minutes)');
// Reap stale sessions to unblock orphan process cleanup (Issue #1168)
this.staleSessionReaperInterval = setInterval(async () => {
try {
const reaped = await this.sessionManager.reapStaleSessions();
if (reaped > 0) {
logger.info('SYSTEM', `Reaped ${reaped} stale sessions`);
}
} catch (e) {
logger.error('SYSTEM', 'Stale session reaper error', { error: e instanceof Error ? e.message : String(e) });
}
}, 2 * 60 * 1000);
// Auto-recover orphaned queues (fire-and-forget with error logging)
this.processPendingQueues(50).then(result => {
if (result.sessionsStarted > 0) {
@@ -593,7 +608,13 @@ export class WorkerService {
};
throw error;
})
.finally(() => {
.finally(async () => {
// CRITICAL: Verify subprocess exit to prevent zombie accumulation (Issue #1168)
const trackedProcess = getProcessBySession(session.sessionDbId);
if (trackedProcess && !trackedProcess.process.killed && trackedProcess.process.exitCode === null) {
await ensureProcessExit(trackedProcess, 5000);
}
session.generatorPromise = null;
// Record successful AI interaction if no error occurred
@@ -823,6 +844,12 @@ export class WorkerService {
this.stopOrphanReaper = null;
}
// Stop stale session reaper (Issue #1168)
if (this.staleSessionReaperInterval) {
clearInterval(this.staleSessionReaperInterval);
this.staleSessionReaperInterval = null;
}
await performGracefulShutdown({
server: this.server.getHttpServer(),
sessionManager: this.sessionManager,
+33
View File
@@ -341,6 +341,39 @@ export class SessionManager {
}
}
private static readonly MAX_SESSION_IDLE_MS = 15 * 60 * 1000; // 15 minutes
/**
* Reap sessions with no active generator and no pending work that have been idle too long.
* This unblocks the orphan reaper which skips processes for "active" sessions. (Issue #1168)
*/
async reapStaleSessions(): Promise<number> {
const now = Date.now();
const staleSessionIds: number[] = [];
for (const [sessionDbId, session] of this.sessions) {
// Skip sessions with active generators
if (session.generatorPromise) continue;
// Skip sessions with pending work
const pendingCount = this.getPendingStore().getPendingCount(sessionDbId);
if (pendingCount > 0) continue;
// No generator + no pending work + old enough = stale
const sessionAge = now - session.startTime;
if (sessionAge > SessionManager.MAX_SESSION_IDLE_MS) {
staleSessionIds.push(sessionDbId);
}
}
for (const sessionDbId of staleSessionIds) {
logger.warn('SESSION', `Reaping stale session ${sessionDbId} (no activity for >${Math.round(SessionManager.MAX_SESSION_IDLE_MS / 60000)}m)`, { sessionDbId });
await this.deleteSession(sessionDbId);
}
return staleSessionIds.length;
}
/**
* Shutdown all active sessions
*/
@@ -21,6 +21,7 @@ import { SessionCompletionHandler } from '../../session/SessionCompletionHandler
import { PrivacyCheckValidator } from '../../validation/PrivacyCheckValidator.js';
import { SettingsDefaultsManager } from '../../../../shared/SettingsDefaultsManager.js';
import { USER_SETTINGS_PATH } from '../../../../shared/paths.js';
import { getProcessBySession, ensureProcessExit } from '../../ProcessRegistry.js';
export class SessionRoutes extends BaseRouteHandler {
private completionHandler: SessionCompletionHandler;
@@ -184,7 +185,13 @@ export class SessionRoutes extends BaseRouteHandler {
}, dbError as Error);
}
})
.finally(() => {
.finally(async () => {
// CRITICAL: Verify subprocess exit to prevent zombie accumulation (Issue #1168)
const tracked = getProcessBySession(session.sessionDbId);
if (tracked && !tracked.process.killed && tracked.process.exitCode === null) {
await ensureProcessExit(tracked, 5000);
}
const sessionDbId = session.sessionDbId;
this.spawnInProgress.delete(sessionDbId);
const wasAborted = session.abortController.signal.aborted;