* fix: reap stuck generators in reapStaleSessions (fixes #1652) Sessions whose SDK subprocess hung would stay in the active sessions map forever because `reapStaleSessions()` unconditionally skipped any session with a non-null `generatorPromise`. The generator was blocked on `for await (const msg of queryResult)` inside SDKAgent and could never unblock itself — the idle-timeout only fires when the generator is in `waitForMessage()`, and the orphan reaper skips processes whose session is still in the map. Add `MAX_GENERATOR_IDLE_MS` (5 min). When `reapStaleSessions()` sees a session whose `generatorPromise` is set but `lastGeneratorActivity` has not advanced in over 5 minutes, it now: 1. SIGKILLs the tracked subprocess to unblock the stuck `for await` 2. Calls `session.abortController.abort()` so the generator loop exits 3. Calls `deleteSession()` which waits up to 30 s for the generator to finish, then cleans up supervisor-tracked children Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: freeze time in stale-generator test and import constants from production source - Export MAX_GENERATOR_IDLE_MS, MAX_SESSION_IDLE_MS, StaleGeneratorCandidate, StaleGeneratorProcess, and detectStaleGenerator from SessionManager.ts so tests no longer duplicate production constants or detection logic. - Use setSystemTime() from bun:test to freeze Date.now() in the "exactly at threshold" test, eliminating the flaky double-Date.now() race. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -17,6 +17,64 @@ import { SessionQueueProcessor } from '../queue/SessionQueueProcessor.js';
|
||||
import { getProcessBySession, ensureProcessExit } from './ProcessRegistry.js';
|
||||
import { getSupervisor } from '../../supervisor/index.js';
|
||||
|
||||
/** Idle threshold before a stuck generator (zombie subprocess) is force-killed. */
|
||||
export const MAX_GENERATOR_IDLE_MS = 5 * 60 * 1000; // 5 minutes
|
||||
|
||||
/** Idle threshold before a no-generator session with no pending work is reaped. */
|
||||
export const MAX_SESSION_IDLE_MS = 15 * 60 * 1000; // 15 minutes
|
||||
|
||||
/**
|
||||
* Minimal process interface used by detectStaleGenerator — compatible with
|
||||
* both the real Bun.Subprocess / ChildProcess shapes and test mocks.
|
||||
*/
|
||||
export interface StaleGeneratorProcess {
|
||||
exitCode: number | null;
|
||||
kill(signal?: string): boolean | void;
|
||||
}
|
||||
|
||||
/**
|
||||
* Minimal session fields required to evaluate stale-generator status.
|
||||
* This is a subset of ActiveSession, allowing unit tests to pass plain objects.
|
||||
*/
|
||||
export interface StaleGeneratorCandidate {
|
||||
generatorPromise: Promise<void> | null;
|
||||
lastGeneratorActivity: number;
|
||||
abortController: AbortController;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect whether a session's generator is stuck (zombie subprocess) and, if so,
|
||||
* SIGKILL the subprocess and abort the controller.
|
||||
*
|
||||
* Extracted from reapStaleSessions() so tests can import and exercise the exact
|
||||
* same logic rather than duplicating it locally. (Issue #1652)
|
||||
*
|
||||
* @param session - session to inspect
|
||||
* @param proc - tracked subprocess (may be undefined if not in ProcessRegistry)
|
||||
* @param now - current timestamp (defaults to Date.now(); pass explicit value in tests)
|
||||
* @returns true if the session was marked stale, false otherwise
|
||||
*/
|
||||
export function detectStaleGenerator(
|
||||
session: StaleGeneratorCandidate,
|
||||
proc: StaleGeneratorProcess | undefined,
|
||||
now = Date.now()
|
||||
): boolean {
|
||||
if (!session.generatorPromise) return false;
|
||||
|
||||
const generatorIdleMs = now - session.lastGeneratorActivity;
|
||||
if (generatorIdleMs <= MAX_GENERATOR_IDLE_MS) return false;
|
||||
|
||||
// Kill subprocess to unblock stuck for-await
|
||||
if (proc && proc.exitCode === null) {
|
||||
try {
|
||||
proc.kill('SIGKILL');
|
||||
} catch {}
|
||||
}
|
||||
// Signal the SDK agent loop to exit
|
||||
session.abortController.abort();
|
||||
return true;
|
||||
}
|
||||
|
||||
export class SessionManager {
|
||||
private dbManager: DatabaseManager;
|
||||
private sessions: Map<number, ActiveSession> = new Map();
|
||||
@@ -364,10 +422,12 @@ export class SessionManager {
|
||||
}
|
||||
}
|
||||
|
||||
private static readonly MAX_SESSION_IDLE_MS = 15 * 60 * 1000; // 15 minutes
|
||||
|
||||
/**
|
||||
* Reap sessions with no active generator and no pending work that have been idle too long.
|
||||
* Also reaps sessions whose generator has been stuck (no lastGeneratorActivity update) for
|
||||
* longer than MAX_GENERATOR_IDLE_MS — these are zombie subprocesses that will never exit
|
||||
* on their own because the orphan reaper skips sessions in the active sessions map. (Issue #1652)
|
||||
*
|
||||
* This unblocks the orphan reaper which skips processes for "active" sessions. (Issue #1168)
|
||||
*/
|
||||
async reapStaleSessions(): Promise<number> {
|
||||
@@ -375,8 +435,31 @@ export class SessionManager {
|
||||
const staleSessionIds: number[] = [];
|
||||
|
||||
for (const [sessionDbId, session] of this.sessions) {
|
||||
// Skip sessions with active generators
|
||||
if (session.generatorPromise) continue;
|
||||
// Sessions with active generators — check for stuck/zombie generators (Issue #1652)
|
||||
if (session.generatorPromise) {
|
||||
const generatorIdleMs = now - session.lastGeneratorActivity;
|
||||
if (generatorIdleMs > MAX_GENERATOR_IDLE_MS) {
|
||||
logger.warn('SESSION', `Stale generator detected for session ${sessionDbId} (no activity for ${Math.round(generatorIdleMs / 60000)}m) — force-killing subprocess`, {
|
||||
sessionDbId,
|
||||
generatorIdleMs
|
||||
});
|
||||
// Force-kill the subprocess to unblock the stuck for-await in SDKAgent.
|
||||
// Without this the generator is blocked on `for await (const msg of queryResult)`
|
||||
// and will never exit even after abort() is called.
|
||||
const trackedProcess = getProcessBySession(sessionDbId);
|
||||
if (trackedProcess && trackedProcess.process.exitCode === null) {
|
||||
try {
|
||||
trackedProcess.process.kill('SIGKILL');
|
||||
} catch (err) {
|
||||
logger.warn('SESSION', 'Failed to SIGKILL subprocess for stale generator', { sessionDbId }, err as Error);
|
||||
}
|
||||
}
|
||||
// Signal the SDK agent loop to exit after the subprocess dies
|
||||
session.abortController.abort();
|
||||
staleSessionIds.push(sessionDbId);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip sessions with pending work
|
||||
const pendingCount = this.getPendingStore().getPendingCount(sessionDbId);
|
||||
@@ -384,13 +467,13 @@ export class SessionManager {
|
||||
|
||||
// No generator + no pending work + old enough = stale
|
||||
const sessionAge = now - session.startTime;
|
||||
if (sessionAge > SessionManager.MAX_SESSION_IDLE_MS) {
|
||||
if (sessionAge > MAX_SESSION_IDLE_MS) {
|
||||
logger.warn('SESSION', `Reaping idle session ${sessionDbId} (no activity for >${Math.round(MAX_SESSION_IDLE_MS / 60000)}m)`, { sessionDbId });
|
||||
staleSessionIds.push(sessionDbId);
|
||||
}
|
||||
}
|
||||
|
||||
for (const sessionDbId of staleSessionIds) {
|
||||
logger.warn('SESSION', `Reaping stale session ${sessionDbId} (no activity for >${Math.round(SessionManager.MAX_SESSION_IDLE_MS / 60000)}m)`, { sessionDbId });
|
||||
await this.deleteSession(sessionDbId);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user