fix: reap stuck generators in reapStaleSessions (fixes #1652) (#1698)

* fix: reap stuck generators in reapStaleSessions (fixes #1652)

Sessions whose SDK subprocess hung would stay in the active sessions
map forever because `reapStaleSessions()` unconditionally skipped any
session with a non-null `generatorPromise`.  The generator was blocked
on `for await (const msg of queryResult)` inside SDKAgent and could
never unblock itself — the idle-timeout only fires when the generator
is in `waitForMessage()`, and the orphan reaper skips processes whose
session is still in the map.

Add `MAX_GENERATOR_IDLE_MS` (5 min).  When `reapStaleSessions()` sees
a session whose `generatorPromise` is set but `lastGeneratorActivity`
has not advanced in over 5 minutes, it now:
1. SIGKILLs the tracked subprocess to unblock the stuck `for await`
2. Calls `session.abortController.abort()` so the generator loop exits
3. Calls `deleteSession()` which waits up to 30 s for the generator to
   finish, then cleans up supervisor-tracked children

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix: freeze time in stale-generator test and import constants from production source

- Export MAX_GENERATOR_IDLE_MS, MAX_SESSION_IDLE_MS, StaleGeneratorCandidate,
  StaleGeneratorProcess, and detectStaleGenerator from SessionManager.ts so
  tests no longer duplicate production constants or detection logic.
- Use setSystemTime() from bun:test to freeze Date.now() in the
  "exactly at threshold" test, eliminating the flaky double-Date.now() race.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Ben Younes
2026-04-15 09:58:35 +02:00
committed by GitHub
parent b411d91885
commit 05232ff091
2 changed files with 380 additions and 6 deletions
+89 -6
View File
@@ -17,6 +17,64 @@ import { SessionQueueProcessor } from '../queue/SessionQueueProcessor.js';
import { getProcessBySession, ensureProcessExit } from './ProcessRegistry.js';
import { getSupervisor } from '../../supervisor/index.js';
/** Idle threshold before a stuck generator (zombie subprocess) is force-killed. */
export const MAX_GENERATOR_IDLE_MS = 5 * 60 * 1000; // 5 minutes
/** Idle threshold before a no-generator session with no pending work is reaped. */
export const MAX_SESSION_IDLE_MS = 15 * 60 * 1000; // 15 minutes
/**
* Minimal process interface used by detectStaleGenerator — compatible with
* both the real Bun.Subprocess / ChildProcess shapes and test mocks.
*/
export interface StaleGeneratorProcess {
exitCode: number | null;
kill(signal?: string): boolean | void;
}
/**
* Minimal session fields required to evaluate stale-generator status.
* This is a subset of ActiveSession, allowing unit tests to pass plain objects.
*/
export interface StaleGeneratorCandidate {
generatorPromise: Promise<void> | null;
lastGeneratorActivity: number;
abortController: AbortController;
}
/**
* Detect whether a session's generator is stuck (zombie subprocess) and, if so,
* SIGKILL the subprocess and abort the controller.
*
* Extracted from reapStaleSessions() so tests can import and exercise the exact
* same logic rather than duplicating it locally. (Issue #1652)
*
* @param session - session to inspect
* @param proc - tracked subprocess (may be undefined if not in ProcessRegistry)
* @param now - current timestamp (defaults to Date.now(); pass explicit value in tests)
* @returns true if the session was marked stale, false otherwise
*/
export function detectStaleGenerator(
session: StaleGeneratorCandidate,
proc: StaleGeneratorProcess | undefined,
now = Date.now()
): boolean {
if (!session.generatorPromise) return false;
const generatorIdleMs = now - session.lastGeneratorActivity;
if (generatorIdleMs <= MAX_GENERATOR_IDLE_MS) return false;
// Kill subprocess to unblock stuck for-await
if (proc && proc.exitCode === null) {
try {
proc.kill('SIGKILL');
} catch {}
}
// Signal the SDK agent loop to exit
session.abortController.abort();
return true;
}
export class SessionManager {
private dbManager: DatabaseManager;
private sessions: Map<number, ActiveSession> = new Map();
@@ -364,10 +422,12 @@ export class SessionManager {
}
}
private static readonly MAX_SESSION_IDLE_MS = 15 * 60 * 1000; // 15 minutes
/**
* Reap sessions with no active generator and no pending work that have been idle too long.
* Also reaps sessions whose generator has been stuck (no lastGeneratorActivity update) for
* longer than MAX_GENERATOR_IDLE_MS — these are zombie subprocesses that will never exit
* on their own because the orphan reaper skips sessions in the active sessions map. (Issue #1652)
*
* This unblocks the orphan reaper which skips processes for "active" sessions. (Issue #1168)
*/
async reapStaleSessions(): Promise<number> {
@@ -375,8 +435,31 @@ export class SessionManager {
const staleSessionIds: number[] = [];
for (const [sessionDbId, session] of this.sessions) {
// Skip sessions with active generators
if (session.generatorPromise) continue;
// Sessions with active generators — check for stuck/zombie generators (Issue #1652)
if (session.generatorPromise) {
const generatorIdleMs = now - session.lastGeneratorActivity;
if (generatorIdleMs > MAX_GENERATOR_IDLE_MS) {
logger.warn('SESSION', `Stale generator detected for session ${sessionDbId} (no activity for ${Math.round(generatorIdleMs / 60000)}m) — force-killing subprocess`, {
sessionDbId,
generatorIdleMs
});
// Force-kill the subprocess to unblock the stuck for-await in SDKAgent.
// Without this the generator is blocked on `for await (const msg of queryResult)`
// and will never exit even after abort() is called.
const trackedProcess = getProcessBySession(sessionDbId);
if (trackedProcess && trackedProcess.process.exitCode === null) {
try {
trackedProcess.process.kill('SIGKILL');
} catch (err) {
logger.warn('SESSION', 'Failed to SIGKILL subprocess for stale generator', { sessionDbId }, err as Error);
}
}
// Signal the SDK agent loop to exit after the subprocess dies
session.abortController.abort();
staleSessionIds.push(sessionDbId);
}
continue;
}
// Skip sessions with pending work
const pendingCount = this.getPendingStore().getPendingCount(sessionDbId);
@@ -384,13 +467,13 @@ export class SessionManager {
// No generator + no pending work + old enough = stale
const sessionAge = now - session.startTime;
if (sessionAge > SessionManager.MAX_SESSION_IDLE_MS) {
if (sessionAge > MAX_SESSION_IDLE_MS) {
logger.warn('SESSION', `Reaping idle session ${sessionDbId} (no activity for >${Math.round(MAX_SESSION_IDLE_MS / 60000)}m)`, { sessionDbId });
staleSessionIds.push(sessionDbId);
}
}
for (const sessionDbId of staleSessionIds) {
logger.warn('SESSION', `Reaping stale session ${sessionDbId} (no activity for >${Math.round(SessionManager.MAX_SESSION_IDLE_MS / 60000)}m)`, { sessionDbId });
await this.deleteSession(sessionDbId);
}