fix: session lifecycle guards to prevent runaway API spend (#1590) (#1693)

* fix: add session lifecycle guards to prevent runaway API spend (#1590)

Three root causes allowed 30+ subprocess accumulation over 36 hours:
1. SIGTERM-killed processes (code 143) triggered crash recovery and
   immediately respawned — now detected and treated as intentional
   termination (aborts controller so wasAborted=true in .finally).
2. No wall-clock limit: sessions ran for 13+ hours continuously
   spending tokens — now refuses new generators after 4 hours and
   drains the pending queue to prevent further spawning.
3. Duplicate --resume processes for the same session UUID — now
   killed and unregistered before a new spawn is registered.

Generated by Claude Code
Vibe coded by ousamabenyounes

Co-Authored-By: Claude <noreply@anthropic.com>

* fix: use normalized errorMsg in logger.error payload and annotate SIGTERM override

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* fix: use persisted createdAt for wall-clock guard and bind abortController locally to prevent stale abort

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

* chore: re-trigger CodeRabbit review after rate limit reset

* fix: defer process unregistration until exit and align boundary test with strict > (#1693)

- ProcessRegistry: don't unregister PID immediately after SIGTERM — let the
  existing 'exit' handler clean up when the process actually exits, preventing
  tracking loss for still-live processes.
- Test: align wall-clock boundary test with production's strict `>` operator
  (exactly 4h is NOT terminated, only >4h is).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude <noreply@anthropic.com>
This commit is contained in:
Ben Younes
2026-04-15 09:58:23 +02:00
committed by GitHub
parent 983be42998
commit f97c50bfb9
3 changed files with 326 additions and 3 deletions
+24
View File
@@ -382,6 +382,30 @@ export function createPidCapturingSpawn(sessionDbId: number) {
env?: NodeJS.ProcessEnv;
signal?: AbortSignal;
}) => {
// Kill any existing process for this session before spawning a new one.
// Multiple processes sharing the same --resume UUID waste API credits and
// can conflict with each other (Issue #1590).
const existing = getProcessBySession(sessionDbId);
if (existing && existing.process.exitCode === null) {
logger.warn('PROCESS', `Killing duplicate process PID ${existing.pid} before spawning new one for session ${sessionDbId}`, {
existingPid: existing.pid,
sessionDbId
});
let exited = false;
try {
existing.process.kill('SIGTERM');
exited = existing.process.exitCode !== null;
} catch {
// Already dead — safe to unregister immediately
exited = true;
}
if (exited) {
unregisterProcess(existing.pid);
}
// If still alive, the 'exit' handler (line ~440) will unregister it.
}
getSupervisor().assertCanSpawn('claude sdk');
// On Windows, use cmd.exe wrapper for .cmd files to properly handle paths with spaces
@@ -94,11 +94,37 @@ export class SessionRoutes extends BaseRouteHandler {
* The next generator will use the new provider with shared conversationHistory.
*/
private static readonly STALE_GENERATOR_THRESHOLD_MS = 30_000; // 30 seconds (#1099)
private static readonly MAX_SESSION_WALL_CLOCK_MS = 4 * 60 * 60 * 1000; // 4 hours (#1590)
private ensureGeneratorRunning(sessionDbId: number, source: string): void {
const session = this.sessionManager.getSession(sessionDbId);
if (!session) return;
// Wall-clock age guard: refuse to start new generators for sessions that have
// been alive too long to prevent runaway API costs (Issue #1590).
// Use the persisted started_at_epoch from the DB so the guard survives worker
// restarts (session.startTime is reset to Date.now() on every re-activation).
const dbSessionRecord = this.dbManager.getSessionStore().db
.prepare('SELECT started_at_epoch FROM sdk_sessions WHERE id = ? LIMIT 1')
.get(sessionDbId) as { started_at_epoch: number } | undefined;
const sessionOriginMs = dbSessionRecord?.started_at_epoch ?? session.startTime;
const sessionAgeMs = Date.now() - sessionOriginMs;
if (sessionAgeMs > SessionRoutes.MAX_SESSION_WALL_CLOCK_MS) {
logger.warn('SESSION', 'Session exceeded wall-clock age limit — aborting to prevent runaway spend', {
sessionId: sessionDbId,
ageHours: Math.round(sessionAgeMs / 3_600_000 * 10) / 10,
limitHours: SessionRoutes.MAX_SESSION_WALL_CLOCK_MS / 3_600_000,
source
});
if (!session.abortController.signal.aborted) {
session.abortController.abort();
}
const pendingStore = this.sessionManager.getPendingMessageStore();
pendingStore.markAllSessionMessagesAbandoned(sessionDbId);
this.sessionManager.removeSessionImmediate(sessionDbId);
return;
}
// GUARD: Prevent duplicate spawns
if (this.spawnInProgress.get(sessionDbId)) {
logger.debug('SESSION', 'Spawn already in progress, skipping', { sessionDbId, source });
@@ -187,15 +213,37 @@ export class SessionRoutes extends BaseRouteHandler {
session.currentProvider = provider;
session.lastGeneratorActivity = Date.now();
// Capture the AbortController that belongs to THIS generator run.
// session.abortController may be replaced (e.g. by stale-recovery) before the
// .catch / .finally handlers run, so binding it here prevents a stale rejection
// from cancelling a brand-new controller (race condition guard).
const myController = session.abortController;
session.generatorPromise = agent.startSession(session, this.workerService)
.catch(error => {
// Only log non-abort errors
if (session.abortController.signal.aborted) return;
if (myController.signal.aborted) return;
const errorMsg = error instanceof Error ? error.message : String(error);
// Treat SIGTERM (exit code 143) as intentional termination, not a crash.
// When a subprocess is killed externally, abort the controller to prevent
// crash recovery from immediately respawning the process (Issue #1590).
// APPROVED OVERRIDE
if (errorMsg.includes('code 143') || errorMsg.includes('signal SIGTERM')) {
logger.warn('SESSION', 'Generator killed by external signal — aborting session to prevent respawn', {
sessionId: session.sessionDbId,
provider,
error: errorMsg
});
myController.abort();
return;
}
logger.error('SESSION', `Generator failed`, {
sessionId: session.sessionDbId,
provider: provider,
error: error.message
error: errorMsg
}, error);
// Mark all processing messages as failed so they can be retried or abandoned