fix: session lifecycle guards to prevent runaway API spend (#1590) (#1693)

* fix: add session lifecycle guards to prevent runaway API spend (#1590) Three root causes allowed 30+ subprocess accumulation over 36 hours: 1. SIGTERM-killed processes (code 143) triggered crash recovery and immediately respawned — now detected and treated as intentional termination (aborts controller so wasAborted=true in .finally). 2. No wall-clock limit: sessions ran for 13+ hours continuously spending tokens — now refuses new generators after 4 hours and drains the pending queue to prevent further spawning. 3. Duplicate --resume processes for the same session UUID — now killed and unregistered before a new spawn is registered. Generated by Claude Code Vibe coded by ousamabenyounes Co-Authored-By: Claude <noreply@anthropic.com> * fix: use normalized errorMsg in logger.error payload and annotate SIGTERM override Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * fix: use persisted createdAt for wall-clock guard and bind abortController locally to prevent stale abort Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> * chore: re-trigger CodeRabbit review after rate limit reset * fix: defer process unregistration until exit and align boundary test with strict > (#1693) - ProcessRegistry: don't unregister PID immediately after SIGTERM — let the existing 'exit' handler clean up when the process actually exits, preventing tracking loss for still-live processes. - Test: align wall-clock boundary test with production's strict `>` operator (exactly 4h is NOT terminated, only >4h is). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude <noreply@anthropic.com>
2026-04-15 09:58:23 +02:00
parent 983be42998
commit f97c50bfb9
3 changed files with 326 additions and 3 deletions
@@ -94,11 +94,37 @@ export class SessionRoutes extends BaseRouteHandler {
   * The next generator will use the new provider with shared conversationHistory.
   */
  private static readonly STALE_GENERATOR_THRESHOLD_MS = 30_000; // 30 seconds (#1099)
+  private static readonly MAX_SESSION_WALL_CLOCK_MS = 4 * 60 * 60 * 1000; // 4 hours (#1590)

  private ensureGeneratorRunning(sessionDbId: number, source: string): void {
    const session = this.sessionManager.getSession(sessionDbId);
    if (!session) return;

+    // Wall-clock age guard: refuse to start new generators for sessions that have
+    // been alive too long to prevent runaway API costs (Issue #1590).
+    // Use the persisted started_at_epoch from the DB so the guard survives worker
+    // restarts (session.startTime is reset to Date.now() on every re-activation).
+    const dbSessionRecord = this.dbManager.getSessionStore().db
+      .prepare('SELECT started_at_epoch FROM sdk_sessions WHERE id = ? LIMIT 1')
+      .get(sessionDbId) as { started_at_epoch: number } | undefined;
+    const sessionOriginMs = dbSessionRecord?.started_at_epoch ?? session.startTime;
+    const sessionAgeMs = Date.now() - sessionOriginMs;
+    if (sessionAgeMs > SessionRoutes.MAX_SESSION_WALL_CLOCK_MS) {
+      logger.warn('SESSION', 'Session exceeded wall-clock age limit — aborting to prevent runaway spend', {
+        sessionId: sessionDbId,
+        ageHours: Math.round(sessionAgeMs / 3_600_000 * 10) / 10,
+        limitHours: SessionRoutes.MAX_SESSION_WALL_CLOCK_MS / 3_600_000,
+        source
+      });
+      if (!session.abortController.signal.aborted) {
+        session.abortController.abort();
+      }
+      const pendingStore = this.sessionManager.getPendingMessageStore();
+      pendingStore.markAllSessionMessagesAbandoned(sessionDbId);
+      this.sessionManager.removeSessionImmediate(sessionDbId);
+      return;
+    }
+
    // GUARD: Prevent duplicate spawns
    if (this.spawnInProgress.get(sessionDbId)) {
      logger.debug('SESSION', 'Spawn already in progress, skipping', { sessionDbId, source });
@@ -187,15 +213,37 @@ export class SessionRoutes extends BaseRouteHandler {
    session.currentProvider = provider;
    session.lastGeneratorActivity = Date.now();

+    // Capture the AbortController that belongs to THIS generator run.
+    // session.abortController may be replaced (e.g. by stale-recovery) before the
+    // .catch / .finally handlers run, so binding it here prevents a stale rejection
+    // from cancelling a brand-new controller (race condition guard).
+    const myController = session.abortController;
+
    session.generatorPromise = agent.startSession(session, this.workerService)
      .catch(error => {
        // Only log non-abort errors
-        if (session.abortController.signal.aborted) return;
-        
+        if (myController.signal.aborted) return;
+
+        const errorMsg = error instanceof Error ? error.message : String(error);
+
+        // Treat SIGTERM (exit code 143) as intentional termination, not a crash.
+        // When a subprocess is killed externally, abort the controller to prevent
+        // crash recovery from immediately respawning the process (Issue #1590).
+        // APPROVED OVERRIDE
+        if (errorMsg.includes('code 143') || errorMsg.includes('signal SIGTERM')) {
+          logger.warn('SESSION', 'Generator killed by external signal — aborting session to prevent respawn', {
+            sessionId: session.sessionDbId,
+            provider,
+            error: errorMsg
+          });
+          myController.abort();
+          return;
+        }
+
        logger.error('SESSION', `Generator failed`, {
          sessionId: session.sessionDbId,
          provider: provider,
-          error: error.message
+          error: errorMsg
        }, error);

        // Mark all processing messages as failed so they can be retried or abandoned