MAESTRO: Merge PR #693 - prevent infinite restart loop that causes runaway API costs

Add restart limit (max 3 consecutive restarts) with exponential backoff
to prevent infinite generator restart loops. Also add defensive
memorySessionId checks in GeminiAgent and OpenRouterAgent before
expensive LLM calls to fail fast when session ID hasn't been captured.

Based on PR #693 by @ajbmachon (applied to current main).

Co-Authored-By: Andre Machon <ajbmachon2@gmail.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-06 01:43:12 -05:00
parent 1ae6c5708c
commit 6382d6f9c7
6 changed files with 112 additions and 62 deletions
File diff suppressed because one or more lines are too long
+1
View File
@@ -33,6 +33,7 @@ export interface ActiveSession {
earliestPendingTimestamp: number | null; // Original timestamp of earliest pending message (for accurate observation timestamps)
conversationHistory: ConversationMessage[]; // Shared conversation history for provider switching
currentProvider: 'claude' | 'gemini' | 'openrouter' | null; // Track which provider is currently running
consecutiveRestarts: number; // Track consecutive restart attempts to prevent infinite loops
}
export interface PendingMessage {
+11
View File
@@ -192,6 +192,12 @@ export class GeminiAgent {
session.lastPromptNumber = message.prompt_number;
}
// CRITICAL: Check memorySessionId BEFORE making expensive LLM call
// This prevents wasting tokens when we won't be able to store the result anyway
if (!session.memorySessionId) {
throw new Error('Cannot process observations: memorySessionId not yet captured. This session may need to be reinitialized.');
}
// Build observation prompt
const obsPrompt = buildObservationPrompt({
id: 0,
@@ -230,6 +236,11 @@ export class GeminiAgent {
);
} else if (message.type === 'summarize') {
// CRITICAL: Check memorySessionId BEFORE making expensive LLM call
if (!session.memorySessionId) {
throw new Error('Cannot process summary: memorySessionId not yet captured. This session may need to be reinitialized.');
}
// Build summary prompt
const summaryPrompt = buildSummaryPrompt({
id: session.sessionDbId,
+11
View File
@@ -150,6 +150,12 @@ export class OpenRouterAgent {
session.lastPromptNumber = message.prompt_number;
}
// CRITICAL: Check memorySessionId BEFORE making expensive LLM call
// This prevents wasting tokens when we won't be able to store the result anyway
if (!session.memorySessionId) {
throw new Error('Cannot process observations: memorySessionId not yet captured. This session may need to be reinitialized.');
}
// Build observation prompt
const obsPrompt = buildObservationPrompt({
id: 0,
@@ -188,6 +194,11 @@ export class OpenRouterAgent {
);
} else if (message.type === 'summarize') {
// CRITICAL: Check memorySessionId BEFORE making expensive LLM call
if (!session.memorySessionId) {
throw new Error('Cannot process summary: memorySessionId not yet captured. This session may need to be reinitialized.');
}
// Build summary prompt
const summaryPrompt = buildSummaryPrompt({
id: session.sessionDbId,
+2 -1
View File
@@ -153,7 +153,8 @@ export class SessionManager {
cumulativeOutputTokens: 0,
earliestPendingTimestamp: null,
conversationHistory: [], // Initialize empty - will be populated by agents
currentProvider: null // Will be set when generator starts
currentProvider: null, // Will be set when generator starts
consecutiveRestarts: 0 // Track consecutive restart attempts to prevent infinite loops
};
logger.debug('SESSION', 'Creating new session object (memorySessionId cleared to prevent stale resume)', {
@@ -175,16 +175,37 @@ export class SessionRoutes extends BaseRouteHandler {
session.currentProvider = null;
this.workerService.broadcastProcessingStatus();
// Crash recovery: If not aborted and still has work, restart
// Crash recovery: If not aborted and still has work, restart (with limit)
if (!wasAborted) {
try {
const pendingStore = this.sessionManager.getPendingMessageStore();
const pendingCount = pendingStore.getPendingCount(sessionDbId);
// CRITICAL: Limit consecutive restarts to prevent infinite loops
// This prevents runaway API costs when there's a persistent error (e.g., memorySessionId not captured)
const MAX_CONSECUTIVE_RESTARTS = 3;
if (pendingCount > 0) {
session.consecutiveRestarts = (session.consecutiveRestarts || 0) + 1;
if (session.consecutiveRestarts > MAX_CONSECUTIVE_RESTARTS) {
logger.error('SESSION', `CRITICAL: Generator restart limit exceeded - stopping to prevent runaway costs`, {
sessionId: sessionDbId,
pendingCount,
consecutiveRestarts: session.consecutiveRestarts,
maxRestarts: MAX_CONSECUTIVE_RESTARTS,
action: 'Generator will NOT restart. Check logs for root cause. Messages remain in pending state.'
});
// Don't restart - abort to prevent further API calls
session.abortController.abort();
return;
}
logger.info('SESSION', `Restarting generator after crash/exit with pending work`, {
sessionId: sessionDbId,
pendingCount
pendingCount,
consecutiveRestarts: session.consecutiveRestarts,
maxRestarts: MAX_CONSECUTIVE_RESTARTS
});
// Abort OLD controller before replacing to prevent child process leaks
@@ -192,16 +213,21 @@ export class SessionRoutes extends BaseRouteHandler {
session.abortController = new AbortController();
oldController.abort();
// Small delay before restart
// Exponential backoff: 1s, 2s, 4s for subsequent restarts
const backoffMs = Math.min(1000 * Math.pow(2, session.consecutiveRestarts - 1), 8000);
// Delay before restart with exponential backoff
setTimeout(() => {
const stillExists = this.sessionManager.getSession(sessionDbId);
if (stillExists && !stillExists.generatorPromise) {
this.startGeneratorWithProvider(stillExists, this.getSelectedProvider(), 'crash-recovery');
}
}, 1000);
}, backoffMs);
} else {
// No pending work - abort to kill the child process
session.abortController.abort();
// Reset restart counter on successful completion
session.consecutiveRestarts = 0;
logger.debug('SESSION', 'Aborted controller after natural completion', {
sessionId: sessionDbId
});