fix(gemini): add conversation history truncation to prevent O(N²) token cost growth

GeminiAgent sends the full conversation history with every API call,
causing quadratic token growth per session. A 100-observation session
sends ~30M cumulative input tokens. This ports the proven truncateHistory()
sliding window from OpenRouterAgent to GeminiAgent.

- Add CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES (default: 20) and
  CLAUDE_MEM_GEMINI_MAX_TOKENS (default: 100000) settings
- Add truncateHistory() to GeminiAgent using shared estimateTokens()
- Always preserve at least the newest message to avoid empty API requests
- Add settings validation in SettingsRoutes (1-100 messages, 1K-1M tokens)
- Add regression tests for truncation and oversized single-prompt edge case

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
vnz
2026-03-17 08:00:29 +01:00
parent e2a230286d
commit df1fb8bb89
8 changed files with 503 additions and 338 deletions
+61 -5
View File
@@ -18,6 +18,8 @@ import { logger } from '../../utils/logger.js';
import { buildInitPrompt, buildObservationPrompt, buildSummaryPrompt, buildContinuationPrompt } from '../../sdk/prompts.js';
import { SettingsDefaultsManager } from '../../shared/SettingsDefaultsManager.js';
import { getCredential } from '../../shared/EnvManager.js';
import { USER_SETTINGS_PATH } from '../../shared/paths.js';
import { estimateTokens } from '../../shared/timeline-formatting.js';
import type { ActiveSession, ConversationMessage } from '../worker-types.js';
import { ModeManager } from '../domain/ModeManager.js';
import {
@@ -56,6 +58,10 @@ const GEMINI_RPM_LIMITS: Record<GeminiModel, number> = {
// Track last request time for rate limiting
let lastRequestTime = 0;
// Context window limits (prevents O(N²) token cost growth)
const DEFAULT_MAX_CONTEXT_MESSAGES = 20; // Maximum messages to keep in conversation history
const DEFAULT_MAX_ESTIMATED_TOKENS = 100000; // ~100k tokens max context (safety limit)
/**
* Enforce RPM rate limit for Gemini free tier.
* Waits the required time between requests based on model's RPM limit + 100ms safety buffer.
@@ -342,6 +348,54 @@ export class GeminiAgent {
}
}
/**
* Truncate conversation history to prevent runaway context costs.
* Keeps most recent messages within both message count and token budget.
* Returns a new array — never mutates the original history.
*/
private truncateHistory(history: ConversationMessage[]): ConversationMessage[] {
const settings = SettingsDefaultsManager.loadFromFile(USER_SETTINGS_PATH);
const MAX_CONTEXT_MESSAGES = parseInt(settings.CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES) || DEFAULT_MAX_CONTEXT_MESSAGES;
const MAX_ESTIMATED_TOKENS = parseInt(settings.CLAUDE_MEM_GEMINI_MAX_TOKENS) || DEFAULT_MAX_ESTIMATED_TOKENS;
if (history.length <= MAX_CONTEXT_MESSAGES) {
// Check token count even if message count is ok
const totalTokens = history.reduce((sum, m) => sum + estimateTokens(m.content), 0);
if (totalTokens <= MAX_ESTIMATED_TOKENS) {
return history;
}
}
// Sliding window: keep most recent messages within limits
const truncated: ConversationMessage[] = [];
let tokenCount = 0;
// Process messages in reverse (most recent first)
for (let i = history.length - 1; i >= 0; i--) {
const msg = history[i];
const msgTokens = estimateTokens(msg.content);
// Always include at least the newest message — an empty contents array
// would cause a hard Gemini API error, which is worse than an oversized request.
if (truncated.length > 0 && (truncated.length >= MAX_CONTEXT_MESSAGES || tokenCount + msgTokens > MAX_ESTIMATED_TOKENS)) {
logger.warn('SDK', 'Context window truncated to prevent runaway costs', {
originalMessages: history.length,
keptMessages: truncated.length,
droppedMessages: i + 1,
estimatedTokens: tokenCount,
tokenLimit: MAX_ESTIMATED_TOKENS
});
break;
}
truncated.unshift(msg); // Add to beginning
tokenCount += msgTokens;
}
return truncated;
}
/**
* Convert shared ConversationMessage array to Gemini's contents format
* Maps 'assistant' role to 'model' for Gemini API compatibility
@@ -354,8 +408,8 @@ export class GeminiAgent {
}
/**
* Query Gemini via REST API with full conversation history (multi-turn)
* Sends the entire conversation context for coherent responses
* Query Gemini via REST API with truncated conversation history (multi-turn)
* Truncates history to prevent O(N²) token cost growth, then sends for coherent responses
*/
private async queryGeminiMultiTurn(
history: ConversationMessage[],
@@ -363,11 +417,13 @@ export class GeminiAgent {
model: GeminiModel,
rateLimitingEnabled: boolean
): Promise<{ content: string; tokensUsed?: number }> {
const contents = this.conversationToGeminiContents(history);
const totalChars = history.reduce((sum, m) => sum + m.content.length, 0);
const truncatedHistory = this.truncateHistory(history);
const contents = this.conversationToGeminiContents(truncatedHistory);
const totalChars = truncatedHistory.reduce((sum, m) => sum + m.content.length, 0);
logger.debug('SDK', `Querying Gemini multi-turn (${model})`, {
turns: history.length,
turns: truncatedHistory.length,
totalTurns: history.length,
totalChars
});
@@ -94,6 +94,8 @@ export class SettingsRoutes extends BaseRouteHandler {
'CLAUDE_MEM_GEMINI_API_KEY',
'CLAUDE_MEM_GEMINI_MODEL',
'CLAUDE_MEM_GEMINI_RATE_LIMITING_ENABLED',
'CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES',
'CLAUDE_MEM_GEMINI_MAX_TOKENS',
// OpenRouter Configuration
'CLAUDE_MEM_OPENROUTER_API_KEY',
'CLAUDE_MEM_OPENROUTER_MODEL',
@@ -248,6 +250,22 @@ export class SettingsRoutes extends BaseRouteHandler {
}
}
// Validate CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES
if (settings.CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES) {
const count = parseInt(settings.CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES, 10);
if (isNaN(count) || count < 1 || count > 100) {
return { valid: false, error: 'CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES must be between 1 and 100' };
}
}
// Validate CLAUDE_MEM_GEMINI_MAX_TOKENS
if (settings.CLAUDE_MEM_GEMINI_MAX_TOKENS) {
const tokens = parseInt(settings.CLAUDE_MEM_GEMINI_MAX_TOKENS, 10);
if (isNaN(tokens) || tokens < 1000 || tokens > 1000000) {
return { valid: false, error: 'CLAUDE_MEM_GEMINI_MAX_TOKENS must be between 1000 and 1000000' };
}
}
// Validate CLAUDE_MEM_CONTEXT_OBSERVATIONS
if (settings.CLAUDE_MEM_CONTEXT_OBSERVATIONS) {
const obsCount = parseInt(settings.CLAUDE_MEM_CONTEXT_OBSERVATIONS, 10);
+4
View File
@@ -23,6 +23,8 @@ export interface SettingsDefaults {
CLAUDE_MEM_GEMINI_API_KEY: string;
CLAUDE_MEM_GEMINI_MODEL: string; // 'gemini-2.5-flash-lite' | 'gemini-2.5-flash' | 'gemini-3-flash-preview'
CLAUDE_MEM_GEMINI_RATE_LIMITING_ENABLED: string; // 'true' | 'false' - enable rate limiting for free tier
CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES: string; // Max messages in Gemini context window (prevents O(N²) cost growth)
CLAUDE_MEM_GEMINI_MAX_TOKENS: string; // Max estimated tokens for Gemini context (~100k safety limit)
CLAUDE_MEM_OPENROUTER_API_KEY: string;
CLAUDE_MEM_OPENROUTER_MODEL: string;
CLAUDE_MEM_OPENROUTER_SITE_URL: string;
@@ -82,6 +84,8 @@ export class SettingsDefaultsManager {
CLAUDE_MEM_GEMINI_API_KEY: '', // Empty by default, can be set via UI or env
CLAUDE_MEM_GEMINI_MODEL: 'gemini-2.5-flash-lite', // Default Gemini model (highest free tier RPM)
CLAUDE_MEM_GEMINI_RATE_LIMITING_ENABLED: 'true', // Rate limiting ON by default for free tier users
CLAUDE_MEM_GEMINI_MAX_CONTEXT_MESSAGES: '20', // Max messages in Gemini context window
CLAUDE_MEM_GEMINI_MAX_TOKENS: '100000', // Max estimated tokens (~100k safety limit)
CLAUDE_MEM_OPENROUTER_API_KEY: '', // Empty by default, can be set via UI or env
CLAUDE_MEM_OPENROUTER_MODEL: 'xiaomi/mimo-v2-flash:free', // Default OpenRouter model (free tier)
CLAUDE_MEM_OPENROUTER_SITE_URL: '', // Optional: for OpenRouter analytics