e398212983
When the LLM is overwhelmed by large context it can emit bare <observation/> blocks (or ones containing only <type>). These are stored as rows where title, narrative, facts and concepts are all null/empty, appearing as meaningless "Untitled" entries in the context window. Add a guard in parseObservations() that skips any observation where every content field is null/empty before pushing it to the result array. Generated by Claude Code Vibe coded by ousamabenyounes Co-Authored-By: Claude <noreply@anthropic.com>
220 lines
7.0 KiB
TypeScript
220 lines
7.0 KiB
TypeScript
/**
|
|
* XML Parser Module
|
|
* Parses observation and summary XML blocks from SDK responses
|
|
*/
|
|
|
|
import { logger } from '../utils/logger.js';
|
|
import { ModeManager } from '../services/domain/ModeManager.js';
|
|
|
|
export interface ParsedObservation {
|
|
type: string;
|
|
title: string | null;
|
|
subtitle: string | null;
|
|
facts: string[];
|
|
narrative: string | null;
|
|
concepts: string[];
|
|
files_read: string[];
|
|
files_modified: string[];
|
|
}
|
|
|
|
export interface ParsedSummary {
|
|
request: string | null;
|
|
investigated: string | null;
|
|
learned: string | null;
|
|
completed: string | null;
|
|
next_steps: string | null;
|
|
notes: string | null;
|
|
}
|
|
|
|
/**
|
|
* Parse observation XML blocks from SDK response
|
|
* Returns all observations found in the response
|
|
*/
|
|
export function parseObservations(text: string, correlationId?: string): ParsedObservation[] {
|
|
const observations: ParsedObservation[] = [];
|
|
|
|
// Match <observation>...</observation> blocks (non-greedy)
|
|
const observationRegex = /<observation>([\s\S]*?)<\/observation>/g;
|
|
|
|
let match;
|
|
while ((match = observationRegex.exec(text)) !== null) {
|
|
const obsContent = match[1];
|
|
|
|
// Extract all fields
|
|
const type = extractField(obsContent, 'type');
|
|
const title = extractField(obsContent, 'title');
|
|
const subtitle = extractField(obsContent, 'subtitle');
|
|
const narrative = extractField(obsContent, 'narrative');
|
|
const facts = extractArrayElements(obsContent, 'facts', 'fact');
|
|
const concepts = extractArrayElements(obsContent, 'concepts', 'concept');
|
|
const files_read = extractArrayElements(obsContent, 'files_read', 'file');
|
|
const files_modified = extractArrayElements(obsContent, 'files_modified', 'file');
|
|
|
|
// All fields except type are nullable in schema.
|
|
// If type is missing or invalid, use first type from mode as fallback.
|
|
|
|
// Determine final type using active mode's valid types
|
|
const mode = ModeManager.getInstance().getActiveMode();
|
|
const validTypes = mode.observation_types.map(t => t.id);
|
|
const fallbackType = validTypes[0]; // First type in mode's list is the fallback
|
|
let finalType = fallbackType;
|
|
if (type) {
|
|
if (validTypes.includes(type.trim())) {
|
|
finalType = type.trim();
|
|
} else {
|
|
logger.error('PARSER', `Invalid observation type: ${type}, using "${fallbackType}"`, { correlationId });
|
|
}
|
|
} else {
|
|
logger.error('PARSER', `Observation missing type field, using "${fallbackType}"`, { correlationId });
|
|
}
|
|
|
|
// All other fields are optional - save whatever we have
|
|
|
|
// Filter out type from concepts array (types and concepts are separate dimensions)
|
|
const cleanedConcepts = concepts.filter(c => c !== finalType);
|
|
|
|
if (cleanedConcepts.length !== concepts.length) {
|
|
logger.error('PARSER', 'Removed observation type from concepts array', {
|
|
correlationId,
|
|
type: finalType,
|
|
originalConcepts: concepts,
|
|
cleanedConcepts
|
|
});
|
|
}
|
|
|
|
// Skip ghost observations — records where every content field is null/empty.
|
|
// These accumulate when the LLM emits a bare <observation/> (or one with only <type>)
|
|
// due to context overflow. They carry no information and pollute the context window.
|
|
// (subtitle and file lists are intentionally excluded from this guard: an observation
|
|
// with only a subtitle is still too thin to be useful on its own.)
|
|
if (!title && !narrative && facts.length === 0 && cleanedConcepts.length === 0) {
|
|
logger.warn('PARSER', 'Skipping empty observation (all content fields null)', {
|
|
correlationId,
|
|
type: finalType
|
|
});
|
|
continue;
|
|
}
|
|
|
|
observations.push({
|
|
type: finalType,
|
|
title,
|
|
subtitle,
|
|
facts,
|
|
narrative,
|
|
concepts: cleanedConcepts,
|
|
files_read,
|
|
files_modified
|
|
});
|
|
}
|
|
|
|
return observations;
|
|
}
|
|
|
|
/**
|
|
* Parse summary XML block from SDK response
|
|
* Returns null if no valid summary found or if summary was skipped
|
|
*/
|
|
export function parseSummary(text: string, sessionId?: number): ParsedSummary | null {
|
|
// Check for skip_summary first
|
|
const skipRegex = /<skip_summary\s+reason="([^"]+)"\s*\/>/;
|
|
const skipMatch = skipRegex.exec(text);
|
|
|
|
if (skipMatch) {
|
|
logger.info('PARSER', 'Summary skipped', {
|
|
sessionId,
|
|
reason: skipMatch[1]
|
|
});
|
|
return null;
|
|
}
|
|
|
|
// Match <summary>...</summary> block (non-greedy)
|
|
const summaryRegex = /<summary>([\s\S]*?)<\/summary>/;
|
|
const summaryMatch = summaryRegex.exec(text);
|
|
|
|
if (!summaryMatch) {
|
|
return null;
|
|
}
|
|
|
|
const summaryContent = summaryMatch[1];
|
|
|
|
// Extract fields
|
|
const request = extractField(summaryContent, 'request');
|
|
const investigated = extractField(summaryContent, 'investigated');
|
|
const learned = extractField(summaryContent, 'learned');
|
|
const completed = extractField(summaryContent, 'completed');
|
|
const next_steps = extractField(summaryContent, 'next_steps');
|
|
const notes = extractField(summaryContent, 'notes'); // Optional
|
|
|
|
// NOTE FROM THEDOTMACK: 100% of the time we must SAVE the summary, even if fields are missing. 10/24/2025
|
|
// NEVER DO THIS NONSENSE AGAIN.
|
|
|
|
// Validate required fields are present (notes is optional)
|
|
// if (!request || !investigated || !learned || !completed || !next_steps) {
|
|
// logger.warn('PARSER', 'Summary missing required fields', {
|
|
// sessionId,
|
|
// hasRequest: !!request,
|
|
// hasInvestigated: !!investigated,
|
|
// hasLearned: !!learned,
|
|
// hasCompleted: !!completed,
|
|
// hasNextSteps: !!next_steps
|
|
// });
|
|
// return null;
|
|
// }
|
|
|
|
return {
|
|
request,
|
|
investigated,
|
|
learned,
|
|
completed,
|
|
next_steps,
|
|
notes
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract a simple field value from XML content
|
|
* Returns null for missing or empty/whitespace-only fields
|
|
*
|
|
* Uses non-greedy match to handle nested tags and code snippets (Issue #798)
|
|
*/
|
|
function extractField(content: string, fieldName: string): string | null {
|
|
// Use [\s\S]*? to match any character including newlines, non-greedily
|
|
// This handles nested XML tags like <item>...</item> inside the field
|
|
const regex = new RegExp(`<${fieldName}>([\\s\\S]*?)</${fieldName}>`);
|
|
const match = regex.exec(content);
|
|
if (!match) return null;
|
|
|
|
const trimmed = match[1].trim();
|
|
return trimmed === '' ? null : trimmed;
|
|
}
|
|
|
|
/**
|
|
* Extract array of elements from XML content
|
|
* Handles nested tags and code snippets (Issue #798)
|
|
*/
|
|
function extractArrayElements(content: string, arrayName: string, elementName: string): string[] {
|
|
const elements: string[] = [];
|
|
|
|
// Match the array block using [\s\S]*? for nested content
|
|
const arrayRegex = new RegExp(`<${arrayName}>([\\s\\S]*?)</${arrayName}>`);
|
|
const arrayMatch = arrayRegex.exec(content);
|
|
|
|
if (!arrayMatch) {
|
|
return elements;
|
|
}
|
|
|
|
const arrayContent = arrayMatch[1];
|
|
|
|
// Extract individual elements using [\s\S]*? for nested content
|
|
const elementRegex = new RegExp(`<${elementName}>([\\s\\S]*?)</${elementName}>`, 'g');
|
|
let elementMatch;
|
|
while ((elementMatch = elementRegex.exec(arrayContent)) !== null) {
|
|
const trimmed = elementMatch[1].trim();
|
|
if (trimmed) {
|
|
elements.push(trimmed);
|
|
}
|
|
}
|
|
|
|
return elements;
|
|
}
|