/** * XML Parser Module * Parses observation and summary XML blocks from SDK responses */ import { logger } from '../utils/logger.js'; import { ModeManager } from '../services/domain/ModeManager.js'; export interface ParsedObservation { type: string; title: string | null; subtitle: string | null; facts: string[]; narrative: string | null; concepts: string[]; files_read: string[]; files_modified: string[]; } export interface ParsedSummary { request: string | null; investigated: string | null; learned: string | null; completed: string | null; next_steps: string | null; notes: string | null; } /** * Parse observation XML blocks from SDK response * Returns all observations found in the response */ export function parseObservations(text: string, correlationId?: string): ParsedObservation[] { const observations: ParsedObservation[] = []; // Match ... blocks (non-greedy) const observationRegex = /([\s\S]*?)<\/observation>/g; let match; while ((match = observationRegex.exec(text)) !== null) { const obsContent = match[1]; // Extract all fields const type = extractField(obsContent, 'type'); const title = extractField(obsContent, 'title'); const subtitle = extractField(obsContent, 'subtitle'); const narrative = extractField(obsContent, 'narrative'); const facts = extractArrayElements(obsContent, 'facts', 'fact'); const concepts = extractArrayElements(obsContent, 'concepts', 'concept'); const files_read = extractArrayElements(obsContent, 'files_read', 'file'); const files_modified = extractArrayElements(obsContent, 'files_modified', 'file'); // All fields except type are nullable in schema. // If type is missing or invalid, use first type from mode as fallback. // Determine final type using active mode's valid types const mode = ModeManager.getInstance().getActiveMode(); const validTypes = mode.observation_types.map(t => t.id); const fallbackType = validTypes[0]; // First type in mode's list is the fallback let finalType = fallbackType; if (type) { if (validTypes.includes(type.trim())) { finalType = type.trim(); } else { logger.error('PARSER', `Invalid observation type: ${type}, using "${fallbackType}"`, { correlationId }); } } else { logger.error('PARSER', `Observation missing type field, using "${fallbackType}"`, { correlationId }); } // All other fields are optional - save whatever we have // Filter out type from concepts array (types and concepts are separate dimensions) const cleanedConcepts = concepts.filter(c => c !== finalType); if (cleanedConcepts.length !== concepts.length) { logger.error('PARSER', 'Removed observation type from concepts array', { correlationId, type: finalType, originalConcepts: concepts, cleanedConcepts }); } // Skip ghost observations — records where every content field is null/empty. // These accumulate when the LLM emits a bare (or one with only ) // due to context overflow. They carry no information and pollute the context window. // (subtitle and file lists are intentionally excluded from this guard: an observation // with only a subtitle is still too thin to be useful on its own.) if (!title && !narrative && facts.length === 0 && cleanedConcepts.length === 0) { logger.warn('PARSER', 'Skipping empty observation (all content fields null)', { correlationId, type: finalType }); continue; } observations.push({ type: finalType, title, subtitle, facts, narrative, concepts: cleanedConcepts, files_read, files_modified }); } return observations; } /** * Parse summary XML block from SDK response * Returns null if no valid summary found or if summary was skipped */ export function parseSummary(text: string, sessionId?: number): ParsedSummary | null { // Check for skip_summary first const skipRegex = //; const skipMatch = skipRegex.exec(text); if (skipMatch) { logger.info('PARSER', 'Summary skipped', { sessionId, reason: skipMatch[1] }); return null; } // Match ... block (non-greedy) const summaryRegex = /([\s\S]*?)<\/summary>/; const summaryMatch = summaryRegex.exec(text); if (!summaryMatch) { return null; } const summaryContent = summaryMatch[1]; // Extract fields const request = extractField(summaryContent, 'request'); const investigated = extractField(summaryContent, 'investigated'); const learned = extractField(summaryContent, 'learned'); const completed = extractField(summaryContent, 'completed'); const next_steps = extractField(summaryContent, 'next_steps'); const notes = extractField(summaryContent, 'notes'); // Optional // NOTE FROM THEDOTMACK: 100% of the time we must SAVE the summary, even if fields are missing. 10/24/2025 // NEVER DO THIS NONSENSE AGAIN. // Validate required fields are present (notes is optional) // if (!request || !investigated || !learned || !completed || !next_steps) { // logger.warn('PARSER', 'Summary missing required fields', { // sessionId, // hasRequest: !!request, // hasInvestigated: !!investigated, // hasLearned: !!learned, // hasCompleted: !!completed, // hasNextSteps: !!next_steps // }); // return null; // } return { request, investigated, learned, completed, next_steps, notes }; } /** * Extract a simple field value from XML content * Returns null for missing or empty/whitespace-only fields * * Uses non-greedy match to handle nested tags and code snippets (Issue #798) */ function extractField(content: string, fieldName: string): string | null { // Use [\s\S]*? to match any character including newlines, non-greedily // This handles nested XML tags like ... inside the field const regex = new RegExp(`<${fieldName}>([\\s\\S]*?)`); const match = regex.exec(content); if (!match) return null; const trimmed = match[1].trim(); return trimmed === '' ? null : trimmed; } /** * Extract array of elements from XML content * Handles nested tags and code snippets (Issue #798) */ function extractArrayElements(content: string, arrayName: string, elementName: string): string[] { const elements: string[] = []; // Match the array block using [\s\S]*? for nested content const arrayRegex = new RegExp(`<${arrayName}>([\\s\\S]*?)`); const arrayMatch = arrayRegex.exec(content); if (!arrayMatch) { return elements; } const arrayContent = arrayMatch[1]; // Extract individual elements using [\s\S]*? for nested content const elementRegex = new RegExp(`<${elementName}>([\\s\\S]*?)`, 'g'); let elementMatch; while ((elementMatch = elementRegex.exec(arrayContent)) !== null) { const trimmed = elementMatch[1].trim(); if (trimmed) { elements.push(trimmed); } } return elements; }