/**
* XML Parser Module
* Parses observation and summary XML blocks from SDK responses
*/
import { logger } from '../utils/logger.js';
import { ModeManager } from '../services/domain/ModeManager.js';
export interface ParsedObservation {
type: string;
title: string | null;
subtitle: string | null;
facts: string[];
narrative: string | null;
concepts: string[];
files_read: string[];
files_modified: string[];
}
export interface ParsedSummary {
request: string | null;
investigated: string | null;
learned: string | null;
completed: string | null;
next_steps: string | null;
notes: string | null;
}
/**
* Parse observation XML blocks from SDK response
* Returns all observations found in the response
*/
export function parseObservations(text: string, correlationId?: string): ParsedObservation[] {
const observations: ParsedObservation[] = [];
// Match ... blocks (non-greedy)
const observationRegex = /([\s\S]*?)<\/observation>/g;
let match;
while ((match = observationRegex.exec(text)) !== null) {
const obsContent = match[1];
// Extract all fields
const type = extractField(obsContent, 'type');
const title = extractField(obsContent, 'title');
const subtitle = extractField(obsContent, 'subtitle');
const narrative = extractField(obsContent, 'narrative');
const facts = extractArrayElements(obsContent, 'facts', 'fact');
const concepts = extractArrayElements(obsContent, 'concepts', 'concept');
const files_read = extractArrayElements(obsContent, 'files_read', 'file');
const files_modified = extractArrayElements(obsContent, 'files_modified', 'file');
// All fields except type are nullable in schema.
// If type is missing or invalid, use first type from mode as fallback.
// Determine final type using active mode's valid types
const mode = ModeManager.getInstance().getActiveMode();
const validTypes = mode.observation_types.map(t => t.id);
const fallbackType = validTypes[0]; // First type in mode's list is the fallback
let finalType = fallbackType;
if (type) {
if (validTypes.includes(type.trim())) {
finalType = type.trim();
} else {
logger.error('PARSER', `Invalid observation type: ${type}, using "${fallbackType}"`, { correlationId });
}
} else {
logger.error('PARSER', `Observation missing type field, using "${fallbackType}"`, { correlationId });
}
// All other fields are optional - save whatever we have
// Filter out type from concepts array (types and concepts are separate dimensions)
const cleanedConcepts = concepts.filter(c => c !== finalType);
if (cleanedConcepts.length !== concepts.length) {
logger.error('PARSER', 'Removed observation type from concepts array', {
correlationId,
type: finalType,
originalConcepts: concepts,
cleanedConcepts
});
}
// Skip ghost observations — records where every content field is null/empty.
// These accumulate when the LLM emits a bare (or one with only )
// due to context overflow. They carry no information and pollute the context window.
// (subtitle and file lists are intentionally excluded from this guard: an observation
// with only a subtitle is still too thin to be useful on its own.)
if (!title && !narrative && facts.length === 0 && cleanedConcepts.length === 0) {
logger.warn('PARSER', 'Skipping empty observation (all content fields null)', {
correlationId,
type: finalType
});
continue;
}
observations.push({
type: finalType,
title,
subtitle,
facts,
narrative,
concepts: cleanedConcepts,
files_read,
files_modified
});
}
return observations;
}
/**
* Parse summary XML block from SDK response
* Returns null if no valid summary found or if summary was skipped
*/
export function parseSummary(text: string, sessionId?: number): ParsedSummary | null {
// Check for skip_summary first
const skipRegex = //;
const skipMatch = skipRegex.exec(text);
if (skipMatch) {
logger.info('PARSER', 'Summary skipped', {
sessionId,
reason: skipMatch[1]
});
return null;
}
// Match ... block (non-greedy)
const summaryRegex = /([\s\S]*?)<\/summary>/;
const summaryMatch = summaryRegex.exec(text);
if (!summaryMatch) {
return null;
}
const summaryContent = summaryMatch[1];
// Extract fields
const request = extractField(summaryContent, 'request');
const investigated = extractField(summaryContent, 'investigated');
const learned = extractField(summaryContent, 'learned');
const completed = extractField(summaryContent, 'completed');
const next_steps = extractField(summaryContent, 'next_steps');
const notes = extractField(summaryContent, 'notes'); // Optional
// NOTE FROM THEDOTMACK: 100% of the time we must SAVE the summary, even if fields are missing. 10/24/2025
// NEVER DO THIS NONSENSE AGAIN.
// Validate required fields are present (notes is optional)
// if (!request || !investigated || !learned || !completed || !next_steps) {
// logger.warn('PARSER', 'Summary missing required fields', {
// sessionId,
// hasRequest: !!request,
// hasInvestigated: !!investigated,
// hasLearned: !!learned,
// hasCompleted: !!completed,
// hasNextSteps: !!next_steps
// });
// return null;
// }
return {
request,
investigated,
learned,
completed,
next_steps,
notes
};
}
/**
* Extract a simple field value from XML content
* Returns null for missing or empty/whitespace-only fields
*
* Uses non-greedy match to handle nested tags and code snippets (Issue #798)
*/
function extractField(content: string, fieldName: string): string | null {
// Use [\s\S]*? to match any character including newlines, non-greedily
// This handles nested XML tags like - ...
inside the field
const regex = new RegExp(`<${fieldName}>([\\s\\S]*?)${fieldName}>`);
const match = regex.exec(content);
if (!match) return null;
const trimmed = match[1].trim();
return trimmed === '' ? null : trimmed;
}
/**
* Extract array of elements from XML content
* Handles nested tags and code snippets (Issue #798)
*/
function extractArrayElements(content: string, arrayName: string, elementName: string): string[] {
const elements: string[] = [];
// Match the array block using [\s\S]*? for nested content
const arrayRegex = new RegExp(`<${arrayName}>([\\s\\S]*?)${arrayName}>`);
const arrayMatch = arrayRegex.exec(content);
if (!arrayMatch) {
return elements;
}
const arrayContent = arrayMatch[1];
// Extract individual elements using [\s\S]*? for nested content
const elementRegex = new RegExp(`<${elementName}>([\\s\\S]*?)${elementName}>`, 'g');
let elementMatch;
while ((elementMatch = elementRegex.exec(arrayContent)) !== null) {
const trimmed = elementMatch[1].trim();
if (trimmed) {
elements.push(trimmed);
}
}
return elements;
}