fix: handle nested XML tags in parser extractField and extractArrayElements

The parser's regex patterns used `[^<]*` and `[^<]+` which fail immediately
when content contains any `<` character (like nested tags or code snippets).

Example failure case:
```xml
<investigated>
  <item>Checked parser.ts</item>
</investigated>
```

The `[^<]*` pattern stops at the first `<` of `<item>`, causing extractField()
to return null even though valid content exists.

## Changes

- `extractField()`: Changed from `[^<]*` to `[\s\S]*?` (non-greedy match any char)
- `extractArrayElements()`: Changed from `[^<]+` to `[\s\S]*?` for both array and element patterns

The `[\s\S]*?` pattern matches any character including newlines, non-greedily,
allowing nested XML tags to be captured correctly.

Fixes #798

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Glucksberg
2026-01-28 02:48:40 +00:00
committed by Alex Newman
parent 44bfb74932
commit 256ebcead0
+14 -6
View File
@@ -162,9 +162,13 @@ export function parseSummary(text: string, sessionId?: number): ParsedSummary |
/**
* Extract a simple field value from XML content
* Returns null for missing or empty/whitespace-only fields
*
* Uses non-greedy match to handle nested tags and code snippets (Issue #798)
*/
function extractField(content: string, fieldName: string): string | null {
const regex = new RegExp(`<${fieldName}>([^<]*)</${fieldName}>`);
// Use [\s\S]*? to match any character including newlines, non-greedily
// This handles nested XML tags like <item>...</item> inside the field
const regex = new RegExp(`<${fieldName}>([\\s\\S]*?)</${fieldName}>`);
const match = regex.exec(content);
if (!match) return null;
@@ -174,12 +178,13 @@ function extractField(content: string, fieldName: string): string | null {
/**
* Extract array of elements from XML content
* Handles nested tags and code snippets (Issue #798)
*/
function extractArrayElements(content: string, arrayName: string, elementName: string): string[] {
const elements: string[] = [];
// Match the array block
const arrayRegex = new RegExp(`<${arrayName}>(.*?)</${arrayName}>`, 's');
// Match the array block using [\s\S]*? for nested content
const arrayRegex = new RegExp(`<${arrayName}>([\\s\\S]*?)</${arrayName}>`);
const arrayMatch = arrayRegex.exec(content);
if (!arrayMatch) {
@@ -188,11 +193,14 @@ function extractArrayElements(content: string, arrayName: string, elementName: s
const arrayContent = arrayMatch[1];
// Extract individual elements
const elementRegex = new RegExp(`<${elementName}>([^<]+)</${elementName}>`, 'g');
// Extract individual elements using [\s\S]*? for nested content
const elementRegex = new RegExp(`<${elementName}>([\\s\\S]*?)</${elementName}>`, 'g');
let elementMatch;
while ((elementMatch = elementRegex.exec(arrayContent)) !== null) {
elements.push(elementMatch[1].trim());
const trimmed = elementMatch[1].trim();
if (trimmed) {
elements.push(trimmed);
}
}
return elements;