fix: handle nested XML tags in parser extractField and extractArrayElements
The parser's regex patterns used `[^<]*` and `[^<]+` which fail immediately when content contains any `<` character (like nested tags or code snippets). Example failure case: ```xml <investigated> <item>Checked parser.ts</item> </investigated> ``` The `[^<]*` pattern stops at the first `<` of `<item>`, causing extractField() to return null even though valid content exists. ## Changes - `extractField()`: Changed from `[^<]*` to `[\s\S]*?` (non-greedy match any char) - `extractArrayElements()`: Changed from `[^<]+` to `[\s\S]*?` for both array and element patterns The `[\s\S]*?` pattern matches any character including newlines, non-greedily, allowing nested XML tags to be captured correctly. Fixes #798 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
+14
-6
@@ -162,9 +162,13 @@ export function parseSummary(text: string, sessionId?: number): ParsedSummary |
|
|||||||
/**
|
/**
|
||||||
* Extract a simple field value from XML content
|
* Extract a simple field value from XML content
|
||||||
* Returns null for missing or empty/whitespace-only fields
|
* Returns null for missing or empty/whitespace-only fields
|
||||||
|
*
|
||||||
|
* Uses non-greedy match to handle nested tags and code snippets (Issue #798)
|
||||||
*/
|
*/
|
||||||
function extractField(content: string, fieldName: string): string | null {
|
function extractField(content: string, fieldName: string): string | null {
|
||||||
const regex = new RegExp(`<${fieldName}>([^<]*)</${fieldName}>`);
|
// Use [\s\S]*? to match any character including newlines, non-greedily
|
||||||
|
// This handles nested XML tags like <item>...</item> inside the field
|
||||||
|
const regex = new RegExp(`<${fieldName}>([\\s\\S]*?)</${fieldName}>`);
|
||||||
const match = regex.exec(content);
|
const match = regex.exec(content);
|
||||||
if (!match) return null;
|
if (!match) return null;
|
||||||
|
|
||||||
@@ -174,12 +178,13 @@ function extractField(content: string, fieldName: string): string | null {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract array of elements from XML content
|
* Extract array of elements from XML content
|
||||||
|
* Handles nested tags and code snippets (Issue #798)
|
||||||
*/
|
*/
|
||||||
function extractArrayElements(content: string, arrayName: string, elementName: string): string[] {
|
function extractArrayElements(content: string, arrayName: string, elementName: string): string[] {
|
||||||
const elements: string[] = [];
|
const elements: string[] = [];
|
||||||
|
|
||||||
// Match the array block
|
// Match the array block using [\s\S]*? for nested content
|
||||||
const arrayRegex = new RegExp(`<${arrayName}>(.*?)</${arrayName}>`, 's');
|
const arrayRegex = new RegExp(`<${arrayName}>([\\s\\S]*?)</${arrayName}>`);
|
||||||
const arrayMatch = arrayRegex.exec(content);
|
const arrayMatch = arrayRegex.exec(content);
|
||||||
|
|
||||||
if (!arrayMatch) {
|
if (!arrayMatch) {
|
||||||
@@ -188,11 +193,14 @@ function extractArrayElements(content: string, arrayName: string, elementName: s
|
|||||||
|
|
||||||
const arrayContent = arrayMatch[1];
|
const arrayContent = arrayMatch[1];
|
||||||
|
|
||||||
// Extract individual elements
|
// Extract individual elements using [\s\S]*? for nested content
|
||||||
const elementRegex = new RegExp(`<${elementName}>([^<]+)</${elementName}>`, 'g');
|
const elementRegex = new RegExp(`<${elementName}>([\\s\\S]*?)</${elementName}>`, 'g');
|
||||||
let elementMatch;
|
let elementMatch;
|
||||||
while ((elementMatch = elementRegex.exec(arrayContent)) !== null) {
|
while ((elementMatch = elementRegex.exec(arrayContent)) !== null) {
|
||||||
elements.push(elementMatch[1].trim());
|
const trimmed = elementMatch[1].trim();
|
||||||
|
if (trimmed) {
|
||||||
|
elements.push(trimmed);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return elements;
|
return elements;
|
||||||
|
|||||||
Reference in New Issue
Block a user