feat: Add XML extraction and import scripts for observations and summaries

This commit is contained in:
Alex Newman
2025-10-18 23:34:53 -04:00
parent bc41e367c0
commit 6d68fd44ca
6 changed files with 860 additions and 0 deletions
+82
View File
@@ -0,0 +1,82 @@
# XML Extraction Scripts
Scripts to extract XML observations and summaries from Claude Code transcript files.
## Scripts
### `filter-actual-xml.py`
**Recommended for import**
Extracts only actual XML from assistant responses, filtering out:
- Template/example XML (with placeholders like `[...]` or `**field**:`)
- XML from tool_use blocks
- XML from user messages
**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml`
**Usage:**
```bash
python3 scripts/extraction/filter-actual-xml.py
```
### `extract-all-xml.py`
**For debugging/analysis**
Extracts ALL XML blocks from transcripts without filtering.
**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml`
**Usage:**
```bash
python3 scripts/extraction/extract-all-xml.py
```
## Workflow
1. **Extract XML from transcripts:**
```bash
cd ~/Scripts/claude-mem
python3 scripts/extraction/filter-actual-xml.py
```
2. **Import to database:**
```bash
npm run import:xml
```
3. **Clean up duplicates (if needed):**
```bash
npm run cleanup:duplicates
```
## Source Data
Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl`
These are Claude Code session transcripts stored in JSONL (JSON Lines) format.
## Output Format
```xml
<?xml version="1.0" encoding="UTF-8"?>
<transcript_extracts>
<!-- Block 1 | 2025-10-19 03:03:23 UTC -->
<observation>
<type>discovery</type>
<title>Example observation</title>
...
</observation>
<!-- Block 2 | 2025-10-19 03:03:45 UTC -->
<summary>
<request>What was accomplished</request>
...
</summary>
</transcript_extracts>
```
Each XML block includes a comment with:
- Block number
- Original timestamp from transcript
+128
View File
@@ -0,0 +1,128 @@
#!/usr/bin/env python3
import json
import re
from datetime import datetime
import os
import subprocess
def extract_xml_blocks(text):
"""Extract complete XML blocks from text"""
xml_patterns = [
r'<observation>.*?</observation>',
r'<session_summary>.*?</session_summary>',
r'<request>.*?</request>',
r'<summary>.*?</summary>',
r'<facts>.*?</facts>',
r'<fact>.*?</fact>',
r'<concepts>.*?</concepts>',
r'<concept>.*?</concept>',
r'<files>.*?</files>',
r'<file>.*?</file>',
r'<files_read>.*?</files_read>',
r'<files_edited>.*?</files_edited>',
r'<files_modified>.*?</files_modified>',
r'<narrative>.*?</narrative>',
r'<learned>.*?</learned>',
r'<investigated>.*?</investigated>',
r'<completed>.*?</completed>',
r'<next_steps>.*?</next_steps>',
r'<notes>.*?</notes>',
r'<title>.*?</title>',
r'<subtitle>.*?</subtitle>',
r'<text>.*?</text>',
r'<type>.*?</type>',
]
blocks = []
for pattern in xml_patterns:
matches = re.findall(pattern, text, re.DOTALL)
blocks.extend(matches)
return blocks
def process_transcript_file(filepath):
"""Process a single transcript file and extract XML with timestamps"""
results = []
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
try:
data = json.loads(line)
# Get timestamp
timestamp = data.get('timestamp', 'unknown')
# Extract text content from message
message = data.get('message', {})
content = message.get('content', [])
if isinstance(content, list):
for item in content:
if isinstance(item, dict):
text = ''
if item.get('type') == 'text':
text = item.get('text', '')
elif item.get('type') == 'tool_use':
# Also check tool_use input fields
tool_input = item.get('input', {})
if isinstance(tool_input, dict):
text = str(tool_input)
if text:
# Extract XML blocks
xml_blocks = extract_xml_blocks(text)
for block in xml_blocks:
results.append({
'timestamp': timestamp,
'xml': block
})
except json.JSONDecodeError:
continue
return results
# Get list of transcript files
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
os.chdir(transcript_dir)
# Get all transcript files sorted by modification time
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
all_results = []
for filename in files:
filepath = os.path.join(transcript_dir, filename)
print(f"Processing {filename}...")
results = process_transcript_file(filepath)
all_results.extend(results)
print(f" Found {len(results)} XML blocks")
# Write results with timestamps
output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
with open(output_file, 'w', encoding='utf-8') as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<transcript_extracts>\n\n')
for i, item in enumerate(all_results, 1):
timestamp = item['timestamp']
xml = item['xml']
# Format timestamp nicely if it's ISO format
if timestamp != 'unknown' and timestamp:
try:
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
except:
formatted_time = timestamp
else:
formatted_time = 'unknown'
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
f.write(xml)
f.write('\n\n')
f.write('</transcript_extracts>\n')
print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
+168
View File
@@ -0,0 +1,168 @@
#!/usr/bin/env python3
import json
import re
from datetime import datetime
import os
def extract_xml_blocks(text):
"""Extract complete XML blocks from text"""
xml_patterns = [
r'<observation>.*?</observation>',
r'<session_summary>.*?</session_summary>',
r'<request>.*?</request>',
r'<summary>.*?</summary>',
r'<facts>.*?</facts>',
r'<fact>.*?</fact>',
r'<concepts>.*?</concepts>',
r'<concept>.*?</concept>',
r'<files>.*?</files>',
r'<file>.*?</file>',
r'<files_read>.*?</files_read>',
r'<files_edited>.*?</files_edited>',
r'<files_modified>.*?</files_modified>',
r'<narrative>.*?</narrative>',
r'<learned>.*?</learned>',
r'<investigated>.*?</investigated>',
r'<completed>.*?</completed>',
r'<next_steps>.*?</next_steps>',
r'<notes>.*?</notes>',
r'<title>.*?</title>',
r'<subtitle>.*?</subtitle>',
r'<text>.*?</text>',
r'<type>.*?</type>',
r'<tool_used>.*?</tool_used>',
r'<tool_name>.*?</tool_name>',
r'<tool_input>.*?</tool_input>',
r'<tool_output>.*?</tool_output>',
r'<tool_time>.*?</tool_time>',
]
blocks = []
for pattern in xml_patterns:
matches = re.findall(pattern, text, re.DOTALL)
blocks.extend(matches)
return blocks
def is_example_xml(xml_block):
"""Check if XML block is an example/template"""
# Patterns that indicate this is example/template XML
example_indicators = [
r'\[.*?\]', # Square brackets with placeholders
r'\*\*\w+\*\*:', # Bold markdown like **title**:
r'\.\.\..*?\.\.\.', # Ellipsis indicating placeholder
r'feature\|bugfix\|refactor', # Multiple options separated by |
r'change \| discovery \| decision', # Example types
r'\{.*?\}', # Curly braces (template variables)
r'Concise, self-contained statement', # Literal example text
r'Short title capturing',
r'One sentence explanation',
r'What was the user trying',
r'What code/systems did you explore',
r'What did you learn',
r'What was done',
r'What should happen next',
r'file1\.ts', # Example filenames
r'file2\.ts',
r'file3\.ts',
r'Any additional context',
]
for pattern in example_indicators:
if re.search(pattern, xml_block):
return True
return False
def process_transcript_file(filepath):
"""Process a single transcript file and extract only real XML from assistant responses"""
results = []
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
try:
data = json.loads(line)
# Get timestamp
timestamp = data.get('timestamp', 'unknown')
# Only process assistant messages
message = data.get('message', {})
role = message.get('role')
if role != 'assistant':
continue
content = message.get('content', [])
if isinstance(content, list):
for item in content:
if isinstance(item, dict) and item.get('type') == 'text':
# This is text in an assistant response, not tool_use
text = item.get('text', '')
# Extract XML blocks
xml_blocks = extract_xml_blocks(text)
for block in xml_blocks:
# Filter out example/template XML
if not is_example_xml(block):
results.append({
'timestamp': timestamp,
'xml': block
})
except json.JSONDecodeError:
continue
return results
# Get list of Oct 18 transcript files
import subprocess
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
os.chdir(transcript_dir)
# Get all transcript files sorted by modification time
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
all_results = []
for filename in files:
filepath = os.path.join(transcript_dir, filename)
print(f"Processing {filename}...")
results = process_transcript_file(filepath)
all_results.extend(results)
print(f" Found {len(results)} actual XML blocks")
# Write results with timestamps
output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml')
with open(output_file, 'w', encoding='utf-8') as f:
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<!-- Actual XML blocks from assistant responses only -->\n')
f.write('<!-- Excludes: tool_use inputs, user prompts, and example/template XML -->\n')
f.write('<transcript_extracts>\n\n')
for i, item in enumerate(all_results, 1):
timestamp = item['timestamp']
xml = item['xml']
# Format timestamp nicely if it's ISO format
if timestamp != 'unknown' and timestamp:
try:
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
except:
formatted_time = timestamp
else:
formatted_time = 'unknown'
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
f.write(xml)
f.write('\n\n')
f.write('</transcript_extracts>\n')
print(f"\n{'='*80}")
print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}")
print(f"{'='*80}")