feat: Add XML extraction and import scripts for observations and summaries
This commit is contained in:
@@ -0,0 +1,82 @@
|
||||
# XML Extraction Scripts
|
||||
|
||||
Scripts to extract XML observations and summaries from Claude Code transcript files.
|
||||
|
||||
## Scripts
|
||||
|
||||
### `filter-actual-xml.py`
|
||||
**Recommended for import**
|
||||
|
||||
Extracts only actual XML from assistant responses, filtering out:
|
||||
- Template/example XML (with placeholders like `[...]` or `**field**:`)
|
||||
- XML from tool_use blocks
|
||||
- XML from user messages
|
||||
|
||||
**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml`
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python3 scripts/extraction/filter-actual-xml.py
|
||||
```
|
||||
|
||||
### `extract-all-xml.py`
|
||||
**For debugging/analysis**
|
||||
|
||||
Extracts ALL XML blocks from transcripts without filtering.
|
||||
|
||||
**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml`
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
python3 scripts/extraction/extract-all-xml.py
|
||||
```
|
||||
|
||||
## Workflow
|
||||
|
||||
1. **Extract XML from transcripts:**
|
||||
```bash
|
||||
cd ~/Scripts/claude-mem
|
||||
python3 scripts/extraction/filter-actual-xml.py
|
||||
```
|
||||
|
||||
2. **Import to database:**
|
||||
```bash
|
||||
npm run import:xml
|
||||
```
|
||||
|
||||
3. **Clean up duplicates (if needed):**
|
||||
```bash
|
||||
npm run cleanup:duplicates
|
||||
```
|
||||
|
||||
## Source Data
|
||||
|
||||
Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl`
|
||||
|
||||
These are Claude Code session transcripts stored in JSONL (JSON Lines) format.
|
||||
|
||||
## Output Format
|
||||
|
||||
```xml
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<transcript_extracts>
|
||||
|
||||
<!-- Block 1 | 2025-10-19 03:03:23 UTC -->
|
||||
<observation>
|
||||
<type>discovery</type>
|
||||
<title>Example observation</title>
|
||||
...
|
||||
</observation>
|
||||
|
||||
<!-- Block 2 | 2025-10-19 03:03:45 UTC -->
|
||||
<summary>
|
||||
<request>What was accomplished</request>
|
||||
...
|
||||
</summary>
|
||||
|
||||
</transcript_extracts>
|
||||
```
|
||||
|
||||
Each XML block includes a comment with:
|
||||
- Block number
|
||||
- Original timestamp from transcript
|
||||
Executable
+128
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
def extract_xml_blocks(text):
|
||||
"""Extract complete XML blocks from text"""
|
||||
xml_patterns = [
|
||||
r'<observation>.*?</observation>',
|
||||
r'<session_summary>.*?</session_summary>',
|
||||
r'<request>.*?</request>',
|
||||
r'<summary>.*?</summary>',
|
||||
r'<facts>.*?</facts>',
|
||||
r'<fact>.*?</fact>',
|
||||
r'<concepts>.*?</concepts>',
|
||||
r'<concept>.*?</concept>',
|
||||
r'<files>.*?</files>',
|
||||
r'<file>.*?</file>',
|
||||
r'<files_read>.*?</files_read>',
|
||||
r'<files_edited>.*?</files_edited>',
|
||||
r'<files_modified>.*?</files_modified>',
|
||||
r'<narrative>.*?</narrative>',
|
||||
r'<learned>.*?</learned>',
|
||||
r'<investigated>.*?</investigated>',
|
||||
r'<completed>.*?</completed>',
|
||||
r'<next_steps>.*?</next_steps>',
|
||||
r'<notes>.*?</notes>',
|
||||
r'<title>.*?</title>',
|
||||
r'<subtitle>.*?</subtitle>',
|
||||
r'<text>.*?</text>',
|
||||
r'<type>.*?</type>',
|
||||
]
|
||||
|
||||
blocks = []
|
||||
for pattern in xml_patterns:
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
blocks.extend(matches)
|
||||
|
||||
return blocks
|
||||
|
||||
def process_transcript_file(filepath):
|
||||
"""Process a single transcript file and extract XML with timestamps"""
|
||||
results = []
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Get timestamp
|
||||
timestamp = data.get('timestamp', 'unknown')
|
||||
|
||||
# Extract text content from message
|
||||
message = data.get('message', {})
|
||||
content = message.get('content', [])
|
||||
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
text = ''
|
||||
if item.get('type') == 'text':
|
||||
text = item.get('text', '')
|
||||
elif item.get('type') == 'tool_use':
|
||||
# Also check tool_use input fields
|
||||
tool_input = item.get('input', {})
|
||||
if isinstance(tool_input, dict):
|
||||
text = str(tool_input)
|
||||
|
||||
if text:
|
||||
# Extract XML blocks
|
||||
xml_blocks = extract_xml_blocks(text)
|
||||
|
||||
for block in xml_blocks:
|
||||
results.append({
|
||||
'timestamp': timestamp,
|
||||
'xml': block
|
||||
})
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
# Get list of transcript files
|
||||
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
|
||||
os.chdir(transcript_dir)
|
||||
|
||||
# Get all transcript files sorted by modification time
|
||||
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
|
||||
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
|
||||
|
||||
all_results = []
|
||||
for filename in files:
|
||||
filepath = os.path.join(transcript_dir, filename)
|
||||
print(f"Processing {filename}...")
|
||||
results = process_transcript_file(filepath)
|
||||
all_results.extend(results)
|
||||
print(f" Found {len(results)} XML blocks")
|
||||
|
||||
# Write results with timestamps
|
||||
output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
f.write('<transcript_extracts>\n\n')
|
||||
|
||||
for i, item in enumerate(all_results, 1):
|
||||
timestamp = item['timestamp']
|
||||
xml = item['xml']
|
||||
|
||||
# Format timestamp nicely if it's ISO format
|
||||
if timestamp != 'unknown' and timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
except:
|
||||
formatted_time = timestamp
|
||||
else:
|
||||
formatted_time = 'unknown'
|
||||
|
||||
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
|
||||
f.write(xml)
|
||||
f.write('\n\n')
|
||||
|
||||
f.write('</transcript_extracts>\n')
|
||||
|
||||
print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
|
||||
Executable
+168
@@ -0,0 +1,168 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
def extract_xml_blocks(text):
|
||||
"""Extract complete XML blocks from text"""
|
||||
xml_patterns = [
|
||||
r'<observation>.*?</observation>',
|
||||
r'<session_summary>.*?</session_summary>',
|
||||
r'<request>.*?</request>',
|
||||
r'<summary>.*?</summary>',
|
||||
r'<facts>.*?</facts>',
|
||||
r'<fact>.*?</fact>',
|
||||
r'<concepts>.*?</concepts>',
|
||||
r'<concept>.*?</concept>',
|
||||
r'<files>.*?</files>',
|
||||
r'<file>.*?</file>',
|
||||
r'<files_read>.*?</files_read>',
|
||||
r'<files_edited>.*?</files_edited>',
|
||||
r'<files_modified>.*?</files_modified>',
|
||||
r'<narrative>.*?</narrative>',
|
||||
r'<learned>.*?</learned>',
|
||||
r'<investigated>.*?</investigated>',
|
||||
r'<completed>.*?</completed>',
|
||||
r'<next_steps>.*?</next_steps>',
|
||||
r'<notes>.*?</notes>',
|
||||
r'<title>.*?</title>',
|
||||
r'<subtitle>.*?</subtitle>',
|
||||
r'<text>.*?</text>',
|
||||
r'<type>.*?</type>',
|
||||
r'<tool_used>.*?</tool_used>',
|
||||
r'<tool_name>.*?</tool_name>',
|
||||
r'<tool_input>.*?</tool_input>',
|
||||
r'<tool_output>.*?</tool_output>',
|
||||
r'<tool_time>.*?</tool_time>',
|
||||
]
|
||||
|
||||
blocks = []
|
||||
for pattern in xml_patterns:
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
blocks.extend(matches)
|
||||
|
||||
return blocks
|
||||
|
||||
def is_example_xml(xml_block):
|
||||
"""Check if XML block is an example/template"""
|
||||
# Patterns that indicate this is example/template XML
|
||||
example_indicators = [
|
||||
r'\[.*?\]', # Square brackets with placeholders
|
||||
r'\*\*\w+\*\*:', # Bold markdown like **title**:
|
||||
r'\.\.\..*?\.\.\.', # Ellipsis indicating placeholder
|
||||
r'feature\|bugfix\|refactor', # Multiple options separated by |
|
||||
r'change \| discovery \| decision', # Example types
|
||||
r'\{.*?\}', # Curly braces (template variables)
|
||||
r'Concise, self-contained statement', # Literal example text
|
||||
r'Short title capturing',
|
||||
r'One sentence explanation',
|
||||
r'What was the user trying',
|
||||
r'What code/systems did you explore',
|
||||
r'What did you learn',
|
||||
r'What was done',
|
||||
r'What should happen next',
|
||||
r'file1\.ts', # Example filenames
|
||||
r'file2\.ts',
|
||||
r'file3\.ts',
|
||||
r'Any additional context',
|
||||
]
|
||||
|
||||
for pattern in example_indicators:
|
||||
if re.search(pattern, xml_block):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def process_transcript_file(filepath):
|
||||
"""Process a single transcript file and extract only real XML from assistant responses"""
|
||||
results = []
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Get timestamp
|
||||
timestamp = data.get('timestamp', 'unknown')
|
||||
|
||||
# Only process assistant messages
|
||||
message = data.get('message', {})
|
||||
role = message.get('role')
|
||||
|
||||
if role != 'assistant':
|
||||
continue
|
||||
|
||||
content = message.get('content', [])
|
||||
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if isinstance(item, dict) and item.get('type') == 'text':
|
||||
# This is text in an assistant response, not tool_use
|
||||
text = item.get('text', '')
|
||||
|
||||
# Extract XML blocks
|
||||
xml_blocks = extract_xml_blocks(text)
|
||||
|
||||
for block in xml_blocks:
|
||||
# Filter out example/template XML
|
||||
if not is_example_xml(block):
|
||||
results.append({
|
||||
'timestamp': timestamp,
|
||||
'xml': block
|
||||
})
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
# Get list of Oct 18 transcript files
|
||||
import subprocess
|
||||
|
||||
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
|
||||
os.chdir(transcript_dir)
|
||||
|
||||
# Get all transcript files sorted by modification time
|
||||
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
|
||||
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
|
||||
|
||||
all_results = []
|
||||
for filename in files:
|
||||
filepath = os.path.join(transcript_dir, filename)
|
||||
print(f"Processing {filename}...")
|
||||
results = process_transcript_file(filepath)
|
||||
all_results.extend(results)
|
||||
print(f" Found {len(results)} actual XML blocks")
|
||||
|
||||
# Write results with timestamps
|
||||
output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
f.write('<!-- Actual XML blocks from assistant responses only -->\n')
|
||||
f.write('<!-- Excludes: tool_use inputs, user prompts, and example/template XML -->\n')
|
||||
f.write('<transcript_extracts>\n\n')
|
||||
|
||||
for i, item in enumerate(all_results, 1):
|
||||
timestamp = item['timestamp']
|
||||
xml = item['xml']
|
||||
|
||||
# Format timestamp nicely if it's ISO format
|
||||
if timestamp != 'unknown' and timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
except:
|
||||
formatted_time = timestamp
|
||||
else:
|
||||
formatted_time = 'unknown'
|
||||
|
||||
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
|
||||
f.write(xml)
|
||||
f.write('\n\n')
|
||||
|
||||
f.write('</transcript_extracts>\n')
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}")
|
||||
print(f"{'='*80}")
|
||||
Reference in New Issue
Block a user