feat: Add XML extraction and import scripts for observations and summaries

2025-10-18 23:34:53 -04:00
parent bc41e367c0
commit 6d68fd44ca
6 changed files with 860 additions and 0 deletions
@@ -0,0 +1,82 @@
+# XML Extraction Scripts
+
+Scripts to extract XML observations and summaries from Claude Code transcript files.
+
+## Scripts
+
+### `filter-actual-xml.py`
+**Recommended for import**
+
+Extracts only actual XML from assistant responses, filtering out:
+- Template/example XML (with placeholders like `[...]` or `**field**:`)
+- XML from tool_use blocks
+- XML from user messages
+
+**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml`
+
+**Usage:**
+```bash
+python3 scripts/extraction/filter-actual-xml.py
+```
+
+### `extract-all-xml.py`
+**For debugging/analysis**
+
+Extracts ALL XML blocks from transcripts without filtering.
+
+**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml`
+
+**Usage:**
+```bash
+python3 scripts/extraction/extract-all-xml.py
+```
+
+## Workflow
+
+1. **Extract XML from transcripts:**
+   ```bash
+   cd ~/Scripts/claude-mem
+   python3 scripts/extraction/filter-actual-xml.py
+   ```
+
+2. **Import to database:**
+   ```bash
+   npm run import:xml
+   ```
+
+3. **Clean up duplicates (if needed):**
+   ```bash
+   npm run cleanup:duplicates
+   ```
+
+## Source Data
+
+Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl`
+
+These are Claude Code session transcripts stored in JSONL (JSON Lines) format.
+
+## Output Format
+
+```xml
+<?xml version="1.0" encoding="UTF-8"?>
+<transcript_extracts>
+
+<!-- Block 1 | 2025-10-19 03:03:23 UTC -->
+<observation>
+  <type>discovery</type>
+  <title>Example observation</title>
+  ...
+</observation>
+
+<!-- Block 2 | 2025-10-19 03:03:45 UTC -->
+<summary>
+  <request>What was accomplished</request>
+  ...
+</summary>
+
+</transcript_extracts>
+```
+
+Each XML block includes a comment with:
+- Block number
+- Original timestamp from transcript
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import json
+import re
+from datetime import datetime
+import os
+import subprocess
+
+def extract_xml_blocks(text):
+    """Extract complete XML blocks from text"""
+    xml_patterns = [
+        r'<observation>.*?</observation>',
+        r'<session_summary>.*?</session_summary>',
+        r'<request>.*?</request>',
+        r'<summary>.*?</summary>',
+        r'<facts>.*?</facts>',
+        r'<fact>.*?</fact>',
+        r'<concepts>.*?</concepts>',
+        r'<concept>.*?</concept>',
+        r'<files>.*?</files>',
+        r'<file>.*?</file>',
+        r'<files_read>.*?</files_read>',
+        r'<files_edited>.*?</files_edited>',
+        r'<files_modified>.*?</files_modified>',
+        r'<narrative>.*?</narrative>',
+        r'<learned>.*?</learned>',
+        r'<investigated>.*?</investigated>',
+        r'<completed>.*?</completed>',
+        r'<next_steps>.*?</next_steps>',
+        r'<notes>.*?</notes>',
+        r'<title>.*?</title>',
+        r'<subtitle>.*?</subtitle>',
+        r'<text>.*?</text>',
+        r'<type>.*?</type>',
+    ]
+
+    blocks = []
+    for pattern in xml_patterns:
+        matches = re.findall(pattern, text, re.DOTALL)
+        blocks.extend(matches)
+
+    return blocks
+
+def process_transcript_file(filepath):
+    """Process a single transcript file and extract XML with timestamps"""
+    results = []
+
+    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+        for line in f:
+            try:
+                data = json.loads(line)
+
+                # Get timestamp
+                timestamp = data.get('timestamp', 'unknown')
+
+                # Extract text content from message
+                message = data.get('message', {})
+                content = message.get('content', [])
+
+                if isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, dict):
+                            text = ''
+                            if item.get('type') == 'text':
+                                text = item.get('text', '')
+                            elif item.get('type') == 'tool_use':
+                                # Also check tool_use input fields
+                                tool_input = item.get('input', {})
+                                if isinstance(tool_input, dict):
+                                    text = str(tool_input)
+
+                            if text:
+                                # Extract XML blocks
+                                xml_blocks = extract_xml_blocks(text)
+
+                                for block in xml_blocks:
+                                    results.append({
+                                        'timestamp': timestamp,
+                                        'xml': block
+                                    })
+
+            except json.JSONDecodeError:
+                continue
+
+    return results
+
+# Get list of transcript files
+transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
+os.chdir(transcript_dir)
+
+# Get all transcript files sorted by modification time
+result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
+files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
+
+all_results = []
+for filename in files:
+    filepath = os.path.join(transcript_dir, filename)
+    print(f"Processing {filename}...")
+    results = process_transcript_file(filepath)
+    all_results.extend(results)
+    print(f"  Found {len(results)} XML blocks")
+
+# Write results with timestamps
+output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
+with open(output_file, 'w', encoding='utf-8') as f:
+    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+    f.write('<transcript_extracts>\n\n')
+
+    for i, item in enumerate(all_results, 1):
+        timestamp = item['timestamp']
+        xml = item['xml']
+
+        # Format timestamp nicely if it's ISO format
+        if timestamp != 'unknown' and timestamp:
+            try:
+                dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+                formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+            except:
+                formatted_time = timestamp
+        else:
+            formatted_time = 'unknown'
+
+        f.write(f'<!-- Block {i} | {formatted_time} -->\n')
+        f.write(xml)
+        f.write('\n\n')
+
+    f.write('</transcript_extracts>\n')
+
+print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+import json
+import re
+from datetime import datetime
+import os
+
+def extract_xml_blocks(text):
+    """Extract complete XML blocks from text"""
+    xml_patterns = [
+        r'<observation>.*?</observation>',
+        r'<session_summary>.*?</session_summary>',
+        r'<request>.*?</request>',
+        r'<summary>.*?</summary>',
+        r'<facts>.*?</facts>',
+        r'<fact>.*?</fact>',
+        r'<concepts>.*?</concepts>',
+        r'<concept>.*?</concept>',
+        r'<files>.*?</files>',
+        r'<file>.*?</file>',
+        r'<files_read>.*?</files_read>',
+        r'<files_edited>.*?</files_edited>',
+        r'<files_modified>.*?</files_modified>',
+        r'<narrative>.*?</narrative>',
+        r'<learned>.*?</learned>',
+        r'<investigated>.*?</investigated>',
+        r'<completed>.*?</completed>',
+        r'<next_steps>.*?</next_steps>',
+        r'<notes>.*?</notes>',
+        r'<title>.*?</title>',
+        r'<subtitle>.*?</subtitle>',
+        r'<text>.*?</text>',
+        r'<type>.*?</type>',
+        r'<tool_used>.*?</tool_used>',
+        r'<tool_name>.*?</tool_name>',
+        r'<tool_input>.*?</tool_input>',
+        r'<tool_output>.*?</tool_output>',
+        r'<tool_time>.*?</tool_time>',
+    ]
+
+    blocks = []
+    for pattern in xml_patterns:
+        matches = re.findall(pattern, text, re.DOTALL)
+        blocks.extend(matches)
+
+    return blocks
+
+def is_example_xml(xml_block):
+    """Check if XML block is an example/template"""
+    # Patterns that indicate this is example/template XML
+    example_indicators = [
+        r'\[.*?\]',  # Square brackets with placeholders
+        r'\*\*\w+\*\*:',  # Bold markdown like **title**:
+        r'\.\.\..*?\.\.\.',  # Ellipsis indicating placeholder
+        r'feature\|bugfix\|refactor',  # Multiple options separated by |
+        r'change \| discovery \| decision',  # Example types
+        r'\{.*?\}',  # Curly braces (template variables)
+        r'Concise, self-contained statement',  # Literal example text
+        r'Short title capturing',
+        r'One sentence explanation',
+        r'What was the user trying',
+        r'What code/systems did you explore',
+        r'What did you learn',
+        r'What was done',
+        r'What should happen next',
+        r'file1\.ts',  # Example filenames
+        r'file2\.ts',
+        r'file3\.ts',
+        r'Any additional context',
+    ]
+
+    for pattern in example_indicators:
+        if re.search(pattern, xml_block):
+            return True
+
+    return False
+
+def process_transcript_file(filepath):
+    """Process a single transcript file and extract only real XML from assistant responses"""
+    results = []
+
+    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+        for line in f:
+            try:
+                data = json.loads(line)
+
+                # Get timestamp
+                timestamp = data.get('timestamp', 'unknown')
+
+                # Only process assistant messages
+                message = data.get('message', {})
+                role = message.get('role')
+
+                if role != 'assistant':
+                    continue
+
+                content = message.get('content', [])
+
+                if isinstance(content, list):
+                    for item in content:
+                        if isinstance(item, dict) and item.get('type') == 'text':
+                            # This is text in an assistant response, not tool_use
+                            text = item.get('text', '')
+
+                            # Extract XML blocks
+                            xml_blocks = extract_xml_blocks(text)
+
+                            for block in xml_blocks:
+                                # Filter out example/template XML
+                                if not is_example_xml(block):
+                                    results.append({
+                                        'timestamp': timestamp,
+                                        'xml': block
+                                    })
+
+            except json.JSONDecodeError:
+                continue
+
+    return results
+
+# Get list of Oct 18 transcript files
+import subprocess
+
+transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
+os.chdir(transcript_dir)
+
+# Get all transcript files sorted by modification time
+result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
+files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
+
+all_results = []
+for filename in files:
+    filepath = os.path.join(transcript_dir, filename)
+    print(f"Processing {filename}...")
+    results = process_transcript_file(filepath)
+    all_results.extend(results)
+    print(f"  Found {len(results)} actual XML blocks")
+
+# Write results with timestamps
+output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml')
+with open(output_file, 'w', encoding='utf-8') as f:
+    f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+    f.write('<!-- Actual XML blocks from assistant responses only -->\n')
+    f.write('<!-- Excludes: tool_use inputs, user prompts, and example/template XML -->\n')
+    f.write('<transcript_extracts>\n\n')
+
+    for i, item in enumerate(all_results, 1):
+        timestamp = item['timestamp']
+        xml = item['xml']
+
+        # Format timestamp nicely if it's ISO format
+        if timestamp != 'unknown' and timestamp:
+            try:
+                dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+                formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+            except:
+                formatted_time = timestamp
+        else:
+            formatted_time = 'unknown'
+
+        f.write(f'<!-- Block {i} | {formatted_time} -->\n')
+        f.write(xml)
+        f.write('\n\n')
+
+    f.write('</transcript_extracts>\n')
+
+print(f"\n{'='*80}")
+print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}")
+print(f"{'='*80}")