feat: Add XML extraction and import scripts for observations and summaries
This commit is contained in:
Executable
+128
@@ -0,0 +1,128 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
def extract_xml_blocks(text):
|
||||
"""Extract complete XML blocks from text"""
|
||||
xml_patterns = [
|
||||
r'<observation>.*?</observation>',
|
||||
r'<session_summary>.*?</session_summary>',
|
||||
r'<request>.*?</request>',
|
||||
r'<summary>.*?</summary>',
|
||||
r'<facts>.*?</facts>',
|
||||
r'<fact>.*?</fact>',
|
||||
r'<concepts>.*?</concepts>',
|
||||
r'<concept>.*?</concept>',
|
||||
r'<files>.*?</files>',
|
||||
r'<file>.*?</file>',
|
||||
r'<files_read>.*?</files_read>',
|
||||
r'<files_edited>.*?</files_edited>',
|
||||
r'<files_modified>.*?</files_modified>',
|
||||
r'<narrative>.*?</narrative>',
|
||||
r'<learned>.*?</learned>',
|
||||
r'<investigated>.*?</investigated>',
|
||||
r'<completed>.*?</completed>',
|
||||
r'<next_steps>.*?</next_steps>',
|
||||
r'<notes>.*?</notes>',
|
||||
r'<title>.*?</title>',
|
||||
r'<subtitle>.*?</subtitle>',
|
||||
r'<text>.*?</text>',
|
||||
r'<type>.*?</type>',
|
||||
]
|
||||
|
||||
blocks = []
|
||||
for pattern in xml_patterns:
|
||||
matches = re.findall(pattern, text, re.DOTALL)
|
||||
blocks.extend(matches)
|
||||
|
||||
return blocks
|
||||
|
||||
def process_transcript_file(filepath):
|
||||
"""Process a single transcript file and extract XML with timestamps"""
|
||||
results = []
|
||||
|
||||
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||
for line in f:
|
||||
try:
|
||||
data = json.loads(line)
|
||||
|
||||
# Get timestamp
|
||||
timestamp = data.get('timestamp', 'unknown')
|
||||
|
||||
# Extract text content from message
|
||||
message = data.get('message', {})
|
||||
content = message.get('content', [])
|
||||
|
||||
if isinstance(content, list):
|
||||
for item in content:
|
||||
if isinstance(item, dict):
|
||||
text = ''
|
||||
if item.get('type') == 'text':
|
||||
text = item.get('text', '')
|
||||
elif item.get('type') == 'tool_use':
|
||||
# Also check tool_use input fields
|
||||
tool_input = item.get('input', {})
|
||||
if isinstance(tool_input, dict):
|
||||
text = str(tool_input)
|
||||
|
||||
if text:
|
||||
# Extract XML blocks
|
||||
xml_blocks = extract_xml_blocks(text)
|
||||
|
||||
for block in xml_blocks:
|
||||
results.append({
|
||||
'timestamp': timestamp,
|
||||
'xml': block
|
||||
})
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
# Get list of transcript files
|
||||
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
|
||||
os.chdir(transcript_dir)
|
||||
|
||||
# Get all transcript files sorted by modification time
|
||||
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
|
||||
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
|
||||
|
||||
all_results = []
|
||||
for filename in files:
|
||||
filepath = os.path.join(transcript_dir, filename)
|
||||
print(f"Processing {filename}...")
|
||||
results = process_transcript_file(filepath)
|
||||
all_results.extend(results)
|
||||
print(f" Found {len(results)} XML blocks")
|
||||
|
||||
# Write results with timestamps
|
||||
output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||
f.write('<transcript_extracts>\n\n')
|
||||
|
||||
for i, item in enumerate(all_results, 1):
|
||||
timestamp = item['timestamp']
|
||||
xml = item['xml']
|
||||
|
||||
# Format timestamp nicely if it's ISO format
|
||||
if timestamp != 'unknown' and timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||
except:
|
||||
formatted_time = timestamp
|
||||
else:
|
||||
formatted_time = 'unknown'
|
||||
|
||||
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
|
||||
f.write(xml)
|
||||
f.write('\n\n')
|
||||
|
||||
f.write('</transcript_extracts>\n')
|
||||
|
||||
print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
|
||||
Reference in New Issue
Block a user