From 6d68fd44ca2f0eb7f0f1a9451797912e659712cd Mon Sep 17 00:00:00 2001 From: Alex Newman Date: Sat, 18 Oct 2025 23:34:53 -0400 Subject: [PATCH] feat: Add XML extraction and import scripts for observations and summaries --- package.json | 2 + scripts/extraction/README.md | 82 +++++ scripts/extraction/extract-all-xml.py | 128 ++++++++ scripts/extraction/filter-actual-xml.py | 168 +++++++++++ src/bin/cleanup-duplicates.ts | 98 ++++++ src/bin/import-xml-observations.ts | 382 ++++++++++++++++++++++++ 6 files changed, 860 insertions(+) create mode 100644 scripts/extraction/README.md create mode 100755 scripts/extraction/extract-all-xml.py create mode 100755 scripts/extraction/filter-actual-xml.py create mode 100644 src/bin/cleanup-duplicates.ts create mode 100644 src/bin/import-xml-observations.ts diff --git a/package.json b/package.json index 534faf03..b75f571d 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,8 @@ "test": "node --test tests/", "test:context": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js 2>/dev/null", "test:context:verbose": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js", + "import:xml": "tsx src/bin/import-xml-observations.ts", + "cleanup:duplicates": "tsx src/bin/cleanup-duplicates.ts", "worker:start": "pm2 start ecosystem.config.cjs", "worker:stop": "pm2 stop claude-mem-worker", "worker:restart": "pm2 restart claude-mem-worker", diff --git a/scripts/extraction/README.md b/scripts/extraction/README.md new file mode 100644 index 00000000..fccf7b03 --- /dev/null +++ b/scripts/extraction/README.md @@ -0,0 +1,82 @@ +# XML Extraction Scripts + +Scripts to extract XML observations and summaries from Claude Code transcript files. + +## Scripts + +### `filter-actual-xml.py` +**Recommended for import** + +Extracts only actual XML from assistant responses, filtering out: +- Template/example XML (with placeholders like `[...]` or `**field**:`) +- XML from tool_use blocks +- XML from user messages + +**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml` + +**Usage:** +```bash +python3 scripts/extraction/filter-actual-xml.py +``` + +### `extract-all-xml.py` +**For debugging/analysis** + +Extracts ALL XML blocks from transcripts without filtering. + +**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml` + +**Usage:** +```bash +python3 scripts/extraction/extract-all-xml.py +``` + +## Workflow + +1. **Extract XML from transcripts:** + ```bash + cd ~/Scripts/claude-mem + python3 scripts/extraction/filter-actual-xml.py + ``` + +2. **Import to database:** + ```bash + npm run import:xml + ``` + +3. **Clean up duplicates (if needed):** + ```bash + npm run cleanup:duplicates + ``` + +## Source Data + +Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl` + +These are Claude Code session transcripts stored in JSONL (JSON Lines) format. + +## Output Format + +```xml + + + + + + discovery + Example observation + ... + + + + + What was accomplished + ... + + + +``` + +Each XML block includes a comment with: +- Block number +- Original timestamp from transcript diff --git a/scripts/extraction/extract-all-xml.py b/scripts/extraction/extract-all-xml.py new file mode 100755 index 00000000..a7954b02 --- /dev/null +++ b/scripts/extraction/extract-all-xml.py @@ -0,0 +1,128 @@ +#!/usr/bin/env python3 +import json +import re +from datetime import datetime +import os +import subprocess + +def extract_xml_blocks(text): + """Extract complete XML blocks from text""" + xml_patterns = [ + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + ] + + blocks = [] + for pattern in xml_patterns: + matches = re.findall(pattern, text, re.DOTALL) + blocks.extend(matches) + + return blocks + +def process_transcript_file(filepath): + """Process a single transcript file and extract XML with timestamps""" + results = [] + + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + for line in f: + try: + data = json.loads(line) + + # Get timestamp + timestamp = data.get('timestamp', 'unknown') + + # Extract text content from message + message = data.get('message', {}) + content = message.get('content', []) + + if isinstance(content, list): + for item in content: + if isinstance(item, dict): + text = '' + if item.get('type') == 'text': + text = item.get('text', '') + elif item.get('type') == 'tool_use': + # Also check tool_use input fields + tool_input = item.get('input', {}) + if isinstance(tool_input, dict): + text = str(tool_input) + + if text: + # Extract XML blocks + xml_blocks = extract_xml_blocks(text) + + for block in xml_blocks: + results.append({ + 'timestamp': timestamp, + 'xml': block + }) + + except json.JSONDecodeError: + continue + + return results + +# Get list of transcript files +transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/') +os.chdir(transcript_dir) + +# Get all transcript files sorted by modification time +result = subprocess.run(['ls', '-t'], capture_output=True, text=True) +files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62] + +all_results = [] +for filename in files: + filepath = os.path.join(transcript_dir, filename) + print(f"Processing {filename}...") + results = process_transcript_file(filepath) + all_results.extend(results) + print(f" Found {len(results)} XML blocks") + +# Write results with timestamps +output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml') +with open(output_file, 'w', encoding='utf-8') as f: + f.write('\n') + f.write('\n\n') + + for i, item in enumerate(all_results, 1): + timestamp = item['timestamp'] + xml = item['xml'] + + # Format timestamp nicely if it's ISO format + if timestamp != 'unknown' and timestamp: + try: + dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) + formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC') + except: + formatted_time = timestamp + else: + formatted_time = 'unknown' + + f.write(f'\n') + f.write(xml) + f.write('\n\n') + + f.write('\n') + +print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}") diff --git a/scripts/extraction/filter-actual-xml.py b/scripts/extraction/filter-actual-xml.py new file mode 100755 index 00000000..ef1344dd --- /dev/null +++ b/scripts/extraction/filter-actual-xml.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 +import json +import re +from datetime import datetime +import os + +def extract_xml_blocks(text): + """Extract complete XML blocks from text""" + xml_patterns = [ + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + r'.*?', + ] + + blocks = [] + for pattern in xml_patterns: + matches = re.findall(pattern, text, re.DOTALL) + blocks.extend(matches) + + return blocks + +def is_example_xml(xml_block): + """Check if XML block is an example/template""" + # Patterns that indicate this is example/template XML + example_indicators = [ + r'\[.*?\]', # Square brackets with placeholders + r'\*\*\w+\*\*:', # Bold markdown like **title**: + r'\.\.\..*?\.\.\.', # Ellipsis indicating placeholder + r'feature\|bugfix\|refactor', # Multiple options separated by | + r'change \| discovery \| decision', # Example types + r'\{.*?\}', # Curly braces (template variables) + r'Concise, self-contained statement', # Literal example text + r'Short title capturing', + r'One sentence explanation', + r'What was the user trying', + r'What code/systems did you explore', + r'What did you learn', + r'What was done', + r'What should happen next', + r'file1\.ts', # Example filenames + r'file2\.ts', + r'file3\.ts', + r'Any additional context', + ] + + for pattern in example_indicators: + if re.search(pattern, xml_block): + return True + + return False + +def process_transcript_file(filepath): + """Process a single transcript file and extract only real XML from assistant responses""" + results = [] + + with open(filepath, 'r', encoding='utf-8', errors='ignore') as f: + for line in f: + try: + data = json.loads(line) + + # Get timestamp + timestamp = data.get('timestamp', 'unknown') + + # Only process assistant messages + message = data.get('message', {}) + role = message.get('role') + + if role != 'assistant': + continue + + content = message.get('content', []) + + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get('type') == 'text': + # This is text in an assistant response, not tool_use + text = item.get('text', '') + + # Extract XML blocks + xml_blocks = extract_xml_blocks(text) + + for block in xml_blocks: + # Filter out example/template XML + if not is_example_xml(block): + results.append({ + 'timestamp': timestamp, + 'xml': block + }) + + except json.JSONDecodeError: + continue + + return results + +# Get list of Oct 18 transcript files +import subprocess + +transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/') +os.chdir(transcript_dir) + +# Get all transcript files sorted by modification time +result = subprocess.run(['ls', '-t'], capture_output=True, text=True) +files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62] + +all_results = [] +for filename in files: + filepath = os.path.join(transcript_dir, filename) + print(f"Processing {filename}...") + results = process_transcript_file(filepath) + all_results.extend(results) + print(f" Found {len(results)} actual XML blocks") + +# Write results with timestamps +output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml') +with open(output_file, 'w', encoding='utf-8') as f: + f.write('\n') + f.write('\n') + f.write('\n') + f.write('\n\n') + + for i, item in enumerate(all_results, 1): + timestamp = item['timestamp'] + xml = item['xml'] + + # Format timestamp nicely if it's ISO format + if timestamp != 'unknown' and timestamp: + try: + dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) + formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC') + except: + formatted_time = timestamp + else: + formatted_time = 'unknown' + + f.write(f'\n') + f.write(xml) + f.write('\n\n') + + f.write('\n') + +print(f"\n{'='*80}") +print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}") +print(f"{'='*80}") diff --git a/src/bin/cleanup-duplicates.ts b/src/bin/cleanup-duplicates.ts new file mode 100644 index 00000000..02fdb408 --- /dev/null +++ b/src/bin/cleanup-duplicates.ts @@ -0,0 +1,98 @@ +#!/usr/bin/env node +/** + * Cleanup duplicate observations and summaries from the database + * Keeps the earliest entry (MIN(id)) for each duplicate group + */ + +import { SessionStore } from '../services/sqlite/SessionStore.js'; + +function main() { + console.log('Starting duplicate cleanup...\n'); + + const db = new SessionStore(); + + // Find and delete duplicate observations + console.log('Finding duplicate observations...'); + + const duplicateObsQuery = db['db'].prepare(` + SELECT sdk_session_id, title, subtitle, type, COUNT(*) as count, GROUP_CONCAT(id) as ids + FROM observations + GROUP BY sdk_session_id, title, subtitle, type + HAVING count > 1 + `); + + const duplicateObs = duplicateObsQuery.all() as Array<{ + sdk_session_id: string; + title: string; + subtitle: string; + type: string; + count: number; + ids: string; + }>; + + console.log(`Found ${duplicateObs.length} duplicate observation groups\n`); + + let deletedObs = 0; + for (const dup of duplicateObs) { + const ids = dup.ids.split(',').map(id => parseInt(id, 10)); + const keepId = Math.min(...ids); + const deleteIds = ids.filter(id => id !== keepId); + + console.log(`Observation "${dup.title.substring(0, 60)}..."`); + console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`); + + const deleteStmt = db['db'].prepare(`DELETE FROM observations WHERE id IN (${deleteIds.join(',')})`); + deleteStmt.run(); + deletedObs += deleteIds.length; + } + + // Find and delete duplicate summaries + console.log('\n\nFinding duplicate summaries...'); + + const duplicateSumQuery = db['db'].prepare(` + SELECT sdk_session_id, request, completed, learned, COUNT(*) as count, GROUP_CONCAT(id) as ids + FROM session_summaries + GROUP BY sdk_session_id, request, completed, learned + HAVING count > 1 + `); + + const duplicateSum = duplicateSumQuery.all() as Array<{ + sdk_session_id: string; + request: string; + completed: string; + learned: string; + count: number; + ids: string; + }>; + + console.log(`Found ${duplicateSum.length} duplicate summary groups\n`); + + let deletedSum = 0; + for (const dup of duplicateSum) { + const ids = dup.ids.split(',').map(id => parseInt(id, 10)); + const keepId = Math.min(...ids); + const deleteIds = ids.filter(id => id !== keepId); + + console.log(`Summary "${dup.request.substring(0, 60)}..."`); + console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`); + + const deleteStmt = db['db'].prepare(`DELETE FROM session_summaries WHERE id IN (${deleteIds.join(',')})`); + deleteStmt.run(); + deletedSum += deleteIds.length; + } + + db.close(); + + console.log('\n' + '='.repeat(60)); + console.log('Cleanup Complete!'); + console.log('='.repeat(60)); + console.log(`🗑️ Deleted: ${deletedObs} duplicate observations`); + console.log(`🗑️ Deleted: ${deletedSum} duplicate summaries`); + console.log(`🗑️ Total: ${deletedObs + deletedSum} duplicates removed`); + console.log('='.repeat(60)); +} + +// Run if executed directly +if (import.meta.url === `file://${process.argv[1]}`) { + main(); +} diff --git a/src/bin/import-xml-observations.ts b/src/bin/import-xml-observations.ts new file mode 100644 index 00000000..87c5881d --- /dev/null +++ b/src/bin/import-xml-observations.ts @@ -0,0 +1,382 @@ +#!/usr/bin/env node +/** + * Import XML observations back into the database + * Parses actual_xml_only_with_timestamps.xml and inserts observations via SessionStore + */ + +import { readFileSync, readdirSync } from 'fs'; +import { join } from 'path'; +import { homedir } from 'os'; +import { SessionStore } from '../services/sqlite/SessionStore.js'; + +interface ObservationData { + type: string; + title: string; + subtitle: string; + facts: string[]; + narrative: string; + concepts: string[]; + files_read: string[]; + files_modified: string[]; +} + +interface SummaryData { + request: string; + investigated: string; + learned: string; + completed: string; + next_steps: string; + notes: string | null; +} + +interface SessionMetadata { + sessionId: string; + project: string; +} + +interface TimestampMapping { + [timestamp: string]: SessionMetadata; +} + +/** + * Build a map of timestamp (rounded to second) -> session metadata by reading all transcript files + * Since XML timestamps are rounded to seconds, we map by second + */ +function buildTimestampMap(): TimestampMapping { + const transcriptDir = join(homedir(), '.claude', 'projects', '-Users-alexnewman-Scripts-claude-mem'); + const map: TimestampMapping = {}; + + console.log(`Reading transcript files from ${transcriptDir}...`); + + const files = readdirSync(transcriptDir).filter(f => f.endsWith('.jsonl')); + console.log(`Found ${files.length} transcript files`); + + for (const filename of files) { + const filepath = join(transcriptDir, filename); + const content = readFileSync(filepath, 'utf-8'); + const lines = content.split('\n').filter(l => l.trim()); + + for (const line of lines) { + try { + const data = JSON.parse(line); + const timestamp = data.timestamp; + const sessionId = data.sessionId; + const project = data.cwd || '/Users/alexnewman/Scripts/claude-mem'; + + if (timestamp && sessionId) { + // Round timestamp to second for matching with XML timestamps + const roundedTimestamp = new Date(timestamp); + roundedTimestamp.setMilliseconds(0); + const key = roundedTimestamp.toISOString(); + + // Only store first occurrence for each second (they're all the same session anyway) + if (!map[key]) { + map[key] = { sessionId, project }; + } + } + } catch (e) { + // Skip invalid JSON lines + } + } + } + + console.log(`Built timestamp map with ${Object.keys(map).length} unique seconds`); + return map; +} + +/** + * Parse XML text content and extract tag value + */ +function extractTag(xml: string, tagName: string): string { + const regex = new RegExp(`<${tagName}>([\\s\\S]*?)`, 'i'); + const match = xml.match(regex); + return match ? match[1].trim() : ''; +} + +/** + * Parse XML array tags (facts, concepts, files, etc.) + */ +function extractArrayTags(xml: string, containerTag: string, itemTag: string): string[] { + const containerRegex = new RegExp(`<${containerTag}>([\\s\\S]*?)`, 'i'); + const containerMatch = xml.match(containerRegex); + + if (!containerMatch) { + return []; + } + + const containerContent = containerMatch[1]; + const itemRegex = new RegExp(`<${itemTag}>([\\s\\S]*?)`, 'gi'); + const items: string[] = []; + let match; + + while ((match = itemRegex.exec(containerContent)) !== null) { + items.push(match[1].trim()); + } + + return items; +} + +/** + * Parse an observation block from XML + */ +function parseObservation(xml: string): ObservationData | null { + // Must be a complete observation block + if (!xml.includes('') || !xml.includes('')) { + return null; + } + + try { + const observation: ObservationData = { + type: extractTag(xml, 'type'), + title: extractTag(xml, 'title'), + subtitle: extractTag(xml, 'subtitle'), + facts: extractArrayTags(xml, 'facts', 'fact'), + narrative: extractTag(xml, 'narrative'), + concepts: extractArrayTags(xml, 'concepts', 'concept'), + files_read: extractArrayTags(xml, 'files_read', 'file'), + files_modified: extractArrayTags(xml, 'files_modified', 'file'), + }; + + // Validate required fields + if (!observation.type || !observation.title) { + return null; + } + + return observation; + } catch (e) { + console.error('Error parsing observation:', e); + return null; + } +} + +/** + * Parse a summary block from XML + */ +function parseSummary(xml: string): SummaryData | null { + // Must be a complete summary block + if (!xml.includes('') || !xml.includes('')) { + return null; + } + + try { + const summary: SummaryData = { + request: extractTag(xml, 'request'), + investigated: extractTag(xml, 'investigated'), + learned: extractTag(xml, 'learned'), + completed: extractTag(xml, 'completed'), + next_steps: extractTag(xml, 'next_steps'), + notes: extractTag(xml, 'notes') || null, + }; + + // Validate required fields + if (!summary.request) { + return null; + } + + return summary; + } catch (e) { + console.error('Error parsing summary:', e); + return null; + } +} + +/** + * Extract timestamp from XML comment + * Format: + */ +function extractTimestamp(commentLine: string): string | null { + const match = commentLine.match(//); + if (match) { + // Convert "2025-10-19 03:03:23 UTC" to ISO format + const dateStr = match[1].replace(' UTC', '').replace(' ', 'T') + 'Z'; + return new Date(dateStr).toISOString(); + } + return null; +} + +/** + * Main import function + */ +function main() { + console.log('Starting XML observation import...\n'); + + // Build timestamp map + const timestampMap = buildTimestampMap(); + + // Open database connection + const db = new SessionStore(); + + // Create SDK sessions for all unique Claude Code sessions + console.log('\nCreating SDK sessions for imported data...'); + const claudeSessionToSdkSession = new Map(); + + for (const sessionMeta of Object.values(timestampMap)) { + if (!claudeSessionToSdkSession.has(sessionMeta.sessionId)) { + const syntheticSdkSessionId = `imported-${sessionMeta.sessionId}`; + + // Try to find existing session first + const existingQuery = db['db'].prepare(` + SELECT sdk_session_id + FROM sdk_sessions + WHERE claude_session_id = ? + `); + const existing = existingQuery.get(sessionMeta.sessionId) as { sdk_session_id: string | null } | undefined; + + if (existing && existing.sdk_session_id) { + // Use existing SDK session ID + claudeSessionToSdkSession.set(sessionMeta.sessionId, existing.sdk_session_id); + } else if (existing && !existing.sdk_session_id) { + // Session exists but sdk_session_id is NULL, update it + const dbId = (db['db'].prepare('SELECT id FROM sdk_sessions WHERE claude_session_id = ?').get(sessionMeta.sessionId) as { id: number }).id; + db.updateSDKSessionId(dbId, syntheticSdkSessionId); + claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId); + } else { + // Create new SDK session + const dbId = db.createSDKSession( + sessionMeta.sessionId, + sessionMeta.project, + 'Imported from transcript XML' + ); + + // Update with synthetic SDK session ID + db.updateSDKSessionId(dbId, syntheticSdkSessionId); + + claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId); + } + } + } + + console.log(`Prepared ${claudeSessionToSdkSession.size} SDK sessions\n`); + + // Read XML file + const xmlPath = join(process.cwd(), 'actual_xml_only_with_timestamps.xml'); + console.log(`Reading XML file: ${xmlPath}`); + const xmlContent = readFileSync(xmlPath, 'utf-8'); + + // Split into blocks by comment markers + const blocks = xmlContent.split(/(?=