diff --git a/package.json b/package.json
index 534faf03..b75f571d 100644
--- a/package.json
+++ b/package.json
@@ -41,6 +41,8 @@
"test": "node --test tests/",
"test:context": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js 2>/dev/null",
"test:context:verbose": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js",
+ "import:xml": "tsx src/bin/import-xml-observations.ts",
+ "cleanup:duplicates": "tsx src/bin/cleanup-duplicates.ts",
"worker:start": "pm2 start ecosystem.config.cjs",
"worker:stop": "pm2 stop claude-mem-worker",
"worker:restart": "pm2 restart claude-mem-worker",
diff --git a/scripts/extraction/README.md b/scripts/extraction/README.md
new file mode 100644
index 00000000..fccf7b03
--- /dev/null
+++ b/scripts/extraction/README.md
@@ -0,0 +1,82 @@
+# XML Extraction Scripts
+
+Scripts to extract XML observations and summaries from Claude Code transcript files.
+
+## Scripts
+
+### `filter-actual-xml.py`
+**Recommended for import**
+
+Extracts only actual XML from assistant responses, filtering out:
+- Template/example XML (with placeholders like `[...]` or `**field**:`)
+- XML from tool_use blocks
+- XML from user messages
+
+**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml`
+
+**Usage:**
+```bash
+python3 scripts/extraction/filter-actual-xml.py
+```
+
+### `extract-all-xml.py`
+**For debugging/analysis**
+
+Extracts ALL XML blocks from transcripts without filtering.
+
+**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml`
+
+**Usage:**
+```bash
+python3 scripts/extraction/extract-all-xml.py
+```
+
+## Workflow
+
+1. **Extract XML from transcripts:**
+ ```bash
+ cd ~/Scripts/claude-mem
+ python3 scripts/extraction/filter-actual-xml.py
+ ```
+
+2. **Import to database:**
+ ```bash
+ npm run import:xml
+ ```
+
+3. **Clean up duplicates (if needed):**
+ ```bash
+ npm run cleanup:duplicates
+ ```
+
+## Source Data
+
+Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl`
+
+These are Claude Code session transcripts stored in JSONL (JSON Lines) format.
+
+## Output Format
+
+```xml
+
+
+
+
+
+ discovery
+ Example observation
+ ...
+
+
+
+
+ What was accomplished
+ ...
+
+
+
+```
+
+Each XML block includes a comment with:
+- Block number
+- Original timestamp from transcript
diff --git a/scripts/extraction/extract-all-xml.py b/scripts/extraction/extract-all-xml.py
new file mode 100755
index 00000000..a7954b02
--- /dev/null
+++ b/scripts/extraction/extract-all-xml.py
@@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import json
+import re
+from datetime import datetime
+import os
+import subprocess
+
+def extract_xml_blocks(text):
+ """Extract complete XML blocks from text"""
+ xml_patterns = [
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'
.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ ]
+
+ blocks = []
+ for pattern in xml_patterns:
+ matches = re.findall(pattern, text, re.DOTALL)
+ blocks.extend(matches)
+
+ return blocks
+
+def process_transcript_file(filepath):
+ """Process a single transcript file and extract XML with timestamps"""
+ results = []
+
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+ for line in f:
+ try:
+ data = json.loads(line)
+
+ # Get timestamp
+ timestamp = data.get('timestamp', 'unknown')
+
+ # Extract text content from message
+ message = data.get('message', {})
+ content = message.get('content', [])
+
+ if isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict):
+ text = ''
+ if item.get('type') == 'text':
+ text = item.get('text', '')
+ elif item.get('type') == 'tool_use':
+ # Also check tool_use input fields
+ tool_input = item.get('input', {})
+ if isinstance(tool_input, dict):
+ text = str(tool_input)
+
+ if text:
+ # Extract XML blocks
+ xml_blocks = extract_xml_blocks(text)
+
+ for block in xml_blocks:
+ results.append({
+ 'timestamp': timestamp,
+ 'xml': block
+ })
+
+ except json.JSONDecodeError:
+ continue
+
+ return results
+
+# Get list of transcript files
+transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
+os.chdir(transcript_dir)
+
+# Get all transcript files sorted by modification time
+result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
+files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
+
+all_results = []
+for filename in files:
+ filepath = os.path.join(transcript_dir, filename)
+ print(f"Processing {filename}...")
+ results = process_transcript_file(filepath)
+ all_results.extend(results)
+ print(f" Found {len(results)} XML blocks")
+
+# Write results with timestamps
+output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
+with open(output_file, 'w', encoding='utf-8') as f:
+ f.write('\n')
+ f.write('\n\n')
+
+ for i, item in enumerate(all_results, 1):
+ timestamp = item['timestamp']
+ xml = item['xml']
+
+ # Format timestamp nicely if it's ISO format
+ if timestamp != 'unknown' and timestamp:
+ try:
+ dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+ formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+ except:
+ formatted_time = timestamp
+ else:
+ formatted_time = 'unknown'
+
+ f.write(f'\n')
+ f.write(xml)
+ f.write('\n\n')
+
+ f.write('\n')
+
+print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
diff --git a/scripts/extraction/filter-actual-xml.py b/scripts/extraction/filter-actual-xml.py
new file mode 100755
index 00000000..ef1344dd
--- /dev/null
+++ b/scripts/extraction/filter-actual-xml.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+import json
+import re
+from datetime import datetime
+import os
+
+def extract_xml_blocks(text):
+ """Extract complete XML blocks from text"""
+ xml_patterns = [
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ r'.*?',
+ ]
+
+ blocks = []
+ for pattern in xml_patterns:
+ matches = re.findall(pattern, text, re.DOTALL)
+ blocks.extend(matches)
+
+ return blocks
+
+def is_example_xml(xml_block):
+ """Check if XML block is an example/template"""
+ # Patterns that indicate this is example/template XML
+ example_indicators = [
+ r'\[.*?\]', # Square brackets with placeholders
+ r'\*\*\w+\*\*:', # Bold markdown like **title**:
+ r'\.\.\..*?\.\.\.', # Ellipsis indicating placeholder
+ r'feature\|bugfix\|refactor', # Multiple options separated by |
+ r'change \| discovery \| decision', # Example types
+ r'\{.*?\}', # Curly braces (template variables)
+ r'Concise, self-contained statement', # Literal example text
+ r'Short title capturing',
+ r'One sentence explanation',
+ r'What was the user trying',
+ r'What code/systems did you explore',
+ r'What did you learn',
+ r'What was done',
+ r'What should happen next',
+ r'file1\.ts', # Example filenames
+ r'file2\.ts',
+ r'file3\.ts',
+ r'Any additional context',
+ ]
+
+ for pattern in example_indicators:
+ if re.search(pattern, xml_block):
+ return True
+
+ return False
+
+def process_transcript_file(filepath):
+ """Process a single transcript file and extract only real XML from assistant responses"""
+ results = []
+
+ with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
+ for line in f:
+ try:
+ data = json.loads(line)
+
+ # Get timestamp
+ timestamp = data.get('timestamp', 'unknown')
+
+ # Only process assistant messages
+ message = data.get('message', {})
+ role = message.get('role')
+
+ if role != 'assistant':
+ continue
+
+ content = message.get('content', [])
+
+ if isinstance(content, list):
+ for item in content:
+ if isinstance(item, dict) and item.get('type') == 'text':
+ # This is text in an assistant response, not tool_use
+ text = item.get('text', '')
+
+ # Extract XML blocks
+ xml_blocks = extract_xml_blocks(text)
+
+ for block in xml_blocks:
+ # Filter out example/template XML
+ if not is_example_xml(block):
+ results.append({
+ 'timestamp': timestamp,
+ 'xml': block
+ })
+
+ except json.JSONDecodeError:
+ continue
+
+ return results
+
+# Get list of Oct 18 transcript files
+import subprocess
+
+transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
+os.chdir(transcript_dir)
+
+# Get all transcript files sorted by modification time
+result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
+files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
+
+all_results = []
+for filename in files:
+ filepath = os.path.join(transcript_dir, filename)
+ print(f"Processing {filename}...")
+ results = process_transcript_file(filepath)
+ all_results.extend(results)
+ print(f" Found {len(results)} actual XML blocks")
+
+# Write results with timestamps
+output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml')
+with open(output_file, 'w', encoding='utf-8') as f:
+ f.write('\n')
+ f.write('\n')
+ f.write('\n')
+ f.write('\n\n')
+
+ for i, item in enumerate(all_results, 1):
+ timestamp = item['timestamp']
+ xml = item['xml']
+
+ # Format timestamp nicely if it's ISO format
+ if timestamp != 'unknown' and timestamp:
+ try:
+ dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
+ formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
+ except:
+ formatted_time = timestamp
+ else:
+ formatted_time = 'unknown'
+
+ f.write(f'\n')
+ f.write(xml)
+ f.write('\n\n')
+
+ f.write('\n')
+
+print(f"\n{'='*80}")
+print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}")
+print(f"{'='*80}")
diff --git a/src/bin/cleanup-duplicates.ts b/src/bin/cleanup-duplicates.ts
new file mode 100644
index 00000000..02fdb408
--- /dev/null
+++ b/src/bin/cleanup-duplicates.ts
@@ -0,0 +1,98 @@
+#!/usr/bin/env node
+/**
+ * Cleanup duplicate observations and summaries from the database
+ * Keeps the earliest entry (MIN(id)) for each duplicate group
+ */
+
+import { SessionStore } from '../services/sqlite/SessionStore.js';
+
+function main() {
+ console.log('Starting duplicate cleanup...\n');
+
+ const db = new SessionStore();
+
+ // Find and delete duplicate observations
+ console.log('Finding duplicate observations...');
+
+ const duplicateObsQuery = db['db'].prepare(`
+ SELECT sdk_session_id, title, subtitle, type, COUNT(*) as count, GROUP_CONCAT(id) as ids
+ FROM observations
+ GROUP BY sdk_session_id, title, subtitle, type
+ HAVING count > 1
+ `);
+
+ const duplicateObs = duplicateObsQuery.all() as Array<{
+ sdk_session_id: string;
+ title: string;
+ subtitle: string;
+ type: string;
+ count: number;
+ ids: string;
+ }>;
+
+ console.log(`Found ${duplicateObs.length} duplicate observation groups\n`);
+
+ let deletedObs = 0;
+ for (const dup of duplicateObs) {
+ const ids = dup.ids.split(',').map(id => parseInt(id, 10));
+ const keepId = Math.min(...ids);
+ const deleteIds = ids.filter(id => id !== keepId);
+
+ console.log(`Observation "${dup.title.substring(0, 60)}..."`);
+ console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`);
+
+ const deleteStmt = db['db'].prepare(`DELETE FROM observations WHERE id IN (${deleteIds.join(',')})`);
+ deleteStmt.run();
+ deletedObs += deleteIds.length;
+ }
+
+ // Find and delete duplicate summaries
+ console.log('\n\nFinding duplicate summaries...');
+
+ const duplicateSumQuery = db['db'].prepare(`
+ SELECT sdk_session_id, request, completed, learned, COUNT(*) as count, GROUP_CONCAT(id) as ids
+ FROM session_summaries
+ GROUP BY sdk_session_id, request, completed, learned
+ HAVING count > 1
+ `);
+
+ const duplicateSum = duplicateSumQuery.all() as Array<{
+ sdk_session_id: string;
+ request: string;
+ completed: string;
+ learned: string;
+ count: number;
+ ids: string;
+ }>;
+
+ console.log(`Found ${duplicateSum.length} duplicate summary groups\n`);
+
+ let deletedSum = 0;
+ for (const dup of duplicateSum) {
+ const ids = dup.ids.split(',').map(id => parseInt(id, 10));
+ const keepId = Math.min(...ids);
+ const deleteIds = ids.filter(id => id !== keepId);
+
+ console.log(`Summary "${dup.request.substring(0, 60)}..."`);
+ console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`);
+
+ const deleteStmt = db['db'].prepare(`DELETE FROM session_summaries WHERE id IN (${deleteIds.join(',')})`);
+ deleteStmt.run();
+ deletedSum += deleteIds.length;
+ }
+
+ db.close();
+
+ console.log('\n' + '='.repeat(60));
+ console.log('Cleanup Complete!');
+ console.log('='.repeat(60));
+ console.log(`🗑️ Deleted: ${deletedObs} duplicate observations`);
+ console.log(`🗑️ Deleted: ${deletedSum} duplicate summaries`);
+ console.log(`🗑️ Total: ${deletedObs + deletedSum} duplicates removed`);
+ console.log('='.repeat(60));
+}
+
+// Run if executed directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+ main();
+}
diff --git a/src/bin/import-xml-observations.ts b/src/bin/import-xml-observations.ts
new file mode 100644
index 00000000..87c5881d
--- /dev/null
+++ b/src/bin/import-xml-observations.ts
@@ -0,0 +1,382 @@
+#!/usr/bin/env node
+/**
+ * Import XML observations back into the database
+ * Parses actual_xml_only_with_timestamps.xml and inserts observations via SessionStore
+ */
+
+import { readFileSync, readdirSync } from 'fs';
+import { join } from 'path';
+import { homedir } from 'os';
+import { SessionStore } from '../services/sqlite/SessionStore.js';
+
+interface ObservationData {
+ type: string;
+ title: string;
+ subtitle: string;
+ facts: string[];
+ narrative: string;
+ concepts: string[];
+ files_read: string[];
+ files_modified: string[];
+}
+
+interface SummaryData {
+ request: string;
+ investigated: string;
+ learned: string;
+ completed: string;
+ next_steps: string;
+ notes: string | null;
+}
+
+interface SessionMetadata {
+ sessionId: string;
+ project: string;
+}
+
+interface TimestampMapping {
+ [timestamp: string]: SessionMetadata;
+}
+
+/**
+ * Build a map of timestamp (rounded to second) -> session metadata by reading all transcript files
+ * Since XML timestamps are rounded to seconds, we map by second
+ */
+function buildTimestampMap(): TimestampMapping {
+ const transcriptDir = join(homedir(), '.claude', 'projects', '-Users-alexnewman-Scripts-claude-mem');
+ const map: TimestampMapping = {};
+
+ console.log(`Reading transcript files from ${transcriptDir}...`);
+
+ const files = readdirSync(transcriptDir).filter(f => f.endsWith('.jsonl'));
+ console.log(`Found ${files.length} transcript files`);
+
+ for (const filename of files) {
+ const filepath = join(transcriptDir, filename);
+ const content = readFileSync(filepath, 'utf-8');
+ const lines = content.split('\n').filter(l => l.trim());
+
+ for (const line of lines) {
+ try {
+ const data = JSON.parse(line);
+ const timestamp = data.timestamp;
+ const sessionId = data.sessionId;
+ const project = data.cwd || '/Users/alexnewman/Scripts/claude-mem';
+
+ if (timestamp && sessionId) {
+ // Round timestamp to second for matching with XML timestamps
+ const roundedTimestamp = new Date(timestamp);
+ roundedTimestamp.setMilliseconds(0);
+ const key = roundedTimestamp.toISOString();
+
+ // Only store first occurrence for each second (they're all the same session anyway)
+ if (!map[key]) {
+ map[key] = { sessionId, project };
+ }
+ }
+ } catch (e) {
+ // Skip invalid JSON lines
+ }
+ }
+ }
+
+ console.log(`Built timestamp map with ${Object.keys(map).length} unique seconds`);
+ return map;
+}
+
+/**
+ * Parse XML text content and extract tag value
+ */
+function extractTag(xml: string, tagName: string): string {
+ const regex = new RegExp(`<${tagName}>([\\s\\S]*?)${tagName}>`, 'i');
+ const match = xml.match(regex);
+ return match ? match[1].trim() : '';
+}
+
+/**
+ * Parse XML array tags (facts, concepts, files, etc.)
+ */
+function extractArrayTags(xml: string, containerTag: string, itemTag: string): string[] {
+ const containerRegex = new RegExp(`<${containerTag}>([\\s\\S]*?)${containerTag}>`, 'i');
+ const containerMatch = xml.match(containerRegex);
+
+ if (!containerMatch) {
+ return [];
+ }
+
+ const containerContent = containerMatch[1];
+ const itemRegex = new RegExp(`<${itemTag}>([\\s\\S]*?)${itemTag}>`, 'gi');
+ const items: string[] = [];
+ let match;
+
+ while ((match = itemRegex.exec(containerContent)) !== null) {
+ items.push(match[1].trim());
+ }
+
+ return items;
+}
+
+/**
+ * Parse an observation block from XML
+ */
+function parseObservation(xml: string): ObservationData | null {
+ // Must be a complete observation block
+ if (!xml.includes('') || !xml.includes('')) {
+ return null;
+ }
+
+ try {
+ const observation: ObservationData = {
+ type: extractTag(xml, 'type'),
+ title: extractTag(xml, 'title'),
+ subtitle: extractTag(xml, 'subtitle'),
+ facts: extractArrayTags(xml, 'facts', 'fact'),
+ narrative: extractTag(xml, 'narrative'),
+ concepts: extractArrayTags(xml, 'concepts', 'concept'),
+ files_read: extractArrayTags(xml, 'files_read', 'file'),
+ files_modified: extractArrayTags(xml, 'files_modified', 'file'),
+ };
+
+ // Validate required fields
+ if (!observation.type || !observation.title) {
+ return null;
+ }
+
+ return observation;
+ } catch (e) {
+ console.error('Error parsing observation:', e);
+ return null;
+ }
+}
+
+/**
+ * Parse a summary block from XML
+ */
+function parseSummary(xml: string): SummaryData | null {
+ // Must be a complete summary block
+ if (!xml.includes('') || !xml.includes('')) {
+ return null;
+ }
+
+ try {
+ const summary: SummaryData = {
+ request: extractTag(xml, 'request'),
+ investigated: extractTag(xml, 'investigated'),
+ learned: extractTag(xml, 'learned'),
+ completed: extractTag(xml, 'completed'),
+ next_steps: extractTag(xml, 'next_steps'),
+ notes: extractTag(xml, 'notes') || null,
+ };
+
+ // Validate required fields
+ if (!summary.request) {
+ return null;
+ }
+
+ return summary;
+ } catch (e) {
+ console.error('Error parsing summary:', e);
+ return null;
+ }
+}
+
+/**
+ * Extract timestamp from XML comment
+ * Format:
+ */
+function extractTimestamp(commentLine: string): string | null {
+ const match = commentLine.match(//);
+ if (match) {
+ // Convert "2025-10-19 03:03:23 UTC" to ISO format
+ const dateStr = match[1].replace(' UTC', '').replace(' ', 'T') + 'Z';
+ return new Date(dateStr).toISOString();
+ }
+ return null;
+}
+
+/**
+ * Main import function
+ */
+function main() {
+ console.log('Starting XML observation import...\n');
+
+ // Build timestamp map
+ const timestampMap = buildTimestampMap();
+
+ // Open database connection
+ const db = new SessionStore();
+
+ // Create SDK sessions for all unique Claude Code sessions
+ console.log('\nCreating SDK sessions for imported data...');
+ const claudeSessionToSdkSession = new Map();
+
+ for (const sessionMeta of Object.values(timestampMap)) {
+ if (!claudeSessionToSdkSession.has(sessionMeta.sessionId)) {
+ const syntheticSdkSessionId = `imported-${sessionMeta.sessionId}`;
+
+ // Try to find existing session first
+ const existingQuery = db['db'].prepare(`
+ SELECT sdk_session_id
+ FROM sdk_sessions
+ WHERE claude_session_id = ?
+ `);
+ const existing = existingQuery.get(sessionMeta.sessionId) as { sdk_session_id: string | null } | undefined;
+
+ if (existing && existing.sdk_session_id) {
+ // Use existing SDK session ID
+ claudeSessionToSdkSession.set(sessionMeta.sessionId, existing.sdk_session_id);
+ } else if (existing && !existing.sdk_session_id) {
+ // Session exists but sdk_session_id is NULL, update it
+ const dbId = (db['db'].prepare('SELECT id FROM sdk_sessions WHERE claude_session_id = ?').get(sessionMeta.sessionId) as { id: number }).id;
+ db.updateSDKSessionId(dbId, syntheticSdkSessionId);
+ claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
+ } else {
+ // Create new SDK session
+ const dbId = db.createSDKSession(
+ sessionMeta.sessionId,
+ sessionMeta.project,
+ 'Imported from transcript XML'
+ );
+
+ // Update with synthetic SDK session ID
+ db.updateSDKSessionId(dbId, syntheticSdkSessionId);
+
+ claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
+ }
+ }
+ }
+
+ console.log(`Prepared ${claudeSessionToSdkSession.size} SDK sessions\n`);
+
+ // Read XML file
+ const xmlPath = join(process.cwd(), 'actual_xml_only_with_timestamps.xml');
+ console.log(`Reading XML file: ${xmlPath}`);
+ const xmlContent = readFileSync(xmlPath, 'utf-8');
+
+ // Split into blocks by comment markers
+ const blocks = xmlContent.split(/(?=