feat: Add XML extraction and import scripts for observations and summaries

2025-10-18 23:34:53 -04:00
parent bc41e367c0
commit 6d68fd44ca
6 changed files with 860 additions and 0 deletions
@@ -0,0 +1,382 @@
+#!/usr/bin/env node
+/**
+ * Import XML observations back into the database
+ * Parses actual_xml_only_with_timestamps.xml and inserts observations via SessionStore
+ */
+
+import { readFileSync, readdirSync } from 'fs';
+import { join } from 'path';
+import { homedir } from 'os';
+import { SessionStore } from '../services/sqlite/SessionStore.js';
+
+interface ObservationData {
+  type: string;
+  title: string;
+  subtitle: string;
+  facts: string[];
+  narrative: string;
+  concepts: string[];
+  files_read: string[];
+  files_modified: string[];
+}
+
+interface SummaryData {
+  request: string;
+  investigated: string;
+  learned: string;
+  completed: string;
+  next_steps: string;
+  notes: string | null;
+}
+
+interface SessionMetadata {
+  sessionId: string;
+  project: string;
+}
+
+interface TimestampMapping {
+  [timestamp: string]: SessionMetadata;
+}
+
+/**
+ * Build a map of timestamp (rounded to second) -> session metadata by reading all transcript files
+ * Since XML timestamps are rounded to seconds, we map by second
+ */
+function buildTimestampMap(): TimestampMapping {
+  const transcriptDir = join(homedir(), '.claude', 'projects', '-Users-alexnewman-Scripts-claude-mem');
+  const map: TimestampMapping = {};
+
+  console.log(`Reading transcript files from ${transcriptDir}...`);
+
+  const files = readdirSync(transcriptDir).filter(f => f.endsWith('.jsonl'));
+  console.log(`Found ${files.length} transcript files`);
+
+  for (const filename of files) {
+    const filepath = join(transcriptDir, filename);
+    const content = readFileSync(filepath, 'utf-8');
+    const lines = content.split('\n').filter(l => l.trim());
+
+    for (const line of lines) {
+      try {
+        const data = JSON.parse(line);
+        const timestamp = data.timestamp;
+        const sessionId = data.sessionId;
+        const project = data.cwd || '/Users/alexnewman/Scripts/claude-mem';
+
+        if (timestamp && sessionId) {
+          // Round timestamp to second for matching with XML timestamps
+          const roundedTimestamp = new Date(timestamp);
+          roundedTimestamp.setMilliseconds(0);
+          const key = roundedTimestamp.toISOString();
+
+          // Only store first occurrence for each second (they're all the same session anyway)
+          if (!map[key]) {
+            map[key] = { sessionId, project };
+          }
+        }
+      } catch (e) {
+        // Skip invalid JSON lines
+      }
+    }
+  }
+
+  console.log(`Built timestamp map with ${Object.keys(map).length} unique seconds`);
+  return map;
+}
+
+/**
+ * Parse XML text content and extract tag value
+ */
+function extractTag(xml: string, tagName: string): string {
+  const regex = new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`, 'i');
+  const match = xml.match(regex);
+  return match ? match[1].trim() : '';
+}
+
+/**
+ * Parse XML array tags (facts, concepts, files, etc.)
+ */
+function extractArrayTags(xml: string, containerTag: string, itemTag: string): string[] {
+  const containerRegex = new RegExp(`<${containerTag}>([\\s\\S]*?)</${containerTag}>`, 'i');
+  const containerMatch = xml.match(containerRegex);
+
+  if (!containerMatch) {
+    return [];
+  }
+
+  const containerContent = containerMatch[1];
+  const itemRegex = new RegExp(`<${itemTag}>([\\s\\S]*?)</${itemTag}>`, 'gi');
+  const items: string[] = [];
+  let match;
+
+  while ((match = itemRegex.exec(containerContent)) !== null) {
+    items.push(match[1].trim());
+  }
+
+  return items;
+}
+
+/**
+ * Parse an observation block from XML
+ */
+function parseObservation(xml: string): ObservationData | null {
+  // Must be a complete observation block
+  if (!xml.includes('<observation>') || !xml.includes('</observation>')) {
+    return null;
+  }
+
+  try {
+    const observation: ObservationData = {
+      type: extractTag(xml, 'type'),
+      title: extractTag(xml, 'title'),
+      subtitle: extractTag(xml, 'subtitle'),
+      facts: extractArrayTags(xml, 'facts', 'fact'),
+      narrative: extractTag(xml, 'narrative'),
+      concepts: extractArrayTags(xml, 'concepts', 'concept'),
+      files_read: extractArrayTags(xml, 'files_read', 'file'),
+      files_modified: extractArrayTags(xml, 'files_modified', 'file'),
+    };
+
+    // Validate required fields
+    if (!observation.type || !observation.title) {
+      return null;
+    }
+
+    return observation;
+  } catch (e) {
+    console.error('Error parsing observation:', e);
+    return null;
+  }
+}
+
+/**
+ * Parse a summary block from XML
+ */
+function parseSummary(xml: string): SummaryData | null {
+  // Must be a complete summary block
+  if (!xml.includes('<summary>') || !xml.includes('</summary>')) {
+    return null;
+  }
+
+  try {
+    const summary: SummaryData = {
+      request: extractTag(xml, 'request'),
+      investigated: extractTag(xml, 'investigated'),
+      learned: extractTag(xml, 'learned'),
+      completed: extractTag(xml, 'completed'),
+      next_steps: extractTag(xml, 'next_steps'),
+      notes: extractTag(xml, 'notes') || null,
+    };
+
+    // Validate required fields
+    if (!summary.request) {
+      return null;
+    }
+
+    return summary;
+  } catch (e) {
+    console.error('Error parsing summary:', e);
+    return null;
+  }
+}
+
+/**
+ * Extract timestamp from XML comment
+ * Format: <!-- Block N | 2025-10-19 03:03:23 UTC -->
+ */
+function extractTimestamp(commentLine: string): string | null {
+  const match = commentLine.match(/<!-- Block \d+ \| (.+?) -->/);
+  if (match) {
+    // Convert "2025-10-19 03:03:23 UTC" to ISO format
+    const dateStr = match[1].replace(' UTC', '').replace(' ', 'T') + 'Z';
+    return new Date(dateStr).toISOString();
+  }
+  return null;
+}
+
+/**
+ * Main import function
+ */
+function main() {
+  console.log('Starting XML observation import...\n');
+
+  // Build timestamp map
+  const timestampMap = buildTimestampMap();
+
+  // Open database connection
+  const db = new SessionStore();
+
+  // Create SDK sessions for all unique Claude Code sessions
+  console.log('\nCreating SDK sessions for imported data...');
+  const claudeSessionToSdkSession = new Map<string, string>();
+
+  for (const sessionMeta of Object.values(timestampMap)) {
+    if (!claudeSessionToSdkSession.has(sessionMeta.sessionId)) {
+      const syntheticSdkSessionId = `imported-${sessionMeta.sessionId}`;
+
+      // Try to find existing session first
+      const existingQuery = db['db'].prepare(`
+        SELECT sdk_session_id
+        FROM sdk_sessions
+        WHERE claude_session_id = ?
+      `);
+      const existing = existingQuery.get(sessionMeta.sessionId) as { sdk_session_id: string | null } | undefined;
+
+      if (existing && existing.sdk_session_id) {
+        // Use existing SDK session ID
+        claudeSessionToSdkSession.set(sessionMeta.sessionId, existing.sdk_session_id);
+      } else if (existing && !existing.sdk_session_id) {
+        // Session exists but sdk_session_id is NULL, update it
+        const dbId = (db['db'].prepare('SELECT id FROM sdk_sessions WHERE claude_session_id = ?').get(sessionMeta.sessionId) as { id: number }).id;
+        db.updateSDKSessionId(dbId, syntheticSdkSessionId);
+        claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
+      } else {
+        // Create new SDK session
+        const dbId = db.createSDKSession(
+          sessionMeta.sessionId,
+          sessionMeta.project,
+          'Imported from transcript XML'
+        );
+
+        // Update with synthetic SDK session ID
+        db.updateSDKSessionId(dbId, syntheticSdkSessionId);
+
+        claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
+      }
+    }
+  }
+
+  console.log(`Prepared ${claudeSessionToSdkSession.size} SDK sessions\n`);
+
+  // Read XML file
+  const xmlPath = join(process.cwd(), 'actual_xml_only_with_timestamps.xml');
+  console.log(`Reading XML file: ${xmlPath}`);
+  const xmlContent = readFileSync(xmlPath, 'utf-8');
+
+  // Split into blocks by comment markers
+  const blocks = xmlContent.split(/(?=<!-- Block \d+)/);
+  console.log(`Found ${blocks.length} blocks in XML file\n`);
+
+  let importedObs = 0;
+  let importedSum = 0;
+  let skipped = 0;
+  let duplicateObs = 0;
+  let duplicateSum = 0;
+  let noSession = 0;
+
+  for (const block of blocks) {
+    if (!block.trim() || block.startsWith('<?xml') || block.startsWith('<transcript_extracts')) {
+      continue;
+    }
+
+    // Extract timestamp from comment
+    const timestampIso = extractTimestamp(block);
+    if (!timestampIso) {
+      skipped++;
+      continue;
+    }
+
+    // Look up session metadata
+    const sessionMeta = timestampMap[timestampIso];
+    if (!sessionMeta) {
+      noSession++;
+      if (noSession <= 5) {
+        console.log(`⚠️  No session found for timestamp: ${timestampIso}`);
+      }
+      skipped++;
+      continue;
+    }
+
+    // Get SDK session ID
+    const sdkSessionId = claudeSessionToSdkSession.get(sessionMeta.sessionId);
+    if (!sdkSessionId) {
+      skipped++;
+      continue;
+    }
+
+    // Try parsing as observation first
+    const observation = parseObservation(block);
+    if (observation) {
+      // Check for duplicate
+      const existingObs = db['db'].prepare(`
+        SELECT id FROM observations
+        WHERE sdk_session_id = ? AND title = ? AND subtitle = ? AND type = ?
+      `).get(sdkSessionId, observation.title, observation.subtitle, observation.type);
+
+      if (existingObs) {
+        duplicateObs++;
+        continue;
+      }
+
+      try {
+        db.storeObservation(
+          sdkSessionId,
+          sessionMeta.project,
+          observation
+        );
+        importedObs++;
+
+        if (importedObs % 50 === 0) {
+          console.log(`Imported ${importedObs} observations...`);
+        }
+      } catch (e) {
+        console.error(`Error storing observation:`, e);
+        skipped++;
+      }
+      continue;
+    }
+
+    // Try parsing as summary
+    const summary = parseSummary(block);
+    if (summary) {
+      // Check for duplicate
+      const existingSum = db['db'].prepare(`
+        SELECT id FROM session_summaries
+        WHERE sdk_session_id = ? AND request = ? AND completed = ? AND learned = ?
+      `).get(sdkSessionId, summary.request, summary.completed, summary.learned);
+
+      if (existingSum) {
+        duplicateSum++;
+        continue;
+      }
+
+      try {
+        db.storeSummary(
+          sdkSessionId,
+          sessionMeta.project,
+          summary
+        );
+        importedSum++;
+
+        if (importedSum % 10 === 0) {
+          console.log(`Imported ${importedSum} summaries...`);
+        }
+      } catch (e) {
+        console.error(`Error storing summary:`, e);
+        skipped++;
+      }
+      continue;
+    }
+
+    // Neither observation nor summary - skip
+    skipped++;
+  }
+
+  db.close();
+
+  console.log('\n' + '='.repeat(60));
+  console.log('Import Complete!');
+  console.log('='.repeat(60));
+  console.log(`✓ Imported: ${importedObs} observations`);
+  console.log(`✓ Imported: ${importedSum} summaries`);
+  console.log(`✓ Total: ${importedObs + importedSum} items`);
+  console.log(`⊘ Skipped: ${skipped} blocks (not full observations or summaries)`);
+  console.log(`⊘ Duplicates skipped: ${duplicateObs} observations, ${duplicateSum} summaries`);
+  console.log(`⚠️  No session: ${noSession} blocks (timestamp not in transcripts)`);
+  console.log('='.repeat(60));
+}
+
+// Run if executed directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+  main();
+}