#!/usr/bin/env bun /** * Cleanup script for duplicate observations created by the batching bug. * * The bug: When multiple messages were batched together, observations were stored * once per message ID instead of once per observation. For example, if 4 messages * were batched and produced 3 observations, those 3 observations were stored * 12 times (4×3) instead of 3 times. * * This script identifies duplicates by matching on: * - memory_session_id (same session) * - text (same content) * - type (same observation type) * - created_at_epoch within 60 seconds (same batch window) * * Usage: * bun scripts/cleanup-duplicates.ts # Dry run (default) * bun scripts/cleanup-duplicates.ts --execute # Actually delete duplicates */ import { Database } from 'bun:sqlite'; import { homedir } from 'os'; import { join } from 'path'; const DB_PATH = join(homedir(), '.claude-mem', 'claude-mem.db'); // Time window modes for duplicate detection const TIME_WINDOW_MODES = { strict: 5, // 5 seconds - only exact duplicates from same batch normal: 60, // 60 seconds - duplicates within same minute aggressive: 0, // 0 = ignore time entirely, match on session+text+type only }; interface DuplicateGroup { memory_session_id: string; title: string; type: string; epoch_bucket: number; count: number; ids: number[]; keep_id: number; delete_ids: number[]; } interface ObservationRow { id: number; memory_session_id: string; title: string | null; subtitle: string | null; narrative: string | null; type: string; created_at_epoch: number; } function main() { const dryRun = !process.argv.includes('--execute'); const aggressive = process.argv.includes('--aggressive'); const strict = process.argv.includes('--strict'); // Determine time window let windowMode: keyof typeof TIME_WINDOW_MODES = 'normal'; if (aggressive) windowMode = 'aggressive'; if (strict) windowMode = 'strict'; const batchWindowSeconds = TIME_WINDOW_MODES[windowMode]; console.log('='.repeat(60)); console.log('Claude-Mem Duplicate Observation Cleanup'); console.log('='.repeat(60)); console.log(`Mode: ${dryRun ? 'DRY RUN (use --execute to delete)' : 'EXECUTE'}`); console.log(`Database: ${DB_PATH}`); console.log(`Time window: ${windowMode} (${batchWindowSeconds === 0 ? 'ignore time' : batchWindowSeconds + ' seconds'})`); console.log(''); console.log('Options:'); console.log(' --execute Actually delete duplicates (default: dry run)'); console.log(' --strict 5-second window (exact batch duplicates only)'); console.log(' --aggressive Ignore time, match on session+text+type only'); console.log(''); const db = dryRun ? new Database(DB_PATH, { readonly: true }) : new Database(DB_PATH); // Get total observation count const totalCount = db.prepare('SELECT COUNT(*) as count FROM observations').get() as { count: number }; console.log(`Total observations in database: ${totalCount.count}`); // Find all observations and group by content fingerprint const observations = db.prepare(` SELECT id, memory_session_id, title, subtitle, narrative, type, created_at_epoch FROM observations ORDER BY memory_session_id, title, type, created_at_epoch `).all() as ObservationRow[]; console.log(`Analyzing ${observations.length} observations for duplicates...`); console.log(''); // Group observations by fingerprint (session + text + type + time bucket) const groups = new Map(); for (const obs of observations) { // Skip observations without title (can't dedupe without content identifier) if (obs.title === null) continue; // Create content hash from title + subtitle + narrative const contentKey = `${obs.title}|${obs.subtitle || ''}|${obs.narrative || ''}`; // Create fingerprint based on time window mode let fingerprint: string; if (batchWindowSeconds === 0) { // Aggressive mode: ignore time entirely fingerprint = `${obs.memory_session_id}|${obs.type}|${contentKey}`; } else { // Normal/strict mode: include time bucket const epochBucket = Math.floor(obs.created_at_epoch / batchWindowSeconds); fingerprint = `${obs.memory_session_id}|${obs.type}|${epochBucket}|${contentKey}`; } if (!groups.has(fingerprint)) { groups.set(fingerprint, []); } groups.get(fingerprint)!.push(obs); } // Find groups with duplicates const duplicateGroups: DuplicateGroup[] = []; for (const [fingerprint, rows] of groups) { if (rows.length > 1) { // Sort by id to keep the oldest (lowest id) rows.sort((a, b) => a.id - b.id); const keepId = rows[0].id; const deleteIds = rows.slice(1).map(r => r.id); // SAFETY: Never delete all copies - always keep at least one if (deleteIds.length >= rows.length) { throw new Error(`SAFETY VIOLATION: Would delete all ${rows.length} copies! Aborting.`); } if (!deleteIds.every(id => id !== keepId)) { throw new Error(`SAFETY VIOLATION: Delete list contains keep_id ${keepId}! Aborting.`); } const title = rows[0].title || ''; duplicateGroups.push({ memory_session_id: rows[0].memory_session_id, title: title.substring(0, 100) + (title.length > 100 ? '...' : ''), type: rows[0].type, epoch_bucket: batchWindowSeconds > 0 ? Math.floor(rows[0].created_at_epoch / batchWindowSeconds) : 0, count: rows.length, ids: rows.map(r => r.id), keep_id: keepId, delete_ids: deleteIds, }); } } if (duplicateGroups.length === 0) { console.log('No duplicate observations found!'); db.close(); return; } // Calculate stats const totalDuplicates = duplicateGroups.reduce((sum, g) => sum + g.delete_ids.length, 0); const affectedSessions = new Set(duplicateGroups.map(g => g.memory_session_id)).size; console.log('DUPLICATE ANALYSIS:'); console.log('-'.repeat(60)); console.log(`Duplicate groups found: ${duplicateGroups.length}`); console.log(`Total duplicates to remove: ${totalDuplicates}`); console.log(`Affected sessions: ${affectedSessions}`); console.log(`Observations after cleanup: ${totalCount.count - totalDuplicates}`); console.log(''); // Show sample of duplicates console.log('SAMPLE DUPLICATES (first 10 groups):'); console.log('-'.repeat(60)); for (const group of duplicateGroups.slice(0, 10)) { console.log(`Session: ${group.memory_session_id.substring(0, 20)}...`); console.log(`Type: ${group.type}`); console.log(`Count: ${group.count} copies (keeping id=${group.keep_id}, deleting ${group.delete_ids.length})`); console.log(`Title: "${group.title}"`); console.log(''); } if (duplicateGroups.length > 10) { console.log(`... and ${duplicateGroups.length - 10} more groups`); console.log(''); } // Execute deletion if not dry run if (!dryRun) { console.log('EXECUTING DELETION...'); console.log('-'.repeat(60)); const allDeleteIds = duplicateGroups.flatMap(g => g.delete_ids); // Delete in batches of 500 to avoid SQLite limits const BATCH_SIZE = 500; let deleted = 0; db.exec('BEGIN TRANSACTION'); try { for (let i = 0; i < allDeleteIds.length; i += BATCH_SIZE) { const batch = allDeleteIds.slice(i, i + BATCH_SIZE); const placeholders = batch.map(() => '?').join(','); const stmt = db.prepare(`DELETE FROM observations WHERE id IN (${placeholders})`); const result = stmt.run(...batch); deleted += result.changes; console.log(`Deleted batch ${Math.floor(i / BATCH_SIZE) + 1}: ${result.changes} observations`); } db.exec('COMMIT'); console.log(''); console.log(`Successfully deleted ${deleted} duplicate observations!`); // Verify final count const finalCount = db.prepare('SELECT COUNT(*) as count FROM observations').get() as { count: number }; console.log(`Final observation count: ${finalCount.count}`); } catch (error) { db.exec('ROLLBACK'); console.error('Error during deletion, rolled back:', error); process.exit(1); } } else { console.log('DRY RUN COMPLETE'); console.log('-'.repeat(60)); console.log('No changes were made. Run with --execute to delete duplicates.'); } db.close(); } main();