feat: Add XML extraction and import scripts for observations and summaries
This commit is contained in:
@@ -41,6 +41,8 @@
|
|||||||
"test": "node --test tests/",
|
"test": "node --test tests/",
|
||||||
"test:context": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js 2>/dev/null",
|
"test:context": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js 2>/dev/null",
|
||||||
"test:context:verbose": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js",
|
"test:context:verbose": "echo '{\"session_id\":\"test-'$(date +%s)'\",\"cwd\":\"'$(pwd)'\",\"source\":\"startup\"}' | node plugin/scripts/context-hook.js",
|
||||||
|
"import:xml": "tsx src/bin/import-xml-observations.ts",
|
||||||
|
"cleanup:duplicates": "tsx src/bin/cleanup-duplicates.ts",
|
||||||
"worker:start": "pm2 start ecosystem.config.cjs",
|
"worker:start": "pm2 start ecosystem.config.cjs",
|
||||||
"worker:stop": "pm2 stop claude-mem-worker",
|
"worker:stop": "pm2 stop claude-mem-worker",
|
||||||
"worker:restart": "pm2 restart claude-mem-worker",
|
"worker:restart": "pm2 restart claude-mem-worker",
|
||||||
|
|||||||
@@ -0,0 +1,82 @@
|
|||||||
|
# XML Extraction Scripts
|
||||||
|
|
||||||
|
Scripts to extract XML observations and summaries from Claude Code transcript files.
|
||||||
|
|
||||||
|
## Scripts
|
||||||
|
|
||||||
|
### `filter-actual-xml.py`
|
||||||
|
**Recommended for import**
|
||||||
|
|
||||||
|
Extracts only actual XML from assistant responses, filtering out:
|
||||||
|
- Template/example XML (with placeholders like `[...]` or `**field**:`)
|
||||||
|
- XML from tool_use blocks
|
||||||
|
- XML from user messages
|
||||||
|
|
||||||
|
**Output:** `~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml`
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python3 scripts/extraction/filter-actual-xml.py
|
||||||
|
```
|
||||||
|
|
||||||
|
### `extract-all-xml.py`
|
||||||
|
**For debugging/analysis**
|
||||||
|
|
||||||
|
Extracts ALL XML blocks from transcripts without filtering.
|
||||||
|
|
||||||
|
**Output:** `~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml`
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```bash
|
||||||
|
python3 scripts/extraction/extract-all-xml.py
|
||||||
|
```
|
||||||
|
|
||||||
|
## Workflow
|
||||||
|
|
||||||
|
1. **Extract XML from transcripts:**
|
||||||
|
```bash
|
||||||
|
cd ~/Scripts/claude-mem
|
||||||
|
python3 scripts/extraction/filter-actual-xml.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Import to database:**
|
||||||
|
```bash
|
||||||
|
npm run import:xml
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Clean up duplicates (if needed):**
|
||||||
|
```bash
|
||||||
|
npm run cleanup:duplicates
|
||||||
|
```
|
||||||
|
|
||||||
|
## Source Data
|
||||||
|
|
||||||
|
Scripts read from: `~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/*.jsonl`
|
||||||
|
|
||||||
|
These are Claude Code session transcripts stored in JSONL (JSON Lines) format.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
```xml
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<transcript_extracts>
|
||||||
|
|
||||||
|
<!-- Block 1 | 2025-10-19 03:03:23 UTC -->
|
||||||
|
<observation>
|
||||||
|
<type>discovery</type>
|
||||||
|
<title>Example observation</title>
|
||||||
|
...
|
||||||
|
</observation>
|
||||||
|
|
||||||
|
<!-- Block 2 | 2025-10-19 03:03:45 UTC -->
|
||||||
|
<summary>
|
||||||
|
<request>What was accomplished</request>
|
||||||
|
...
|
||||||
|
</summary>
|
||||||
|
|
||||||
|
</transcript_extracts>
|
||||||
|
```
|
||||||
|
|
||||||
|
Each XML block includes a comment with:
|
||||||
|
- Block number
|
||||||
|
- Original timestamp from transcript
|
||||||
Executable
+128
@@ -0,0 +1,128 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
def extract_xml_blocks(text):
|
||||||
|
"""Extract complete XML blocks from text"""
|
||||||
|
xml_patterns = [
|
||||||
|
r'<observation>.*?</observation>',
|
||||||
|
r'<session_summary>.*?</session_summary>',
|
||||||
|
r'<request>.*?</request>',
|
||||||
|
r'<summary>.*?</summary>',
|
||||||
|
r'<facts>.*?</facts>',
|
||||||
|
r'<fact>.*?</fact>',
|
||||||
|
r'<concepts>.*?</concepts>',
|
||||||
|
r'<concept>.*?</concept>',
|
||||||
|
r'<files>.*?</files>',
|
||||||
|
r'<file>.*?</file>',
|
||||||
|
r'<files_read>.*?</files_read>',
|
||||||
|
r'<files_edited>.*?</files_edited>',
|
||||||
|
r'<files_modified>.*?</files_modified>',
|
||||||
|
r'<narrative>.*?</narrative>',
|
||||||
|
r'<learned>.*?</learned>',
|
||||||
|
r'<investigated>.*?</investigated>',
|
||||||
|
r'<completed>.*?</completed>',
|
||||||
|
r'<next_steps>.*?</next_steps>',
|
||||||
|
r'<notes>.*?</notes>',
|
||||||
|
r'<title>.*?</title>',
|
||||||
|
r'<subtitle>.*?</subtitle>',
|
||||||
|
r'<text>.*?</text>',
|
||||||
|
r'<type>.*?</type>',
|
||||||
|
]
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
for pattern in xml_patterns:
|
||||||
|
matches = re.findall(pattern, text, re.DOTALL)
|
||||||
|
blocks.extend(matches)
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def process_transcript_file(filepath):
|
||||||
|
"""Process a single transcript file and extract XML with timestamps"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
|
||||||
|
# Get timestamp
|
||||||
|
timestamp = data.get('timestamp', 'unknown')
|
||||||
|
|
||||||
|
# Extract text content from message
|
||||||
|
message = data.get('message', {})
|
||||||
|
content = message.get('content', [])
|
||||||
|
|
||||||
|
if isinstance(content, list):
|
||||||
|
for item in content:
|
||||||
|
if isinstance(item, dict):
|
||||||
|
text = ''
|
||||||
|
if item.get('type') == 'text':
|
||||||
|
text = item.get('text', '')
|
||||||
|
elif item.get('type') == 'tool_use':
|
||||||
|
# Also check tool_use input fields
|
||||||
|
tool_input = item.get('input', {})
|
||||||
|
if isinstance(tool_input, dict):
|
||||||
|
text = str(tool_input)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
# Extract XML blocks
|
||||||
|
xml_blocks = extract_xml_blocks(text)
|
||||||
|
|
||||||
|
for block in xml_blocks:
|
||||||
|
results.append({
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'xml': block
|
||||||
|
})
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Get list of transcript files
|
||||||
|
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
|
||||||
|
os.chdir(transcript_dir)
|
||||||
|
|
||||||
|
# Get all transcript files sorted by modification time
|
||||||
|
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
|
||||||
|
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
for filename in files:
|
||||||
|
filepath = os.path.join(transcript_dir, filename)
|
||||||
|
print(f"Processing {filename}...")
|
||||||
|
results = process_transcript_file(filepath)
|
||||||
|
all_results.extend(results)
|
||||||
|
print(f" Found {len(results)} XML blocks")
|
||||||
|
|
||||||
|
# Write results with timestamps
|
||||||
|
output_file = os.path.expanduser('~/Scripts/claude-mem/all_xml_fragments_with_timestamps.xml')
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||||
|
f.write('<transcript_extracts>\n\n')
|
||||||
|
|
||||||
|
for i, item in enumerate(all_results, 1):
|
||||||
|
timestamp = item['timestamp']
|
||||||
|
xml = item['xml']
|
||||||
|
|
||||||
|
# Format timestamp nicely if it's ISO format
|
||||||
|
if timestamp != 'unknown' and timestamp:
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||||
|
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||||
|
except:
|
||||||
|
formatted_time = timestamp
|
||||||
|
else:
|
||||||
|
formatted_time = 'unknown'
|
||||||
|
|
||||||
|
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
|
||||||
|
f.write(xml)
|
||||||
|
f.write('\n\n')
|
||||||
|
|
||||||
|
f.write('</transcript_extracts>\n')
|
||||||
|
|
||||||
|
print(f"\nExtracted {len(all_results)} XML blocks with timestamps to {output_file}")
|
||||||
Executable
+168
@@ -0,0 +1,168 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
import os
|
||||||
|
|
||||||
|
def extract_xml_blocks(text):
|
||||||
|
"""Extract complete XML blocks from text"""
|
||||||
|
xml_patterns = [
|
||||||
|
r'<observation>.*?</observation>',
|
||||||
|
r'<session_summary>.*?</session_summary>',
|
||||||
|
r'<request>.*?</request>',
|
||||||
|
r'<summary>.*?</summary>',
|
||||||
|
r'<facts>.*?</facts>',
|
||||||
|
r'<fact>.*?</fact>',
|
||||||
|
r'<concepts>.*?</concepts>',
|
||||||
|
r'<concept>.*?</concept>',
|
||||||
|
r'<files>.*?</files>',
|
||||||
|
r'<file>.*?</file>',
|
||||||
|
r'<files_read>.*?</files_read>',
|
||||||
|
r'<files_edited>.*?</files_edited>',
|
||||||
|
r'<files_modified>.*?</files_modified>',
|
||||||
|
r'<narrative>.*?</narrative>',
|
||||||
|
r'<learned>.*?</learned>',
|
||||||
|
r'<investigated>.*?</investigated>',
|
||||||
|
r'<completed>.*?</completed>',
|
||||||
|
r'<next_steps>.*?</next_steps>',
|
||||||
|
r'<notes>.*?</notes>',
|
||||||
|
r'<title>.*?</title>',
|
||||||
|
r'<subtitle>.*?</subtitle>',
|
||||||
|
r'<text>.*?</text>',
|
||||||
|
r'<type>.*?</type>',
|
||||||
|
r'<tool_used>.*?</tool_used>',
|
||||||
|
r'<tool_name>.*?</tool_name>',
|
||||||
|
r'<tool_input>.*?</tool_input>',
|
||||||
|
r'<tool_output>.*?</tool_output>',
|
||||||
|
r'<tool_time>.*?</tool_time>',
|
||||||
|
]
|
||||||
|
|
||||||
|
blocks = []
|
||||||
|
for pattern in xml_patterns:
|
||||||
|
matches = re.findall(pattern, text, re.DOTALL)
|
||||||
|
blocks.extend(matches)
|
||||||
|
|
||||||
|
return blocks
|
||||||
|
|
||||||
|
def is_example_xml(xml_block):
|
||||||
|
"""Check if XML block is an example/template"""
|
||||||
|
# Patterns that indicate this is example/template XML
|
||||||
|
example_indicators = [
|
||||||
|
r'\[.*?\]', # Square brackets with placeholders
|
||||||
|
r'\*\*\w+\*\*:', # Bold markdown like **title**:
|
||||||
|
r'\.\.\..*?\.\.\.', # Ellipsis indicating placeholder
|
||||||
|
r'feature\|bugfix\|refactor', # Multiple options separated by |
|
||||||
|
r'change \| discovery \| decision', # Example types
|
||||||
|
r'\{.*?\}', # Curly braces (template variables)
|
||||||
|
r'Concise, self-contained statement', # Literal example text
|
||||||
|
r'Short title capturing',
|
||||||
|
r'One sentence explanation',
|
||||||
|
r'What was the user trying',
|
||||||
|
r'What code/systems did you explore',
|
||||||
|
r'What did you learn',
|
||||||
|
r'What was done',
|
||||||
|
r'What should happen next',
|
||||||
|
r'file1\.ts', # Example filenames
|
||||||
|
r'file2\.ts',
|
||||||
|
r'file3\.ts',
|
||||||
|
r'Any additional context',
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in example_indicators:
|
||||||
|
if re.search(pattern, xml_block):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def process_transcript_file(filepath):
|
||||||
|
"""Process a single transcript file and extract only real XML from assistant responses"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
data = json.loads(line)
|
||||||
|
|
||||||
|
# Get timestamp
|
||||||
|
timestamp = data.get('timestamp', 'unknown')
|
||||||
|
|
||||||
|
# Only process assistant messages
|
||||||
|
message = data.get('message', {})
|
||||||
|
role = message.get('role')
|
||||||
|
|
||||||
|
if role != 'assistant':
|
||||||
|
continue
|
||||||
|
|
||||||
|
content = message.get('content', [])
|
||||||
|
|
||||||
|
if isinstance(content, list):
|
||||||
|
for item in content:
|
||||||
|
if isinstance(item, dict) and item.get('type') == 'text':
|
||||||
|
# This is text in an assistant response, not tool_use
|
||||||
|
text = item.get('text', '')
|
||||||
|
|
||||||
|
# Extract XML blocks
|
||||||
|
xml_blocks = extract_xml_blocks(text)
|
||||||
|
|
||||||
|
for block in xml_blocks:
|
||||||
|
# Filter out example/template XML
|
||||||
|
if not is_example_xml(block):
|
||||||
|
results.append({
|
||||||
|
'timestamp': timestamp,
|
||||||
|
'xml': block
|
||||||
|
})
|
||||||
|
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Get list of Oct 18 transcript files
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
transcript_dir = os.path.expanduser('~/.claude/projects/-Users-alexnewman-Scripts-claude-mem/')
|
||||||
|
os.chdir(transcript_dir)
|
||||||
|
|
||||||
|
# Get all transcript files sorted by modification time
|
||||||
|
result = subprocess.run(['ls', '-t'], capture_output=True, text=True)
|
||||||
|
files = [f for f in result.stdout.strip().split('\n') if f.endswith('.jsonl')][:62]
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
for filename in files:
|
||||||
|
filepath = os.path.join(transcript_dir, filename)
|
||||||
|
print(f"Processing {filename}...")
|
||||||
|
results = process_transcript_file(filepath)
|
||||||
|
all_results.extend(results)
|
||||||
|
print(f" Found {len(results)} actual XML blocks")
|
||||||
|
|
||||||
|
# Write results with timestamps
|
||||||
|
output_file = os.path.expanduser('~/Scripts/claude-mem/actual_xml_only_with_timestamps.xml')
|
||||||
|
with open(output_file, 'w', encoding='utf-8') as f:
|
||||||
|
f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
|
||||||
|
f.write('<!-- Actual XML blocks from assistant responses only -->\n')
|
||||||
|
f.write('<!-- Excludes: tool_use inputs, user prompts, and example/template XML -->\n')
|
||||||
|
f.write('<transcript_extracts>\n\n')
|
||||||
|
|
||||||
|
for i, item in enumerate(all_results, 1):
|
||||||
|
timestamp = item['timestamp']
|
||||||
|
xml = item['xml']
|
||||||
|
|
||||||
|
# Format timestamp nicely if it's ISO format
|
||||||
|
if timestamp != 'unknown' and timestamp:
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
||||||
|
formatted_time = dt.strftime('%Y-%m-%d %H:%M:%S UTC')
|
||||||
|
except:
|
||||||
|
formatted_time = timestamp
|
||||||
|
else:
|
||||||
|
formatted_time = 'unknown'
|
||||||
|
|
||||||
|
f.write(f'<!-- Block {i} | {formatted_time} -->\n')
|
||||||
|
f.write(xml)
|
||||||
|
f.write('\n\n')
|
||||||
|
|
||||||
|
f.write('</transcript_extracts>\n')
|
||||||
|
|
||||||
|
print(f"\n{'='*80}")
|
||||||
|
print(f"Extracted {len(all_results)} actual XML blocks (filtered) to {output_file}")
|
||||||
|
print(f"{'='*80}")
|
||||||
@@ -0,0 +1,98 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
/**
|
||||||
|
* Cleanup duplicate observations and summaries from the database
|
||||||
|
* Keeps the earliest entry (MIN(id)) for each duplicate group
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { SessionStore } from '../services/sqlite/SessionStore.js';
|
||||||
|
|
||||||
|
function main() {
|
||||||
|
console.log('Starting duplicate cleanup...\n');
|
||||||
|
|
||||||
|
const db = new SessionStore();
|
||||||
|
|
||||||
|
// Find and delete duplicate observations
|
||||||
|
console.log('Finding duplicate observations...');
|
||||||
|
|
||||||
|
const duplicateObsQuery = db['db'].prepare(`
|
||||||
|
SELECT sdk_session_id, title, subtitle, type, COUNT(*) as count, GROUP_CONCAT(id) as ids
|
||||||
|
FROM observations
|
||||||
|
GROUP BY sdk_session_id, title, subtitle, type
|
||||||
|
HAVING count > 1
|
||||||
|
`);
|
||||||
|
|
||||||
|
const duplicateObs = duplicateObsQuery.all() as Array<{
|
||||||
|
sdk_session_id: string;
|
||||||
|
title: string;
|
||||||
|
subtitle: string;
|
||||||
|
type: string;
|
||||||
|
count: number;
|
||||||
|
ids: string;
|
||||||
|
}>;
|
||||||
|
|
||||||
|
console.log(`Found ${duplicateObs.length} duplicate observation groups\n`);
|
||||||
|
|
||||||
|
let deletedObs = 0;
|
||||||
|
for (const dup of duplicateObs) {
|
||||||
|
const ids = dup.ids.split(',').map(id => parseInt(id, 10));
|
||||||
|
const keepId = Math.min(...ids);
|
||||||
|
const deleteIds = ids.filter(id => id !== keepId);
|
||||||
|
|
||||||
|
console.log(`Observation "${dup.title.substring(0, 60)}..."`);
|
||||||
|
console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`);
|
||||||
|
|
||||||
|
const deleteStmt = db['db'].prepare(`DELETE FROM observations WHERE id IN (${deleteIds.join(',')})`);
|
||||||
|
deleteStmt.run();
|
||||||
|
deletedObs += deleteIds.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find and delete duplicate summaries
|
||||||
|
console.log('\n\nFinding duplicate summaries...');
|
||||||
|
|
||||||
|
const duplicateSumQuery = db['db'].prepare(`
|
||||||
|
SELECT sdk_session_id, request, completed, learned, COUNT(*) as count, GROUP_CONCAT(id) as ids
|
||||||
|
FROM session_summaries
|
||||||
|
GROUP BY sdk_session_id, request, completed, learned
|
||||||
|
HAVING count > 1
|
||||||
|
`);
|
||||||
|
|
||||||
|
const duplicateSum = duplicateSumQuery.all() as Array<{
|
||||||
|
sdk_session_id: string;
|
||||||
|
request: string;
|
||||||
|
completed: string;
|
||||||
|
learned: string;
|
||||||
|
count: number;
|
||||||
|
ids: string;
|
||||||
|
}>;
|
||||||
|
|
||||||
|
console.log(`Found ${duplicateSum.length} duplicate summary groups\n`);
|
||||||
|
|
||||||
|
let deletedSum = 0;
|
||||||
|
for (const dup of duplicateSum) {
|
||||||
|
const ids = dup.ids.split(',').map(id => parseInt(id, 10));
|
||||||
|
const keepId = Math.min(...ids);
|
||||||
|
const deleteIds = ids.filter(id => id !== keepId);
|
||||||
|
|
||||||
|
console.log(`Summary "${dup.request.substring(0, 60)}..."`);
|
||||||
|
console.log(` Found ${dup.count} copies, keeping ID ${keepId}, deleting ${deleteIds.length} duplicates`);
|
||||||
|
|
||||||
|
const deleteStmt = db['db'].prepare(`DELETE FROM session_summaries WHERE id IN (${deleteIds.join(',')})`);
|
||||||
|
deleteStmt.run();
|
||||||
|
deletedSum += deleteIds.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
db.close();
|
||||||
|
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('Cleanup Complete!');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`🗑️ Deleted: ${deletedObs} duplicate observations`);
|
||||||
|
console.log(`🗑️ Deleted: ${deletedSum} duplicate summaries`);
|
||||||
|
console.log(`🗑️ Total: ${deletedObs + deletedSum} duplicates removed`);
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run if executed directly
|
||||||
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
@@ -0,0 +1,382 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
/**
|
||||||
|
* Import XML observations back into the database
|
||||||
|
* Parses actual_xml_only_with_timestamps.xml and inserts observations via SessionStore
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { readFileSync, readdirSync } from 'fs';
|
||||||
|
import { join } from 'path';
|
||||||
|
import { homedir } from 'os';
|
||||||
|
import { SessionStore } from '../services/sqlite/SessionStore.js';
|
||||||
|
|
||||||
|
interface ObservationData {
|
||||||
|
type: string;
|
||||||
|
title: string;
|
||||||
|
subtitle: string;
|
||||||
|
facts: string[];
|
||||||
|
narrative: string;
|
||||||
|
concepts: string[];
|
||||||
|
files_read: string[];
|
||||||
|
files_modified: string[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SummaryData {
|
||||||
|
request: string;
|
||||||
|
investigated: string;
|
||||||
|
learned: string;
|
||||||
|
completed: string;
|
||||||
|
next_steps: string;
|
||||||
|
notes: string | null;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SessionMetadata {
|
||||||
|
sessionId: string;
|
||||||
|
project: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TimestampMapping {
|
||||||
|
[timestamp: string]: SessionMetadata;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a map of timestamp (rounded to second) -> session metadata by reading all transcript files
|
||||||
|
* Since XML timestamps are rounded to seconds, we map by second
|
||||||
|
*/
|
||||||
|
function buildTimestampMap(): TimestampMapping {
|
||||||
|
const transcriptDir = join(homedir(), '.claude', 'projects', '-Users-alexnewman-Scripts-claude-mem');
|
||||||
|
const map: TimestampMapping = {};
|
||||||
|
|
||||||
|
console.log(`Reading transcript files from ${transcriptDir}...`);
|
||||||
|
|
||||||
|
const files = readdirSync(transcriptDir).filter(f => f.endsWith('.jsonl'));
|
||||||
|
console.log(`Found ${files.length} transcript files`);
|
||||||
|
|
||||||
|
for (const filename of files) {
|
||||||
|
const filepath = join(transcriptDir, filename);
|
||||||
|
const content = readFileSync(filepath, 'utf-8');
|
||||||
|
const lines = content.split('\n').filter(l => l.trim());
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(line);
|
||||||
|
const timestamp = data.timestamp;
|
||||||
|
const sessionId = data.sessionId;
|
||||||
|
const project = data.cwd || '/Users/alexnewman/Scripts/claude-mem';
|
||||||
|
|
||||||
|
if (timestamp && sessionId) {
|
||||||
|
// Round timestamp to second for matching with XML timestamps
|
||||||
|
const roundedTimestamp = new Date(timestamp);
|
||||||
|
roundedTimestamp.setMilliseconds(0);
|
||||||
|
const key = roundedTimestamp.toISOString();
|
||||||
|
|
||||||
|
// Only store first occurrence for each second (they're all the same session anyway)
|
||||||
|
if (!map[key]) {
|
||||||
|
map[key] = { sessionId, project };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
// Skip invalid JSON lines
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Built timestamp map with ${Object.keys(map).length} unique seconds`);
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse XML text content and extract tag value
|
||||||
|
*/
|
||||||
|
function extractTag(xml: string, tagName: string): string {
|
||||||
|
const regex = new RegExp(`<${tagName}>([\\s\\S]*?)</${tagName}>`, 'i');
|
||||||
|
const match = xml.match(regex);
|
||||||
|
return match ? match[1].trim() : '';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse XML array tags (facts, concepts, files, etc.)
|
||||||
|
*/
|
||||||
|
function extractArrayTags(xml: string, containerTag: string, itemTag: string): string[] {
|
||||||
|
const containerRegex = new RegExp(`<${containerTag}>([\\s\\S]*?)</${containerTag}>`, 'i');
|
||||||
|
const containerMatch = xml.match(containerRegex);
|
||||||
|
|
||||||
|
if (!containerMatch) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const containerContent = containerMatch[1];
|
||||||
|
const itemRegex = new RegExp(`<${itemTag}>([\\s\\S]*?)</${itemTag}>`, 'gi');
|
||||||
|
const items: string[] = [];
|
||||||
|
let match;
|
||||||
|
|
||||||
|
while ((match = itemRegex.exec(containerContent)) !== null) {
|
||||||
|
items.push(match[1].trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse an observation block from XML
|
||||||
|
*/
|
||||||
|
function parseObservation(xml: string): ObservationData | null {
|
||||||
|
// Must be a complete observation block
|
||||||
|
if (!xml.includes('<observation>') || !xml.includes('</observation>')) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const observation: ObservationData = {
|
||||||
|
type: extractTag(xml, 'type'),
|
||||||
|
title: extractTag(xml, 'title'),
|
||||||
|
subtitle: extractTag(xml, 'subtitle'),
|
||||||
|
facts: extractArrayTags(xml, 'facts', 'fact'),
|
||||||
|
narrative: extractTag(xml, 'narrative'),
|
||||||
|
concepts: extractArrayTags(xml, 'concepts', 'concept'),
|
||||||
|
files_read: extractArrayTags(xml, 'files_read', 'file'),
|
||||||
|
files_modified: extractArrayTags(xml, 'files_modified', 'file'),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate required fields
|
||||||
|
if (!observation.type || !observation.title) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return observation;
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error parsing observation:', e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse a summary block from XML
|
||||||
|
*/
|
||||||
|
function parseSummary(xml: string): SummaryData | null {
|
||||||
|
// Must be a complete summary block
|
||||||
|
if (!xml.includes('<summary>') || !xml.includes('</summary>')) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const summary: SummaryData = {
|
||||||
|
request: extractTag(xml, 'request'),
|
||||||
|
investigated: extractTag(xml, 'investigated'),
|
||||||
|
learned: extractTag(xml, 'learned'),
|
||||||
|
completed: extractTag(xml, 'completed'),
|
||||||
|
next_steps: extractTag(xml, 'next_steps'),
|
||||||
|
notes: extractTag(xml, 'notes') || null,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Validate required fields
|
||||||
|
if (!summary.request) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return summary;
|
||||||
|
} catch (e) {
|
||||||
|
console.error('Error parsing summary:', e);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract timestamp from XML comment
|
||||||
|
* Format: <!-- Block N | 2025-10-19 03:03:23 UTC -->
|
||||||
|
*/
|
||||||
|
function extractTimestamp(commentLine: string): string | null {
|
||||||
|
const match = commentLine.match(/<!-- Block \d+ \| (.+?) -->/);
|
||||||
|
if (match) {
|
||||||
|
// Convert "2025-10-19 03:03:23 UTC" to ISO format
|
||||||
|
const dateStr = match[1].replace(' UTC', '').replace(' ', 'T') + 'Z';
|
||||||
|
return new Date(dateStr).toISOString();
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Main import function
|
||||||
|
*/
|
||||||
|
function main() {
|
||||||
|
console.log('Starting XML observation import...\n');
|
||||||
|
|
||||||
|
// Build timestamp map
|
||||||
|
const timestampMap = buildTimestampMap();
|
||||||
|
|
||||||
|
// Open database connection
|
||||||
|
const db = new SessionStore();
|
||||||
|
|
||||||
|
// Create SDK sessions for all unique Claude Code sessions
|
||||||
|
console.log('\nCreating SDK sessions for imported data...');
|
||||||
|
const claudeSessionToSdkSession = new Map<string, string>();
|
||||||
|
|
||||||
|
for (const sessionMeta of Object.values(timestampMap)) {
|
||||||
|
if (!claudeSessionToSdkSession.has(sessionMeta.sessionId)) {
|
||||||
|
const syntheticSdkSessionId = `imported-${sessionMeta.sessionId}`;
|
||||||
|
|
||||||
|
// Try to find existing session first
|
||||||
|
const existingQuery = db['db'].prepare(`
|
||||||
|
SELECT sdk_session_id
|
||||||
|
FROM sdk_sessions
|
||||||
|
WHERE claude_session_id = ?
|
||||||
|
`);
|
||||||
|
const existing = existingQuery.get(sessionMeta.sessionId) as { sdk_session_id: string | null } | undefined;
|
||||||
|
|
||||||
|
if (existing && existing.sdk_session_id) {
|
||||||
|
// Use existing SDK session ID
|
||||||
|
claudeSessionToSdkSession.set(sessionMeta.sessionId, existing.sdk_session_id);
|
||||||
|
} else if (existing && !existing.sdk_session_id) {
|
||||||
|
// Session exists but sdk_session_id is NULL, update it
|
||||||
|
const dbId = (db['db'].prepare('SELECT id FROM sdk_sessions WHERE claude_session_id = ?').get(sessionMeta.sessionId) as { id: number }).id;
|
||||||
|
db.updateSDKSessionId(dbId, syntheticSdkSessionId);
|
||||||
|
claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
|
||||||
|
} else {
|
||||||
|
// Create new SDK session
|
||||||
|
const dbId = db.createSDKSession(
|
||||||
|
sessionMeta.sessionId,
|
||||||
|
sessionMeta.project,
|
||||||
|
'Imported from transcript XML'
|
||||||
|
);
|
||||||
|
|
||||||
|
// Update with synthetic SDK session ID
|
||||||
|
db.updateSDKSessionId(dbId, syntheticSdkSessionId);
|
||||||
|
|
||||||
|
claudeSessionToSdkSession.set(sessionMeta.sessionId, syntheticSdkSessionId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`Prepared ${claudeSessionToSdkSession.size} SDK sessions\n`);
|
||||||
|
|
||||||
|
// Read XML file
|
||||||
|
const xmlPath = join(process.cwd(), 'actual_xml_only_with_timestamps.xml');
|
||||||
|
console.log(`Reading XML file: ${xmlPath}`);
|
||||||
|
const xmlContent = readFileSync(xmlPath, 'utf-8');
|
||||||
|
|
||||||
|
// Split into blocks by comment markers
|
||||||
|
const blocks = xmlContent.split(/(?=<!-- Block \d+)/);
|
||||||
|
console.log(`Found ${blocks.length} blocks in XML file\n`);
|
||||||
|
|
||||||
|
let importedObs = 0;
|
||||||
|
let importedSum = 0;
|
||||||
|
let skipped = 0;
|
||||||
|
let duplicateObs = 0;
|
||||||
|
let duplicateSum = 0;
|
||||||
|
let noSession = 0;
|
||||||
|
|
||||||
|
for (const block of blocks) {
|
||||||
|
if (!block.trim() || block.startsWith('<?xml') || block.startsWith('<transcript_extracts')) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract timestamp from comment
|
||||||
|
const timestampIso = extractTimestamp(block);
|
||||||
|
if (!timestampIso) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Look up session metadata
|
||||||
|
const sessionMeta = timestampMap[timestampIso];
|
||||||
|
if (!sessionMeta) {
|
||||||
|
noSession++;
|
||||||
|
if (noSession <= 5) {
|
||||||
|
console.log(`⚠️ No session found for timestamp: ${timestampIso}`);
|
||||||
|
}
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get SDK session ID
|
||||||
|
const sdkSessionId = claudeSessionToSdkSession.get(sessionMeta.sessionId);
|
||||||
|
if (!sdkSessionId) {
|
||||||
|
skipped++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try parsing as observation first
|
||||||
|
const observation = parseObservation(block);
|
||||||
|
if (observation) {
|
||||||
|
// Check for duplicate
|
||||||
|
const existingObs = db['db'].prepare(`
|
||||||
|
SELECT id FROM observations
|
||||||
|
WHERE sdk_session_id = ? AND title = ? AND subtitle = ? AND type = ?
|
||||||
|
`).get(sdkSessionId, observation.title, observation.subtitle, observation.type);
|
||||||
|
|
||||||
|
if (existingObs) {
|
||||||
|
duplicateObs++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
db.storeObservation(
|
||||||
|
sdkSessionId,
|
||||||
|
sessionMeta.project,
|
||||||
|
observation
|
||||||
|
);
|
||||||
|
importedObs++;
|
||||||
|
|
||||||
|
if (importedObs % 50 === 0) {
|
||||||
|
console.log(`Imported ${importedObs} observations...`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Error storing observation:`, e);
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try parsing as summary
|
||||||
|
const summary = parseSummary(block);
|
||||||
|
if (summary) {
|
||||||
|
// Check for duplicate
|
||||||
|
const existingSum = db['db'].prepare(`
|
||||||
|
SELECT id FROM session_summaries
|
||||||
|
WHERE sdk_session_id = ? AND request = ? AND completed = ? AND learned = ?
|
||||||
|
`).get(sdkSessionId, summary.request, summary.completed, summary.learned);
|
||||||
|
|
||||||
|
if (existingSum) {
|
||||||
|
duplicateSum++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
db.storeSummary(
|
||||||
|
sdkSessionId,
|
||||||
|
sessionMeta.project,
|
||||||
|
summary
|
||||||
|
);
|
||||||
|
importedSum++;
|
||||||
|
|
||||||
|
if (importedSum % 10 === 0) {
|
||||||
|
console.log(`Imported ${importedSum} summaries...`);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.error(`Error storing summary:`, e);
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Neither observation nor summary - skip
|
||||||
|
skipped++;
|
||||||
|
}
|
||||||
|
|
||||||
|
db.close();
|
||||||
|
|
||||||
|
console.log('\n' + '='.repeat(60));
|
||||||
|
console.log('Import Complete!');
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
console.log(`✓ Imported: ${importedObs} observations`);
|
||||||
|
console.log(`✓ Imported: ${importedSum} summaries`);
|
||||||
|
console.log(`✓ Total: ${importedObs + importedSum} items`);
|
||||||
|
console.log(`⊘ Skipped: ${skipped} blocks (not full observations or summaries)`);
|
||||||
|
console.log(`⊘ Duplicates skipped: ${duplicateObs} observations, ${duplicateSum} summaries`);
|
||||||
|
console.log(`⚠️ No session: ${noSession} blocks (timestamp not in transcripts)`);
|
||||||
|
console.log('='.repeat(60));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run if executed directly
|
||||||
|
if (import.meta.url === `file://${process.argv[1]}`) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user