fix: align IDs with metadatas in ChromaSearchStrategy

ChromaSync.queryChroma() returns deduplicated sqlite_ids but the
metadatas array contains multiple entries per observation (narrative +
facts). The filterByRecency() method was iterating over metadatas and
using the index to access ids, causing array out-of-bounds access.

The fix builds a Map from sqlite_id to metadata, then iterates over
the deduplicated ids array to ensure proper alignment.

Symptoms before fix:
- Semantic search returning incorrect/empty results
- Search only working with near-exact queries
- Recent items (same day) not being found

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Abdelkarim Mateos Sanchez
2026-02-02 20:12:00 +01:00
committed by Alex Newman
parent f62b5d248c
commit 9bd56c993c
2 changed files with 101 additions and 4 deletions
@@ -167,6 +167,13 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt
/**
* Filter results by recency (90-day window)
*
* IMPORTANT: ChromaSync.queryChroma() returns deduplicated `ids` (unique sqlite_ids)
* but the `metadatas` array may contain multiple entries per sqlite_id (e.g., one
* observation can have narrative + multiple facts as separate Chroma documents).
*
* This method iterates over the deduplicated `ids` and finds the first matching
* metadata for each ID to avoid array misalignment issues.
*/
private filterByRecency(chromaResults: {
ids: number[];
@@ -174,10 +181,19 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt
}): Array<{ id: number; meta: ChromaMetadata }> {
const cutoff = Date.now() - SEARCH_CONSTANTS.RECENCY_WINDOW_MS;
return chromaResults.metadatas
.map((meta, idx) => ({
id: chromaResults.ids[idx],
meta
// Build a map from sqlite_id to first metadata for efficient lookup
const metadataByIdMap = new Map<number, ChromaMetadata>();
for (const meta of chromaResults.metadatas) {
if (meta?.sqlite_id !== undefined && !metadataByIdMap.has(meta.sqlite_id)) {
metadataByIdMap.set(meta.sqlite_id, meta);
}
}
// Iterate over deduplicated ids and get corresponding metadata
return chromaResults.ids
.map(id => ({
id,
meta: metadataByIdMap.get(id) as ChromaMetadata
}))
.filter(item => item.meta && item.meta.created_at_epoch > cutoff);
}
@@ -295,6 +295,87 @@ describe('ChromaSearchStrategy', () => {
expect(result.usedChroma).toBe(false); // Error occurred
expect(result.results.observations).toHaveLength(0);
});
it('should correctly align IDs with metadatas when Chroma returns duplicate sqlite_ids (multiple docs per observation)', async () => {
// BUG SCENARIO: One observation (id=100) has 3 documents in Chroma (narrative + 2 facts)
// Another observation (id=200) has 1 document
// Chroma returns 4 metadatas but after deduplication we have 2 unique IDs
// The metadatas MUST be deduplicated/aligned to match the unique IDs
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; // 1 day ago
mockChromaSync.queryChroma = mock(() => Promise.resolve({
// After deduplication in ChromaSync.queryChroma, ids should be [100, 200]
// But metadatas array has 4 elements - THIS IS THE BUG
ids: [100, 200], // Deduplicated
distances: [0.3, 0.4, 0.5, 0.6], // Original 4 distances
metadatas: [
// Original 4 metadatas - not aligned with deduplicated ids!
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
{ sqlite_id: 200, doc_type: 'observation', created_at_epoch: recentEpoch }
]
}));
// Mock that returns observations when called with correct IDs
const mockObs100 = { ...mockObservation, id: 100 };
const mockObs200 = { ...mockObservation, id: 200, title: 'Second observation' };
mockSessionStore.getObservationsByIds = mock((ids: number[]) => {
// Should receive [100, 200]
return ids.map(id => id === 100 ? mockObs100 : mockObs200);
});
const options: StrategySearchOptions = {
query: 'test query',
searchType: 'observations'
};
const result = await strategy.search(options);
// The strategy should correctly identify BOTH observations
// Before the fix: idx=2 and idx=3 would access ids[2] and ids[3] which are undefined
expect(result.usedChroma).toBe(true);
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
// Verify the correct IDs were passed to SQLite hydration
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
expect(calledWith).toContain(100);
expect(calledWith).toContain(200);
expect(calledWith.length).toBe(2); // Should have exactly 2 unique IDs
});
it('should handle misaligned arrays gracefully without undefined access', async () => {
// Edge case: metadatas array longer than ids array
// This simulates the actual bug condition
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24;
mockChromaSync.queryChroma = mock(() => Promise.resolve({
ids: [100], // Only 1 ID after deduplication
distances: [0.3, 0.4, 0.5], // 3 distances
metadatas: [
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }
] // 3 metadatas for same observation
}));
mockSessionStore.getObservationsByIds = mock(() => [mockObservation]);
const options: StrategySearchOptions = {
query: 'test query',
searchType: 'observations'
};
// Before fix: This would try to access ids[1], ids[2] which are undefined
// causing incorrect filtering or crashes
const result = await strategy.search(options);
expect(result.usedChroma).toBe(true);
// Should still find the one observation correctly
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
expect(calledWith).toEqual([100]);
});
});
describe('strategy name', () => {