fix: align IDs with metadatas in ChromaSearchStrategy
ChromaSync.queryChroma() returns deduplicated sqlite_ids but the metadatas array contains multiple entries per observation (narrative + facts). The filterByRecency() method was iterating over metadatas and using the index to access ids, causing array out-of-bounds access. The fix builds a Map from sqlite_id to metadata, then iterates over the deduplicated ids array to ensure proper alignment. Symptoms before fix: - Semantic search returning incorrect/empty results - Search only working with near-exact queries - Recent items (same day) not being found Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Alex Newman
parent
f62b5d248c
commit
9bd56c993c
@@ -167,6 +167,13 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter results by recency (90-day window)
|
* Filter results by recency (90-day window)
|
||||||
|
*
|
||||||
|
* IMPORTANT: ChromaSync.queryChroma() returns deduplicated `ids` (unique sqlite_ids)
|
||||||
|
* but the `metadatas` array may contain multiple entries per sqlite_id (e.g., one
|
||||||
|
* observation can have narrative + multiple facts as separate Chroma documents).
|
||||||
|
*
|
||||||
|
* This method iterates over the deduplicated `ids` and finds the first matching
|
||||||
|
* metadata for each ID to avoid array misalignment issues.
|
||||||
*/
|
*/
|
||||||
private filterByRecency(chromaResults: {
|
private filterByRecency(chromaResults: {
|
||||||
ids: number[];
|
ids: number[];
|
||||||
@@ -174,10 +181,19 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt
|
|||||||
}): Array<{ id: number; meta: ChromaMetadata }> {
|
}): Array<{ id: number; meta: ChromaMetadata }> {
|
||||||
const cutoff = Date.now() - SEARCH_CONSTANTS.RECENCY_WINDOW_MS;
|
const cutoff = Date.now() - SEARCH_CONSTANTS.RECENCY_WINDOW_MS;
|
||||||
|
|
||||||
return chromaResults.metadatas
|
// Build a map from sqlite_id to first metadata for efficient lookup
|
||||||
.map((meta, idx) => ({
|
const metadataByIdMap = new Map<number, ChromaMetadata>();
|
||||||
id: chromaResults.ids[idx],
|
for (const meta of chromaResults.metadatas) {
|
||||||
meta
|
if (meta?.sqlite_id !== undefined && !metadataByIdMap.has(meta.sqlite_id)) {
|
||||||
|
metadataByIdMap.set(meta.sqlite_id, meta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate over deduplicated ids and get corresponding metadata
|
||||||
|
return chromaResults.ids
|
||||||
|
.map(id => ({
|
||||||
|
id,
|
||||||
|
meta: metadataByIdMap.get(id) as ChromaMetadata
|
||||||
}))
|
}))
|
||||||
.filter(item => item.meta && item.meta.created_at_epoch > cutoff);
|
.filter(item => item.meta && item.meta.created_at_epoch > cutoff);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -295,6 +295,87 @@ describe('ChromaSearchStrategy', () => {
|
|||||||
expect(result.usedChroma).toBe(false); // Error occurred
|
expect(result.usedChroma).toBe(false); // Error occurred
|
||||||
expect(result.results.observations).toHaveLength(0);
|
expect(result.results.observations).toHaveLength(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('should correctly align IDs with metadatas when Chroma returns duplicate sqlite_ids (multiple docs per observation)', async () => {
|
||||||
|
// BUG SCENARIO: One observation (id=100) has 3 documents in Chroma (narrative + 2 facts)
|
||||||
|
// Another observation (id=200) has 1 document
|
||||||
|
// Chroma returns 4 metadatas but after deduplication we have 2 unique IDs
|
||||||
|
// The metadatas MUST be deduplicated/aligned to match the unique IDs
|
||||||
|
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; // 1 day ago
|
||||||
|
|
||||||
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
||||||
|
// After deduplication in ChromaSync.queryChroma, ids should be [100, 200]
|
||||||
|
// But metadatas array has 4 elements - THIS IS THE BUG
|
||||||
|
ids: [100, 200], // Deduplicated
|
||||||
|
distances: [0.3, 0.4, 0.5, 0.6], // Original 4 distances
|
||||||
|
metadatas: [
|
||||||
|
// Original 4 metadatas - not aligned with deduplicated ids!
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
||||||
|
{ sqlite_id: 200, doc_type: 'observation', created_at_epoch: recentEpoch }
|
||||||
|
]
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Mock that returns observations when called with correct IDs
|
||||||
|
const mockObs100 = { ...mockObservation, id: 100 };
|
||||||
|
const mockObs200 = { ...mockObservation, id: 200, title: 'Second observation' };
|
||||||
|
mockSessionStore.getObservationsByIds = mock((ids: number[]) => {
|
||||||
|
// Should receive [100, 200]
|
||||||
|
return ids.map(id => id === 100 ? mockObs100 : mockObs200);
|
||||||
|
});
|
||||||
|
|
||||||
|
const options: StrategySearchOptions = {
|
||||||
|
query: 'test query',
|
||||||
|
searchType: 'observations'
|
||||||
|
};
|
||||||
|
|
||||||
|
const result = await strategy.search(options);
|
||||||
|
|
||||||
|
// The strategy should correctly identify BOTH observations
|
||||||
|
// Before the fix: idx=2 and idx=3 would access ids[2] and ids[3] which are undefined
|
||||||
|
expect(result.usedChroma).toBe(true);
|
||||||
|
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
|
||||||
|
|
||||||
|
// Verify the correct IDs were passed to SQLite hydration
|
||||||
|
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
|
||||||
|
expect(calledWith).toContain(100);
|
||||||
|
expect(calledWith).toContain(200);
|
||||||
|
expect(calledWith.length).toBe(2); // Should have exactly 2 unique IDs
|
||||||
|
});
|
||||||
|
|
||||||
|
it('should handle misaligned arrays gracefully without undefined access', async () => {
|
||||||
|
// Edge case: metadatas array longer than ids array
|
||||||
|
// This simulates the actual bug condition
|
||||||
|
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24;
|
||||||
|
|
||||||
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
||||||
|
ids: [100], // Only 1 ID after deduplication
|
||||||
|
distances: [0.3, 0.4, 0.5], // 3 distances
|
||||||
|
metadatas: [
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
||||||
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }
|
||||||
|
] // 3 metadatas for same observation
|
||||||
|
}));
|
||||||
|
|
||||||
|
mockSessionStore.getObservationsByIds = mock(() => [mockObservation]);
|
||||||
|
|
||||||
|
const options: StrategySearchOptions = {
|
||||||
|
query: 'test query',
|
||||||
|
searchType: 'observations'
|
||||||
|
};
|
||||||
|
|
||||||
|
// Before fix: This would try to access ids[1], ids[2] which are undefined
|
||||||
|
// causing incorrect filtering or crashes
|
||||||
|
const result = await strategy.search(options);
|
||||||
|
|
||||||
|
expect(result.usedChroma).toBe(true);
|
||||||
|
// Should still find the one observation correctly
|
||||||
|
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
|
||||||
|
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
|
||||||
|
expect(calledWith).toEqual([100]);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe('strategy name', () => {
|
describe('strategy name', () => {
|
||||||
|
|||||||
Reference in New Issue
Block a user