diff --git a/src/services/worker/search/strategies/ChromaSearchStrategy.ts b/src/services/worker/search/strategies/ChromaSearchStrategy.ts index 05273f53..633032c0 100644 --- a/src/services/worker/search/strategies/ChromaSearchStrategy.ts +++ b/src/services/worker/search/strategies/ChromaSearchStrategy.ts @@ -167,6 +167,13 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt /** * Filter results by recency (90-day window) + * + * IMPORTANT: ChromaSync.queryChroma() returns deduplicated `ids` (unique sqlite_ids) + * but the `metadatas` array may contain multiple entries per sqlite_id (e.g., one + * observation can have narrative + multiple facts as separate Chroma documents). + * + * This method iterates over the deduplicated `ids` and finds the first matching + * metadata for each ID to avoid array misalignment issues. */ private filterByRecency(chromaResults: { ids: number[]; @@ -174,10 +181,19 @@ export class ChromaSearchStrategy extends BaseSearchStrategy implements SearchSt }): Array<{ id: number; meta: ChromaMetadata }> { const cutoff = Date.now() - SEARCH_CONSTANTS.RECENCY_WINDOW_MS; - return chromaResults.metadatas - .map((meta, idx) => ({ - id: chromaResults.ids[idx], - meta + // Build a map from sqlite_id to first metadata for efficient lookup + const metadataByIdMap = new Map(); + for (const meta of chromaResults.metadatas) { + if (meta?.sqlite_id !== undefined && !metadataByIdMap.has(meta.sqlite_id)) { + metadataByIdMap.set(meta.sqlite_id, meta); + } + } + + // Iterate over deduplicated ids and get corresponding metadata + return chromaResults.ids + .map(id => ({ + id, + meta: metadataByIdMap.get(id) as ChromaMetadata })) .filter(item => item.meta && item.meta.created_at_epoch > cutoff); } diff --git a/tests/worker/search/strategies/chroma-search-strategy.test.ts b/tests/worker/search/strategies/chroma-search-strategy.test.ts index 1823a5b7..09a8078c 100644 --- a/tests/worker/search/strategies/chroma-search-strategy.test.ts +++ b/tests/worker/search/strategies/chroma-search-strategy.test.ts @@ -295,6 +295,87 @@ describe('ChromaSearchStrategy', () => { expect(result.usedChroma).toBe(false); // Error occurred expect(result.results.observations).toHaveLength(0); }); + + it('should correctly align IDs with metadatas when Chroma returns duplicate sqlite_ids (multiple docs per observation)', async () => { + // BUG SCENARIO: One observation (id=100) has 3 documents in Chroma (narrative + 2 facts) + // Another observation (id=200) has 1 document + // Chroma returns 4 metadatas but after deduplication we have 2 unique IDs + // The metadatas MUST be deduplicated/aligned to match the unique IDs + const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; // 1 day ago + + mockChromaSync.queryChroma = mock(() => Promise.resolve({ + // After deduplication in ChromaSync.queryChroma, ids should be [100, 200] + // But metadatas array has 4 elements - THIS IS THE BUG + ids: [100, 200], // Deduplicated + distances: [0.3, 0.4, 0.5, 0.6], // Original 4 distances + metadatas: [ + // Original 4 metadatas - not aligned with deduplicated ids! + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }, + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }, + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }, + { sqlite_id: 200, doc_type: 'observation', created_at_epoch: recentEpoch } + ] + })); + + // Mock that returns observations when called with correct IDs + const mockObs100 = { ...mockObservation, id: 100 }; + const mockObs200 = { ...mockObservation, id: 200, title: 'Second observation' }; + mockSessionStore.getObservationsByIds = mock((ids: number[]) => { + // Should receive [100, 200] + return ids.map(id => id === 100 ? mockObs100 : mockObs200); + }); + + const options: StrategySearchOptions = { + query: 'test query', + searchType: 'observations' + }; + + const result = await strategy.search(options); + + // The strategy should correctly identify BOTH observations + // Before the fix: idx=2 and idx=3 would access ids[2] and ids[3] which are undefined + expect(result.usedChroma).toBe(true); + expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled(); + + // Verify the correct IDs were passed to SQLite hydration + const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0]; + expect(calledWith).toContain(100); + expect(calledWith).toContain(200); + expect(calledWith.length).toBe(2); // Should have exactly 2 unique IDs + }); + + it('should handle misaligned arrays gracefully without undefined access', async () => { + // Edge case: metadatas array longer than ids array + // This simulates the actual bug condition + const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; + + mockChromaSync.queryChroma = mock(() => Promise.resolve({ + ids: [100], // Only 1 ID after deduplication + distances: [0.3, 0.4, 0.5], // 3 distances + metadatas: [ + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }, + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }, + { sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch } + ] // 3 metadatas for same observation + })); + + mockSessionStore.getObservationsByIds = mock(() => [mockObservation]); + + const options: StrategySearchOptions = { + query: 'test query', + searchType: 'observations' + }; + + // Before fix: This would try to access ids[1], ids[2] which are undefined + // causing incorrect filtering or crashes + const result = await strategy.search(options); + + expect(result.usedChroma).toBe(true); + // Should still find the one observation correctly + expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled(); + const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0]; + expect(calledWith).toEqual([100]); + }); }); describe('strategy name', () => {