0a40c4c596
When searching with a project parameter, the ChromaDB vector query was not filtering by project. It only filtered by doc_type. This caused larger projects to dominate the top-N results returned by ChromaDB, effectively crowding out results from smaller projects before the post-hoc SQLite project filter could take effect. For example, with project A having 19,000 embeddings and project B having 700, a search scoped to project B would return mostly project A results from ChromaDB. After SQLite filtered by project, only 1-3 results from B would survive instead of the expected 20+. The fix adds the project to the ChromaDB where clause using $and when both doc_type and project filters are needed. This is applied in both ChromaSearchStrategy.buildWhereFilter() and SearchManager.search(). Co-authored-by: TARS <tars@openclaw.local>
433 lines
14 KiB
TypeScript
433 lines
14 KiB
TypeScript
import { describe, it, expect, mock, beforeEach } from 'bun:test';
|
|
import { ChromaSearchStrategy } from '../../../../src/services/worker/search/strategies/ChromaSearchStrategy.js';
|
|
import type { StrategySearchOptions, ObservationSearchResult, SessionSummarySearchResult, UserPromptSearchResult } from '../../../../src/services/worker/search/types.js';
|
|
|
|
// Mock observation data
|
|
const mockObservation: ObservationSearchResult = {
|
|
id: 1,
|
|
memory_session_id: 'session-123',
|
|
project: 'test-project',
|
|
text: 'Test observation text',
|
|
type: 'decision',
|
|
title: 'Test Decision',
|
|
subtitle: 'A test subtitle',
|
|
facts: '["fact1", "fact2"]',
|
|
narrative: 'Test narrative',
|
|
concepts: '["concept1", "concept2"]',
|
|
files_read: '["file1.ts"]',
|
|
files_modified: '["file2.ts"]',
|
|
prompt_number: 1,
|
|
discovery_tokens: 100,
|
|
created_at: '2025-01-01T12:00:00.000Z',
|
|
created_at_epoch: Date.now() - 1000 * 60 * 60 * 24 // 1 day ago
|
|
};
|
|
|
|
const mockSession: SessionSummarySearchResult = {
|
|
id: 2,
|
|
memory_session_id: 'session-123',
|
|
project: 'test-project',
|
|
request: 'Test request',
|
|
investigated: 'Test investigated',
|
|
learned: 'Test learned',
|
|
completed: 'Test completed',
|
|
next_steps: 'Test next steps',
|
|
files_read: '["file1.ts"]',
|
|
files_edited: '["file2.ts"]',
|
|
notes: 'Test notes',
|
|
prompt_number: 1,
|
|
discovery_tokens: 500,
|
|
created_at: '2025-01-01T12:00:00.000Z',
|
|
created_at_epoch: Date.now() - 1000 * 60 * 60 * 24
|
|
};
|
|
|
|
const mockPrompt: UserPromptSearchResult = {
|
|
id: 3,
|
|
content_session_id: 'content-session-123',
|
|
prompt_number: 1,
|
|
prompt_text: 'Test prompt text',
|
|
created_at: '2025-01-01T12:00:00.000Z',
|
|
created_at_epoch: Date.now() - 1000 * 60 * 60 * 24
|
|
};
|
|
|
|
describe('ChromaSearchStrategy', () => {
|
|
let strategy: ChromaSearchStrategy;
|
|
let mockChromaSync: any;
|
|
let mockSessionStore: any;
|
|
|
|
beforeEach(() => {
|
|
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; // 1 day ago (within 90-day window)
|
|
|
|
mockChromaSync = {
|
|
queryChroma: mock(() => Promise.resolve({
|
|
ids: [1, 2, 3],
|
|
distances: [0.1, 0.2, 0.3],
|
|
metadatas: [
|
|
{ sqlite_id: 1, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 2, doc_type: 'session_summary', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 3, doc_type: 'user_prompt', created_at_epoch: recentEpoch }
|
|
]
|
|
}))
|
|
};
|
|
|
|
mockSessionStore = {
|
|
getObservationsByIds: mock(() => [mockObservation]),
|
|
getSessionSummariesByIds: mock(() => [mockSession]),
|
|
getUserPromptsByIds: mock(() => [mockPrompt])
|
|
};
|
|
|
|
strategy = new ChromaSearchStrategy(mockChromaSync, mockSessionStore);
|
|
});
|
|
|
|
describe('canHandle', () => {
|
|
it('should return true when query text is present', () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'semantic search query'
|
|
};
|
|
expect(strategy.canHandle(options)).toBe(true);
|
|
});
|
|
|
|
it('should return false for filter-only (no query)', () => {
|
|
const options: StrategySearchOptions = {
|
|
project: 'test-project'
|
|
};
|
|
expect(strategy.canHandle(options)).toBe(false);
|
|
});
|
|
|
|
it('should return false when query is empty string', () => {
|
|
const options: StrategySearchOptions = {
|
|
query: ''
|
|
};
|
|
expect(strategy.canHandle(options)).toBe(false);
|
|
});
|
|
|
|
it('should return false when query is undefined', () => {
|
|
const options: StrategySearchOptions = {};
|
|
expect(strategy.canHandle(options)).toBe(false);
|
|
});
|
|
});
|
|
|
|
describe('search', () => {
|
|
it('should call Chroma with query text', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
limit: 10
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100, // CHROMA_BATCH_SIZE
|
|
undefined // no where filter for 'all'
|
|
);
|
|
});
|
|
|
|
it('should return usedChroma: true on success', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.usedChroma).toBe(true);
|
|
expect(result.fellBack).toBe(false);
|
|
expect(result.strategy).toBe('chroma');
|
|
});
|
|
|
|
it('should hydrate observations from SQLite', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
|
|
expect(result.results.observations).toHaveLength(1);
|
|
});
|
|
|
|
it('should hydrate sessions from SQLite', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'sessions'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockSessionStore.getSessionSummariesByIds).toHaveBeenCalled();
|
|
});
|
|
|
|
it('should hydrate prompts from SQLite', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'prompts'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockSessionStore.getUserPromptsByIds).toHaveBeenCalled();
|
|
});
|
|
|
|
it('should filter by doc_type when searchType is observations', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ doc_type: 'observation' }
|
|
);
|
|
});
|
|
|
|
it('should filter by doc_type when searchType is sessions', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'sessions'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ doc_type: 'session_summary' }
|
|
);
|
|
});
|
|
|
|
it('should filter by doc_type when searchType is prompts', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'prompts'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ doc_type: 'user_prompt' }
|
|
);
|
|
});
|
|
|
|
it('should include project in Chroma where clause when specified', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
project: 'my-project'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ project: 'my-project' }
|
|
);
|
|
});
|
|
|
|
it('should combine doc_type and project with $and when both specified', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations',
|
|
project: 'my-project'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ $and: [{ doc_type: 'observation' }, { project: 'my-project' }] }
|
|
);
|
|
});
|
|
|
|
it('should not include project filter when project is not specified', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
await strategy.search(options);
|
|
|
|
expect(mockChromaSync.queryChroma).toHaveBeenCalledWith(
|
|
'test query',
|
|
100,
|
|
{ doc_type: 'observation' }
|
|
);
|
|
});
|
|
|
|
it('should return empty result when no query provided', async () => {
|
|
const options: StrategySearchOptions = {
|
|
query: undefined
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.results.observations).toHaveLength(0);
|
|
expect(result.results.sessions).toHaveLength(0);
|
|
expect(result.results.prompts).toHaveLength(0);
|
|
expect(mockChromaSync.queryChroma).not.toHaveBeenCalled();
|
|
});
|
|
|
|
it('should return empty result when Chroma returns no matches', async () => {
|
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
|
ids: [],
|
|
distances: [],
|
|
metadatas: []
|
|
}));
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'no matches query'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.results.observations).toHaveLength(0);
|
|
expect(result.usedChroma).toBe(true); // Still used Chroma, just no results
|
|
});
|
|
|
|
it('should filter out old results (beyond 90-day window)', async () => {
|
|
const oldEpoch = Date.now() - 1000 * 60 * 60 * 24 * 100; // 100 days ago
|
|
|
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
|
ids: [1],
|
|
distances: [0.1],
|
|
metadatas: [
|
|
{ sqlite_id: 1, doc_type: 'observation', created_at_epoch: oldEpoch }
|
|
]
|
|
}));
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'old data query'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
// Old results should be filtered out
|
|
expect(mockSessionStore.getObservationsByIds).not.toHaveBeenCalled();
|
|
});
|
|
|
|
it('should handle Chroma errors gracefully (returns usedChroma: false)', async () => {
|
|
mockChromaSync.queryChroma = mock(() => Promise.reject(new Error('Chroma connection failed')));
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.usedChroma).toBe(false);
|
|
expect(result.fellBack).toBe(false);
|
|
expect(result.results.observations).toHaveLength(0);
|
|
expect(result.results.sessions).toHaveLength(0);
|
|
expect(result.results.prompts).toHaveLength(0);
|
|
});
|
|
|
|
it('should handle SQLite hydration errors gracefully', async () => {
|
|
mockSessionStore.getObservationsByIds = mock(() => {
|
|
throw new Error('SQLite error');
|
|
});
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.usedChroma).toBe(false); // Error occurred
|
|
expect(result.results.observations).toHaveLength(0);
|
|
});
|
|
|
|
it('should correctly align IDs with metadatas when Chroma returns duplicate sqlite_ids (multiple docs per observation)', async () => {
|
|
// BUG SCENARIO: One observation (id=100) has 3 documents in Chroma (narrative + 2 facts)
|
|
// Another observation (id=200) has 1 document
|
|
// Chroma returns 4 metadatas but after deduplication we have 2 unique IDs
|
|
// The metadatas MUST be deduplicated/aligned to match the unique IDs
|
|
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24; // 1 day ago
|
|
|
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
|
// After deduplication in ChromaSync.queryChroma, ids should be [100, 200]
|
|
// But metadatas array has 4 elements - THIS IS THE BUG
|
|
ids: [100, 200], // Deduplicated
|
|
distances: [0.3, 0.4, 0.5, 0.6], // Original 4 distances
|
|
metadatas: [
|
|
// Original 4 metadatas - not aligned with deduplicated ids!
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 200, doc_type: 'observation', created_at_epoch: recentEpoch }
|
|
]
|
|
}));
|
|
|
|
// Mock that returns observations when called with correct IDs
|
|
const mockObs100 = { ...mockObservation, id: 100 };
|
|
const mockObs200 = { ...mockObservation, id: 200, title: 'Second observation' };
|
|
mockSessionStore.getObservationsByIds = mock((ids: number[]) => {
|
|
// Should receive [100, 200]
|
|
return ids.map(id => id === 100 ? mockObs100 : mockObs200);
|
|
});
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
const result = await strategy.search(options);
|
|
|
|
// The strategy should correctly identify BOTH observations
|
|
// Before the fix: idx=2 and idx=3 would access ids[2] and ids[3] which are undefined
|
|
expect(result.usedChroma).toBe(true);
|
|
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
|
|
|
|
// Verify the correct IDs were passed to SQLite hydration
|
|
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
|
|
expect(calledWith).toContain(100);
|
|
expect(calledWith).toContain(200);
|
|
expect(calledWith.length).toBe(2); // Should have exactly 2 unique IDs
|
|
});
|
|
|
|
it('should handle misaligned arrays gracefully without undefined access', async () => {
|
|
// Edge case: metadatas array longer than ids array
|
|
// This simulates the actual bug condition
|
|
const recentEpoch = Date.now() - 1000 * 60 * 60 * 24;
|
|
|
|
mockChromaSync.queryChroma = mock(() => Promise.resolve({
|
|
ids: [100], // Only 1 ID after deduplication
|
|
distances: [0.3, 0.4, 0.5], // 3 distances
|
|
metadatas: [
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch },
|
|
{ sqlite_id: 100, doc_type: 'observation', created_at_epoch: recentEpoch }
|
|
] // 3 metadatas for same observation
|
|
}));
|
|
|
|
mockSessionStore.getObservationsByIds = mock(() => [mockObservation]);
|
|
|
|
const options: StrategySearchOptions = {
|
|
query: 'test query',
|
|
searchType: 'observations'
|
|
};
|
|
|
|
// Before fix: This would try to access ids[1], ids[2] which are undefined
|
|
// causing incorrect filtering or crashes
|
|
const result = await strategy.search(options);
|
|
|
|
expect(result.usedChroma).toBe(true);
|
|
// Should still find the one observation correctly
|
|
expect(mockSessionStore.getObservationsByIds).toHaveBeenCalled();
|
|
const calledWith = mockSessionStore.getObservationsByIds.mock.calls[0][0];
|
|
expect(calledWith).toEqual([100]);
|
|
});
|
|
});
|
|
|
|
describe('strategy name', () => {
|
|
it('should have name "chroma"', () => {
|
|
expect(strategy.name).toBe('chroma');
|
|
});
|
|
});
|
|
});
|