fix: add content-hash dedup to batch observation store methods (#1302)

storeObservations() and storeObservationsAndMarkComplete() were missing
the content-hash deduplication that storeObservation() (singular) already
had via computeObservationContentHash() and findDuplicateObservation().

This caused the Gemini provider (and potentially others that return
multiple observations per response) to insert 2-10x duplicate rows per
tool use, since the batch methods inserted unconditionally without
checking content_hash.

The fix adds the same dedup pattern from storeObservation() to both
batch methods:
1. Compute content hash via computeObservationContentHash()
2. Check for existing observation within 30s window via findDuplicateObservation()
3. Skip insert and reuse existing ID if duplicate found
4. Include content_hash column in INSERT statement

Fixes #1158 (duplicate observations with Gemini provider)

Co-authored-by: Enzo Ricciulli <e.ricciulli@systhema.ai>
This commit is contained in:
enzoricciulli
2026-03-13 04:01:53 +01:00
committed by GitHub
parent ad902bedd9
commit e7ba9acaa7
+24 -6
View File
@@ -1659,15 +1659,23 @@ export class SessionStore {
const storeTx = this.db.transaction(() => {
const observationIds: number[] = [];
// 1. Store all observations
// 1. Store all observations (with content-hash deduplication)
const obsStmt = this.db.prepare(`
INSERT INTO observations
(memory_session_id, project, type, title, subtitle, facts, narrative, concepts,
files_read, files_modified, prompt_number, discovery_tokens, created_at, created_at_epoch)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
files_read, files_modified, prompt_number, discovery_tokens, content_hash, created_at, created_at_epoch)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
for (const observation of observations) {
// Content-hash deduplication (same logic as storeObservation singular)
const contentHash = computeObservationContentHash(memorySessionId, observation.title, observation.narrative);
const existing = findDuplicateObservation(this.db, contentHash, timestampEpoch);
if (existing) {
observationIds.push(existing.id);
continue;
}
const result = obsStmt.run(
memorySessionId,
project,
@@ -1681,6 +1689,7 @@ export class SessionStore {
JSON.stringify(observation.files_modified),
promptNumber || null,
discoveryTokens,
contentHash,
timestampIso,
timestampEpoch
);
@@ -1779,15 +1788,23 @@ export class SessionStore {
const storeAndMarkTx = this.db.transaction(() => {
const observationIds: number[] = [];
// 1. Store all observations
// 1. Store all observations (with content-hash deduplication)
const obsStmt = this.db.prepare(`
INSERT INTO observations
(memory_session_id, project, type, title, subtitle, facts, narrative, concepts,
files_read, files_modified, prompt_number, discovery_tokens, created_at, created_at_epoch)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
files_read, files_modified, prompt_number, discovery_tokens, content_hash, created_at, created_at_epoch)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
`);
for (const observation of observations) {
// Content-hash deduplication (same logic as storeObservation singular)
const contentHash = computeObservationContentHash(memorySessionId, observation.title, observation.narrative);
const existing = findDuplicateObservation(this.db, contentHash, timestampEpoch);
if (existing) {
observationIds.push(existing.id);
continue;
}
const result = obsStmt.run(
memorySessionId,
project,
@@ -1801,6 +1818,7 @@ export class SessionStore {
JSON.stringify(observation.files_modified),
promptNumber || null,
discoveryTokens,
contentHash,
timestampIso,
timestampEpoch
);