fix: resolve 3 upstream bugs (summarize, ChromaSync, HealthMonitor) (#1566)

* fix: resolve 3 upstream bugs in summarize, ChromaSync, and HealthMonitor

1. summarize.ts: Skip summary when transcript has no assistant message.
   Prevents error loop where empty transcripts cause repeated failed
   summarize attempts (~30 errors/day observed in production).

2. ChromaSync.ts: Fallback to chroma_update_documents when add fails
   with "IDs already exist". Handles partial writes after MCP timeout
   without waiting for next backfill cycle.

3. HealthMonitor.ts: Replace HTTP-based isPortInUse with atomic socket
   bind on Unix. Eliminates TOCTOU race when two sessions start
   simultaneously (HTTP check is non-atomic — both see "port free"
   before either completes listen()). Updated tests accordingly.

All three bugs are pre-existing in v10.5.5. Confirmed via log analysis
of 543K lines over 17 days of production usage across two servers.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* chore: add CONTRIB_NOTES.md to gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: address CodeRabbit review on PR #1566

- HealthMonitor: add APPROVED OVERRIDE annotation for Win32 HTTP fallback
- ChromaSync: replace chroma_update_documents with delete+add for proper
  upsert (update only modifies existing IDs, silently ignores missing ones)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Alessandro Costa <alessandro@claudio.dev>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alessandro Costa
2026-04-04 19:15:08 -03:00
committed by GitHub
parent 5a27420809
commit 64cce2bf10
5 changed files with 177 additions and 52 deletions
+10
View File
@@ -52,6 +52,16 @@ export const summarizeHandler: EventHandler = {
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
// Skip summary if transcript has no assistant message (prevents repeated
// empty summarize requests that pollute logs — upstream bug)
if (!lastAssistantMessage || !lastAssistantMessage.trim()) {
logger.debug('HOOK', 'No assistant message in transcript - skipping summary', {
sessionId,
transcriptPath
});
return { continue: true, suppressOutput: true, exitCode: HOOK_EXIT_CODES.SUCCESS };
}
logger.dataIn('HOOK', 'Stop: Requesting summary', {
hasLastAssistantMessage: !!lastAssistantMessage
});
+35 -8
View File
@@ -10,6 +10,7 @@
*/
import path from 'path';
import net from 'net';
import { readFileSync } from 'fs';
import { logger } from '../../utils/logger.js';
import { MARKETPLACE_ROOT } from '../../shared/paths.js';
@@ -35,17 +36,43 @@ async function httpRequestToWorker(
}
/**
* Check if a port is in use by querying the health endpoint
* Check if a port is in use by attempting an atomic socket bind.
* More reliable than HTTP health check for daemon spawn guards —
* prevents TOCTOU race where two daemons both see "port free" via
* HTTP and then both try to listen() (upstream bug workaround).
*
* Falls back to HTTP health check on Windows where socket bind
* behavior differs.
*/
export async function isPortInUse(port: number): Promise<boolean> {
try {
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(`http://127.0.0.1:${port}/api/health`);
return response.ok;
} catch (error) {
// [ANTI-PATTERN IGNORED]: Health check polls every 500ms, logging would flood
return false;
if (process.platform === 'win32') {
// APPROVED OVERRIDE: Windows keeps HTTP health check because socket bind
// semantics differ (SO_REUSEADDR defaults, firewall prompts). The TOCTOU
// race remains on Windows but is an accepted limitation — the atomic
// socket approach would cause false positives or UAC popups.
try {
const response = await fetch(`http://127.0.0.1:${port}/api/health`);
return response.ok;
} catch {
return false;
}
}
// Unix: atomic socket bind check — no TOCTOU race
return new Promise((resolve) => {
const server = net.createServer();
server.once('error', (err: NodeJS.ErrnoException) => {
if (err.code === 'EADDRINUSE') {
resolve(true);
} else {
resolve(false);
}
});
server.once('listening', () => {
server.close(() => resolve(false));
});
server.listen(port, '127.0.0.1');
});
}
/**
+35 -5
View File
@@ -283,11 +283,41 @@ export class ChromaSync {
metadatas: cleanMetadatas
});
} catch (error) {
logger.error('CHROMA_SYNC', 'Batch add failed, continuing with remaining batches', {
collection: this.collectionName,
batchStart: i,
batchSize: batch.length
}, error as Error);
const errMsg = error instanceof Error ? error.message : String(error);
// APPROVED OVERRIDE: Duplicate IDs from partial write before timeout/crash.
// chroma_update_documents only updates *existing* IDs — it silently ignores
// missing ones. So we delete-then-add to guarantee all IDs are written.
if (errMsg.includes('already exist')) {
try {
await chromaMcp.callTool('chroma_delete_documents', {
collection_name: this.collectionName,
ids: batch.map(d => d.id)
});
await chromaMcp.callTool('chroma_add_documents', {
collection_name: this.collectionName,
ids: batch.map(d => d.id),
documents: batch.map(d => d.document),
metadatas: cleanMetadatas
});
logger.info('CHROMA_SYNC', 'Batch reconciled via delete+add after duplicate conflict', {
collection: this.collectionName,
batchStart: i,
batchSize: batch.length
});
} catch (reconcileError) {
logger.error('CHROMA_SYNC', 'Batch reconcile (delete+add) failed', {
collection: this.collectionName,
batchStart: i,
batchSize: batch.length
}, reconcileError as Error);
}
} else {
logger.error('CHROMA_SYNC', 'Batch add failed, continuing with remaining batches', {
collection: this.collectionName,
batchStart: i,
batchSize: batch.length
}, error as Error);
}
}
}