fix: rename save_memory and fix MCP search instructions + startup hook (#1210)

* fix: rename save_memory to save_observation and fix MCP search instructions

Stop the primary agent from proactively saving memories by renaming
save_memory to save_observation with a neutral description. Remove
"Saving Memories" section from SKILL.md. Update context formatters
and output styles to reference the mem-search skill instead of raw
MCP tool names.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: split SessionStart hooks so smart-install failure doesn't block worker start

smart-install.js and worker-start were in the same hook group, so if
smart-install exited non-zero the worker never started. Split into
separate hook groups so they run independently.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* fix: worker startup waits for readiness before hooks fire

Move initializationCompleteFlag to set after DB/search init (not MCP),
add waitForReadiness() polling /api/readiness, and extract shared
pollEndpointUntilOk helper to DRY up health/readiness checks.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-02-23 03:30:31 -05:00
committed by GitHub
parent e4e735d3ff
commit 7966c6cba9
12 changed files with 343 additions and 323 deletions
+2 -2
View File
@@ -235,8 +235,8 @@ NEVER fetch full details without filtering first. 10x token savings.`,
}
},
{
name: 'save_memory',
description: 'Save a manual memory/observation for semantic search. Use this to remember important information.',
name: 'save_observation',
description: 'Save an observation to the database. Params: text (required), title, project',
inputSchema: {
type: 'object',
properties: {
@@ -74,8 +74,8 @@ export function renderColorContextIndex(): string[] {
`${colors.dim}Context Index: This semantic index (titles, types, files, tokens) is usually sufficient to understand past work.${colors.reset}`,
'',
`${colors.dim}When you need implementation details, rationale, or debugging context:${colors.reset}`,
`${colors.dim} - Use MCP tools (search, get_observations) to fetch full observations on-demand${colors.reset}`,
`${colors.dim} - Critical types ( bugfix, decision) often need detailed fetching${colors.reset}`,
`${colors.dim} - Fetch by ID: get_observations([IDs]) for observations visible in this index${colors.reset}`,
`${colors.dim} - Search history: Use the mem-search skill for past decisions, bugs, and deeper research${colors.reset}`,
`${colors.dim} - Trust this index over re-reading code for past decisions and learnings${colors.reset}`,
''
];
@@ -72,8 +72,8 @@ export function renderMarkdownContextIndex(): string[] {
`**Context Index:** This semantic index (titles, types, files, tokens) is usually sufficient to understand past work.`,
'',
`When you need implementation details, rationale, or debugging context:`,
`- Use MCP tools (search, get_observations) to fetch full observations on-demand`,
`- Critical types ( bugfix, decision) often need detailed fetching`,
`- Fetch by ID: get_observations([IDs]) for observations visible in this index`,
`- Search history: Use the mem-search skill for past decisions, bugs, and deeper research`,
`- Trust this index over re-reading code for past decisions and learnings`,
''
];
+29 -11
View File
@@ -29,31 +29,49 @@ export async function isPortInUse(port: number): Promise<boolean> {
}
/**
* Wait for the worker HTTP server to become responsive (liveness check)
* Uses /api/health instead of /api/readiness because:
* - /api/health returns 200 as soon as HTTP server is listening
* - /api/readiness waits for full initialization (MCP connection can take 5+ minutes)
* See: https://github.com/thedotmack/claude-mem/issues/811
* @param port Worker port to check
* @param timeoutMs Maximum time to wait in milliseconds
* @returns true if worker became responsive, false if timeout
* Poll a localhost endpoint until it returns 200 OK or timeout.
* Shared implementation for liveness and readiness checks.
*/
export async function waitForHealth(port: number, timeoutMs: number = 30000): Promise<boolean> {
async function pollEndpointUntilOk(
port: number,
endpointPath: string,
timeoutMs: number,
retryLogMessage: string
): Promise<boolean> {
const start = Date.now();
while (Date.now() - start < timeoutMs) {
try {
// Note: Removed AbortSignal.timeout to avoid Windows Bun cleanup issue (libuv assertion)
const response = await fetch(`http://127.0.0.1:${port}/api/health`);
const response = await fetch(`http://127.0.0.1:${port}${endpointPath}`);
if (response.ok) return true;
} catch (error) {
// [ANTI-PATTERN IGNORED]: Retry loop - expected failures during startup, will retry
logger.debug('SYSTEM', 'Service not ready yet, will retry', { port }, error as Error);
logger.debug('SYSTEM', retryLogMessage, { port }, error as Error);
}
await new Promise(r => setTimeout(r, 500));
}
return false;
}
/**
* Wait for the worker HTTP server to become responsive (liveness check).
* Uses /api/health which returns 200 as soon as the HTTP server is listening.
* For full initialization (DB + search), use waitForReadiness() instead.
*/
export function waitForHealth(port: number, timeoutMs: number = 30000): Promise<boolean> {
return pollEndpointUntilOk(port, '/api/health', timeoutMs, 'Service not ready yet, will retry');
}
/**
* Wait for the worker to be fully initialized (DB + search ready).
* Uses /api/readiness which returns 200 only after core initialization completes.
* Now that initializationCompleteFlag is set after DB/search init (not MCP),
* this typically completes in a few seconds.
*/
export function waitForReadiness(port: number, timeoutMs: number = 30000): Promise<boolean> {
return pollEndpointUntilOk(port, '/api/readiness', timeoutMs, 'Worker not ready yet, will retry');
}
/**
* Wait for a port to become free (no longer responding to health checks)
* Used after shutdown to confirm the port is available for restart
+16 -5
View File
@@ -79,6 +79,7 @@ import {
import {
isPortInUse,
waitForHealth,
waitForReadiness,
waitForPortFree,
httpShutdown,
checkVersionMatch
@@ -416,6 +417,13 @@ export class WorkerService {
this.server.registerRoutes(this.searchRoutes);
logger.info('WORKER', 'SearchManager initialized and search routes registered');
// DB and search are ready — mark initialization complete so hooks can proceed.
// MCP connection is tracked separately via mcpReady and is NOT required for
// the worker to serve context/search requests.
this.initializationCompleteFlag = true;
this.resolveInitialization();
logger.info('SYSTEM', 'Core initialization complete (DB + search ready)');
// Auto-backfill Chroma for all projects if out of sync with SQLite (fire-and-forget)
if (this.chromaMcpManager) {
ChromaSync.backfillAllProjects().then(() => {
@@ -441,11 +449,7 @@ export class WorkerService {
await Promise.race([mcpConnectionPromise, timeoutPromise]);
this.mcpReady = true;
logger.success('WORKER', 'Connected to MCP server');
this.initializationCompleteFlag = true;
this.resolveInitialization();
logger.info('SYSTEM', 'Background initialization complete');
logger.success('WORKER', 'MCP server connected');
// Start orphan reaper to clean up zombie processes (Issue #737)
this.stopOrphanReaper = startOrphanReaper(() => {
@@ -945,6 +949,13 @@ async function ensureWorkerStarted(port: number): Promise<boolean> {
return false;
}
// Health passed (HTTP listening). Now wait for DB + search initialization
// so hooks that run immediately after can actually use the worker.
const ready = await waitForReadiness(port, getPlatformTimeout(HOOK_TIMEOUTS.READINESS_WAIT));
if (!ready) {
logger.warn('SYSTEM', 'Worker is alive but readiness timed out — proceeding anyway');
}
clearWorkerSpawnAttempted();
logger.info('SYSTEM', 'Worker started successfully');
return true;
+1
View File
@@ -2,6 +2,7 @@ export const HOOK_TIMEOUTS = {
DEFAULT: 300000, // Standard HTTP timeout (5 min for slow systems)
HEALTH_CHECK: 3000, // Worker health check (3s — healthy worker responds in <100ms)
POST_SPAWN_WAIT: 5000, // Wait for daemon to start after spawn (starts in <1s on Linux)
READINESS_WAIT: 30000, // Wait for DB + search init after spawn (typically <5s)
PORT_IN_USE_WAIT: 3000, // Wait when port occupied but health failing
WORKER_STARTUP_WAIT: 1000,
PRE_RESTART_SETTLE_DELAY: 2000, // Give files time to sync before restart