Merge branch 'thedotmack/add-lang-parsers' into integration/validation-batch

Adds 24-language support for smart-explore: Kotlin, Swift, Elixir,
Lua, Scala, Bash, Haskell, Zig, CSS, SCSS, TOML, YAML, SQL, Markdown.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Newman
2026-04-07 13:50:46 -07:00
7 changed files with 635 additions and 33 deletions
+14
View File
@@ -124,6 +124,12 @@
"zod-to-json-schema": "^3.24.6"
},
"devDependencies": {
"@derekstride/tree-sitter-sql": "^0.3.11",
"@tree-sitter-grammars/tree-sitter-lua": "^0.4.1",
"@tree-sitter-grammars/tree-sitter-markdown": "^0.3.2",
"@tree-sitter-grammars/tree-sitter-toml": "^0.7.0",
"@tree-sitter-grammars/tree-sitter-yaml": "^0.7.1",
"@tree-sitter-grammars/tree-sitter-zig": "^1.1.2",
"@types/cors": "^2.8.19",
"@types/dompurify": "^3.0.5",
"@types/express": "^4.17.21",
@@ -132,16 +138,24 @@
"@types/react-dom": "^18.3.0",
"esbuild": "^0.27.2",
"np": "^11.0.2",
"tree-sitter-bash": "^0.25.1",
"tree-sitter-c": "^0.24.1",
"tree-sitter-cli": "^0.26.5",
"tree-sitter-cpp": "^0.23.4",
"tree-sitter-css": "^0.25.0",
"tree-sitter-elixir": "^0.3.5",
"tree-sitter-go": "^0.25.0",
"tree-sitter-haskell": "^0.23.1",
"tree-sitter-java": "^0.23.5",
"tree-sitter-javascript": "^0.25.0",
"tree-sitter-kotlin": "^0.3.8",
"tree-sitter-php": "^0.24.2",
"tree-sitter-python": "^0.25.0",
"tree-sitter-ruby": "^0.23.1",
"tree-sitter-rust": "^0.24.0",
"tree-sitter-scala": "^0.24.0",
"tree-sitter-scss": "^1.0.0",
"tree-sitter-swift": "^0.7.1",
"tree-sitter-typescript": "^0.23.2",
"tsx": "^4.20.6",
"typescript": "^5.3.0"
+16 -1
View File
@@ -14,7 +14,22 @@
"tree-sitter-python": "^0.25.0",
"tree-sitter-ruby": "^0.23.1",
"tree-sitter-rust": "^0.24.0",
"tree-sitter-typescript": "^0.23.2"
"tree-sitter-typescript": "^0.23.2",
"tree-sitter-kotlin": "^0.3.8",
"tree-sitter-swift": "^0.7.1",
"tree-sitter-php": "^0.24.2",
"tree-sitter-elixir": "^0.3.5",
"@tree-sitter-grammars/tree-sitter-lua": "^0.4.1",
"tree-sitter-scala": "^0.24.0",
"tree-sitter-bash": "^0.25.1",
"tree-sitter-haskell": "^0.23.1",
"@tree-sitter-grammars/tree-sitter-zig": "^1.1.2",
"tree-sitter-css": "^0.25.0",
"tree-sitter-scss": "^1.0.0",
"@tree-sitter-grammars/tree-sitter-toml": "^0.7.0",
"@tree-sitter-grammars/tree-sitter-yaml": "^0.7.1",
"@derekstride/tree-sitter-sql": "^0.3.11",
"@tree-sitter-grammars/tree-sitter-markdown": "^0.3.2"
},
"engines": {
"node": ">=18.0.0",
+48
View File
@@ -125,3 +125,51 @@ get_observations(ids=[11131, 10942, 10855], orderBy="date_desc")
- **Full observation:** ~500-1000 tokens each
- **Batch fetch:** 1 HTTP request vs N individual requests
- **10x token savings** by filtering before fetching
## Smart-Explore Language Support
Smart-explore tools (`smart_search`, `smart_outline`, `smart_unfold`) use tree-sitter AST parsing. The following languages are supported out of the box.
### 24 Bundled Languages
JS, TS, Python, Go, Rust, Ruby, Java, C, C++, Kotlin, Swift, PHP, Elixir, Lua, Scala, Bash, Haskell, Zig, CSS, SCSS, TOML, YAML, SQL, Markdown
### Markdown Special Support
Markdown files get structure-aware parsing beyond generic tree-sitter:
- **Heading hierarchy** -- `#`/`##`/`###` headings are extracted as nested symbols (sections contain subsections)
- **Code block detection** -- fenced code blocks are surfaced as `code` symbols with language annotation
- **Section-aware unfold** -- `smart_unfold` on a heading returns the full section content (heading through all subsections until the next heading of equal or higher level)
### User-Installable Grammars via `.claude-mem.json`
Add custom tree-sitter grammars for languages not in the bundled set. Place `.claude-mem.json` in the project root:
```json
{
"grammars": {
"gleam": {
"package": "tree-sitter-gleam",
"extensions": [".gleam"]
},
"protobuf": {
"package": "tree-sitter-proto",
"extensions": [".proto"],
"query": ".claude-mem/queries/proto.scm"
}
}
}
```
**Fields:**
- `package` (string, required) -- npm package name for the tree-sitter grammar
- `extensions` (array of strings, required) -- file extensions to associate with this language
- `query` (string, optional) -- path to a custom `.scm` query file for symbol extraction. If omitted, a generic query is used.
**Rules:**
- User grammars do NOT override bundled languages. If a language is already bundled, the entry is ignored.
- The npm package must be installed in the project (`npm install tree-sitter-gleam`).
- Config is cached per project root. Changes to `.claude-mem.json` take effect on next worker restart.
File diff suppressed because one or more lines are too long
+30
View File
@@ -111,6 +111,21 @@ async function buildHooks() {
'tree-sitter-ruby': '^0.23.1',
'tree-sitter-rust': '^0.24.0',
'tree-sitter-typescript': '^0.23.2',
'tree-sitter-kotlin': '^0.3.8',
'tree-sitter-swift': '^0.7.1',
'tree-sitter-php': '^0.24.2',
'tree-sitter-elixir': '^0.3.5',
'@tree-sitter-grammars/tree-sitter-lua': '^0.4.1',
'tree-sitter-scala': '^0.24.0',
'tree-sitter-bash': '^0.25.1',
'tree-sitter-haskell': '^0.23.1',
'@tree-sitter-grammars/tree-sitter-zig': '^1.1.2',
'tree-sitter-css': '^0.25.0',
'tree-sitter-scss': '^1.0.0',
'@tree-sitter-grammars/tree-sitter-toml': '^0.7.0',
'@tree-sitter-grammars/tree-sitter-yaml': '^0.7.1',
'@derekstride/tree-sitter-sql': '^0.3.11',
'@tree-sitter-grammars/tree-sitter-markdown': '^0.3.2',
},
engines: {
node: '>=18.0.0',
@@ -197,6 +212,21 @@ async function buildHooks() {
'tree-sitter-java',
'tree-sitter-c',
'tree-sitter-cpp',
'tree-sitter-kotlin',
'tree-sitter-swift',
'tree-sitter-php',
'tree-sitter-elixir',
'@tree-sitter-grammars/tree-sitter-lua',
'tree-sitter-scala',
'tree-sitter-bash',
'tree-sitter-haskell',
'@tree-sitter-grammars/tree-sitter-zig',
'tree-sitter-css',
'tree-sitter-scss',
'@tree-sitter-grammars/tree-sitter-toml',
'@tree-sitter-grammars/tree-sitter-yaml',
'@derekstride/tree-sitter-sql',
'@tree-sitter-grammars/tree-sitter-markdown',
],
define: {
'__DEFAULT_PACKAGE_VERSION__': `"${version}"`
+485 -15
View File
@@ -3,13 +3,15 @@
*
* No native bindings. No WASM. Just the CLI binary + query patterns.
*
* Supported: JS, TS, Python, Go, Rust, Ruby, Java, C, C++, PHP
* Supported: JS, TS, Python, Go, Rust, Ruby, Java, C, C++,
* Kotlin, Swift, PHP, Elixir, Lua, Scala, Bash, Haskell, Zig,
* CSS, SCSS, TOML, YAML, SQL, Markdown
*
* by Copter Labs
*/
import { execFileSync } from "node:child_process";
import { writeFileSync, mkdtempSync, rmSync, existsSync } from "node:fs";
import { writeFileSync, readFileSync, mkdtempSync, rmSync, existsSync } from "node:fs";
import { join, dirname } from "node:path";
import { tmpdir } from "node:os";
import { createRequire } from "node:module";
@@ -25,7 +27,7 @@ const _require = typeof __filename !== 'undefined'
export interface CodeSymbol {
name: string;
kind: "function" | "class" | "method" | "interface" | "type" | "const" | "variable" | "export" | "struct" | "enum" | "trait" | "impl" | "property" | "getter" | "setter";
kind: "function" | "class" | "method" | "interface" | "type" | "const" | "variable" | "export" | "struct" | "enum" | "trait" | "impl" | "property" | "getter" | "setter" | "mixin" | "section" | "code" | "metadata" | "reference";
signature: string;
jsdoc?: string;
lineStart: number;
@@ -66,7 +68,28 @@ const LANG_MAP: Record<string, string> = {
".cxx": "cpp",
".hpp": "cpp",
".hh": "cpp",
".kt": "kotlin",
".kts": "kotlin",
".swift": "swift",
".php": "php",
".ex": "elixir",
".exs": "elixir",
".lua": "lua",
".scala": "scala",
".sc": "scala",
".sh": "bash",
".bash": "bash",
".zsh": "bash",
".hs": "haskell",
".zig": "zig",
".css": "css",
".scss": "scss",
".toml": "toml",
".yml": "yaml",
".yaml": "yaml",
".sql": "sql",
".md": "markdown",
".mdx": "markdown",
};
export function detectLanguage(filePath: string): string {
@@ -74,6 +97,135 @@ export function detectLanguage(filePath: string): string {
return LANG_MAP[ext] || "unknown";
}
/**
* Detect language with fallback to user-configured grammar extensions.
* Bundled LANG_MAP takes priority.
*/
function detectLanguageWithUserGrammars(filePath: string, userConfig: UserGrammarConfig): string {
const ext = filePath.slice(filePath.lastIndexOf("."));
if (LANG_MAP[ext]) return LANG_MAP[ext];
if (userConfig.extensionToLanguage[ext]) return userConfig.extensionToLanguage[ext];
return "unknown";
}
/**
* Get the query key for a language, checking user config for custom queries.
*/
function getUserAwareQueryKey(language: string, userConfig: UserGrammarConfig): string {
// If user config has a specific query key for this language, use it
if (userConfig.languageToQueryKey[language]) {
return userConfig.languageToQueryKey[language];
}
// Otherwise fall back to the bundled query key mapping
return getQueryKey(language);
}
// --- User-installable grammars via .claude-mem.json ---
export interface UserGrammarEntry {
package: string;
extensions: string[];
query?: string;
}
export interface UserGrammarConfig {
/** language name → grammar entry */
grammars: Record<string, UserGrammarEntry>;
/** file extension → language name (for user-defined extensions only) */
extensionToLanguage: Record<string, string>;
/** language name → query content (custom .scm file content or "generic") */
languageToQueryKey: Record<string, string>;
}
const userGrammarCache = new Map<string, UserGrammarConfig>();
const EMPTY_USER_GRAMMAR_CONFIG: UserGrammarConfig = {
grammars: {},
extensionToLanguage: {},
languageToQueryKey: {},
};
/**
* Load user grammar configuration from .claude-mem.json in a project root.
* Cached per project root. Returns empty config if file doesn't exist or is invalid.
* User entries do NOT override bundled grammars.
*/
export function loadUserGrammars(projectRoot: string): UserGrammarConfig {
if (userGrammarCache.has(projectRoot)) return userGrammarCache.get(projectRoot)!;
const configPath = join(projectRoot, ".claude-mem.json");
let rawConfig: Record<string, unknown>;
try {
const content = readFileSync(configPath, "utf-8");
rawConfig = JSON.parse(content);
} catch {
userGrammarCache.set(projectRoot, EMPTY_USER_GRAMMAR_CONFIG);
return EMPTY_USER_GRAMMAR_CONFIG;
}
const grammarsRaw = rawConfig.grammars;
if (!grammarsRaw || typeof grammarsRaw !== "object" || Array.isArray(grammarsRaw)) {
userGrammarCache.set(projectRoot, EMPTY_USER_GRAMMAR_CONFIG);
return EMPTY_USER_GRAMMAR_CONFIG;
}
const config: UserGrammarConfig = {
grammars: {},
extensionToLanguage: {},
languageToQueryKey: {},
};
for (const [language, entry] of Object.entries(grammarsRaw as Record<string, unknown>)) {
// Skip if this language is already bundled
if (GRAMMAR_PACKAGES[language]) continue;
if (!entry || typeof entry !== "object" || Array.isArray(entry)) continue;
const typedEntry = entry as Record<string, unknown>;
const pkg = typedEntry.package;
const extensions = typedEntry.extensions;
const queryPath = typedEntry.query;
// Validate required fields
if (typeof pkg !== "string" || !Array.isArray(extensions)) continue;
if (!extensions.every((e: unknown) => typeof e === "string")) continue;
config.grammars[language] = {
package: pkg,
extensions: extensions as string[],
query: typeof queryPath === "string" ? queryPath : undefined,
};
// Map extensions to language (skip extensions already handled by bundled LANG_MAP)
for (const ext of extensions as string[]) {
if (!LANG_MAP[ext]) {
config.extensionToLanguage[ext] = language;
}
}
// Resolve query content
if (typeof queryPath === "string") {
const fullQueryPath = join(projectRoot, queryPath);
try {
const queryContent = readFileSync(fullQueryPath, "utf-8");
// Store with a unique key to avoid collisions with built-in queries
const queryKey = `user_${language}`;
QUERIES[queryKey] = queryContent;
config.languageToQueryKey[language] = queryKey;
} catch {
console.error(`[smart-file-read] Custom query file not found: ${fullQueryPath}, falling back to generic`);
config.languageToQueryKey[language] = "generic";
}
} else {
config.languageToQueryKey[language] = "generic";
}
}
userGrammarCache.set(projectRoot, config);
return config;
}
// --- Grammar path resolution ---
const GRAMMAR_PACKAGES: Record<string, string> = {
@@ -87,12 +239,45 @@ const GRAMMAR_PACKAGES: Record<string, string> = {
java: "tree-sitter-java",
c: "tree-sitter-c",
cpp: "tree-sitter-cpp",
kotlin: "tree-sitter-kotlin",
swift: "tree-sitter-swift",
php: "tree-sitter-php/php",
elixir: "tree-sitter-elixir",
lua: "@tree-sitter-grammars/tree-sitter-lua",
scala: "tree-sitter-scala",
bash: "tree-sitter-bash",
haskell: "tree-sitter-haskell",
zig: "@tree-sitter-grammars/tree-sitter-zig",
css: "tree-sitter-css",
scss: "tree-sitter-scss",
toml: "@tree-sitter-grammars/tree-sitter-toml",
yaml: "@tree-sitter-grammars/tree-sitter-yaml",
sql: "@derekstride/tree-sitter-sql",
markdown: "@tree-sitter-grammars/tree-sitter-markdown",
};
// Grammars where the parser source lives in a subdirectory of the npm package root,
// AND that subdirectory lacks its own package.json (so require.resolve won't find it).
// Maps language → subdirectory name under the package root.
const GRAMMAR_SUBDIR: Record<string, string> = {
markdown: "tree-sitter-markdown",
};
function resolveGrammarPath(language: string): string | null {
const pkg = GRAMMAR_PACKAGES[language];
if (!pkg) return null;
const subdir = GRAMMAR_SUBDIR[language];
if (subdir) {
// Package root has no sub-package.json — resolve root then append subdir
try {
const rootPkgPath = _require.resolve(pkg + "/package.json");
const resolved = join(dirname(rootPkgPath), subdir);
if (existsSync(join(resolved, "src"))) return resolved;
} catch { /* fall through */ }
return null;
}
try {
const packageJsonPath = _require.resolve(pkg + "/package.json");
return dirname(packageJsonPath);
@@ -101,6 +286,37 @@ function resolveGrammarPath(language: string): string | null {
}
}
/**
* Resolve grammar path with fallback to user-installed grammars.
* First tries bundled grammars, then falls back to the project's node_modules.
*/
export function resolveGrammarPathWithFallback(language: string, projectRoot?: string): string | null {
// Try bundled grammar first
const bundled = resolveGrammarPath(language);
if (bundled) return bundled;
// Fall back to user-installed grammar in project's node_modules
if (!projectRoot) return null;
const userConfig = loadUserGrammars(projectRoot);
const entry = userConfig.grammars[language];
if (!entry) return null;
try {
const packageJsonPath = join(projectRoot, "node_modules", entry.package, "package.json");
if (existsSync(packageJsonPath)) {
const grammarDir = dirname(packageJsonPath);
// Verify it has a src/ directory (required by tree-sitter CLI)
if (existsSync(join(grammarDir, "src"))) return grammarDir;
}
} catch {
// Grammar package not installed
}
console.error(`[smart-file-read] Grammar package not found for "${language}": ${entry.package} (install it in your project's node_modules)`);
return null;
}
// --- Query patterns (declarative symbol extraction) ---
const QUERIES: Record<string, string> = {
@@ -152,6 +368,104 @@ const QUERIES: Record<string, string> = {
(interface_declaration name: (identifier) @name) @iface
(enum_declaration name: (identifier) @name) @enm
(import_declaration) @imp
`,
kotlin: `
(function_declaration (simple_identifier) @name) @func
(class_declaration (type_identifier) @name) @cls
(object_declaration (type_identifier) @name) @cls
(import_header) @imp
`,
swift: `
(function_declaration name: (simple_identifier) @name) @func
(class_declaration name: (type_identifier) @name) @cls
(protocol_declaration name: (type_identifier) @name) @iface
(import_declaration) @imp
`,
php: `
(function_definition name: (name) @name) @func
(class_declaration name: (name) @name) @cls
(interface_declaration name: (name) @name) @iface
(trait_declaration name: (name) @name) @trait_def
(method_declaration name: (name) @name) @method
(namespace_use_declaration) @imp
`,
lua: `
(function_declaration name: (identifier) @name) @func
(function_declaration name: (dot_index_expression) @name) @func
(function_declaration name: (method_index_expression) @name) @func
`,
scala: `
(function_definition name: (identifier) @name) @func
(class_definition name: (identifier) @name) @cls
(object_definition name: (identifier) @name) @cls
(trait_definition name: (identifier) @name) @trait_def
(import_declaration) @imp
`,
bash: `
(function_definition name: (word) @name) @func
`,
haskell: `
(function name: (variable) @name) @func
(type_synomym name: (name) @name) @tdef
(newtype name: (name) @name) @tdef
(data_type name: (name) @name) @tdef
(class name: (name) @name) @cls
(import) @imp
`,
zig: `
(function_declaration name: (identifier) @name) @func
(test_declaration) @func
`,
css: `
(rule_set (selectors) @name) @func
(media_statement) @cls
(keyframes_statement (keyframes_name) @name) @cls
(import_statement) @imp
`,
scss: `
(rule_set (selectors) @name) @func
(media_statement) @cls
(keyframes_statement (keyframes_name) @name) @cls
(import_statement) @imp
(mixin_statement name: (identifier) @name) @mixin_def
(function_statement name: (identifier) @name) @func
(include_statement) @imp
`,
toml: `
(table (bare_key) @name) @cls
(table (dotted_key) @name) @cls
(table_array_element (bare_key) @name) @cls
(table_array_element (dotted_key) @name) @cls
`,
yaml: `
(block_mapping_pair key: (flow_node) @name) @func
`,
sql: `
(create_table (object_reference) @name) @cls
(create_function (object_reference) @name) @func
(create_view (object_reference) @name) @cls
`,
markdown: `
(atx_heading heading_content: (inline) @name) @heading
(setext_heading heading_content: (paragraph) @name) @heading
(fenced_code_block (info_string (language) @name)) @code_block
(fenced_code_block) @code_block
(minus_metadata) @frontmatter
(link_reference_definition (link_label) @name) @ref
`,
generic: `
@@ -184,7 +498,21 @@ function getQueryKey(language: string): string {
case "rust": return "rust";
case "ruby": return "ruby";
case "java": return "java";
case "kotlin": return "kotlin";
case "swift": return "swift";
case "php": return "php";
case "elixir": return "generic";
case "lua": return "lua";
case "scala": return "scala";
case "bash": return "bash";
case "haskell": return "haskell";
case "zig": return "zig";
case "css": return "css";
case "scss": return "scss";
case "toml": return "toml";
case "yaml": return "yaml";
case "sql": return "sql";
case "markdown": return "markdown";
default: return "generic";
}
}
@@ -320,6 +648,11 @@ const KIND_MAP: Record<string, CodeSymbol["kind"]> = {
struct_def: "struct",
trait_def: "trait",
impl_def: "impl",
mixin_def: "mixin",
heading: "section",
code_block: "code",
frontmatter: "metadata",
ref: "reference",
};
const CONTAINER_KINDS = new Set(["class", "struct", "impl", "trait"]);
@@ -419,18 +752,36 @@ function buildSymbols(matches: RawMatch[], lines: string[], language: string): {
const nameCapture = match.captures.find(c => c.tag === "name");
if (!kindCapture) continue;
const name = nameCapture?.text || "anonymous";
const startRow = kindCapture.startRow;
const endRow = kindCapture.endRow;
const kind = KIND_MAP[kindCapture.tag];
const name = nameCapture?.text || "anonymous";
const comment = findCommentAbove(lines, startRow);
// Markdown-specific: extract heading level and build signature
let signature: string;
if (language === "markdown" && kind === "section") {
const headingLine = lines[startRow] || "";
const hashMatch = headingLine.match(/^(#{1,6})\s/);
const level = hashMatch ? hashMatch[1].length : 1;
signature = `${"#".repeat(level)} ${name}`;
} else if (language === "markdown" && kind === "code") {
const langTag = name !== "anonymous" ? name : "";
signature = langTag ? "```" + langTag : "```";
} else if (language === "markdown" && kind === "metadata") {
signature = "---frontmatter---";
} else if (language === "markdown" && kind === "reference") {
signature = lines[startRow]?.trim() || name;
} else {
signature = extractSignatureFromLines(lines, startRow, endRow);
}
const comment = language === "markdown" ? undefined : findCommentAbove(lines, startRow);
const docstring = language === "python" ? findPythonDocstringFromLines(lines, startRow, endRow) : undefined;
const sym: CodeSymbol = {
name,
kind,
signature: extractSignatureFromLines(lines, startRow, endRow),
signature,
jsdoc: comment || docstring,
lineStart: startRow,
lineEnd: endRow,
@@ -445,6 +796,34 @@ function buildSymbols(matches: RawMatch[], lines: string[], language: string): {
symbols.push(sym);
}
// Markdown: deduplicate code_block matches. The catch-all `(fenced_code_block) @code_block`
// pattern and the language-specific pattern both match the same block. Keep the named one.
if (language === "markdown") {
const codeBlocksByRange = new Map<string, CodeSymbol>();
const duplicateCodeBlocks = new Set<CodeSymbol>();
for (const sym of symbols) {
if (sym.kind !== "code") continue;
const rangeKey = `${sym.lineStart}:${sym.lineEnd}`;
const existing = codeBlocksByRange.get(rangeKey);
if (existing) {
// Prefer the named version (has actual language tag vs "anonymous")
if (sym.name !== "anonymous") {
duplicateCodeBlocks.add(existing);
codeBlocksByRange.set(rangeKey, sym);
} else {
duplicateCodeBlocks.add(sym);
}
} else {
codeBlocksByRange.set(rangeKey, sym);
}
}
if (duplicateCodeBlocks.size > 0) {
const filtered = symbols.filter(s => !duplicateCodeBlocks.has(s));
symbols.length = 0;
symbols.push(...filtered);
}
}
// Nest methods inside containers
const nested = new Set<CodeSymbol>();
for (const container of containers) {
@@ -463,11 +842,12 @@ function buildSymbols(matches: RawMatch[], lines: string[], language: string): {
// --- Main parse functions ---
export function parseFile(content: string, filePath: string): FoldedFile {
const language = detectLanguage(filePath);
export function parseFile(content: string, filePath: string, projectRoot?: string): FoldedFile {
const userConfig = projectRoot ? loadUserGrammars(projectRoot) : EMPTY_USER_GRAMMAR_CONFIG;
const language = detectLanguageWithUserGrammars(filePath, userConfig);
const lines = content.split("\n");
const grammarPath = resolveGrammarPath(language);
const grammarPath = resolveGrammarPathWithFallback(language, projectRoot);
if (!grammarPath) {
return {
filePath, language, symbols: [], imports: [],
@@ -475,7 +855,7 @@ export function parseFile(content: string, filePath: string): FoldedFile {
};
}
const queryKey = getQueryKey(language);
const queryKey = getUserAwareQueryKey(language, userConfig);
const queryFile = getQueryFile(queryKey);
// Write content to temp file with correct extension for language detection
@@ -510,20 +890,22 @@ export function parseFile(content: string, filePath: string): FoldedFile {
* Much faster than calling parseFile() per file (one process spawn per language vs per file).
*/
export function parseFilesBatch(
files: Array<{ absolutePath: string; relativePath: string; content: string }>
files: Array<{ absolutePath: string; relativePath: string; content: string }>,
projectRoot?: string
): Map<string, FoldedFile> {
const results = new Map<string, FoldedFile>();
const userConfig = projectRoot ? loadUserGrammars(projectRoot) : EMPTY_USER_GRAMMAR_CONFIG;
// Group files by language (and thus by query + grammar)
const languageGroups = new Map<string, typeof files>();
for (const file of files) {
const language = detectLanguage(file.relativePath);
const language = detectLanguageWithUserGrammars(file.relativePath, userConfig);
if (!languageGroups.has(language)) languageGroups.set(language, []);
languageGroups.get(language)!.push(file);
}
for (const [language, groupFiles] of languageGroups) {
const grammarPath = resolveGrammarPath(language);
const grammarPath = resolveGrammarPathWithFallback(language, projectRoot);
if (!grammarPath) {
// No grammar — return empty results for these files
for (const file of groupFiles) {
@@ -536,7 +918,7 @@ export function parseFilesBatch(
continue;
}
const queryKey = getQueryKey(language);
const queryKey = getUserAwareQueryKey(language, userConfig);
const queryFile = getQueryFile(queryKey);
// Run one batch query for all files of this language
@@ -570,6 +952,10 @@ export function parseFilesBatch(
// --- Formatting ---
export function formatFoldedView(file: FoldedFile): string {
if (file.language === "markdown") {
return formatMarkdownFoldedView(file);
}
const parts: string[] = [];
parts.push(`📁 ${file.filePath} (${file.language}, ${file.totalLines} lines)`);
@@ -593,6 +979,64 @@ export function formatFoldedView(file: FoldedFile): string {
return parts.join("\n");
}
function formatMarkdownFoldedView(file: FoldedFile): string {
const parts: string[] = [];
// Total width for the content column (before the line range)
const COL_WIDTH = 56;
parts.push(`📄 ${file.filePath} (${file.language}, ${file.totalLines} lines)`);
for (const sym of file.symbols) {
if (sym.kind === "section") {
// Extract heading level from the signature (count leading # characters)
const hashMatch = sym.signature.match(/^(#{1,6})\s/);
const level = hashMatch ? hashMatch[1].length : 1;
const indent = " ".repeat(level);
const lineRange = `L${sym.lineStart + 1}`;
const content = `${indent}${sym.signature}`;
parts.push(`${content.padEnd(COL_WIDTH)}${lineRange}`);
} else if (sym.kind === "code") {
// Find containing heading level for indentation
const containingLevel = findContainingHeadingLevel(file.symbols, sym.lineStart);
const indent = " ".repeat(containingLevel + 1);
const lineRange = sym.lineStart === sym.lineEnd
? `L${sym.lineStart + 1}`
: `L${sym.lineStart + 1}-${sym.lineEnd + 1}`;
const content = `${indent}${sym.signature}`;
parts.push(`${content.padEnd(COL_WIDTH)}${lineRange}`);
} else if (sym.kind === "metadata") {
const lineRange = sym.lineStart === sym.lineEnd
? `L${sym.lineStart + 1}`
: `L${sym.lineStart + 1}-${sym.lineEnd + 1}`;
const content = ` ${sym.signature}`;
parts.push(`${content.padEnd(COL_WIDTH)}${lineRange}`);
} else if (sym.kind === "reference") {
const containingLevel = findContainingHeadingLevel(file.symbols, sym.lineStart);
const indent = " ".repeat(containingLevel + 1);
const lineRange = `L${sym.lineStart + 1}`;
const content = `${indent}${sym.name}`;
parts.push(`${content.padEnd(COL_WIDTH)}${lineRange}`);
}
}
return parts.join("\n");
}
/**
* Find the heading level of the most recent section heading before the given line.
* Returns 0 if no heading precedes the line.
*/
function findContainingHeadingLevel(symbols: CodeSymbol[], lineStart: number): number {
let bestLevel = 0;
for (const sym of symbols) {
if (sym.kind === "section" && sym.lineStart < lineStart) {
const hashMatch = sym.signature.match(/^(#{1,6})\s/);
bestLevel = hashMatch ? hashMatch[1].length : 1;
}
}
return bestLevel;
}
function formatSymbol(sym: CodeSymbol, indent: string): string {
const parts: string[] = [];
@@ -633,7 +1077,8 @@ function getSymbolIcon(kind: CodeSymbol["kind"]): string {
function: "ƒ", method: "ƒ", class: "◆", interface: "◇",
type: "◇", const: "●", variable: "○", export: "→",
struct: "◆", enum: "▣", trait: "◇", impl: "◈",
property: "○", getter: "⇢", setter: "⇠",
property: "○", getter: "⇢", setter: "⇠", mixin: "◈",
section: "§", code: "⌘", metadata: "◊", reference: "↗",
};
return icons[kind] || "·";
}
@@ -659,6 +1104,31 @@ export function unfoldSymbol(content: string, filePath: string, symbolName: stri
const lines = content.split("\n");
// Markdown section unfold: return from heading to next heading of same or higher level
if (file.language === "markdown" && symbol.kind === "section") {
const hashMatch = symbol.signature.match(/^(#{1,6})\s/);
const level = hashMatch ? hashMatch[1].length : 1;
const start = symbol.lineStart;
// Find the next heading at same or higher (lower number) level
let end = lines.length - 1;
for (const sym of file.symbols) {
if (sym.kind === "section" && sym.lineStart > start) {
const otherHashMatch = sym.signature.match(/^(#{1,6})\s/);
const otherLevel = otherHashMatch ? otherHashMatch[1].length : 1;
if (otherLevel <= level) {
end = sym.lineStart - 1;
// Trim trailing blank lines
while (end > start && lines[end].trim() === "") end--;
break;
}
}
}
const extracted = lines.slice(start, end + 1).join("\n");
return `<!-- 📍 ${filePath} L${start + 1}-${end + 1} -->\n${extracted}`;
}
// Include preceding comments/decorators
let start = symbol.lineStart;
for (let i = symbol.lineStart - 1; i >= 0; i--) {
+33 -8
View File
@@ -12,7 +12,7 @@
import { readFile, readdir, stat } from "node:fs/promises";
import { join, relative } from "node:path";
import { parseFilesBatch, formatFoldedView, type FoldedFile } from "./parser.js";
import { parseFilesBatch, formatFoldedView, loadUserGrammars, type FoldedFile } from "./parser.js";
const CODE_EXTENSIONS = new Set([
".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs",
@@ -22,11 +22,22 @@ const CODE_EXTENSIONS = new Set([
".rb",
".java",
".cs",
".cpp", ".c", ".h", ".hpp",
".cpp", ".cc", ".cxx", ".c", ".h", ".hpp", ".hh",
".swift",
".kt",
".kt", ".kts",
".php",
".vue", ".svelte",
".ex", ".exs",
".lua",
".scala", ".sc",
".sh", ".bash", ".zsh",
".hs",
".zig",
".css", ".scss",
".toml",
".yml", ".yaml",
".sql",
".md", ".mdx",
]);
const IGNORE_DIRS = new Set([
@@ -59,8 +70,9 @@ export interface SymbolMatch {
/**
* Walk a directory recursively, yielding file paths.
* extraExtensions: additional file extensions to include (from user grammar config).
*/
async function* walkDir(dir: string, rootDir: string, maxDepth: number = 20): AsyncGenerator<string> {
async function* walkDir(dir: string, rootDir: string, maxDepth: number = 20, extraExtensions?: Set<string>): AsyncGenerator<string> {
if (maxDepth <= 0) return;
let entries;
@@ -77,10 +89,10 @@ async function* walkDir(dir: string, rootDir: string, maxDepth: number = 20): As
const fullPath = join(dir, entry.name);
if (entry.isDirectory()) {
yield* walkDir(fullPath, rootDir, maxDepth - 1);
yield* walkDir(fullPath, rootDir, maxDepth - 1, extraExtensions);
} else if (entry.isFile()) {
const ext = entry.name.slice(entry.name.lastIndexOf("."));
if (CODE_EXTENSIONS.has(ext)) {
if (CODE_EXTENSIONS.has(ext) || (extraExtensions && extraExtensions.has(ext))) {
yield fullPath;
}
}
@@ -121,16 +133,29 @@ export async function searchCodebase(
maxResults?: number;
includeImports?: boolean;
filePattern?: string;
projectRoot?: string;
} = {}
): Promise<SearchResult> {
const maxResults = options.maxResults || 20;
const queryLower = query.toLowerCase();
const queryParts = queryLower.split(/[\s_\-./]+/).filter(p => p.length > 0);
// Load user grammar config for extra file extensions
const projectRoot = options.projectRoot || rootDir;
const userConfig = loadUserGrammars(projectRoot);
const extraExtensions = new Set<string>();
for (const entry of Object.values(userConfig.grammars)) {
for (const ext of entry.extensions) {
if (!CODE_EXTENSIONS.has(ext)) {
extraExtensions.add(ext);
}
}
}
// Phase 1: Collect files
const filesToParse: Array<{ absolutePath: string; relativePath: string; content: string }> = [];
for await (const filePath of walkDir(rootDir, rootDir)) {
for await (const filePath of walkDir(rootDir, rootDir, 20, extraExtensions.size > 0 ? extraExtensions : undefined)) {
if (options.filePattern) {
const relPath = relative(rootDir, filePath);
if (!relPath.toLowerCase().includes(options.filePattern.toLowerCase())) continue;
@@ -147,7 +172,7 @@ export async function searchCodebase(
}
// Phase 2: Batch parse (one CLI call per language)
const parsedFiles = parseFilesBatch(filesToParse);
const parsedFiles = parseFilesBatch(filesToParse, projectRoot);
// Phase 3: Match query against symbols
const foldedFiles: FoldedFile[] = [];