diff --git a/docs/wiki/Configuration.md b/docs/wiki/Configuration.md index 0f70bd5..9ccdaa3 100644 --- a/docs/wiki/Configuration.md +++ b/docs/wiki/Configuration.md @@ -137,6 +137,48 @@ Files matching `.gitignore` are also skipped automatically. --- +## Custom File Extensions + +By default the indexer recognises common source file extensions (`.py`, `.ts`, `.go`, `.html`, `.css`, …) and routes each to the right tree-sitter parser when one is available. If your project uses an extension CCE doesn't know about — a template language, a rebranded JS extension, a config DSL — register it under `indexer.extensions`: + +```yaml +indexer: + extensions: + .tpl: html # alias to an existing parser + .mjs: javascript + .cts: typescript + .liquid: "" # index as plaintext (no AST chunking) + .erb: "" +``` + +**Rules:** + +- **Keys** must start with `.` and are matched case-insensitively against file suffixes (`.HTML` and `.html` resolve the same way). +- **Values** are language strings — anything in the built-in `_LANGUAGE_MAP` works (`html`, `javascript`, `typescript`, `python`, `go`, `rust`, `java`, `php`, etc.). Unknown values are accepted and fall back to plaintext at chunk time. +- **Empty string or `null`** indexes the file as a single plaintext chunk. Useful when you want the file searchable but know there's no parser for it. +- **User entries override built-ins.** For example, force `.h` to be parsed as C++ instead of C: + + ```yaml + indexer: + extensions: + .h: cpp + ``` + +**Where to put it:** + +- Global default: `~/.cce/config.yaml` +- Project-specific: `.context-engine.yaml` in the project root (overrides the global entry per-extension) + +**After editing**, re-run indexing so existing files get re-chunked under the new mapping: + +```bash +cce index --full +``` + +**Parsers with full AST chunking** (semantic chunks for functions, classes, blocks): Python, JavaScript, TypeScript/TSX, PHP, Go, Rust, Java, HTML. Other languages (`css`, `markdown`, `json`, `yaml`, …) are mapped for metadata but indexed as a single plaintext chunk per file. + +--- + ## Changing the Embedding Model ```yaml diff --git a/pyproject.toml b/pyproject.toml index c200e8a..6f67553 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "tree-sitter-go>=0.23", "tree-sitter-rust>=0.23", "tree-sitter-java>=0.23", + "tree-sitter-html>=0.20", "watchdog>=4.0", "mcp>=1.0", "httpx>=0.27", diff --git a/src/context_engine/config.py b/src/context_engine/config.py index afdbab5..c6ed42f 100644 --- a/src/context_engine/config.py +++ b/src/context_engine/config.py @@ -75,6 +75,14 @@ class Config: # index. See indexer/secrets.py for the full pattern list. Default # True; users on non-sensitive corpora can opt out. indexer_redact_secrets: bool = True + # Extra extension → language mappings layered over the indexer's built-in + # _LANGUAGE_MAP. Keys must start with "." and are matched case-insensitively + # against file suffixes; values are language strings (e.g. "html", + # "javascript") that pick the tree-sitter parser, or "" for plaintext. + # Unknown language strings are accepted and fall back to plaintext at + # chunk time — same behavior as built-in entries without a parser (md, + # css, json, …). + indexer_extensions: dict[str, str] = field(default_factory=dict) # When True, memory.db writes (decisions, code_areas, turn_summaries, # session rollups) get PII scrubbed before storage: emails, IPs, # credit cards (Luhn-validated), SSNs, phone numbers. Free-form @@ -127,6 +135,7 @@ def _deep_merge(base: dict, override: dict) -> dict: "indexer_debounce_ms": int, "indexer_ignore": list, "indexer_redact_secrets": bool, + "indexer_extensions": dict, "memory_redact_pii": bool, "audit_log_enabled": bool, "storage_path": str, @@ -148,6 +157,7 @@ def _apply_dict_to_config(config: Config, data: dict) -> None: ("indexer", "debounce_ms"): "indexer_debounce_ms", ("indexer", "ignore"): "indexer_ignore", ("indexer", "redact_secrets"): "indexer_redact_secrets", + ("indexer", "extensions"): "indexer_extensions", ("memory", "redact_pii"): "memory_redact_pii", ("audit", "enabled"): "audit_log_enabled", ("storage", "path"): "storage_path", @@ -176,6 +186,23 @@ def _apply_dict_to_config(config: Config, data: dict) -> None: if item not in merged: merged.append(item) setattr(config, attr, merged) + elif attr == "indexer_extensions" and isinstance(value, dict): + normalized: dict[str, str] = {} + for ext, lang in value.items(): + if not isinstance(ext, str) or not ext.startswith("."): + raise ValueError( + f"Config indexer.extensions: key {ext!r} must be a " + "string starting with '.' (e.g. '.tpl')" + ) + if lang is None: + lang = "" + if not isinstance(lang, str): + raise ValueError( + f"Config indexer.extensions[{ext!r}]: language must " + f"be a string or null, got {type(lang).__name__}" + ) + normalized[ext.lower()] = lang + setattr(config, attr, normalized) else: setattr(config, attr, value) diff --git a/src/context_engine/indexer/chunker.py b/src/context_engine/indexer/chunker.py index ee6e8d2..033069d 100644 --- a/src/context_engine/indexer/chunker.py +++ b/src/context_engine/indexer/chunker.py @@ -8,6 +8,7 @@ import tree_sitter_go as tsgo import tree_sitter_rust as tsrust import tree_sitter_java as tsjava +import tree_sitter_html as tshtml from tree_sitter import Language, Parser from context_engine.models import Chunk, ChunkType @@ -28,6 +29,13 @@ "import_declaration", # TypeScript, Go, Java "use_declaration", # PHP, Rust } +# HTML chunks at + + + +""" + +HTML_PLAIN = """ +

Just text

no script or style here

+""" + + +def test_chunk_html_extracts_script_and_style(chunker): + chunks = chunker.chunk(HTML_WITH_BLOCKS, file_path="page.html", language="html") + types = [c.chunk_type for c in chunks] + # script and style become MODULE chunks + assert types.count(ChunkType.MODULE) >= 2 + contents = [c.content for c in chunks if c.chunk_type == ChunkType.MODULE] + assert any("console.log" in c for c in contents) + assert any("color: red" in c for c in contents) + + +def test_chunk_html_without_blocks_falls_back_to_whole_file(chunker): + chunks = chunker.chunk(HTML_PLAIN, file_path="plain.html", language="html") + # No script/style → fallback path returns single MODULE chunk for the file + assert len(chunks) == 1 + assert chunks[0].chunk_type == ChunkType.MODULE + assert "Just text" in chunks[0].content diff --git a/tests/indexer/test_language_resolution.py b/tests/indexer/test_language_resolution.py new file mode 100644 index 0000000..d70f46a --- /dev/null +++ b/tests/indexer/test_language_resolution.py @@ -0,0 +1,35 @@ +"""Unit tests for `_resolve_language` — the indexer hook that lets users add +custom file-extension → language mappings via `indexer.extensions` in +`.context-engine.yaml`. +""" +from context_engine.indexer.pipeline import _resolve_language + + +def test_builtin_extension_resolves_to_known_language(): + assert _resolve_language(".py", {}) == "python" + + +def test_unknown_extension_falls_back_to_plaintext(): + assert _resolve_language(".xyz", {}) == "plaintext" + + +def test_custom_alias_overrides_builtin(): + # .h normally maps to c; custom mapping flips it to cpp. + assert _resolve_language(".h", {".h": "cpp"}) == "cpp" + + +def test_custom_alias_for_unknown_extension(): + assert _resolve_language(".tpl", {".tpl": "html"}) == "html" + + +def test_custom_empty_value_means_plaintext(): + # User opts into indexing the file but knows there's no parser. + assert _resolve_language(".liquid", {".liquid": ""}) == "plaintext" + + +def test_lookup_is_case_insensitive(): + # Extension comes from Path.suffix which preserves case (.HTML on Windows + # mounts, .R for R files); custom map keys are normalised to lowercase + # at config load time, so the lookup must lowercase the suffix too. + assert _resolve_language(".HTML", {}) == "html" + assert _resolve_language(".TPL", {".tpl": "html"}) == "html" diff --git a/tests/test_config.py b/tests/test_config.py index 70e65fe..ece5dcc 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -85,3 +85,55 @@ def test_ollama_url_yaml_type_validation(tmp_path): })) with pytest.raises(ValueError, match="ollama_url"): load_config(global_path=config_file) + + +def test_indexer_extensions_default_empty(): + assert Config().indexer_extensions == {} + + +def test_indexer_extensions_loads_and_normalizes(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "html", ".MJS": "javascript", ".liquid": "", ".erb": None}}, + })) + config = load_config(global_path=config_file) + # Keys lowercased, null coerced to empty string. + assert config.indexer_extensions == { + ".tpl": "html", + ".mjs": "javascript", + ".liquid": "", + ".erb": "", + } + + +def test_indexer_extensions_rejects_key_without_dot(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {"tpl": "html"}}, + })) + with pytest.raises(ValueError, match="must be a string starting with"): + load_config(global_path=config_file) + + +def test_indexer_extensions_rejects_non_string_value(tmp_path): + config_file = tmp_path / "config.yaml" + config_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": 123}}, + })) + with pytest.raises(ValueError, match="must be a string or null"): + load_config(global_path=config_file) + + +def test_indexer_extensions_project_overrides_global(tmp_path): + global_file = tmp_path / "config.yaml" + global_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "html"}}, + })) + project_file = tmp_path / ".context-engine.yaml" + project_file.write_text(yaml.dump({ + "indexer": {"extensions": {".tpl": "javascript", ".vue": "vue"}}, + })) + config = load_config(global_path=global_file, project_path=project_file) + # Project entry wins for .tpl; .vue inherited because deep_merge merges dicts. + assert config.indexer_extensions[".tpl"] == "javascript" + assert config.indexer_extensions[".vue"] == "vue"