diff --git a/docs/wiki/Configuration.md b/docs/wiki/Configuration.md
index 0f70bd5..9ccdaa3 100644
--- a/docs/wiki/Configuration.md
+++ b/docs/wiki/Configuration.md
@@ -137,6 +137,48 @@ Files matching `.gitignore` are also skipped automatically.
---
+## Custom File Extensions
+
+By default the indexer recognises common source file extensions (`.py`, `.ts`, `.go`, `.html`, `.css`, …) and routes each to the right tree-sitter parser when one is available. If your project uses an extension CCE doesn't know about — a template language, a rebranded JS extension, a config DSL — register it under `indexer.extensions`:
+
+```yaml
+indexer:
+ extensions:
+ .tpl: html # alias to an existing parser
+ .mjs: javascript
+ .cts: typescript
+ .liquid: "" # index as plaintext (no AST chunking)
+ .erb: ""
+```
+
+**Rules:**
+
+- **Keys** must start with `.` and are matched case-insensitively against file suffixes (`.HTML` and `.html` resolve the same way).
+- **Values** are language strings — anything in the built-in `_LANGUAGE_MAP` works (`html`, `javascript`, `typescript`, `python`, `go`, `rust`, `java`, `php`, etc.). Unknown values are accepted and fall back to plaintext at chunk time.
+- **Empty string or `null`** indexes the file as a single plaintext chunk. Useful when you want the file searchable but know there's no parser for it.
+- **User entries override built-ins.** For example, force `.h` to be parsed as C++ instead of C:
+
+ ```yaml
+ indexer:
+ extensions:
+ .h: cpp
+ ```
+
+**Where to put it:**
+
+- Global default: `~/.cce/config.yaml`
+- Project-specific: `.context-engine.yaml` in the project root (overrides the global entry per-extension)
+
+**After editing**, re-run indexing so existing files get re-chunked under the new mapping:
+
+```bash
+cce index --full
+```
+
+**Parsers with full AST chunking** (semantic chunks for functions, classes, blocks): Python, JavaScript, TypeScript/TSX, PHP, Go, Rust, Java, HTML. Other languages (`css`, `markdown`, `json`, `yaml`, …) are mapped for metadata but indexed as a single plaintext chunk per file.
+
+---
+
## Changing the Embedding Model
```yaml
diff --git a/pyproject.toml b/pyproject.toml
index c200e8a..6f67553 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
"tree-sitter-go>=0.23",
"tree-sitter-rust>=0.23",
"tree-sitter-java>=0.23",
+ "tree-sitter-html>=0.20",
"watchdog>=4.0",
"mcp>=1.0",
"httpx>=0.27",
diff --git a/src/context_engine/config.py b/src/context_engine/config.py
index afdbab5..c6ed42f 100644
--- a/src/context_engine/config.py
+++ b/src/context_engine/config.py
@@ -75,6 +75,14 @@ class Config:
# index. See indexer/secrets.py for the full pattern list. Default
# True; users on non-sensitive corpora can opt out.
indexer_redact_secrets: bool = True
+ # Extra extension → language mappings layered over the indexer's built-in
+ # _LANGUAGE_MAP. Keys must start with "." and are matched case-insensitively
+ # against file suffixes; values are language strings (e.g. "html",
+ # "javascript") that pick the tree-sitter parser, or "" for plaintext.
+ # Unknown language strings are accepted and fall back to plaintext at
+ # chunk time — same behavior as built-in entries without a parser (md,
+ # css, json, …).
+ indexer_extensions: dict[str, str] = field(default_factory=dict)
# When True, memory.db writes (decisions, code_areas, turn_summaries,
# session rollups) get PII scrubbed before storage: emails, IPs,
# credit cards (Luhn-validated), SSNs, phone numbers. Free-form
@@ -127,6 +135,7 @@ def _deep_merge(base: dict, override: dict) -> dict:
"indexer_debounce_ms": int,
"indexer_ignore": list,
"indexer_redact_secrets": bool,
+ "indexer_extensions": dict,
"memory_redact_pii": bool,
"audit_log_enabled": bool,
"storage_path": str,
@@ -148,6 +157,7 @@ def _apply_dict_to_config(config: Config, data: dict) -> None:
("indexer", "debounce_ms"): "indexer_debounce_ms",
("indexer", "ignore"): "indexer_ignore",
("indexer", "redact_secrets"): "indexer_redact_secrets",
+ ("indexer", "extensions"): "indexer_extensions",
("memory", "redact_pii"): "memory_redact_pii",
("audit", "enabled"): "audit_log_enabled",
("storage", "path"): "storage_path",
@@ -176,6 +186,23 @@ def _apply_dict_to_config(config: Config, data: dict) -> None:
if item not in merged:
merged.append(item)
setattr(config, attr, merged)
+ elif attr == "indexer_extensions" and isinstance(value, dict):
+ normalized: dict[str, str] = {}
+ for ext, lang in value.items():
+ if not isinstance(ext, str) or not ext.startswith("."):
+ raise ValueError(
+ f"Config indexer.extensions: key {ext!r} must be a "
+ "string starting with '.' (e.g. '.tpl')"
+ )
+ if lang is None:
+ lang = ""
+ if not isinstance(lang, str):
+ raise ValueError(
+ f"Config indexer.extensions[{ext!r}]: language must "
+ f"be a string or null, got {type(lang).__name__}"
+ )
+ normalized[ext.lower()] = lang
+ setattr(config, attr, normalized)
else:
setattr(config, attr, value)
diff --git a/src/context_engine/indexer/chunker.py b/src/context_engine/indexer/chunker.py
index ee6e8d2..033069d 100644
--- a/src/context_engine/indexer/chunker.py
+++ b/src/context_engine/indexer/chunker.py
@@ -8,6 +8,7 @@
import tree_sitter_go as tsgo
import tree_sitter_rust as tsrust
import tree_sitter_java as tsjava
+import tree_sitter_html as tshtml
from tree_sitter import Language, Parser
from context_engine.models import Chunk, ChunkType
@@ -28,6 +29,13 @@
"import_declaration", # TypeScript, Go, Java
"use_declaration", # PHP, Rust
}
+# HTML chunks at
+
+