diff --git a/AGENTS.md b/AGENTS.md index 24756bb..5e9c37e 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -46,4 +46,4 @@ ## Security & Configuration Tips - Never commit secrets. The default local provider needs no key; OpenAI is opt-in. -- The index/database live in `CODERAG_STORE_DIR` (default `./.coderag/`, gitignored). +- The index/database live in `CODERAG_STORE_DIR` (default `/.coderag/`, derived from the watched dir rather than the cwd, gitignored). diff --git a/README.md b/README.md index 24956a4..bdb4708 100644 --- a/README.md +++ b/README.md @@ -123,7 +123,8 @@ coderag search "where are duplicate vectors removed on file change" --watched-di """Incremental indexing orchestration. ...the critical correctness property…""" ``` -By default the index lives in `./.coderag/`. Set `CODERAG_WATCHED_DIR` / `CODERAG_STORE_DIR` +By default the index lives in `/.coderag/` (next to the code it indexes, so it's +found no matter where you run `coderag` from). Set `CODERAG_WATCHED_DIR` / `CODERAG_STORE_DIR` (or copy `example.env` to `.env`) to avoid repeating flags. ## 🧑‍💻 The surfaces @@ -384,7 +385,7 @@ table is in [`docs/configuration.md`](docs/configuration.md). | `CODERAG_PROVIDER` | `fastembed` | Embedding backend: `fastembed` (local) · `openai` (OpenAI API **or** any OpenAI-compatible/local server) · `fake` | | `CODERAG_MODEL` | `BAAI/bge-small-en-v1.5` | Local embedding model (`coderag eval --list-models`) | | `CODERAG_WATCHED_DIR` | cwd | Codebase to index | -| `CODERAG_STORE_DIR` | `./.coderag` | Where the LanceDB store lives | +| `CODERAG_STORE_DIR` | `/.coderag` | Where the LanceDB store lives (derived from the watched dir, not the cwd) | | `CODERAG_TOP_K` | `8` | Results returned | | `OPENAI_BASE_URL` | – | Point at a self-hosted / local OpenAI-compatible server (Ollama, vLLM, LM Studio, LocalAI) — enables local embeddings **and** local answers | | `OPENAI_API_KEY` | – | OpenAI **cloud** embeddings / answers (optional for a local server) | diff --git a/coderag/config.py b/coderag/config.py index 049777d..145a822 100644 --- a/coderag/config.py +++ b/coderag/config.py @@ -113,6 +113,18 @@ def _env_path(key: str, default: Path) -> Path: return Path(raw).expanduser() +def _env_path_opt(key: str) -> Path | None: + """Like :func:`_env_path` but returns ``None`` when the var is unset/blank. + + Used for ``store_dir`` so an absent ``CODERAG_STORE_DIR`` leaves it unset and lets it be + derived from ``watched_dir`` (see :meth:`Config.__post_init__`) rather than the cwd. + """ + raw = os.getenv(key) + if raw is None or not raw.strip(): + return None + return Path(raw).expanduser() + + def _env_tuple(key: str, default: Tuple[str, ...]) -> Tuple[str, ...]: """Parse a comma-separated env var into a tuple of trimmed, non-empty values.""" raw = os.getenv(key) @@ -139,7 +151,14 @@ class Config: # --- Locations --- watched_dir: Path = field(default_factory=Path.cwd) - store_dir: Path = field(default_factory=lambda: Path.cwd() / ".coderag") + # Defaults to ``/.coderag`` (resolved in ``__post_init__``), so the index + # lives next to the code it indexes. It deliberately does NOT default to the *current + # working directory*: a cwd-relative store silently pointed at a different (often empty) + # ``.coderag`` whenever a command ran from a directory other than the one indexed — e.g. + # ``coderag index --watched-dir /home/me`` would write to ``/home/me/.coderag`` only if + # you happened to be standing in ``/home/me``, and ``coderag status`` from elsewhere then + # found nothing. Set it explicitly with ``--store-dir`` / ``CODERAG_STORE_DIR`` to override. + store_dir: Path = None # type: ignore[assignment] # filled in by __post_init__ # --- What to index --- languages: Tuple[str, ...] = DEFAULT_LANGUAGES @@ -254,8 +273,31 @@ class Config: demo_max_answers: int = 5 # LLM answers allowed per browser session demo_cooldown_seconds: int = 20 # minimum seconds between answers in a session + def __post_init__(self) -> None: + # Derive the store location from the watched dir when it wasn't set explicitly, so the + # index is found regardless of the current working directory. Runs on every + # construction — including the ``replace()`` inside ``with_overrides`` — so overriding + # ``watched_dir`` without a ``store_dir`` re-derives the store from the new watched dir. + if self.store_dir is None: + # Frozen dataclass: assign through object.__setattr__. + object.__setattr__(self, "store_dir", self.watched_dir / ".coderag") + def with_overrides(self, **kwargs: object) -> "Config": - """Return a copy with the given fields replaced (config stays immutable).""" + """Return a copy with the given fields replaced (config stays immutable). + + Overriding ``watched_dir`` alone also moves the store: ``replace`` re-runs + ``__post_init__``, but only when ``store_dir`` is left unset here. To keep an + auto-derived store from following a new watched dir, pass ``store_dir`` explicitly. + """ + # If watched_dir is being moved but store_dir isn't given *and* the current store_dir + # was auto-derived from the old watched_dir, clear it so __post_init__ re-derives it + # against the new watched_dir. + if ( + "watched_dir" in kwargs + and "store_dir" not in kwargs + and self.store_dir == self.watched_dir / ".coderag" + ): + kwargs = {**kwargs, "store_dir": None} return replace(self, **kwargs) # type: ignore[arg-type] @classmethod @@ -272,7 +314,8 @@ def from_env(cls, **overrides: object) -> "Config": "CODERAG_CACHE_DIR", Path.home() / ".cache" / "coderag" ), watched_dir=_env_path("CODERAG_WATCHED_DIR", Path.cwd()), - store_dir=_env_path("CODERAG_STORE_DIR", Path.cwd() / ".coderag"), + # None => derived from watched_dir in __post_init__ (never the cwd). + store_dir=_env_path_opt("CODERAG_STORE_DIR"), # type: ignore[arg-type] top_k=_env_int("CODERAG_TOP_K", cls.top_k), fetch_k=_env_int("CODERAG_FETCH_K", cls.fetch_k), rrf_k=_env_int("CODERAG_RRF_K", cls.rrf_k), diff --git a/coderag/surfaces/cli.py b/coderag/surfaces/cli.py index 75116bf..ecf892f 100644 --- a/coderag/surfaces/cli.py +++ b/coderag/surfaces/cli.py @@ -368,7 +368,8 @@ def _env_port(key: str, default: int) -> int: def _add_common(p: argparse.ArgumentParser) -> None: p.add_argument("--watched-dir", help="Codebase root to index/search.") p.add_argument( - "--store-dir", help="Where the index/database live (default ./.coderag)." + "--store-dir", + help="Where the index/database live (default /.coderag).", ) p.add_argument( "--provider", diff --git a/docs/configuration.md b/docs/configuration.md index 1762959..6ec11b0 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -158,7 +158,7 @@ optional. | Variable | Default | Meaning | | --- | --- | --- | | `CODERAG_WATCHED_DIR` | cwd | Codebase to index/search. | -| `CODERAG_STORE_DIR` | `./.coderag` | Where the LanceDB store lives. | +| `CODERAG_STORE_DIR` | `/.coderag` | Where the LanceDB store lives. Derived from the watched dir (not the cwd) so the index is found no matter where you run `coderag` from. | | `CODERAG_INDEX_ALL_TEXT` | `false` | Index any UTF-8 text file (docs/config/extensionless), not just code. Binary files are always skipped. | ### Retrieval & quality diff --git a/example.env b/example.env index f83a8f2..4987818 100644 --- a/example.env +++ b/example.env @@ -14,8 +14,10 @@ CODERAG_MODEL=BAAI/bge-small-en-v1.5 # --- Locations --- # The codebase to index/search (defaults to the current directory). CODERAG_WATCHED_DIR=/path/to/your/codebase -# Where the LanceDB store is kept (defaults to ./.coderag). -# CODERAG_STORE_DIR=./.coderag +# Where the LanceDB store is kept. Defaults to /.coderag — i.e. next to the +# code it indexes, so the index is found no matter which directory you run coderag from +# (not the current working directory). Uncomment to put it somewhere else. +# CODERAG_STORE_DIR=/path/to/your/codebase/.coderag # --- Retrieval --- # CODERAG_TOP_K=8 diff --git a/tests/test_config_and_providers.py b/tests/test_config_and_providers.py index a06cd2c..4e2b8b7 100644 --- a/tests/test_config_and_providers.py +++ b/tests/test_config_and_providers.py @@ -14,6 +14,53 @@ def test_config_defaults_and_derived_paths(tmp_path): assert cfg.store_dir == tmp_path / ".coderag" +def test_store_dir_derives_from_watched_dir_not_cwd(tmp_path, monkeypatch): + """The store lives under the *watched* dir, not the shell's cwd. + + Regression: a cwd-relative default silently pointed `status`/`search` at a different + (empty) ``.coderag`` whenever a command ran from a directory other than the indexed one. + """ + watched = tmp_path / "project" + watched.mkdir() + elsewhere = tmp_path / "elsewhere" + elsewhere.mkdir() + monkeypatch.chdir(elsewhere) # stand somewhere other than the watched dir + + cfg = Config(watched_dir=watched) + assert cfg.store_dir == watched / ".coderag" + + # Same result via from_env, and regardless of where the process is launched from. + cfg_env = Config.from_env(watched_dir=watched) + assert cfg_env.store_dir == watched / ".coderag" + + +def test_explicit_store_dir_overrides_derivation(tmp_path): + watched = tmp_path / "project" + explicit = tmp_path / "custom-store" + cfg = Config(watched_dir=watched, store_dir=explicit) + assert cfg.store_dir == explicit # explicit wins; not /.coderag + + +def test_store_dir_from_env_var(tmp_path, monkeypatch): + monkeypatch.setenv("CODERAG_STORE_DIR", str(tmp_path / "env-store")) + cfg = Config.from_env(watched_dir=tmp_path / "project") + assert cfg.store_dir == tmp_path / "env-store" + + +def test_overriding_watched_dir_moves_derived_store(tmp_path): + """Re-pointing watched_dir on an auto-derived config moves the store with it.""" + cfg = Config(watched_dir=tmp_path / "a") + assert cfg.store_dir == tmp_path / "a" / ".coderag" + moved = cfg.with_overrides(watched_dir=tmp_path / "b") + assert moved.store_dir == tmp_path / "b" / ".coderag" + # But an explicit store_dir is sticky across a watched_dir move. + pinned = Config(watched_dir=tmp_path / "a", store_dir=tmp_path / "store") + assert ( + pinned.with_overrides(watched_dir=tmp_path / "b").store_dir + == tmp_path / "store" + ) + + def test_config_is_immutable_and_copies(): cfg = Config() updated = cfg.with_overrides(top_k=42)