diff --git a/.agents/skills/raincloud-add-kaggle-tos/SKILL.md b/.agents/skills/raincloud-add-kaggle-tos/SKILL.md index 254d1fa..747e301 100644 --- a/.agents/skills/raincloud-add-kaggle-tos/SKILL.md +++ b/.agents/skills/raincloud-add-kaggle-tos/SKILL.md @@ -25,7 +25,7 @@ Steps: Use the [Python load-edit-dump pattern](../../context/AGENTS.md#safe-ways-to-edit-sourcesjson) — never `sed`. -2. **Confirm Kaggle creds are set up:** `~/.kaggle/kaggle.json` with `chmod 600`, and the project synced via `uv sync --extra kaggle`. +2. **Confirm Kaggle creds are set up:** `~/.kaggle/kaggle.json` with `chmod 600`, and the project synced via `uv sync --extra kaggle --inexact`. 3. **Try the first build via `/raincloud-build --loose`.** Pre-flight will print `kaggle (ToS-gated): ...`. Expect a 403 on the first try. The error message will point at the exact Kaggle URL the user must visit. diff --git a/.agents/skills/raincloud-build/SKILL.md b/.agents/skills/raincloud-build/SKILL.md index c81ab21..4cd2006 100644 --- a/.agents/skills/raincloud-build/SKILL.md +++ b/.agents/skills/raincloud-build/SKILL.md @@ -24,7 +24,7 @@ Modifiers: Before running: - **Confirm with the user** before triggering anything non-trivial. JSONBench 100M ≈ 6 h, Wikipedia Structured Contents → 34 GB parquet, OSM Germany ~45 min per kind. Small (<100 MB) parquets are fine without asking. (See [AGENTS.md "Rebuilding is expensive"](../../context/AGENTS.md).) - For large builds, set `RAINCLOUD_DUCKDB_MEMORY_LIMIT` and `RAINCLOUD_DUCKDB_TEMP_DIRECTORY` — see `/raincloud-large-build` for the full pattern. -- For Kaggle/HF datasets, ensure `uv sync --extra kaggle` (or `--extra huggingface`) was run. +- For Kaggle/HF datasets, ensure `uv sync --extra kaggle --inexact` (or `--extra huggingface --inexact`) was run. The `--inexact` flag is important: without it, syncing one extra removes the others. After a successful build, suggest running `/raincloud-docs` to regenerate derived docs. diff --git a/AGENTS.md b/AGENTS.md index 7108e85..cdfae1c 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,7 +12,7 @@ On a fresh clone `outputs/` is empty — that's expected. The `outputs/v1/ python -m scripts.pipeline.status --fast --missing-only ``` -It loads `sources.json`, walks the manifest, and prints per-slug filesystem state in seconds with no side effects. If it errors, fix the env (`uv sync`) before running any build. +It loads `sources.json`, walks the manifest, and prints per-slug filesystem state in seconds with no side effects. If it errors, fix the env (`uv sync --inexact`) before running any build. Always pass `--inexact` to `uv sync`: without it, syncing one extra (e.g. `--extra dev`) silently uninstalls the others (kaggle, huggingface, tui), so a subsequent build of an HF/Kaggle slug will fail. For a manifest sanity check that doesn't touch the filesystem at all: @@ -33,12 +33,12 @@ python -m scripts.pipeline.list_datasets --grep '\bgeo' --long Filters compose with AND across `--family`, `--handler`, `--license`, `--fetch-type`, `--reader`, `--vortex` / `--no-vortex`, `--kaggle-tos`, `--grep`. Output modes: default (one slug per line), `--long` (wide table), `--json` (jq-friendly), `--count`. -If the user wants to *browse* interactively rather than query, point them at `python -m scripts.pipeline.browse` (read-only Textual TUI over the same data; requires `uv sync --extra tui`). It's a human-facing tool — don't try to run it from an agent context, since it won't render and will hang waiting for keystrokes. +If the user wants to *browse* interactively rather than query, point them at `python -m scripts.pipeline.browse` (read-only Textual TUI over the same data; requires `uv sync --extra tui --inexact`). It's a human-facing tool — don't try to run it from an agent context, since it won't render and will hang waiting for keystrokes. For a slightly broader regression net, the `tests/` directory carries a sub-second pytest smoke suite (manifest shape, schema self-consistency, handler registry, example template). Run it after any change to the manifest, the schema, or the handler registry: ```bash -uv sync --extra dev # one-time — installs pytest +uv sync --extra dev --inexact # one-time — installs pytest, preserves other extras pytest ``` diff --git a/CHANGELOG.md b/CHANGELOG.md index 2662bad..81c9420 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.2] - 2026-05-10 + +### Fixed + +- All `uv sync` instructions across the docs (README, AGENTS, CONTRIBUTING, + SKILLS, in-code install hints, and skill files) now pass `--inexact` so + installing one extra no longer uninstalls the others. Without this, the + documented sequential setup (`uv sync --extra tui` → bare `uv sync` → + `uv sync --extra huggingface`) silently left the user with only the last + extra installed, and subsequent builds of HF/Kaggle slugs failed with + `ImportError`. uv has no project-level toggle for this — `--inexact` is + per-command — so the fix is documentation-wide. + +### Changed + +- TUI build action (`python -m scripts.pipeline.browse`, then `b` on a row) + now runs `uv sync --extra --inexact` automatically + before the build subprocess when the dataset's `fetch.type` requires an + upstream-fetch backend. Sync output streams into the same RichLog as the + build; sync failure aborts the build with a visible exit code. Pure-HTTP + and custom-fetch slugs see the same flow as before (no extra sync). + `BuildConfirmModal` surfaces the sync command line above the build command + line so the user sees both before confirming. + ## [0.1.1] - 2026-05-07 ### Added @@ -82,5 +106,6 @@ This release bundles: this repository" button in the repo sidebar with BibTeX / APA / Chicago exports. +[0.1.2]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.2 [0.1.1]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.1 [0.1.0]: https://github.com/spiraldb/raincloud/releases/tag/v0.1.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f9db3bd..80d263e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -10,11 +10,14 @@ pipeline itself, see [`README.md`](README.md), [`AGENTS.md`](AGENTS.md), and ```bash git clone git@github.com:spiraldb/raincloud.git cd raincloud -uv sync --extra dev +uv sync --extra dev --inexact ``` `--extra dev` pulls in `pytest`. Add `--extra kaggle` or `--extra huggingface` if your work touches those upstream types, or `--extra all` for everything. +Always pass `--inexact` — without it, each `uv sync --extra X` removes the +extras from the previous one (e.g. syncing `--extra dev` after `--extra +huggingface` uninstalls `huggingface_hub`). ## Before you open a PR diff --git a/README.md b/README.md index d9b36a1..0c2d595 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ Nothing downstream of `sources.json` is hand-maintained; `docs/datasets.md` and **Browse the catalog at a glance** — sortable columns, parquet/vortex presence per slug, no builds required: ```bash -uv sync --extra tui +uv sync --extra tui --inexact python -m scripts.pipeline.browse ``` @@ -33,7 +33,7 @@ A read-only Textual TUI over `sources.json`. Click any column header to sort; ri **Tell Raincloud which dataset you want; get back a Parquet + Vortex file on disk.** ```bash -uv sync +uv sync --inexact python -m scripts.pipeline.status --fast --missing-only # read-only env check python -m scripts.pipeline.build countries-of-the-world ``` @@ -59,23 +59,25 @@ python -m scripts.pipeline.build --family public-bi # all 46 Public BI w ```bash # Kaggle-hosted (33 slugs). One-time credential setup: -uv sync --extra kaggle +uv sync --extra kaggle --inexact mkdir -p ~/.kaggle && mv /path/to/kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json # Hugging Face-hosted (59 slugs): -uv sync --extra huggingface +uv sync --extra huggingface --inexact # Everything: -uv sync --extra all +uv sync --extra all --inexact ``` +The `--inexact` flag matters: by default `uv sync` removes any extras you installed previously. Pass it on every `uv sync` so the tui / kaggle / huggingface / dev extras accumulate instead of overwriting each other. + ## For AI coding agents If you're an AI coding agent landing in this repo: 1. Read [`AGENTS.md`](AGENTS.md) (auto-loaded from `CLAUDE.md → AGENTS.md`) for the invariants and architecture. 2. Run `python -m scripts.pipeline.status --fast --missing-only` to verify the env, then `python -m scripts.pipeline.validate_manifest` to confirm `sources.json` is well-formed. Both are sub-second and side-effect-free. -3. Run `pytest` (after `uv sync --extra dev`) for a regression net before any non-trivial change to the manifest, schema, or handler registry. +3. Run `pytest` (after `uv sync --extra dev --inexact`) for a regression net before any non-trivial change to the manifest, schema, or handler registry. 4. For catalog questions ("which slugs use handler X", "what's CC0-licensed"), use `python -m scripts.pipeline.list_datasets` rather than greping `sources.json` or scrolling [`docs/v1/datasets.md`](docs/v1/datasets.md). 5. Copy-pasteable templates for new manifest entries and streaming handlers live in [`examples/`](examples/). 6. Harnesses that follow the [Agent Skills](https://agentskills.io) standard get 16 invokable skills under [`.agents/skills/`](.agents/skills/) (the `.claude → .agents` symlink means Claude Code sees the same files). Tracked safe-default permissions in [`.agents/settings.json`](.agents/settings.json) — see [`.agents/README.md`](.agents/README.md) for the full layout. @@ -175,7 +177,7 @@ python -m scripts.pipeline.list_datasets --grep '\bgeo' --long python -m scripts.pipeline.browse # interactive TUI (requires --extra tui) # Run the test suite (sub-second, no fetch / no build) -uv sync --extra dev && pytest +uv sync --extra dev --inexact && pytest ``` Each stage is independently invokable — e.g. `python -m scripts.pipeline.fetch ` to download raw bytes without running the rest. Stages are idempotent: fetch skips when `expected_bytes`/`expected_sha256` already matches on disk, write skips when the output parquet is already current. diff --git a/SKILLS.md b/SKILLS.md index 67e1b74..b1f5df8 100644 --- a/SKILLS.md +++ b/SKILLS.md @@ -2,7 +2,7 @@ Playbooks for common operations in this repo. Each section is a self-contained recipe — copy and adapt. -Prereqs: Python 3.11+ and [uv](https://docs.astral.sh/uv/). Run `uv sync` in the repo root to install the pinned core deps (`pyarrow`, `duckdb`, `vortex-data`, `zstandard`, `py7zr`, `unlzw3`, `pandas`, `openpyxl`, `pyreadstat`, `osmium`, `jsonschema`). Add `--extra kaggle` for Kaggle-hosted datasets, `--extra huggingface` for Hugging Face ones, or `--extra dev` for `pytest` — see [`README.md`](README.md#upstream-specific-extras). Invoke Python as `.venv/bin/python` (or activate the venv). +Prereqs: Python 3.11+ and [uv](https://docs.astral.sh/uv/). Run `uv sync --inexact` in the repo root to install the pinned core deps (`pyarrow`, `duckdb`, `vortex-data`, `zstandard`, `py7zr`, `unlzw3`, `pandas`, `openpyxl`, `pyreadstat`, `osmium`, `jsonschema`). Add `--extra kaggle` for Kaggle-hosted datasets, `--extra huggingface` for Hugging Face ones, or `--extra dev` for `pytest` — see [`README.md`](README.md#upstream-specific-extras). Always pass `--inexact` so subsequent extras accumulate instead of overwriting prior ones (uv's default is "exact" sync, which removes anything not requested by the current invocation). Invoke Python as `.venv/bin/python` (or activate the venv). ## Index @@ -51,8 +51,8 @@ Env vars the helper honours (see [`README.md`](README.md#duckdb-resource-limits) ## Running the test suite ```bash -uv sync --extra dev # one-time — installs pytest -pytest # ~0.5 s on the full suite +uv sync --extra dev --inexact # one-time — installs pytest, preserves other extras +pytest # ~0.5 s on the full suite ``` `tests/` carries a sub-second smoke suite for the manifest, the schema, the handler registry, and the example templates. No fetch, no build, no filesystem writes. Run after any change to `sources.json`, `sources.schema.json`, `scripts/pipeline/handlers/__init__.py`, or `examples/`. Tests exercise the same `validate_manifest` codepath the `/raincloud-validate-manifest` skill runs. diff --git a/pyproject.toml b/pyproject.toml index f7ebebc..7487b5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "raincloud" -version = "0.1.1" +version = "0.1.2" description = "Client-reproducible pipeline for building a curated catalog of public datasets as Parquet + Vortex files." readme = "README.md" requires-python = ">=3.11" diff --git a/scripts/pipeline/browse.py b/scripts/pipeline/browse.py index e8bd35f..6e389dc 100644 --- a/scripts/pipeline/browse.py +++ b/scripts/pipeline/browse.py @@ -7,7 +7,7 @@ streams output into a modal log; cancellation kills the subprocess. Run: `python -m scripts.pipeline.browse` -Install: `uv sync --extra tui` +Install: `uv sync --extra tui --inexact` Keybindings: q — quit (orphans any in-flight build subprocesses) @@ -37,7 +37,7 @@ from textual.widgets import DataTable, Footer, Header, RichLog, Static except ImportError: print( - "textual is not installed. Install with: uv sync --extra tui", + "textual is not installed. Install with: uv sync --extra tui --inexact", file=sys.stderr, ) raise SystemExit(2) @@ -216,6 +216,30 @@ def _read_column_stats(parquet: Path) -> list[dict] | None: return read_column_stats(parquet) +def _required_extras(spec: dict) -> list[str]: + """Optional pyproject extras the build needs based on fetch.type. + + Handler-specific format deps (pandas, openpyxl, pyreadstat, osmium, + zstandard, py7zr, unlzw3) all live in core deps, so the only extras + we ever need to pull in on demand are the upstream-fetch backends: + `kaggle` for fetch.type=kaggle, `huggingface` for fetch.type=huggingface. + Returns [] for http / custom — no sync needed before build. + """ + ftype = (spec.get("fetch") or {}).get("type") + if ftype == "kaggle": return ["kaggle"] + if ftype == "huggingface": return ["huggingface"] + return [] + + +def _uv_sync_command(extras: list[str]) -> list[str]: + """Argv for `uv sync --extra X [--extra Y...] --inexact`.""" + cmd = ["uv", "sync"] + for e in extras: + cmd += ["--extra", e] + cmd.append("--inexact") + return cmd + + def _build_time_estimate(spec: dict, snapshot: dict | None = None) -> str: """Heuristic build-time bracket. Prefers fetch.expected_bytes when set (most accurate, but rare in the manifest); falls back to expect.rows, @@ -549,6 +573,18 @@ def compose(self) -> ComposeResult: if advisory else "" ) + extras = _required_extras(spec) + sync_line = ( + f" [reverse] {' '.join(_uv_sync_command(extras))} [/reverse]\n" + if extras else "" + ) + sync_note = ( + f"[dim]First syncs the {'/'.join(extras)} extra (preserving any " + f"others installed) so the {(spec.get('fetch') or {}).get('type')} " + f"backend is available; then runs the build.[/dim]\n\n" + if extras else "" + ) + body = ( f"[dim]{full_name}[/dim]\n\n" f"{description}\n\n" @@ -557,7 +593,9 @@ def compose(self) -> ComposeResult: f"[b]est. time[/b] {est}\n" f"{advisory_block}\n" f"Will run from the repo root:\n" + f"{sync_line}" f" [reverse] python -m scripts.pipeline.build {self.slug} [/reverse]\n\n" + f"{sync_note}" f"[dim]The TUI will stream the subprocess output. Cancelling the " f"build modal terminates the subprocess. Quitting the TUI " f"orphans any in-flight builds — use the CLI for hours-long " @@ -622,9 +660,10 @@ class BuildLogModal(ModalScreen): Binding("q", "request_close", "cancel + close"), ] - def __init__(self, slug: str) -> None: + def __init__(self, slug: str, spec: dict | None = None) -> None: super().__init__() self.slug = slug + self.spec = spec or {} self._process: asyncio.subprocess.Process | None = None self._task: asyncio.Task | None = None @@ -641,23 +680,41 @@ def compose(self) -> ComposeResult: def on_mount(self) -> None: self._task = asyncio.create_task(self._run_build()) + async def _stream_subprocess(self, argv: list[str], log: "RichLog") -> int: + """Spawn argv, stream stdout/stderr line-by-line into `log`, return exit code. + Stores the process on self so cancellation can SIGTERM/SIGKILL it.""" + self._process = await asyncio.create_subprocess_exec( + *argv, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + cwd=str(REPO_ROOT), + ) + assert self._process.stdout is not None + while True: + line = await self._process.stdout.readline() + if not line: + break + log.write(line.decode("utf-8", errors="replace").rstrip("\n")) + return await self._process.wait() + async def _run_build(self) -> None: log = self.query_one("#build-log", RichLog) status = self.query_one("#status", Static) try: - self._process = await asyncio.create_subprocess_exec( - sys.executable, "-u", "-m", "scripts.pipeline.build", self.slug, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.STDOUT, - cwd=str(REPO_ROOT), - ) - assert self._process.stdout is not None - while True: - line = await self._process.stdout.readline() - if not line: - break - log.write(line.decode("utf-8", errors="replace").rstrip("\n")) - rc = await self._process.wait() + extras = _required_extras(self.spec) + if extras: + sync_cmd = _uv_sync_command(extras) + status.update(f"[yellow]syncing {'/'.join(extras)}…[/yellow]") + log.write(f"$ {' '.join(sync_cmd)}") + rc = await self._stream_subprocess(sync_cmd, log) + if rc != 0: + status.update(f"[red]✗ uv sync failed (exit {rc}) — build skipped[/red]") + return + log.write("") # blank line between sync and build output + status.update("[yellow]running…[/yellow]") + build_cmd = [sys.executable, "-u", "-m", "scripts.pipeline.build", self.slug] + log.write(f"$ {' '.join(build_cmd)}") + rc = await self._stream_subprocess(build_cmd, log) if rc == 0: status.update("[green]✓ build succeeded[/green]") else: @@ -887,7 +944,7 @@ def _on_confirm(confirmed: bool | None) -> None: _vortex_cell(spec, parquet, vortex), _hydrate_cell(spec, hydrated), ) - self.push_screen(BuildLogModal(slug)) + self.push_screen(BuildLogModal(slug, spec)) self.push_screen(BuildConfirmModal(slug, spec, snapshot=self._snapshot), _on_confirm) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 974abcd..748f861 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -6,7 +6,7 @@ build, no filesystem writes. They're the regression net for changes to the manifest, the schema, or the handler registry. -Run: `uv sync --extra dev && pytest`. +Run: `uv sync --extra dev --inexact && pytest`. """ from __future__ import annotations