diff --git a/Cargo.lock b/Cargo.lock index bf8c74d7058..9520911ca5b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10438,6 +10438,7 @@ dependencies = [ "flate2", "insta", "maud", + "parking_lot", "reqwest 0.13.3", "serde", "serde_json", diff --git a/benchmarks-website/AGENTS.md b/benchmarks-website/AGENTS.md index bf00e48b855..4ff2196822c 100644 --- a/benchmarks-website/AGENTS.md +++ b/benchmarks-website/AGENTS.md @@ -14,10 +14,12 @@ covers Rust style, test layout, commit conventions. Until the cutover PR lands, the top-level v2 files (`server.js`, `src/`, `index.html`, `vite.config.js`, `package.json`, `package-lock.json`, `public/`, the top-level `Dockerfile`, -`docker-compose.yml`, `ec2-init.txt`) and the `benchmarks-website` service -in `docker-compose.yml` and the `publish-benchmarks-website.yml` workflow +`docker-compose.yml`) and the `publish-benchmarks-website.yml` workflow are production. Don't edit them as part of unrelated work. +The v3 deploy lives entirely under `server/`, `migrate/`, and `ops/`. +The operator runbook is [`ops/README.md`](ops/README.md). + ## v3 specifics - **Wire shapes are a coordinated change.** [`server/src/records.rs`](server/src/records.rs), diff --git a/benchmarks-website/README.md b/benchmarks-website/README.md index 63604539279..98f775d980e 100644 --- a/benchmarks-website/README.md +++ b/benchmarks-website/README.md @@ -9,16 +9,16 @@ The website behind `bench.vortex.dev`. The directory currently houses **two implementations side by side**, run together until the v3 cutover lands: - **v2** (top-level files: `server.js`, `src/`, `index.html`, `vite.config.js`, - `package.json`, `Dockerfile`, `docker-compose.yml`, `ec2-init.txt`, - `public/`). The Node + React stack that has shipped to production for the - life of the site. Built and published by + `package.json`, `Dockerfile`, `docker-compose.yml`, `public/`). The Node + + React stack that has shipped to production for the life of the site. Built + and published by [`.github/workflows/publish-benchmarks-website.yml`](../.github/workflows/publish-benchmarks-website.yml). -- **v3** (`server/` + `migrate/`). A single Rust binary — +- **v3** (`server/` + `migrate/` + `ops/`). A single Rust binary — [`vortex-bench-server`](server/) — that owns a DuckDB file on local disk, serves the API, and renders the HTML. Compiles all static assets (`chart.umd.js`, `chart-init.js`, `style.css`) into the binary so deploys - are one file plus a database. Container image at - `ghcr.io/vortex-data/vortex/vortex-bench-server:latest`. + are one file plus a database. Built directly on the EC2 host by + [`ops/deploy.sh`](ops/deploy.sh) — see [`ops/README.md`](ops/README.md). [`migrate/`](migrate/) is a one-shot tool that loads v2's S3 dataset into a v3 DuckDB; it is throwaway and goes away after cutover. @@ -79,12 +79,22 @@ npm run dev ## Deployment -`docker-compose.yml` runs both stacks side by side: v2 on `:80` and v3 on -`:3001`. `watchtower` polls GHCR every 60s so a fresh image push lands -automatically. v3 reads `INGEST_BEARER_TOKEN` from -`/etc/vortex-bench/secrets.env`, persists DuckDB to -`/opt/benchmarks-website/data/bench.duckdb`, and binds `0.0.0.0:3000` so the -container's `:3001` host port forwards through. +v3 runs as a systemd service on a single EC2 host. The full operator +runbook (first-time install, day-to-day, failure modes) is in +[`ops/README.md`](ops/README.md). Summary: + +- A `vortex-bench-deploy.timer` polls `origin/develop` every 60s. If commits + in the range touch `benchmarks-website/server/`, `benchmarks-website/migrate/`, + `Cargo.toml`, or `Cargo.lock`, it builds and atomically swaps the binary, + then verifies `/health`. Otherwise it fast-forwards the working tree and + exits silently. +- A `vortex-bench-backup.timer` fires hourly: it asks the server to + `EXPORT DATABASE` via the bearer-gated `/api/admin/snapshot` endpoint, + `tar czf`s the CSV output into `.tar.gz`, uploads it to + `s3://vortex-benchmark-results-database/v3-backups/`, and deletes the local + copies. +- For ad-hoc reads against the live DB, `ops/inspect.sh` calls a + bearer-gated `/api/admin/sql` endpoint — no server stop required. The v3 server is throwaway-friendly: every request runs against the local DuckDB file, and a fresh boot reapplies the schema DDL idempotently. The @@ -96,13 +106,12 @@ re-running `vortex-bench-migrate run --output ...` is safe. The work to flip `bench.vortex.dev` from v2 to v3 is tracked outside this repo. The relevant code-side bits: -- v3 runs alongside v2 on the same EC2 host today (v2 on `:80`, v3 on - `:3001`) and is fed by CI's dual-write `--gh-json-v3` path. +- v3 runs alongside v2 on the same EC2 host today and is fed by CI's + dual-write `--gh-json-v3` path. - v2 keeps shipping unchanged until DNS flips. **Do not touch the top-level v2 files unless you are doing the cleanup PR opened post-flip.** - The v2 cleanup PR removes everything top-level under `benchmarks-website/` that belongs to v2 (`server.js`, `src/`, `index.html`, `vite.config.js`, `package.json`, `package-lock.json`, `public/`, the top-level `Dockerfile`, - `docker-compose.yml`, `ec2-init.txt`, and the - `publish-benchmarks-website.yml` workflow). The v3 tree under `server/` and - `migrate/` is untouched. + `docker-compose.yml`, and the `publish-benchmarks-website.yml` workflow). + The v3 tree under `server/`, `migrate/`, and `ops/` is untouched. diff --git a/benchmarks-website/ec2-init.txt b/benchmarks-website/ec2-init.txt deleted file mode 100644 index 4e1377cc014..00000000000 --- a/benchmarks-website/ec2-init.txt +++ /dev/null @@ -1,70 +0,0 @@ - 1. Install Docker - # Amazon Linux 2023 - sudo yum install -y docker - sudo systemctl enable --now docker - sudo usermod -aG docker $USER - newgrp docker - - 2. Install Docker Compose plugin - sudo mkdir -p /usr/local/lib/docker/cli-plugins - sudo curl -SL https://github.com/docker/compose/releases/latest/download/docker-compose-linux-aarch64 -o /usr/local/lib/docker/cli-plugins/docker-compose - sudo chmod +x /usr/local/lib/docker/cli-plugins/docker-compose - - 3. Set up and start the app - sudo mkdir -p /opt/benchmarks-website - sudo cp docker-compose.yml /opt/benchmarks-website/ - cd /opt/benchmarks-website - docker compose up -d - - ==================================================================== - v3 (vortex-bench-server) — additive setup, runs alongside v2 - ==================================================================== - - v2 stays on port 80 until DNS is flipped. v3 runs on port 3001 from - the same docker-compose.yml on this host. - - 4. Create the bearer-token env file (root:root, mode 600) - sudo mkdir -p /etc/vortex-bench - sudo install -m 600 -o root -g root /dev/null /etc/vortex-bench/secrets.env - # Edit and set INGEST_BEARER_TOKEN=: - sudo vi /etc/vortex-bench/secrets.env - # File contents: - # INGEST_BEARER_TOKEN= - - 5. Create the EBS-backed DuckDB data directory - # Assumes an EBS volume is already mounted at /opt/benchmarks-website/data. - sudo mkdir -p /opt/benchmarks-website/data - sudo chown root:root /opt/benchmarks-website/data - sudo chmod 755 /opt/benchmarks-website/data - - 6. Pull and start v3 (watchtower already polls ghcr.io for refreshes) - cd /opt/benchmarks-website - docker compose pull vortex-bench-server - docker compose up -d vortex-bench-server - # Smoke-check on the host: - curl -sf http://127.0.0.1:3001/health || echo "v3 not responding" - - 7. Install the daily DuckDB backup cron - # Copy the backup script from the repo checkout to a stable location. - sudo install -m 755 -o root -g root \ - benchmarks-website/server/scripts/backup.sh \ - /usr/local/bin/vortex-bench-backup.sh - # Cron entry: 06:00 UTC daily, after the nightly bench finishes. - sudo tee /etc/cron.d/vortex-bench-backup >/dev/null <<'CRON' - 0 6 * * * root /usr/local/bin/vortex-bench-backup.sh >> /var/log/vortex-bench-backup.log 2>&1 - CRON - sudo chmod 644 /etc/cron.d/vortex-bench-backup - # The instance IAM role already permits writes to - # s3://vortex-ci-benchmark-results/ (same role v2's cat-s3.sh uses). - - 8. Bearer-token rotation procedure - # When rotating INGEST_BEARER_TOKEN: - # a. Generate a new token (e.g. `openssl rand -hex 32`). - # b. Update the GitHub Actions Environment secret INGEST_BEARER_TOKEN - # so CI dual-writes use the new value. - # c. On this EC2 host, edit the env file and restart only the v3 - # container so v2 traffic on port 80 is unaffected: - # sudo vi /etc/vortex-bench/secrets.env - # cd /opt/benchmarks-website - # docker compose up -d --force-recreate vortex-bench-server - # d. Verify with `curl` against /health and a token-gated endpoint. \ No newline at end of file diff --git a/benchmarks-website/ops/README.md b/benchmarks-website/ops/README.md new file mode 100644 index 00000000000..cf026a0b947 --- /dev/null +++ b/benchmarks-website/ops/README.md @@ -0,0 +1,647 @@ + + +# vortex-bench-server — operations runbook + +This is the canonical guide for deploying and operating the v3 +benchmarks site (`bench.vortex.dev`) on EC2. It targets a fresh admin +who has SSH access to the box and never seen the system before. + +The contents of this directory are everything the EC2 host needs to +build, run, deploy, back up, and inspect the server. There is no +out-of-tree state — every script and unit lives in +`benchmarks-website/ops/` and gets installed onto the host by +[`install.sh`](install.sh). + +## TL;DR + +- One Rust binary (`vortex-bench-server`), one DuckDB file + (`/var/lib/vortex-bench/bench.duckdb`). +- A systemd timer polls `origin/develop` every 60s. If commits in the + range touch website-relevant paths it builds, atomically swaps the + binary, and restarts the server. Otherwise it fast-forwards the + working tree and exits. +- A second timer fires hourly, asks the server to write a per-table + Vortex snapshot (`schema.sql` + one `.vortex` per table), + `tar czf`s it, and uploads to + `s3://vortex-benchmark-results-database/v3-backups/.tar.gz`. + The vortex DuckDB extension is auto-installed from the community + repo on first call. Vortex compresses the BIGINT[] runtime arrays + and string columns roughly an order of magnitude better than + gzipped CSV — and dogfoods the project's own format. +- For ad-hoc reads, `inspect.sh` calls a bearer-gated `/api/admin/sql` + endpoint instead of stopping the server. +- For DB-replacing operations (re-running the v2→v3 migration), + `migrate.sh` stops the server, snapshots the current DB to + `bench.prev-.duckdb`, runs the migration, and starts back up. + +## Architecture + +``` +┌──────────────────────────────────────────────────────────────────────┐ +│ EC2 host (Amazon Linux 2023, ec2-user) │ +│ │ +│ /home/ec2-user/vortex/ ← git checkout (build context only) │ +│ │ +│ /var/lib/vortex-bench/ │ +│ bench.duckdb ← live DB │ +│ bench.duckdb.wal │ +│ bench.prev-.duckdb ← pre-migration backup, last 1-2 │ +│ bin/ │ +│ vortex-bench-server ← symlink → versioned binary │ +│ vortex-bench-server. ← versioned, last $KEEP_BINARIES (3) │ +│ snapshots// ← transient vortex-snapshot landing │ +│ last-deployed-sha ← stamp file for the deploy timer │ +│ .deploy.lock ← flock guard │ +│ ops -> /home/ec2-user/vortex/benchmarks-website/ops │ +│ │ +│ /etc/vortex-bench.env ← secrets, mode 0600 │ +│ /etc/sudoers.d/vortex-bench ← lets ec2-user systemctl restart │ +│ the server with no password │ +│ /etc/systemd/system/ │ +│ vortex-bench-server.service ← serves :3000 │ +│ vortex-bench-deploy.service ← oneshot, runs deploy.sh │ +│ vortex-bench-deploy.timer ← every 60s │ +│ vortex-bench-backup.service ← oneshot, runs backup.sh │ +│ vortex-bench-backup.timer ← hourly │ +│ │ +│ Logs: journalctl -u vortex-bench-{server,deploy,backup} │ +└──────────────────────────────────────────────────────────────────────┘ + │ + │ aws s3 sync + ▼ + ┌───────────────────────────────────────┐ + │ s3://vortex-benchmark-results-database/│ + │ v3-backups/ │ + │ .tar.gz │ + │ / │ + │ schema.sql │ + │
.vortex │ + └───────────────────────────────────────┘ +``` + +## Files in this directory + +| Path | Role | +|--------------------------------------------|------------------------------------------------------------------| +| [`install.sh`](install.sh) | One-time bootstrap on a fresh host. Idempotent. | +| [`deploy.sh`](deploy.sh) | Pull → build (if needed) → atomic restart. Called by timer. | +| [`migrate.sh`](migrate.sh) | Manual: stop, snapshot prev DB, run migrate, restart. | +| [`backup.sh`](backup.sh) | Hourly: trigger `/api/admin/snapshot`, sync to S3, prune local. | +| [`inspect.sh`](inspect.sh) | Read-only SQL via `/api/admin/sql`, no server stop. | +| [`config/vortex-bench.env.example`](config/vortex-bench.env.example) | Template for `/etc/vortex-bench.env`. | +| [`systemd/`](systemd/) | Unit files installed into `/etc/systemd/system/`. | + +## First-time install (on a fresh EC2 host) + +This guide walks an admin who has never seen the system before from +"empty box + AWS account" to "site up, hourly backups landing in S3". +There are two parts: cloud-side setup (IAM role, bucket lifecycle) and +host-side setup (`install.sh`, env file, migration). Do them in that +order — the host-side scripts assume the IAM role is already attached. + +### Host prereqs + +- Amazon Linux 2023 (or any Linux with systemd, sudo, and curl). +- ec2-user has sudo (default on AL2023). +- Rust toolchain installed for the run user — `curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh` if not already. +- `aws` CLI on PATH (Amazon Linux ships with it). +- `git`, `curl`, `jq` (or `python3`), `flock` (`util-linux`), `gcc`/`g++`, + `cmake`, `pkg-config` (the duckdb-sys build needs these). +- The repo's `origin` remote must be the **HTTPS** URL + (`https://github.com/vortex-data/vortex.git`), not `git@github.com:…`. + The deploy timer runs as the unprivileged service user with no SSH + agent, so SSH-based fetches fail with `Permission denied (publickey)`. + Public-repo HTTPS reads are unauthenticated and just work. + +### AWS setup (do this once, from the AWS console) + +The server reads and writes a single S3 prefix — +`s3://vortex-benchmark-results-database/v3-backups/`. Configure two +things in AWS before touching the EC2 box: + +**(a) An IAM role for the EC2 instance.** Least-privilege — only what +the runtime actually needs (read/write objects, list backups). Bucket +admin actions (lifecycle, policy) are intentionally not granted; you +manage those separately from the console. + +In **IAM → Policies → Create policy**, paste this JSON and name it +`VortexBenchV3Backups`: + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ListBucket", + "Effect": "Allow", + "Action": "s3:ListBucket", + "Resource": "arn:aws:s3:::vortex-benchmark-results-database" + }, + { + "Sid": "ReadWriteV3Backups", + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:PutObject", "s3:DeleteObject"], + "Resource": "arn:aws:s3:::vortex-benchmark-results-database/v3-backups/*" + } + ] +} +``` + +In **IAM → Roles → Create role**, pick "AWS service" → "EC2", attach +the `VortexBenchV3Backups` policy, name it `VortexBenchServerRole`. + +In **EC2 → Instances → bench instance → Actions → Security → Modify +IAM role**, pick `VortexBenchServerRole` and Update. Wait ~15s for the +instance metadata service to refresh. + +Verify on the EC2 box: + +```bash +aws sts get-caller-identity # Arn should end in /VortexBenchServerRole/ +echo probe > /tmp/probe.txt +aws s3 cp /tmp/probe.txt s3://vortex-benchmark-results-database/v3-backups/_probe.txt +aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ +aws s3 rm s3://vortex-benchmark-results-database/v3-backups/_probe.txt +rm /tmp/probe.txt +``` + +If any of those four fail with `AccessDenied`, double-check (1) the +policy is actually attached to the role, (2) the instance is using the +new role (`aws sts get-caller-identity` shows the right name), and +(3) there isn't a bucket-level deny in +`S3 → bucket → Permissions → Bucket policy`. + +**(b) An S3 lifecycle rule** so hourly snapshots don't accumulate +forever. The runtime role can't manage lifecycle (by design — it's +admin metadata, not runtime data), so do this in the console once: + +In **S3 → Buckets → vortex-benchmark-results-database → Management → +Lifecycle rules → Create lifecycle rule**: + +- Name: `v3-backups-7d` +- Status: Enabled +- Filter scope: Prefix `v3-backups/` +- Action: "Expire current versions of objects" → **7 days** after creation + +Adjust the retention to taste (7 days × 24 hourly snapshots ≈ 170 +tarballs). The bucket isn't versioned so you can ignore the +noncurrent-version sections. + +### Host setup + +```bash +# 1. Clone the repo (anywhere, but the env file's REPO_DIR must point at it). +# Must be the HTTPS URL — the deploy timer has no SSH agent. +cd ~ && git clone https://github.com/vortex-data/vortex.git +cd vortex +# If you already cloned over SSH, fix the remote in place: +# git remote set-url origin https://github.com/vortex-data/vortex.git + +# 2. Run the installer. It needs sudo for /etc/, /var/lib/, and systemd. +./benchmarks-website/ops/install.sh + +# 3. Fill in the env file the installer staged. +sudo $EDITOR /etc/vortex-bench.env +# Generate the two tokens: +# openssl rand -hex 32 +# Store INGEST_BEARER_TOKEN in the GitHub Actions Environment used by +# .github/workflows/.yml so CI can keep posting. +# ADMIN_BEARER_TOKEN never leaves the box (used only by ops/* scripts). + +# 4. Wait ~90s. The deploy timer's first fire builds the binary and +# starts the server. Tail it: +journalctl -fu vortex-bench-deploy.service + +# 5. Smoke check (server is up but the DB is empty — schema applied, +# no rows). +curl -fsS http://127.0.0.1:3000/health | jq +./benchmarks-website/ops/inspect.sh "SELECT COUNT(*) FROM commits;" + +# 6. Populate the DB. migrate.sh stops the server, runs the migrator, +# and restarts it. The deploy timer never does this — populating +# the DB is a one-time admin action, distinct from deploying code. +/var/lib/vortex-bench/ops/migrate.sh run --output /var/lib/vortex-bench/bench.duckdb + +# 7. Verify the backup loop end-to-end. Fire one backup manually and +# confirm a tarball lands in S3. +sudo systemctl start vortex-bench-backup.service +journalctl -u vortex-bench-backup.service --since '2 min ago' --no-pager +aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ | tail -3 + +# 8. (Alternative to step 6: preserve an existing $HOME/bench.duckdb +# instead of re-migrating.) +sudo systemctl stop vortex-bench-server +sudo -u ec2-user mv ~/bench.duckdb /var/lib/vortex-bench/bench.duckdb +sudo systemctl start vortex-bench-server +``` + +After step 7, the system is fully self-driving: deploys happen +automatically within 60s of merge to develop, snapshots upload +automatically every hour, and the lifecycle rule expires old ones. +You don't need to SSH in for routine operations. + +## Day-to-day operations + +### "I pushed a website change — when does it ship?" + +Within 60s of merge to `develop`. The deploy timer fires every minute, +notices the new SHA, checks whether the diff touches +`benchmarks-website/server/`, `benchmarks-website/migrate/`, +`benchmarks-website/Cargo.toml`, `Cargo.toml`, or `Cargo.lock`. If +yes, it builds, atomically swaps the binary, restarts, and confirms +`/health` is happy. + +If the build fails or `/health` doesn't respond within 30s, the symlink +rolls back to the previous binary and the server restarts on the old +version. The stamp file is *not* updated, so the next timer fire +retries — fix the bug, push again. + +Watch a deploy live: + +```bash +journalctl -fu vortex-bench-deploy.service +``` + +Force a deploy right now (don't wait for the next tick): + +```bash +sudo systemctl start vortex-bench-deploy.service +``` + +### "Which build is actually running?" + +Three equivalent identifiers, in increasing levels of certainty: + +```bash +# What the deploy timer last successfully rolled out: +cat /var/lib/vortex-bench/last-deployed-sha + +# Which versioned binary the symlink currently points at: +readlink /var/lib/vortex-bench/bin/vortex-bench-server +# → /var/lib/vortex-bench/bin/vortex-bench-server. + +# What the live process baked in at compile time: +curl -fsS http://127.0.0.1:3000/health | jq '{build_sha, db_path, schema_version}' +``` + +`build_sha` is the source of truth — it's the git SHA `cargo build` +saw when it produced the running binary. If it disagrees with +`last-deployed-sha`, the running process is stale (e.g. a manual +binary swap, or systemd is still running an older PID). + +### "How do I manually rebuild and restart, outside the timer?" + +You shouldn't normally need this — the deploy timer covers all +ordinary cases — but it's useful when you want to test an unmerged +branch or recover from a stuck timer. Three knobs: + +**(a) Restart the running binary, no rebuild.** Cheapest restart; +useful after editing `/etc/vortex-bench.env` or recovering from a +hung connection. + +```bash +sudo systemctl restart vortex-bench-server +journalctl -fu vortex-bench-server # confirm it came up +curl -fsS http://127.0.0.1:3000/health | jq # build_sha unchanged +``` + +**(b) Force a deploy of the configured branch right now.** Triggers +exactly the same flow the timer runs, including build, atomic symlink +swap, and `/health` rollback if anything fails. + +```bash +sudo systemctl start vortex-bench-deploy.service +journalctl -fu vortex-bench-deploy.service # watch it +``` + +**(c) Manually build a binary from the current working tree and +install it.** Use this to test a branch that isn't `$DEPLOY_BRANCH` +without flipping the env file. The deploy timer will overwrite your +manual binary on the next tick that sees a relevant change, so you +probably want to pause it first: + +```bash +. /etc/vortex-bench.env +sudo systemctl stop vortex-bench-deploy.timer # pause auto-deploy +cd "$REPO_DIR" +git fetch origin +git checkout --force --detach origin/ # pin to whatever you want +cargo build --release -p vortex-bench-server +ts=$(date -u +%Y%m%dT%H%M%SZ) +sudo install -m 0755 -o ec2-user -g ec2-user \ + target/release/vortex-bench-server \ + "/var/lib/vortex-bench/bin/vortex-bench-server.manual-${ts}" +ln -sfnT "/var/lib/vortex-bench/bin/vortex-bench-server.manual-${ts}" \ + /var/lib/vortex-bench/bin/vortex-bench-server +sudo systemctl restart vortex-bench-server +curl -fsS http://127.0.0.1:3000/health | jq .build_sha # verify new SHA +# When done testing: +sudo systemctl start vortex-bench-deploy.timer # resume auto-deploy +``` + +The timer's next fire (within 60s) will overwrite your manual binary +with whatever `origin/$DEPLOY_BRANCH` produces, which is usually what +you want — manual binaries are scratch space, not a long-term state. + +### "A vortex-array PR landed — does the website rebuild?" + +No. The path filter ignores anything outside the directories listed +above. The working tree still fast-forwards (so a future website +change builds against the latest deps) but the server keeps running. + +If you ever want to force a rebuild against a non-website change, push +a no-op commit that touches `benchmarks-website/server/` (e.g. a +whitespace edit in `README.md`). + +### "How do I re-run the v2→v3 migration?" + +`migrate.sh` passes its args straight through to `cargo run -p +vortex-bench-migrate --`, so the migrator's CLI is whatever it is on +the current branch. As of writing the invocation is: + +```bash +/var/lib/vortex-bench/ops/migrate.sh run --output "$VORTEX_BENCH_DB" +``` + +The script stops the server, snapshots the current DB to +`/var/lib/vortex-bench/bench.prev-.duckdb` for instant rollback, +runs the migrator, and starts the server back up. Total downtime is +roughly one rebuild cycle. + +If the migrate fails partway, the script leaves the server stopped and +prints the rollback command. To roll back manually: + +```bash +mv /var/lib/vortex-bench/bench.prev-.duckdb /var/lib/vortex-bench/bench.duckdb +sudo systemctl start vortex-bench-server +``` + +### "What's in the database right now?" + +```bash +./benchmarks-website/ops/inspect.sh " + SELECT dataset, COUNT(*) AS n + FROM compression_times + GROUP BY dataset + ORDER BY n DESC; +" +``` + +Server-side validation only allows `SELECT`, `WITH`, `PRAGMA`, `SHOW`, +`DESCRIBE`, and `EXPLAIN`. Anything else is rejected with 403 — a +fat-fingered `UPDATE` or `DROP` cannot run through this path. + +For the raw JSON (handier in pipelines): + +```bash +./benchmarks-website/ops/inspect.sh -j "SELECT * FROM commits LIMIT 1" | jq +``` + +### "Where are the backups, and how do I restore?" + +Hourly, automatic. List the most recent snapshots: + +```bash +aws s3 ls s3://vortex-benchmark-results-database/v3-backups/ | tail -20 +``` + +Each `.tar.gz` archive contains a single directory `/` with +a `schema.sql` (verbatim DDL the server applies on boot) and one +`
.vortex` per table. Restore on a fresh box: + +```bash +sudo systemctl stop vortex-bench-server +cd /tmp +aws s3 cp s3://vortex-benchmark-results-database/v3-backups/.tar.gz . +tar xzf .tar.gz # extracts .// +ts= # e.g. 20260508T010000Z +sudo -u ec2-user rm -f /var/lib/vortex-bench/bench.duckdb \ + /var/lib/vortex-bench/bench.duckdb.wal +duckdb /var/lib/vortex-bench/bench.duckdb </`. + +### "Token rotation" + +`INGEST_BEARER_TOKEN`: + +1. Generate a new value: `openssl rand -hex 32`. +2. Update the GitHub Actions Environment secret so CI uses the new value. +3. SSH in, edit `/etc/vortex-bench.env`, then `sudo systemctl restart vortex-bench-server`. + +`ADMIN_BEARER_TOKEN`: + +1. `openssl rand -hex 32`. +2. Edit `/etc/vortex-bench.env`, restart the server. +3. The next backup timer fire will use the new value (read from the env + file at script invocation). + +The two tokens are independent — rotating one doesn't affect the other. + +### "Adding another admin" + +There's no separate admin database — being an admin means three things, +each granted independently: + +1. **SSH access to the EC2 box.** Append the new admin's SSH public key + to `/home/ec2-user/.ssh/authorized_keys` (mode 0600 owned by ec2-user) + on the live host. They'll be able to log in as `ec2-user`, which is + the same identity systemd runs the service as. Alternatively, enable + AWS Systems Manager Session Manager for the instance and add the new + admin's IAM principal to the instance's SSM connect IAM policy — + that avoids managing SSH keys at all. + +2. **AWS console access** for the bits the runtime role can't reach + (creating IAM roles/policies, editing the bucket lifecycle rule, + running setup-time admin commands). Grant the new admin an IAM user + or SSO role that can read/write IAM and the + `vortex-benchmark-results-database` bucket. The exact scope is your + call — read-only on IAM is enough to *audit* the setup; full write + is needed to *change* it. + +3. **The bearer tokens**, if they need to call the admin endpoints from + their laptop or run `inspect.sh` directly. The tokens live in + `/etc/vortex-bench.env` (mode 0600 owned by ec2-user); once they have + SSH access they can read it. To revoke an admin's access to the + tokens specifically, rotate `ADMIN_BEARER_TOKEN` (see above) — every + admin who knew the old value loses access immediately. + +The repo itself is the source of truth for *how* to operate the system +— every script and unit lives in [`benchmarks-website/ops/`](.). +A new admin who can SSH in and read `/etc/vortex-bench.env` has +everything they need to run the existing operations; the runbook above +covers the full surface. + +To remove an admin: revoke their SSH key (delete the line from +`authorized_keys`), revoke their AWS console access, and rotate the +admin token. CI's `INGEST_BEARER_TOKEN` is unaffected — it's a separate +token tied to the GitHub Actions Environment, not to any individual. + +## Wire APIs the ops scripts depend on + +These are the only server endpoints the operator scripts touch. They +also constitute the public admin contract for any future tooling. + +| Method + path | Bearer | Notes | +|------------------------------------------------------------------|---------------|------------------------------------------------------------------------------------------------| +| `GET /health` | none | `deploy.sh` polls for liveness after a restart. | +| `POST /api/admin/snapshot?ts=` | admin | Writes `schema.sql` + per-table `.vortex` files. `ts` must match `[A-Za-z0-9_-]{1,64}`. 409 if the dir exists. | +| `POST /api/admin/sql` (body `{"sql": …}`, `?format=json\|table`) | admin | Read-only SQL only — `SELECT`/`WITH`/`PRAGMA`/`SHOW`/`DESCRIBE`/`EXPLAIN`. | +| `POST /api/ingest` | ingest | Used by CI, not by these scripts. Documented under [`crate::ingest`]. | + +The admin router is mounted only when `ADMIN_BEARER_TOKEN` is set. With +the env unset (e.g. in local dev) the routes 404 and the backup script +fails fast — there's no silent "backups disabled" mode. + +See [`server/src/admin.rs`](../server/src/admin.rs) for the full +contract and the validation rules. + +## Failure modes & recovery + +### Deploy keeps failing + +Symptom: `journalctl -fu vortex-bench-deploy.service` shows repeated +build or `/health` failures, server stays on the old binary. + +What's happening: the script's stamp file is only written on success, +so every tick retries the same SHA. Inspect: + +```bash +sudo cat /var/lib/vortex-bench/last-deployed-sha +journalctl -u vortex-bench-deploy.service --since '15 min ago' +``` + +Recovery: fix the bug and push (the timer will pick it up). To stop +the retry loop while you investigate: + +```bash +sudo systemctl stop vortex-bench-deploy.timer +# … debug … +sudo systemctl start vortex-bench-deploy.timer +``` + +### Server is up but `/health` is slow + +`/health` runs five `SELECT COUNT(*)`s under the connection mutex. If +ingest is in flight it'll wait. > 1s is normal during the nightly +bench window; > 30s means the connection mutex is stuck. + +```bash +journalctl -u vortex-bench-server --since '5 min ago' +sudo systemctl restart vortex-bench-server +``` + +### Disk filling up under `/var/lib/vortex-bench/` + +Likely culprits and the order to check: + +```bash +du -sh /var/lib/vortex-bench/* | sort -h +``` + +- `bench.duckdb` itself growing — expected; ~hundreds of MB after the + v2 migration. +- `snapshots/` not being cleaned up — `backup.sh` deletes after a + successful S3 sync. If the IAM role broke, hourly snapshots will pile + up. `journalctl -u vortex-bench-backup.service` will show the + upload errors. +- `bin/vortex-bench-server.*` accumulation — `deploy.sh` keeps the + most recent `KEEP_BINARIES` (default 3). To prune harder, edit the + env file and add `KEEP_BINARIES=1`, then trigger a deploy. +- `bench.prev-.duckdb` from old migrations — these are kept on + purpose for rollback. Delete by hand once you've verified the + current DB is good. + +### Backup hasn't run + +```bash +systemctl list-timers vortex-bench-backup.timer +journalctl -u vortex-bench-backup.service --since '4 hours ago' +``` + +Run one by hand: + +```bash +sudo systemctl start vortex-bench-backup.service +journalctl -fu vortex-bench-backup.service +``` + +If the script reports a 503 from `/api/admin/snapshot`, the server +started without `ADMIN_BEARER_TOKEN`. Edit the env file, restart, retry. + +### Migrating to a new EC2 host + +1. Stand the new host up. Run `install.sh`. Fill the env file. +2. On the *old* host, take a final snapshot: + `sudo systemctl start vortex-bench-backup.service` and wait. +3. On the *new* host, restore from S3 (see "Where are the backups"). +4. Cut DNS over. + +Total RPO is the gap between the last hourly snapshot and the cutover +moment — bounded by an hour by default, can be tightened by adding +extra `OnCalendar=` lines to the backup timer. + +## Local development + +You don't need any of this to run the server locally: + +```bash +INGEST_BEARER_TOKEN=dev \ +ADMIN_BEARER_TOKEN=dev \ +VORTEX_BENCH_DB=/tmp/bench.duckdb \ +cargo run -p vortex-bench-server +``` + +The admin endpoints work the same as in production. The hourly timer +and the deploy timer are systemd-only — they have no local equivalent +and don't need one. + +## What's intentionally not here + +- **Docker.** A previous iteration ran the server under + `docker compose` with `watchtower` polling GHCR. We removed it: the + binary is small enough that a build-on-host model is simpler, and + systemd gives us atomic restarts and rollback for free. The v2 React + site retains its image-based deploy (separate `Dockerfile` and CI + workflow); v3 does not. +- **A push-based deploy.** A GitHub Actions workflow could push via + SSM or SSH on every merge. We chose polling because (a) zero inbound + surface on the EC2 box, (b) no shared secret to manage in CI, and + (c) 60s is well under any reasonable expectation for a benchmarks + site. If the polling becomes unworkable, swap `vortex-bench-deploy.timer` + for an SSM-triggered ExecStart and the rest of `deploy.sh` doesn't + change. +- **A dedicated SQL endpoint user.** `/api/admin/sql` is gated by the + same admin token as `/api/admin/snapshot`. If you want per-operator + audit, run a reverse proxy that adds a header and log it on the way + through. diff --git a/benchmarks-website/ops/backup.sh b/benchmarks-website/ops/backup.sh new file mode 100755 index 00000000000..c5ad544cf37 --- /dev/null +++ b/benchmarks-website/ops/backup.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Hourly snapshot to S3, called by vortex-bench-backup.timer. +# +# Asks the running server to write a per-table Vortex snapshot via +# /api/admin/snapshot (so the writer uses the same DuckDB process +# that owns the file — no stop required), `tar czf`s the resulting +# directory into a single archive, uploads it to +# $S3_BACKUP_PREFIX/.tar.gz, and deletes the local copies. +# +# Vortex compresses our shape (mostly BIGINT[] runtime arrays + short +# strings) far better than gzipped CSV; the additional gzip on the +# tarball is largely catching schema.sql and tar metadata, not the +# data files themselves. +# +# The instance IAM role must already permit s3:PutObject under +# $S3_BACKUP_PREFIX. (Same bucket the v2 backup script used.) + +set -euo pipefail + +ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" +if [ ! -f "$ENV_FILE" ]; then + echo "ERROR: missing ${ENV_FILE}" >&2 + exit 2 +fi +set -a +# shellcheck disable=SC1090 +source "$ENV_FILE" +set +a +: "${ADMIN_BEARER_TOKEN:?ADMIN_BEARER_TOKEN must be set in ${ENV_FILE}}" +: "${VORTEX_BENCH_SNAPSHOT_DIR:?VORTEX_BENCH_SNAPSHOT_DIR must be set}" +: "${S3_BACKUP_PREFIX:?S3_BACKUP_PREFIX must be set in ${ENV_FILE}}" +: "${SERVER_URL:=http://127.0.0.1:3000}" + +log() { printf '[backup %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } + +ts="$(date -u +%Y%m%dT%H%M%SZ)" +local_dir="${VORTEX_BENCH_SNAPSHOT_DIR}/${ts}" +archive="${VORTEX_BENCH_SNAPSHOT_DIR}/${ts}.tar.gz" +remote="${S3_BACKUP_PREFIX}/${ts}.tar.gz" + +log "triggering /api/admin/snapshot?ts=${ts}" +http_status=$(curl -sS -o /tmp/snapshot.out -w '%{http_code}' \ + -X POST \ + -H "Authorization: Bearer ${ADMIN_BEARER_TOKEN}" \ + "${SERVER_URL}/api/admin/snapshot?ts=${ts}" || echo "000") +if [ "$http_status" != "200" ]; then + echo "ERROR: /api/admin/snapshot returned ${http_status}" >&2 + cat /tmp/snapshot.out >&2 || true + exit 3 +fi +rm -f /tmp/snapshot.out + +if [ ! -d "$local_dir" ]; then + echo "ERROR: server reported success but ${local_dir} does not exist" >&2 + exit 4 +fi + +# Compress the export directory into a single tar.gz. `tar -C` so paths +# inside the archive are relative to the snapshot id (i.e. `/foo.csv`), +# which matches the layout expected by the restore docs. +log "compressing ${local_dir} → ${archive}" +if ! tar -C "$VORTEX_BENCH_SNAPSHOT_DIR" -czf "$archive" "$ts"; then + echo "ERROR: tar czf failed" >&2 + rm -f "$archive" + exit 5 +fi + +orig_bytes=$(du -sb "$local_dir" | awk '{print $1}') +gz_bytes=$(stat -c %s "$archive") +log "compressed ${orig_bytes} → ${gz_bytes} bytes ($(( orig_bytes / (gz_bytes > 0 ? gz_bytes : 1) ))x)" + +log "uploading ${archive} → s3://${remote#s3://}" +if ! aws s3 cp --quiet "${archive}" "${remote}"; then + echo "ERROR: aws s3 cp failed; keeping ${archive} and ${local_dir} for manual recovery" >&2 + exit 6 +fi + +log "deleting local copies (${archive}, ${local_dir})" +rm -f "$archive" +rm -rf "$local_dir" + +log "snapshot ${ts} ok → ${remote}" diff --git a/benchmarks-website/ops/config/vortex-bench.env.example b/benchmarks-website/ops/config/vortex-bench.env.example new file mode 100644 index 00000000000..b6c2e3860a4 --- /dev/null +++ b/benchmarks-website/ops/config/vortex-bench.env.example @@ -0,0 +1,52 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Template for /etc/vortex-bench.env on the EC2 host. +# +# install.sh copies this template into place if /etc/vortex-bench.env is +# missing; the operator then fills in the secrets. The file must be +# mode 0600 owned by the user systemd runs the service as (ec2-user by +# default). systemd's EnvironmentFile= reads this for both the server +# unit and the deploy/backup timers. + +# Bearer token CI presents on POST /api/ingest. Constant-time compared. +# Generate with: openssl rand -hex 32 +INGEST_BEARER_TOKEN= + +# Bearer token operators present on /api/admin/snapshot and /api/admin/sql. +# Independent of INGEST_BEARER_TOKEN so the two can rotate separately. +# Generate with: openssl rand -hex 32 +ADMIN_BEARER_TOKEN= + +# DuckDB file the server opens. Lives outside any git checkout so a +# `git pull` never touches it. +VORTEX_BENCH_DB=/var/lib/vortex-bench/bench.duckdb + +# Where /api/admin/snapshot writes EXPORT DATABASE output. backup.sh +# uploads the contents to S3 then deletes them, so this dir is transient. +VORTEX_BENCH_SNAPSHOT_DIR=/var/lib/vortex-bench/snapshots + +# `host:port` the server binds to. Behind a reverse proxy (or just +# exposed directly on the EC2 SG), 0.0.0.0 is correct. +VORTEX_BENCH_BIND=0.0.0.0:3000 + +# tracing-subscriber env filter spec. +VORTEX_BENCH_LOG=info,vortex_bench_server=info + +# --- ops scripts only (not consumed by the server itself) -------------- + +# Repo checkout the deploy timer pulls and builds from. Owned by the +# same user as the systemd services so `git pull` and `cargo build` don't +# need sudo. +REPO_DIR=/home/ec2-user/vortex + +# Branch the deploy timer tracks. +DEPLOY_BRANCH=develop + +# S3 prefix backup.sh syncs hourly snapshots to. The instance IAM role +# must already permit s3:PutObject under this prefix. +S3_BACKUP_PREFIX=s3://vortex-benchmark-results-database/v3-backups + +# URL of the running server, used by ops scripts (deploy health check, +# backup admin snapshot trigger, inspect SQL). +SERVER_URL=http://127.0.0.1:3000 diff --git a/benchmarks-website/ops/deploy.sh b/benchmarks-website/ops/deploy.sh new file mode 100755 index 00000000000..b635fc18a7a --- /dev/null +++ b/benchmarks-website/ops/deploy.sh @@ -0,0 +1,225 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Idempotent rebuild + restart, called by vortex-bench-deploy.timer +# every 60s. Cheap and silent on the common path (no new commits). +# +# Flow: +# 1. flock on a state file (concurrent runs bail). +# 2. git fetch origin $DEPLOY_BRANCH. +# 3. If origin SHA == last-deployed SHA → exit 0. +# 4. Else: git diff against a path filter. If nothing in the filter +# changed, sync the working tree (destructive checkout) to the +# new SHA, update the stamp, exit 0. (Skips a build for monorepo +# changes that don't touch the server.) +# 5. Else: sync working tree + cargo build --release -p vortex-bench-server. +# 6. Compare new binary's sha256 to the currently-running symlink target. +# If unchanged (cargo did no real work), update stamp + exit 0. +# 7. Else: copy to bin/vortex-bench-server., atomically swap the +# symlink, sudo systemctl restart vortex-bench-server. +# 8. Wait for /health. On failure: revert symlink, restart, error out +# (do NOT update the stamp — next tick retries). +# 9. On success: update stamp, prune binary versions older than $KEEP_BINARIES. +# +# The working-tree sync is `git checkout --force --detach `, not +# `git pull --ff-only`, so the script survives force-pushes on the +# tracked branch. +# +# Exit codes: +# 0 success (either a real deploy or a clean no-op) +# 1 another deploy is in progress (lock held) +# 2 config error (missing env file, REPO_DIR, etc.) +# 3 git fetch failed +# 4 cargo build failed +# 5 systemctl restart failed +# 6 /health check failed (rolled back to previous binary) + +set -euo pipefail + +ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" +STATE_DIR="${STATE_DIR:-/var/lib/vortex-bench}" +LOCK_FILE="${LOCK_FILE:-${STATE_DIR}/.deploy.lock}" +STAMP_FILE="${STAMP_FILE:-${STATE_DIR}/last-deployed-sha}" +BIN_DIR="${BIN_DIR:-${STATE_DIR}/bin}" +BIN_SYMLINK="${BIN_DIR}/vortex-bench-server" +KEEP_BINARIES="${KEEP_BINARIES:-3}" + +log() { printf '[deploy %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } +err() { printf '[deploy %s] ERROR: %s\n' "$(date -u +%H:%M:%SZ)" "$*" >&2; } + +# --- Load env --- +if [ ! -f "$ENV_FILE" ]; then + err "missing ${ENV_FILE}" + exit 2 +fi +set -a +# shellcheck disable=SC1090 +source "$ENV_FILE" +set +a +: "${REPO_DIR:?REPO_DIR must be set in ${ENV_FILE}}" +: "${DEPLOY_BRANCH:=develop}" +: "${SERVER_URL:=http://127.0.0.1:3000}" + +if [ ! -d "${REPO_DIR}/.git" ]; then + err "${REPO_DIR} is not a git checkout" + exit 2 +fi + +# --- Lock --- +mkdir -p "$(dirname "$LOCK_FILE")" +exec 200>"$LOCK_FILE" +if ! flock -n 200; then + log "another deploy is in progress; bailing" + exit 1 +fi + +# Pick up cargo from the user's profile if not on PATH already. +# shellcheck disable=SC1091 +. "$HOME/.cargo/env" 2>/dev/null || true + +cd "$REPO_DIR" + +last_sha="" +[ -f "$STAMP_FILE" ] && last_sha="$(cat "$STAMP_FILE")" + +# --- Fetch --- +if ! git fetch --quiet origin "$DEPLOY_BRANCH"; then + err "git fetch origin ${DEPLOY_BRANCH} failed" + exit 3 +fi +new_sha="$(git rev-parse "origin/${DEPLOY_BRANCH}")" + +if [ "$new_sha" = "$last_sha" ]; then + # Common case: nothing new since last fire. Silent on stdout to + # keep the journal clean. + exit 0 +fi + +# --- Path filter --- +# Rebuild + restart only when commits in the range touch website code, +# the workspace lockfile, or workspace Cargo manifests. Other changes +# (e.g. vortex-array fixes) update the working tree but don't restart. +filter_paths=( + benchmarks-website/server + benchmarks-website/migrate + benchmarks-website/Cargo.toml + Cargo.lock + Cargo.toml +) + +if [ -z "$last_sha" ] || ! git cat-file -e "${last_sha}^{commit}" 2>/dev/null; then + # First run, or stamp points at a commit we no longer have. Treat + # as "must rebuild" so we don't silently skip a real change. + log "first run / unknown stamp '${last_sha:-}'; full rebuild" + relevant_changed=1 +else + if git diff --name-only "${last_sha}" "${new_sha}" -- "${filter_paths[@]}" | grep -q .; then + relevant_changed=1 + else + relevant_changed=0 + fi +fi + +# --- Sync the working tree to origin/$DEPLOY_BRANCH --- +# `git pull --ff-only` breaks the moment the tracked branch is +# force-pushed (typical during PR iteration). The deploy worker's +# checkout is build-only — no human edits live here — so a destructive +# `git checkout --force --detach $new_sha` is the right semantics. +# Detached HEAD avoids any local-branch ref drift. +if ! git checkout --quiet --force --detach "$new_sha"; then + err "git checkout --force --detach ${new_sha} failed" + exit 3 +fi + +if [ "$relevant_changed" = "0" ]; then + log "no website-relevant paths changed in ${last_sha:0:7}..${new_sha:0:7}; skipping rebuild" + echo "$new_sha" > "$STAMP_FILE" + exit 0 +fi + +# --- Build --- +prev_short="${last_sha:0:7}" +log "building ${new_sha:0:7} (was ${prev_short:-})" +if ! cargo build --release --quiet -p vortex-bench-server; then + err "cargo build -p vortex-bench-server failed" + exit 4 +fi +new_binary="${REPO_DIR}/target/release/vortex-bench-server" +if [ ! -x "$new_binary" ]; then + err "expected binary not found at ${new_binary}" + exit 4 +fi + +# --- Compare hashes; skip restart if cargo produced byte-identical output --- +new_hash="$(sha256sum "$new_binary" | awk '{print $1}')" +current_hash="" +if [ -L "$BIN_SYMLINK" ] && [ -e "$BIN_SYMLINK" ]; then + current_hash="$(sha256sum "$BIN_SYMLINK" | awk '{print $1}')" +fi +if [ "$new_hash" = "$current_hash" ]; then + log "binary unchanged (sha256 ${new_hash:0:12}); skipping restart" + echo "$new_sha" > "$STAMP_FILE" + exit 0 +fi + +# --- Install + atomic symlink swap --- +ts="$(date -u +%Y%m%dT%H%M%SZ)" +versioned="${BIN_DIR}/vortex-bench-server.${ts}" +install -m 0755 "$new_binary" "$versioned" +prev_target="" +if [ -L "$BIN_SYMLINK" ]; then + prev_target="$(readlink "$BIN_SYMLINK")" +fi +ln -sfnT "$versioned" "$BIN_SYMLINK" +log "swapped symlink → ${versioned}" + +# --- Restart + verify --- +if ! sudo /bin/systemctl restart vortex-bench-server; then + err "systemctl restart failed" + if [ -n "$prev_target" ]; then + ln -sfnT "$prev_target" "$BIN_SYMLINK" + sudo /bin/systemctl restart vortex-bench-server || true + fi + exit 5 +fi + +# Give it a moment to come up, then poll /health. +deadline=$(( $(date +%s) + 30 )) +healthy=0 +while [ "$(date +%s)" -lt "$deadline" ]; do + if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then + healthy=1 + break + fi + sleep 1 +done +if [ "$healthy" != "1" ]; then + err "/health did not respond within 30s — rolling back" + if [ -n "$prev_target" ]; then + ln -sfnT "$prev_target" "$BIN_SYMLINK" + sudo /bin/systemctl restart vortex-bench-server || true + log "rolled back symlink to ${prev_target}" + else + err "no previous binary to roll back to" + fi + exit 6 +fi + +# --- Success: update stamp, prune old binaries --- +echo "$new_sha" > "$STAMP_FILE" +log "deploy ok: ${new_sha:0:7} → live (binary ${ts})" + +# Keep the most recent $KEEP_BINARIES versioned binaries, drop the rest. +# Sort by name (timestamp prefix is sortable), keep the tail. +mapfile -t binaries < <(ls -1 "${BIN_DIR}"/vortex-bench-server.* 2>/dev/null | sort) +if [ "${#binaries[@]}" -gt "$KEEP_BINARIES" ]; then + drop_count=$(( ${#binaries[@]} - KEEP_BINARIES )) + for b in "${binaries[@]:0:$drop_count}"; do + # Never delete what the symlink currently points at. + if [ "$b" != "$(readlink -f "$BIN_SYMLINK")" ]; then + rm -f "$b" + log "pruned ${b}" + fi + done +fi diff --git a/benchmarks-website/ops/inspect.sh b/benchmarks-website/ops/inspect.sh new file mode 100755 index 00000000000..9434c3145ce --- /dev/null +++ b/benchmarks-website/ops/inspect.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Run a read-only SQL query against the live DuckDB without stopping +# the server. Calls /api/admin/sql and prints the duckdb-cli-style +# table. +# +# Usage: +# ./inspect.sh "SELECT COUNT(*) FROM commits;" +# echo "PRAGMA table_info('commits');" | ./inspect.sh +# ./inspect.sh -j "SELECT * FROM compression_sizes LIMIT 3" # raw json +# +# The server allows SELECT, WITH, PRAGMA, SHOW, DESCRIBE, EXPLAIN. +# Anything else is rejected with 403 by the server (so a typo'd UPDATE +# can't run). + +set -euo pipefail + +ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" +if [ ! -f "$ENV_FILE" ]; then + echo "ERROR: missing ${ENV_FILE}" >&2 + exit 2 +fi +set -a +# shellcheck disable=SC1090 +source "$ENV_FILE" +set +a +: "${ADMIN_BEARER_TOKEN:?ADMIN_BEARER_TOKEN must be set in ${ENV_FILE}}" +: "${SERVER_URL:=http://127.0.0.1:3000}" + +format=table +if [ "${1:-}" = "-j" ] || [ "${1:-}" = "--json" ]; then + format=json + shift +fi + +if [ -n "${1:-}" ]; then + sql="$1" +else + sql="$(cat)" +fi + +# Build the JSON body with a here-doc so quoting in the SQL is a non-issue. +body=$(jq -nc --arg sql "$sql" '{sql: $sql}' 2>/dev/null) || { + # Fallback when jq isn't installed: hand-escape with python or printf. + if command -v python3 >/dev/null; then + body=$(python3 -c 'import json,sys; print(json.dumps({"sql": sys.argv[1]}))' "$sql") + else + echo "ERROR: install jq or python3 to call /api/admin/sql safely" >&2 + exit 2 + fi +} + +curl -fsS \ + -X POST \ + -H "Authorization: Bearer ${ADMIN_BEARER_TOKEN}" \ + -H "Content-Type: application/json" \ + --data-binary "$body" \ + "${SERVER_URL}/api/admin/sql?format=${format}" +echo diff --git a/benchmarks-website/ops/install.sh b/benchmarks-website/ops/install.sh new file mode 100755 index 00000000000..277c33d132a --- /dev/null +++ b/benchmarks-website/ops/install.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# One-time bootstrap of vortex-bench-server on a fresh EC2 host. +# Idempotent — safe to re-run after editing units or to recover from +# partial state. See ops/README.md for the full operator runbook. +# +# Run as a user with sudo (typically ec2-user). The script will: +# 1. Create state and log directories under /var/lib/vortex-bench +# and /var/log/vortex-bench, owned by $RUN_USER. +# 2. Drop a sudoers fragment that lets $RUN_USER restart the server +# service without a password (so the deploy timer can run as the +# service user). +# 3. Copy /etc/vortex-bench.env from the template if missing (mode 0600). +# 4. Install the systemd units and reload systemd. +# 5. Symlink the ops/ directory into /var/lib/vortex-bench so the +# systemd units have a stable path (the repo can move). +# 6. Enable + start the server, deploy timer, and backup timer. +# +# Usage: +# ./benchmarks-website/ops/install.sh +# RUN_USER=ec2-user REPO_DIR=$HOME/vortex ./benchmarks-website/ops/install.sh + +set -euo pipefail + +RUN_USER="${RUN_USER:-ec2-user}" +RUN_GROUP="${RUN_GROUP:-${RUN_USER}}" +REPO_DIR="${REPO_DIR:-$HOME/vortex}" +STATE_DIR="${STATE_DIR:-/var/lib/vortex-bench}" +LOG_DIR="${LOG_DIR:-/var/log/vortex-bench}" +ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" +SYSTEMD_DIR="${SYSTEMD_DIR:-/etc/systemd/system}" +SUDOERS_FILE="${SUDOERS_FILE:-/etc/sudoers.d/vortex-bench}" + +ops_dir="${REPO_DIR}/benchmarks-website/ops" +if [ ! -d "$ops_dir" ]; then + echo "ERROR: ${ops_dir} not found. Set REPO_DIR=." >&2 + exit 2 +fi + +# The deploy timer runs as ${RUN_USER} with no SSH agent, so an SSH +# remote fails with "Permission denied (publickey)" on every fire. +# Public-repo HTTPS reads need no auth — warn early so this is not the +# first surprise out of the gate. +if [ -d "${REPO_DIR}/.git" ]; then + origin_url="$(git -C "$REPO_DIR" remote get-url origin 2>/dev/null || true)" + case "$origin_url" in + git@*|ssh://*) + echo "WARNING: ${REPO_DIR}'s origin is ${origin_url}." >&2 + echo " The deploy timer cannot fetch over SSH (no agent). Fix with:" >&2 + echo " git -C ${REPO_DIR} remote set-url origin https://github.com/vortex-data/vortex.git" >&2 + ;; + esac +fi + +log() { printf '[install] %s\n' "$*"; } + +# --- 1. State + log directories --- +log "creating ${STATE_DIR} and ${LOG_DIR} (owner ${RUN_USER}:${RUN_GROUP})" +sudo install -d -m 0755 -o "$RUN_USER" -g "$RUN_GROUP" \ + "$STATE_DIR" \ + "${STATE_DIR}/bin" \ + "${STATE_DIR}/snapshots" \ + "$LOG_DIR" + +# --- 2. Sudoers fragment --- +# Let RUN_USER restart/start/stop only vortex-bench-server, no password. +# The script that uses this is ops/deploy.sh (atomic restart after build). +log "writing sudoers fragment to ${SUDOERS_FILE}" +sudo tee "$SUDOERS_FILE" >/dev/null </dev/null + +# --- 3. Env file --- +if [ ! -f "$ENV_FILE" ]; then + log "creating ${ENV_FILE} from template (mode 0600 owned by ${RUN_USER})" + sudo install -m 0600 -o "$RUN_USER" -g "$RUN_GROUP" \ + "${ops_dir}/config/vortex-bench.env.example" \ + "$ENV_FILE" + log "EDIT ${ENV_FILE} to set INGEST_BEARER_TOKEN, ADMIN_BEARER_TOKEN, REPO_DIR" +else + log "${ENV_FILE} already present — leaving alone" +fi + +# --- 4. Symlink ops/ into the state dir --- +# Gives systemd units a stable path that doesn't depend on the repo +# checkout location moving. +log "symlinking ${ops_dir} -> ${STATE_DIR}/ops" +sudo ln -sfnT "$ops_dir" "${STATE_DIR}/ops" + +# --- 5. systemd units --- +log "installing systemd units to ${SYSTEMD_DIR}" +for unit in \ + vortex-bench-server.service \ + vortex-bench-deploy.service \ + vortex-bench-deploy.timer \ + vortex-bench-backup.service \ + vortex-bench-backup.timer +do + sudo install -m 0644 -o root -g root \ + "${ops_dir}/systemd/${unit}" \ + "${SYSTEMD_DIR}/${unit}" +done +sudo systemctl daemon-reload + +# --- 6. Enable + start --- +# The server unit needs a binary at /var/lib/vortex-bench/bin/vortex-bench-server +# before it can start. If the symlink isn't there yet, the deploy timer +# will lay one down on its first run; until then the server will fail. +if [ ! -e "${STATE_DIR}/bin/vortex-bench-server" ]; then + log "no binary at ${STATE_DIR}/bin/vortex-bench-server yet" + log " → the first deploy-timer fire (within 90s) will build + install one." + log " → tail it with: journalctl -fu vortex-bench-deploy.service" +fi + +log "enabling + starting timers" +sudo systemctl enable --now vortex-bench-deploy.timer +sudo systemctl enable --now vortex-bench-backup.timer + +log "enabling vortex-bench-server (will start once a binary is in place)" +sudo systemctl enable vortex-bench-server.service +sudo systemctl start vortex-bench-server.service || \ + log " server didn't start — likely no binary yet; deploy timer will handle it" + +log "" +log "install complete. Next steps:" +log " 1. Edit ${ENV_FILE} (chmod 0600, owned by ${RUN_USER})" +log " - INGEST_BEARER_TOKEN=$(openssl rand -hex 32)" +log " - ADMIN_BEARER_TOKEN=$(openssl rand -hex 32)" +log " - confirm REPO_DIR points at the actual checkout" +log " 2. Wait ~90s for the first deploy-timer fire to build the binary" +log " and start the server with an empty DuckDB:" +log " journalctl -fu vortex-bench-deploy.service" +log " curl http://127.0.0.1:3000/health" +log " 3. Populate the DB with the v2→v3 migration (server is stopped" +log " and restarted automatically):" +log " ${STATE_DIR}/ops/migrate.sh run --output \"${STATE_DIR}/bench.duckdb\"" +log " 4. (If preserving an existing \$HOME/bench.duckdb instead of" +log " re-migrating, copy it into place before step 3:" +log " sudo systemctl stop vortex-bench-server" +log " sudo -u ${RUN_USER} mv \$HOME/bench.duckdb ${STATE_DIR}/bench.duckdb" +log " sudo systemctl start vortex-bench-server" +log " and skip step 3.)" diff --git a/benchmarks-website/ops/migrate.sh b/benchmarks-website/ops/migrate.sh new file mode 100755 index 00000000000..5353a7a30dc --- /dev/null +++ b/benchmarks-website/ops/migrate.sh @@ -0,0 +1,88 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Manual v2→v3 migration wrapper. The migration tool needs exclusive +# access to the DB file, so the server is stopped first, the current DB +# is snapshotted to prev-bench.duckdb. for instant rollback, the +# migrate binary runs, and the server is started back up. +# +# Run from any directory while SSH'd onto the EC2 host. The args are +# passed through verbatim to `cargo run -p vortex-bench-migrate --`, so +# the operator owns the migrator's CLI surface (which has been changing +# while v3 stabilises). The wrapper only handles stop / snapshot prev +# DB / restart. +# +# Examples: +# /var/lib/vortex-bench/ops/migrate.sh run --output "$VORTEX_BENCH_DB" +# +# (Run as ec2-user is fine — we sudo only for systemctl.) + +set -euo pipefail + +ENV_FILE="${ENV_FILE:-/etc/vortex-bench.env}" +if [ ! -f "$ENV_FILE" ]; then + echo "ERROR: missing ${ENV_FILE}" >&2 + exit 2 +fi +set -a +# shellcheck disable=SC1090 +source "$ENV_FILE" +set +a +: "${REPO_DIR:?REPO_DIR must be set in ${ENV_FILE}}" +: "${VORTEX_BENCH_DB:?VORTEX_BENCH_DB must be set in ${ENV_FILE}}" +: "${SERVER_URL:=http://127.0.0.1:3000}" + +log() { printf '[migrate %s] %s\n' "$(date -u +%H:%M:%SZ)" "$*"; } + +if [ ! -d "${REPO_DIR}/.git" ]; then + echo "ERROR: REPO_DIR=${REPO_DIR} is not a git checkout" >&2 + exit 2 +fi + +# shellcheck disable=SC1091 +. "$HOME/.cargo/env" 2>/dev/null || true + +log "stopping vortex-bench-server" +sudo /bin/systemctl stop vortex-bench-server + +# Snapshot the current DB so a botched migration can be reverted with +# one mv. WAL is folded in by DuckDB on next clean shutdown; if it +# survives a stop, copy it too. +ts="$(date -u +%Y%m%dT%H%M%SZ)" +prev="${VORTEX_BENCH_DB%.duckdb}.prev-${ts}.duckdb" +if [ -f "$VORTEX_BENCH_DB" ]; then + log "snapshotting ${VORTEX_BENCH_DB} → ${prev}" + cp -p "$VORTEX_BENCH_DB" "$prev" + [ -f "${VORTEX_BENCH_DB}.wal" ] && cp -p "${VORTEX_BENCH_DB}.wal" "${prev}.wal" +fi + +log "running cargo run --release -p vortex-bench-migrate -- $*" +pushd "$REPO_DIR" >/dev/null +# Pass through whatever args the operator gave us. Don't inject a path +# flag — the migrator's CLI is owned by that crate. +if ! cargo run --release --quiet -p vortex-bench-migrate -- "$@"; then + popd >/dev/null + echo "ERROR: migration failed. Server is still stopped." >&2 + echo " Restore previous DB with: mv \"$prev\" \"$VORTEX_BENCH_DB\"" >&2 + echo " Then: sudo systemctl start vortex-bench-server" >&2 + exit 3 +fi +popd >/dev/null + +log "starting vortex-bench-server" +sudo /bin/systemctl start vortex-bench-server + +# Give it a few seconds to come up. +deadline=$(( $(date +%s) + 30 )) +while [ "$(date +%s)" -lt "$deadline" ]; do + if curl -fsS --max-time 3 "${SERVER_URL}/health" >/dev/null 2>&1; then + log "migrate ok — server is up" + log " prev DB kept at ${prev} (delete when you've verified data)" + exit 0 + fi + sleep 1 +done +echo "ERROR: server did not respond on /health within 30s" >&2 +echo " prev DB kept at ${prev} for rollback" >&2 +exit 1 diff --git a/benchmarks-website/ops/systemd/vortex-bench-backup.service b/benchmarks-website/ops/systemd/vortex-bench-backup.service new file mode 100644 index 00000000000..78d081c2500 --- /dev/null +++ b/benchmarks-website/ops/systemd/vortex-bench-backup.service @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Oneshot unit invoked by vortex-bench-backup.timer hourly. Calls +# /api/admin/snapshot to make EXPORT DATABASE land a CSV snapshot +# under $VORTEX_BENCH_SNAPSHOT_DIR, then `aws s3 sync` uploads to +# $S3_BACKUP_PREFIX// and deletes the local copy. + +[Unit] +Description=Vortex bench v3 hourly DB snapshot to S3 +After=vortex-bench-server.service network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=ec2-user +Group=ec2-user +EnvironmentFile=/etc/vortex-bench.env +WorkingDirectory=/var/lib/vortex-bench +ExecStart=/var/lib/vortex-bench/ops/backup.sh +StandardOutput=journal +StandardError=journal diff --git a/benchmarks-website/ops/systemd/vortex-bench-backup.timer b/benchmarks-website/ops/systemd/vortex-bench-backup.timer new file mode 100644 index 00000000000..3c1910fe892 --- /dev/null +++ b/benchmarks-website/ops/systemd/vortex-bench-backup.timer @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Hourly snapshot timer. Persistent=true means a missed hour catches up +# after a reboot. + +[Unit] +Description=Vortex bench v3 hourly DB snapshot timer +Requires=vortex-bench-backup.service + +[Timer] +# Top of every hour, UTC. +OnCalendar=hourly +Persistent=true +RandomizedDelaySec=2min +Unit=vortex-bench-backup.service + +[Install] +WantedBy=timers.target diff --git a/benchmarks-website/ops/systemd/vortex-bench-deploy.service b/benchmarks-website/ops/systemd/vortex-bench-deploy.service new file mode 100644 index 00000000000..8f4f313d8aa --- /dev/null +++ b/benchmarks-website/ops/systemd/vortex-bench-deploy.service @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Oneshot unit invoked by vortex-bench-deploy.timer every 60s. Runs +# ops/deploy.sh, which is a no-op when origin/$DEPLOY_BRANCH hasn't +# moved or moved without touching benchmarks-website code. +# +# This unit deliberately runs as ec2-user — `git pull` and `cargo build` +# don't need root. The script asks for sudo only to `systemctl restart` +# vortex-bench-server, and the sudoers fragment ops/install.sh writes +# allows that single command without a password. + +[Unit] +Description=Vortex bench v3 auto-deploy +# Run after the server unit is up; if the server is failing, restarting +# it from the deploy script is what we want anyway. +After=vortex-bench-server.service network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +User=ec2-user +Group=ec2-user +EnvironmentFile=/etc/vortex-bench.env +WorkingDirectory=/var/lib/vortex-bench +ExecStart=/var/lib/vortex-bench/ops/deploy.sh +StandardOutput=journal +StandardError=journal +# A failed deploy isn't fatal — keep the timer running. +SuccessExitStatus=0 diff --git a/benchmarks-website/ops/systemd/vortex-bench-deploy.timer b/benchmarks-website/ops/systemd/vortex-bench-deploy.timer new file mode 100644 index 00000000000..be453d62d57 --- /dev/null +++ b/benchmarks-website/ops/systemd/vortex-bench-deploy.timer @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Timer that fires the auto-deploy oneshot every 60s. AccuracySec=10s +# keeps the wakeups from coalescing too aggressively; the work itself +# is a no-op when nothing relevant changed. + +[Unit] +Description=Vortex bench v3 auto-deploy timer +Requires=vortex-bench-deploy.service + +[Timer] +# Fire 30s after boot, then every 60s. +OnBootSec=30s +OnUnitActiveSec=60s +AccuracySec=10s +Unit=vortex-bench-deploy.service + +[Install] +WantedBy=timers.target diff --git a/benchmarks-website/ops/systemd/vortex-bench-server.service b/benchmarks-website/ops/systemd/vortex-bench-server.service new file mode 100644 index 00000000000..eac80a8e0cd --- /dev/null +++ b/benchmarks-website/ops/systemd/vortex-bench-server.service @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# systemd unit for vortex-bench-server. Owns the DuckDB file at +# $VORTEX_BENCH_DB and serves :3000. +# +# Installed by ops/install.sh into /etc/systemd/system/. Restart on +# crash, never give up. Start order: after a successful binary install +# at /var/lib/vortex-bench/bin/vortex-bench-server (the deploy timer +# atomically swaps this symlink in place; this unit is `restart`ed via +# the sudoers fragment ops/install.sh writes). + +[Unit] +Description=Vortex bench v3 server (bench.vortex.dev) +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +User=ec2-user +Group=ec2-user +WorkingDirectory=/var/lib/vortex-bench +EnvironmentFile=/etc/vortex-bench.env +ExecStart=/var/lib/vortex-bench/bin/vortex-bench-server +Restart=on-failure +RestartSec=2 +# `journalctl -u vortex-bench-server` is the canonical log. Keep stdout +# unbuffered so `tail -f` works. +Environment=RUST_BACKTRACE=1 +StandardOutput=journal +StandardError=journal + +# Conservative hardening — server only needs DB writes and outbound TCP +# (none in steady state, but cargo build's hyper would). +ProtectSystem=strict +ReadWritePaths=/var/lib/vortex-bench +ProtectHome=read-only +PrivateTmp=true +NoNewPrivileges=true + +[Install] +WantedBy=multi-user.target diff --git a/benchmarks-website/server/Cargo.toml b/benchmarks-website/server/Cargo.toml index 420ee082ff9..abc94925fb5 100644 --- a/benchmarks-website/server/Cargo.toml +++ b/benchmarks-website/server/Cargo.toml @@ -29,6 +29,7 @@ base64 = "0.22" # track vortex-duckdb's bundled engine version (build.rs) duckdb = { version = "1.10502", features = ["bundled"] } maud = { version = "0.27", features = ["axum"] } +parking_lot = { workspace = true } serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } subtle = "2.6" diff --git a/benchmarks-website/server/Dockerfile b/benchmarks-website/server/Dockerfile deleted file mode 100644 index 81c2c4860b9..00000000000 --- a/benchmarks-website/server/Dockerfile +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Build context: repository root (the server is a workspace member). -# Build: docker build -f benchmarks-website/server/Dockerfile . -# Toolchain pinned to match rust-toolchain.toml. - -FROM rust:1.91-bookworm AS build - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - pkg-config \ - && rm -rf /var/lib/apt/lists/* - -WORKDIR /build -COPY . . - -RUN cargo build --release -p vortex-bench-server --bin vortex-bench-server - -FROM debian:bookworm-slim - -# Keep this in lockstep with libduckdb-sys in Cargo.lock. -ARG DUCKDB_VERSION=1.5.2 - -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - ca-certificates \ - libstdc++6 \ - unzip \ - wget \ - && wget -q "https://github.com/duckdb/duckdb/releases/download/v${DUCKDB_VERSION}/duckdb_cli-linux-aarch64.zip" -O /tmp/duckdb.zip \ - && unzip -q /tmp/duckdb.zip -d /usr/local/bin/ \ - && chmod +x /usr/local/bin/duckdb \ - && rm /tmp/duckdb.zip \ - && apt-get purge -y --auto-remove unzip wget \ - && rm -rf /var/lib/apt/lists/* - -COPY --from=build /build/target/release/vortex-bench-server /usr/local/bin/vortex-bench-server - -WORKDIR /app/data - -EXPOSE 3000 - -CMD ["/usr/local/bin/vortex-bench-server"] diff --git a/benchmarks-website/server/build.rs b/benchmarks-website/server/build.rs index 37bb34d013a..056f23c910f 100644 --- a/benchmarks-website/server/build.rs +++ b/benchmarks-website/server/build.rs @@ -1,8 +1,30 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::process::Command; + fn main() { if std::env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows") { println!("cargo:rustc-link-lib=dylib=rstrtmgr"); } + + // Capture the git SHA at build time so /health can confirm the + // running process matches what the deploy timer last saw. Falls + // back to "unknown" outside a git checkout (e.g. shallow CI + // clones, source tarballs) so the build never fails on this. + let sha = Command::new("git") + .args(["rev-parse", "--short=12", "HEAD"]) + .output() + .ok() + .filter(|o| o.status.success()) + .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_owned()) + .unwrap_or_else(|| "unknown".to_owned()); + println!("cargo:rustc-env=VORTEX_BENCH_BUILD_SHA={sha}"); + + // HEAD covers the common deploy.sh path + // (`git checkout --force --detach `); refs/heads/* covers + // local branches if anyone runs the binary from a checked-out + // branch. Both are no-ops if the file doesn't exist. + println!("cargo:rerun-if-changed=../../.git/HEAD"); + println!("cargo:rerun-if-changed=../../.git/refs/heads"); } diff --git a/benchmarks-website/server/scripts/backup.sh b/benchmarks-website/server/scripts/backup.sh deleted file mode 100755 index ca4a35f891f..00000000000 --- a/benchmarks-website/server/scripts/backup.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors -# -# Daily DuckDB backup for the vortex-bench-server v3 instance. -# Runs on the EC2 host via cron (see benchmarks-website/ec2-init.txt). -# -# Exports the running container's DuckDB to a local directory and uploads -# it to s3://vortex-ci-benchmark-results/v3-backups//. The instance -# IAM role already grants write access to that bucket (it is the same -# bucket cat-s3.sh uses for v2). -# -# At alpha this is a convenience backup: the data is also reproducible -# from CI dual-writes to the v3 ingest endpoint, so RPO is bounded by -# what CI has posted, not by this script's cadence. - -set -euo pipefail - -CONTAINER="${CONTAINER:-vortex-bench-server}" -DB_PATH="${DB_PATH:-/app/data/bench.duckdb}" -DATA_DIR="${DATA_DIR:-/opt/benchmarks-website/data}" -S3_PREFIX="${S3_PREFIX:-s3://vortex-ci-benchmark-results/v3-backups}" - -date_stamp="$(date -u +%Y%m%d)" -export_dir="backup-${date_stamp}" -host_export_dir="${DATA_DIR}/${export_dir}" - -# Run EXPORT DATABASE inside the container so we hit the same DuckDB -# build that wrote the file. The container path mirrors the host path -# under /app/data, so the export lands on the EBS volume. -docker exec "${CONTAINER}" \ - duckdb "${DB_PATH}" \ - -c "EXPORT DATABASE '/app/data/${export_dir}'" - -aws s3 cp \ - --recursive \ - "${host_export_dir}" \ - "${S3_PREFIX}/${date_stamp}/" - -# Keep the latest local export, drop older ones to bound disk use. -find "${DATA_DIR}" \ - -maxdepth 1 \ - -type d \ - -name "backup-*" \ - ! -path "${host_export_dir}" \ - -exec rm -rf {} + diff --git a/benchmarks-website/server/src/admin.rs b/benchmarks-website/server/src/admin.rs new file mode 100644 index 00000000000..622ef8451ff --- /dev/null +++ b/benchmarks-website/server/src/admin.rs @@ -0,0 +1,426 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Admin endpoints — bearer-gated DuckDB snapshot and read-only SQL. +//! +//! Mounted at `/api/admin/*` only when `ADMIN_BEARER_TOKEN` is set on the +//! server, surfaced through [`crate::app::AppState::with_admin`]. Both routes +//! require an `Authorization: Bearer ` header — the +//! `INGEST_BEARER_TOKEN` will not work here, so the two can rotate +//! independently. The operator workflow is documented in +//! `benchmarks-website/ops/README.md`. +//! +//! ## Routes +//! +//! ### `POST /api/admin/snapshot?ts=` +//! +//! Writes a snapshot directory `//` containing: +//! - `schema.sql` — verbatim copy of [`crate::schema::SCHEMA_DDL`], so a +//! restore knows how to recreate the tables before bulk-loading. +//! - `
.vortex` for every table in [`crate::schema::TABLES`] — +//! each produced by a `COPY (SELECT * FROM
) TO … +//! (FORMAT vortex)`. The vortex DuckDB extension is auto-installed +//! from the community repo on first call, then `LOAD`ed. +//! +//! Vortex compresses the BIGINT[] runtime arrays and string columns +//! roughly an order of magnitude better than gzipped CSV on this shape; +//! it is also the project's own format, which is the obvious dogfood. +//! +//! `ts` must match `[A-Za-z0-9_-]{1,64}`; the snapshot script +//! conventionally passes a UTC timestamp like `20260508T010000Z`. The +//! target subdirectory must not already exist (409 otherwise). All +//! per-table COPY statements run on a connection cloned from the +//! shared handle, so concurrent ingest writes are not blocked. +//! +//! ### `POST /api/admin/sql` +//! +//! Body: `{ "sql": "SELECT ..." }`. Query: `?format=json|table` (default +//! `json`). Only `SELECT`, `WITH`, `PRAGMA`, `SHOW`, `DESCRIBE`, and +//! `EXPLAIN` statements are allowed — anything else is rejected with 403. +//! The connection mutex is held for the duration of the call, so a slow +//! SELECT briefly delays ingest. + +use std::fmt::Write as _; +use std::path::PathBuf; + +use anyhow::Context as _; +use anyhow::Result; +use axum::Json; +use axum::extract::Query; +use axum::extract::Request; +use axum::extract::State; +use axum::http::StatusCode; +use axum::http::header::AUTHORIZATION; +use axum::http::header::CONTENT_TYPE; +use axum::middleware::Next; +use axum::response::IntoResponse; +use axum::response::Response; +use duckdb::Connection; +use duckdb::types::ValueRef; +use serde::Deserialize; +use serde::Serialize; +use serde_json::Value; +use serde_json::json; +use subtle::ConstantTimeEq; +use thiserror::Error; + +use crate::app::AppState; +use crate::db; +use crate::schema; + +/// Errors surfaced by `/api/admin/*` handlers. Auth (401) is handled by +/// [`require_admin_bearer`] and never reaches a handler. +#[derive(Debug, Error)] +pub enum AdminError { + /// 400 — request shape is malformed (bad `ts`, bad SQL JSON body, …). + #[error("bad request: {0}")] + BadRequest(String), + /// 403 — request is well-formed but the SQL statement is not on the + /// read-only allow-list. + #[error("forbidden: {0}")] + Forbidden(String), + /// 409 — snapshot target directory already exists. + #[error("conflict: {0}")] + Conflict(String), + /// 500 — anything else (DB error, IO error, …). + #[error("internal server error")] + Internal(#[from] anyhow::Error), +} + +impl IntoResponse for AdminError { + fn into_response(self) -> Response { + let (status, body) = match &self { + Self::BadRequest(msg) => ( + StatusCode::BAD_REQUEST, + json!({ "error": "bad_request", "message": msg }), + ), + Self::Forbidden(msg) => ( + StatusCode::FORBIDDEN, + json!({ "error": "forbidden", "message": msg }), + ), + Self::Conflict(msg) => ( + StatusCode::CONFLICT, + json!({ "error": "conflict", "message": msg }), + ), + Self::Internal(err) => { + tracing::error!(error = ?err, "admin internal error"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + json!({ "error": "internal" }), + ) + } + }; + (status, Json(body)).into_response() + } +} + +/// Axum middleware enforcing the admin bearer token on `/api/admin/*`. +/// 401 if the header is missing, malformed, or wrong; 503 if the server +/// was started without `ADMIN_BEARER_TOKEN` (the admin router is unmounted +/// in that case, so this is just a defensive belt-and-braces check). +pub async fn require_admin_bearer( + State(state): State, + req: Request, + next: Next, +) -> Result { + let Some(expected) = state.admin_bearer_token.as_ref() else { + return Err(( + StatusCode::SERVICE_UNAVAILABLE, + Json(json!({ "error": "admin_not_configured" })), + ) + .into_response()); + }; + let unauthorized = || { + ( + StatusCode::UNAUTHORIZED, + Json(json!({ "error": "unauthorized" })), + ) + .into_response() + }; + let header = req + .headers() + .get(AUTHORIZATION) + .ok_or_else(unauthorized)? + .to_str() + .map_err(|_| unauthorized())?; + let presented = header + .strip_prefix("Bearer ") + .ok_or_else(unauthorized)? + .as_bytes(); + if presented.ct_eq(expected.as_bytes()).into() { + Ok(next.run(req).await) + } else { + Err(unauthorized()) + } +} + +#[derive(Debug, Deserialize)] +pub struct SnapshotQuery { + /// Operator-supplied identifier for the snapshot, used as the leaf + /// directory name. Must match `[A-Za-z0-9_-]{1,64}`. + pub ts: String, +} + +#[derive(Debug, Serialize)] +pub struct SnapshotResponse { + /// Absolute path of the directory the export landed in. + pub snapshot_dir: String, +} + +/// Handler for `POST /api/admin/snapshot?ts=`. Writes +/// `schema.sql` plus one `
.vortex` file per fact/dim table into +/// a fresh subdirectory under [`AppState::snapshot_dir`]. +pub async fn snapshot( + State(state): State, + Query(q): Query, +) -> Result { + validate_ts(&q.ts)?; + let target: PathBuf = state.snapshot_dir.join(&q.ts); + if target.exists() { + return Err(AdminError::Conflict(format!( + "snapshot directory already exists: {}", + target.display() + ))); + } + std::fs::create_dir_all(&target) + .with_context(|| format!("creating snapshot dir {}", target.display()))?; + + // Schema is just our DDL string verbatim; restore reads this with + // `duckdb -init schema.sql` (or `.read schema.sql`) before + // bulk-loading the per-table vortex files. + std::fs::write(target.join("schema.sql"), schema::SCHEMA_DDL) + .with_context(|| format!("writing schema.sql under {}", target.display()))?; + + let target_for_db = target.clone(); + db::run_blocking(&state.db, move |conn| { + // Idempotent — `INSTALL` is a no-op if the extension is already + // present, `LOAD` is cheap once the binary is on disk. The + // bundled libduckdb-sys has autoload enabled, so the very first + // call also auto-fetches the extension from the DuckDB + // community repo. Subsequent calls are entirely local. + conn.execute_batch("INSTALL vortex FROM community; LOAD vortex;") + .context("INSTALL/LOAD vortex extension")?; + for table in schema::TABLES { + // Single-quote escaping is a non-issue: `target_for_db` + // is composed from the operator-configured snapshot dir + + // a validated [A-Za-z0-9_-] timestamp, and table names + // come from the closed const list in schema.rs. + let path = target_for_db.join(format!("{table}.vortex")); + let path_str = path + .to_str() + .ok_or_else(|| anyhow::anyhow!("snapshot path is not UTF-8: {}", path.display()))?; + let sql = format!("COPY (SELECT * FROM {table}) TO '{path_str}' (FORMAT vortex)"); + conn.execute_batch(&sql) + .with_context(|| format!("COPY {table} TO {path_str}"))?; + } + Ok(()) + }) + .await + .map_err(AdminError::Internal)?; + Ok(Json(SnapshotResponse { + snapshot_dir: target.display().to_string(), + })) +} + +fn validate_ts(ts: &str) -> Result<(), AdminError> { + if ts.is_empty() || ts.len() > 64 { + return Err(AdminError::BadRequest("ts must be 1..=64 chars".into())); + } + if !ts + .chars() + .all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') + { + return Err(AdminError::BadRequest( + "ts must match [A-Za-z0-9_-]+".into(), + )); + } + Ok(()) +} + +#[derive(Debug, Deserialize)] +pub struct SqlBody { + pub sql: String, +} + +#[derive(Debug, Deserialize, Default)] +pub struct SqlQuery { + #[serde(default)] + pub format: SqlFormat, +} + +#[derive(Debug, Deserialize, Default, Clone, Copy)] +#[serde(rename_all = "lowercase")] +pub enum SqlFormat { + /// Returns `{ columns, rows, row_count }` JSON. + #[default] + Json, + /// Returns a `text/plain` ASCII table similar to `duckdb` CLI output. + Table, +} + +/// Handler for `POST /api/admin/sql`. +pub async fn sql( + State(state): State, + Query(q): Query, + Json(body): Json, +) -> Result { + validate_read_only(&body.sql)?; + let format = q.format; + let sql_text = body.sql; + let result = db::run_blocking(&state.db, move |conn| run_select(conn, &sql_text)) + .await + .map_err(AdminError::Internal)?; + Ok(match format { + SqlFormat::Json => Json(json!({ + "columns": result.columns, + "rows": result.rows, + "row_count": result.rows.len(), + })) + .into_response(), + SqlFormat::Table => ( + [(CONTENT_TYPE, "text/plain; charset=utf-8")], + format_table(&result), + ) + .into_response(), + }) +} + +fn validate_read_only(sql: &str) -> Result<(), AdminError> { + let trimmed = sql.trim_start_matches(|c: char| c.is_whitespace() || c == '(' || c == ';'); + let first_word: String = trimmed + .chars() + .take_while(|c| c.is_ascii_alphabetic()) + .collect::() + .to_ascii_uppercase(); + const ALLOWED: &[&str] = &["SELECT", "WITH", "PRAGMA", "SHOW", "DESCRIBE", "EXPLAIN"]; + if !ALLOWED.contains(&first_word.as_str()) { + return Err(AdminError::Forbidden(format!( + "only {ALLOWED:?} statements are allowed; got {first_word:?}" + ))); + } + Ok(()) +} + +struct QueryResult { + columns: Vec, + rows: Vec>, +} + +fn run_select(conn: &Connection, sql: &str) -> Result { + let mut stmt = conn.prepare(sql).context("prepare SQL")?; + let mut rows_iter = stmt.query([]).context("execute SQL")?; + // duckdb-rs panics on Statement::column_names() if the statement has not + // executed yet — schema is only populated after `query()` runs. Pull it + // off the live `Rows` iterator instead. + let columns: Vec = rows_iter + .as_ref() + .map(|s| s.column_names()) + .unwrap_or_default(); + let column_count = columns.len(); + let mut rows: Vec> = Vec::new(); + while let Some(row) = rows_iter.next().context("row iter")? { + let mut out = Vec::with_capacity(column_count); + for i in 0..column_count { + let v = row.get_ref(i).context("get col")?; + out.push(value_ref_to_json(v)); + } + rows.push(out); + } + Ok(QueryResult { columns, rows }) +} + +fn value_ref_to_json(v: ValueRef<'_>) -> Value { + match v { + ValueRef::Null => Value::Null, + ValueRef::Boolean(b) => Value::Bool(b), + ValueRef::TinyInt(i) => Value::from(i), + ValueRef::SmallInt(i) => Value::from(i), + ValueRef::Int(i) => Value::from(i), + ValueRef::BigInt(i) => Value::from(i), + ValueRef::HugeInt(i) => Value::String(i.to_string()), + ValueRef::UTinyInt(i) => Value::from(i), + ValueRef::USmallInt(i) => Value::from(i), + ValueRef::UInt(i) => Value::from(i), + ValueRef::UBigInt(i) => Value::String(i.to_string()), + ValueRef::Float(f) => f64::from(f).into(), + ValueRef::Double(f) => f.into(), + ValueRef::Text(bytes) => Value::String(String::from_utf8_lossy(bytes).into_owned()), + ValueRef::Blob(_) => Value::String("".into()), + other => Value::String(format!("{other:?}")), + } +} + +fn format_table(r: &QueryResult) -> String { + if r.columns.is_empty() { + return "(no columns)\n".into(); + } + let row_strings: Vec> = r + .rows + .iter() + .map(|row| row.iter().map(value_display).collect()) + .collect(); + let mut widths: Vec = r.columns.iter().map(|c| c.chars().count()).collect(); + for row in &row_strings { + for (i, cell) in row.iter().enumerate() { + let w = cell.chars().count(); + if w > widths[i] { + widths[i] = w; + } + } + } + let mut out = String::new(); + write_separator(&mut out, &widths, '┌', '┬', '┐'); + write_row(&mut out, &r.columns, &widths); + write_separator(&mut out, &widths, '├', '┼', '┤'); + for row in &row_strings { + write_row(&mut out, row, &widths); + } + write_separator(&mut out, &widths, '└', '┴', '┘'); + let _ = writeln!( + out, + "({} row{})", + r.rows.len(), + if r.rows.len() == 1 { "" } else { "s" } + ); + out +} + +fn value_display(v: &Value) -> String { + match v { + Value::Null => "NULL".into(), + Value::String(s) => s.clone(), + Value::Number(n) => n.to_string(), + Value::Bool(b) => b.to_string(), + other => other.to_string(), + } +} + +fn write_row>(out: &mut String, cells: &[S], widths: &[usize]) { + out.push('│'); + for (i, cell) in cells.iter().enumerate() { + let s = cell.as_ref(); + let pad = widths[i].saturating_sub(s.chars().count()); + out.push(' '); + out.push_str(s); + for _ in 0..pad { + out.push(' '); + } + out.push(' '); + out.push('│'); + } + out.push('\n'); +} + +fn write_separator(out: &mut String, widths: &[usize], left: char, mid: char, right: char) { + out.push(left); + for (i, w) in widths.iter().enumerate() { + if i > 0 { + out.push(mid); + } + for _ in 0..(*w + 2) { + out.push('─'); + } + } + out.push(right); + out.push('\n'); +} diff --git a/benchmarks-website/server/src/api/dto.rs b/benchmarks-website/server/src/api/dto.rs index 0ffe3752497..801a72bbd0f 100644 --- a/benchmarks-website/server/src/api/dto.rs +++ b/benchmarks-website/server/src/api/dto.rs @@ -333,6 +333,12 @@ pub struct HealthResponse { pub db_path: String, /// Schema version the server was compiled against. pub schema_version: i32, + /// Git SHA the binary was built from (12-char short form, or + /// `"unknown"` outside a git checkout). Compare against + /// `/var/lib/vortex-bench/last-deployed-sha` on the host to + /// confirm the live process is the one the deploy timer last + /// rolled out. + pub build_sha: &'static str, /// Most recent `commits.timestamp`, or `None` if the table is empty. pub latest_commit_timestamp: Option, /// Per-fact-table row counts for smoke tests. diff --git a/benchmarks-website/server/src/api/mod.rs b/benchmarks-website/server/src/api/mod.rs index 7a3058f691d..58b8ce95552 100644 --- a/benchmarks-website/server/src/api/mod.rs +++ b/benchmarks-website/server/src/api/mod.rs @@ -137,6 +137,7 @@ fn collect_health(conn: &Connection, db_path: String) -> Result status: "ok", db_path, schema_version: crate::schema::SCHEMA_VERSION, + build_sha: env!("VORTEX_BENCH_BUILD_SHA"), latest_commit_timestamp, row_counts, }) diff --git a/benchmarks-website/server/src/app.rs b/benchmarks-website/server/src/app.rs index d013bfe9ad7..53d51318a3a 100644 --- a/benchmarks-website/server/src/app.rs +++ b/benchmarks-website/server/src/app.rs @@ -7,6 +7,9 @@ //! - `/api/groups`, `/api/chart/{slug}`, `/api/group/{slug}`, `/health` //! (read API) //! - `/api/ingest` (gated by [`crate::auth::require_bearer`]) +//! - `/api/admin/snapshot`, `/api/admin/sql` — only when +//! [`AppState::with_admin`] has been called (gated by +//! [`crate::admin::require_admin_bearer`]) //! - HTML routes contributed by [`crate::html::router`] //! //! All responses pass through [`CompressionLayer`] so HTML, JSON, and the @@ -24,6 +27,7 @@ use axum::routing::get; use axum::routing::post; use tower_http::compression::CompressionLayer; +use crate::admin; use crate::api; use crate::auth::require_bearer; use crate::db::DbHandle; @@ -35,25 +39,53 @@ use crate::ingest; /// or a small `String`). #[derive(Clone)] pub struct AppState { - /// Mutex-guarded DuckDB connection. See [`crate::db`]. + /// Shared DuckDB handle. See [`crate::db`]. pub db: DbHandle, /// Bearer token expected on `/api/ingest`. Compared via constant-time eq. pub bearer_token: Arc, + /// Bearer token expected on `/api/admin/*`. `None` disables the admin + /// router entirely. Set via [`AppState::with_admin`]. + pub admin_bearer_token: Option>, /// On-disk path of the DuckDB file. Surfaced on `/health`. pub db_path: Arc, + /// Directory `EXPORT DATABASE` writes into. Defaults to + /// `/snapshots`. Override via [`AppState::with_snapshot_dir`]. + pub snapshot_dir: Arc, } impl AppState { /// Open the DuckDB at `db_path`, apply the schema, and return shared state. + /// Admin endpoints are unmounted by default; call [`AppState::with_admin`] + /// to enable them. pub fn open>(db_path: P, bearer_token: String) -> Result { let path = db_path.as_ref().to_path_buf(); + let snapshot_dir = path + .parent() + .unwrap_or_else(|| Path::new(".")) + .join("snapshots"); let db = db::open(&path)?; Ok(Self { db, bearer_token: Arc::new(bearer_token), + admin_bearer_token: None, db_path: Arc::new(path), + snapshot_dir: Arc::new(snapshot_dir), }) } + + /// Enable the `/api/admin/*` router, gated by `admin_bearer_token`. + /// Without this call, the admin router is not mounted at all. + pub fn with_admin(mut self, admin_bearer_token: String) -> Self { + self.admin_bearer_token = Some(Arc::new(admin_bearer_token)); + self + } + + /// Override the directory `EXPORT DATABASE` writes into. Defaults to + /// `/snapshots`. + pub fn with_snapshot_dir(mut self, dir: PathBuf) -> Self { + self.snapshot_dir = Arc::new(dir); + self + } } /// Build the full Axum router for the bench server. @@ -71,10 +103,21 @@ pub fn router(state: AppState) -> Router { .route("/api/group/{slug}", get(api::group)) .route("/health", get(api::health)); - Router::new() + let mut router = Router::new() .merge(ingest_routes) .merge(read_routes) - .merge(html::router()) - .layer(CompressionLayer::new()) - .with_state(state) + .merge(html::router()); + + if state.admin_bearer_token.is_some() { + let admin_routes = Router::new() + .route("/api/admin/snapshot", post(admin::snapshot)) + .route("/api/admin/sql", post(admin::sql)) + .route_layer(axum::middleware::from_fn_with_state( + state.clone(), + admin::require_admin_bearer, + )); + router = router.merge(admin_routes); + } + + router.layer(CompressionLayer::new()).with_state(state) } diff --git a/benchmarks-website/server/src/db.rs b/benchmarks-website/server/src/db.rs index ce503a701ae..3ed7952f1e5 100644 --- a/benchmarks-website/server/src/db.rs +++ b/benchmarks-website/server/src/db.rs @@ -3,9 +3,10 @@ //! DuckDB connection management plus the deterministic `measurement_id` hash. //! -//! The server holds a single [`duckdb::Connection`] inside an async -//! [`tokio::sync::Mutex`]. All DB work runs inside `spawn_blocking` so the -//! Tokio runtime is never blocked on synchronous DuckDB calls. +//! The server keeps one root [`duckdb::Connection`] and clones a fresh +//! connection from it for each blocking DB task. All DB work runs inside +//! `spawn_blocking` so the Tokio runtime is never blocked on synchronous +//! DuckDB calls. //! //! `measurement_id` is a server-internal xxhash64 over `commit_sha` plus //! each table's dimensional tuple. Including `commit_sha` makes every @@ -21,7 +22,7 @@ use std::sync::Arc; use anyhow::Context as _; use anyhow::Result; use duckdb::Connection; -use tokio::sync::Mutex; +use parking_lot::Mutex; use twox_hash::XxHash64; use crate::records::CompressionSize; @@ -31,8 +32,25 @@ use crate::records::RandomAccessTime; use crate::records::VectorSearchRun; use crate::schema::SCHEMA_DDL; -/// A connection guard the rest of the crate hands around. -pub type DbHandle = Arc>; +/// Shared DuckDB handle. Cloning the handle is cheap; each DB task clones a +/// task-local [`Connection`] before doing work. +#[derive(Clone)] +pub struct DbHandle { + root: Arc>, +} + +impl DbHandle { + fn new(root: Connection) -> Self { + Self { + root: Arc::new(Mutex::new(root)), + } + } + + fn connection(&self) -> Result { + let root = self.root.lock(); + root.try_clone().context("cloning DuckDB connection") + } +} /// Open the DuckDB file at `path` (creating it if absent) and apply the /// schema DDL. Returns a handle ready to be cloned into the Axum state. @@ -41,11 +59,11 @@ pub fn open>(path: P) -> Result { .with_context(|| format!("opening DuckDB at {}", path.as_ref().display()))?; conn.execute_batch(SCHEMA_DDL) .context("applying schema DDL")?; - Ok(Arc::new(Mutex::new(conn))) + Ok(DbHandle::new(conn)) } -/// Run a synchronous DB operation on the blocking pool, holding the connection -/// mutex for the duration of the call. +/// Run a synchronous DB operation on the blocking pool using a task-local +/// DuckDB connection cloned from the shared database handle. pub async fn run_blocking(handle: &DbHandle, f: F) -> Result where F: FnOnce(&mut Connection) -> Result + Send + 'static, @@ -53,8 +71,8 @@ where { let handle = handle.clone(); tokio::task::spawn_blocking(move || { - let mut guard = handle.blocking_lock(); - f(&mut guard) + let mut conn = handle.connection()?; + f(&mut conn) }) .await .context("DB task panicked")? diff --git a/benchmarks-website/server/src/ingest.rs b/benchmarks-website/server/src/ingest.rs index b97401bea8a..b5bd6d8bc4d 100644 --- a/benchmarks-website/server/src/ingest.rs +++ b/benchmarks-website/server/src/ingest.rs @@ -57,6 +57,9 @@ use crate::records::Record; use crate::records::VectorSearchRun; use crate::schema::SCHEMA_VERSION; +// Unless we start merging 128 PR every second we are not hitting this max. +const WRITE_CONFLICT_ATTEMPTS: usize = 128; + /// Successful ingest response body. #[derive(Debug, Serialize)] pub struct IngestResponse { @@ -102,7 +105,38 @@ fn validate_envelope(env: &Envelope) -> Result<(), IngestError> { Ok(()) } +fn retry_write_conflicts(mut op: F) -> Result +where + F: FnMut() -> Result, +{ + for attempt in 1..=WRITE_CONFLICT_ATTEMPTS { + match op() { + Ok(value) => return Ok(value), + Err(err) if attempt < WRITE_CONFLICT_ATTEMPTS && is_retryable_write_conflict(&err) => { + std::thread::yield_now(); + } + Err(err) => return Err(err), + } + } + unreachable!("loop either returns a value or the final error") +} + +fn is_retryable_write_conflict(err: &anyhow::Error) -> bool { + err.chain().any(|cause| { + let message = cause.to_string().to_ascii_lowercase(); + message.contains("conflict") + && (message.contains("transaction") + || message.contains("write") + || message.contains("tuple") + || message.contains("update")) + }) +} + fn apply_envelope(conn: &mut Connection, env: Envelope) -> Result { + retry_write_conflicts(|| apply_envelope_once(conn, &env)) +} + +fn apply_envelope_once(conn: &mut Connection, env: &Envelope) -> Result { let tx = conn.transaction().context("begin transaction")?; upsert_commit(&tx, &env.commit).context("upsert commit")?; diff --git a/benchmarks-website/server/src/lib.rs b/benchmarks-website/server/src/lib.rs index 3174aa50696..2690aa4587e 100644 --- a/benchmarks-website/server/src/lib.rs +++ b/benchmarks-website/server/src/lib.rs @@ -28,6 +28,9 @@ //! - `GET /health` — liveness probe + per-table row counts. //! - `POST /api/ingest` — bearer-gated ingest. See [`ingest`] for the HTTP //! matrix and [`auth`] for the bearer middleware. +//! - `POST /api/admin/snapshot`, `POST /api/admin/sql` — admin-bearer-gated +//! snapshot trigger and read-only SQL. Mounted only when +//! [`app::AppState::with_admin`] has been called. See [`admin`]. //! //! ## Module map //! @@ -35,7 +38,8 @@ //! |---------------|---------------------------------------------------------------------------------------------| //! | [`app`] | [`app::AppState`] (DB handle + bearer + path) and the Axum router composition. | //! | [`auth`] | Bearer-token middleware for `/api/ingest`. | -//! | [`db`] | [`db::DbHandle`] connection wrapper + the per-fact-table `measurement_id_*` hash functions. | +//! | [`admin`] | `/api/admin/*` handlers + admin-bearer middleware. See `ops/README.md` for the operator flow. | +//! | [`db`] | [`db::DbHandle`] task-local connection cloning + the per-fact-table `measurement_id_*` hash functions. | //! | [`schema`] | DuckDB DDL ([`schema::SCHEMA_DDL`]) and the wire schema version. | //! | [`records`] | Wire shapes for `POST /api/ingest`. | //! | [`ingest`] | `POST /api/ingest` handler — envelope validation, transaction, upsert dispatch. | @@ -51,8 +55,8 @@ //! routes skip auth. //! 3. The handler parses body / path / query into typed inputs (e.g. //! [`slug::ChartKey::from_slug`]). -//! 4. The handler hands a closure to [`db::run_blocking`], which acquires -//! the connection mutex and runs the synchronous DuckDB call on +//! 4. The handler hands a closure to [`db::run_blocking`], which clones a +//! task-local DuckDB connection and runs the synchronous call on //! `tokio::task::spawn_blocking` so the runtime stays free. //! 5. The closure returns `Result`. Errors are mapped //! into [`error::IngestError`] / [`error::ApiError`] with the right @@ -62,6 +66,7 @@ //! 7. Every response passes through [`tower_http::compression::CompressionLayer`] //! on the way out. +pub mod admin; pub mod api; pub mod app; pub mod auth; diff --git a/benchmarks-website/server/src/main.rs b/benchmarks-website/server/src/main.rs index 00ff8418874..e6c62d61a1c 100644 --- a/benchmarks-website/server/src/main.rs +++ b/benchmarks-website/server/src/main.rs @@ -3,13 +3,22 @@ //! Binary entrypoint for `vortex-bench-server`. //! -//! Reads four environment variables before handing off to +//! Reads the following environment variables before handing off to //! [`vortex_bench_server::app::router`]: //! //! - `INGEST_BEARER_TOKEN` — required. Token presented by ingest clients //! on `Authorization: Bearer `. Compared in constant time. +//! - `ADMIN_BEARER_TOKEN` — optional. When set, mounts the +//! `/api/admin/snapshot` and `/api/admin/sql` endpoints; both expect +//! this token in the `Authorization: Bearer …` header. Without it the +//! admin router is not mounted at all (404). The `INGEST_BEARER_TOKEN` +//! does not work on admin routes — keep them separate so they rotate +//! independently. //! - `VORTEX_BENCH_DB` — DuckDB file path. Default: `bench.duckdb` in the //! working directory. +//! - `VORTEX_BENCH_SNAPSHOT_DIR` — directory `EXPORT DATABASE` writes to +//! when an operator hits `/api/admin/snapshot`. Default: +//! `/snapshots`. //! - `VORTEX_BENCH_BIND` — `host:port` to listen on. Default //! `127.0.0.1:3000`. Override to `0.0.0.0:3000` for container deploys. //! - `VORTEX_BENCH_LOG` — `tracing-subscriber` env filter spec. Default @@ -35,10 +44,23 @@ async fn main() -> Result<()> { .into(); let bearer_token = env::var("INGEST_BEARER_TOKEN").context("INGEST_BEARER_TOKEN env var must be set")?; + let admin_bearer_token = env::var("ADMIN_BEARER_TOKEN").ok(); let bind_addr = env::var("VORTEX_BENCH_BIND").unwrap_or_else(|_| "127.0.0.1:3000".to_string()); - let state = vortex_bench_server::app::AppState::open(&db_path, bearer_token) + let mut state = vortex_bench_server::app::AppState::open(&db_path, bearer_token) .with_context(|| format!("opening DuckDB at {}", db_path.display()))?; + if let Some(token) = admin_bearer_token { + state = state.with_admin(token); + } else { + tracing::warn!( + "ADMIN_BEARER_TOKEN is unset — /api/admin/* will return 404 \ + (snapshot + read-only SQL disabled)" + ); + } + if let Ok(dir) = env::var("VORTEX_BENCH_SNAPSHOT_DIR") { + state = state.with_snapshot_dir(PathBuf::from(dir)); + } + let snapshot_dir = state.snapshot_dir.clone(); let app = vortex_bench_server::app::router(state); let listener = tokio::net::TcpListener::bind(&bind_addr) @@ -47,6 +69,7 @@ async fn main() -> Result<()> { tracing::info!( addr = %listener.local_addr()?, db = %db_path.display(), + snapshot_dir = %snapshot_dir.display(), "bench server listening" ); axum::serve(listener, app).await?; diff --git a/benchmarks-website/server/src/schema.rs b/benchmarks-website/server/src/schema.rs index 70727ab7fab..f6199af8472 100644 --- a/benchmarks-website/server/src/schema.rs +++ b/benchmarks-website/server/src/schema.rs @@ -183,3 +183,16 @@ CREATE TABLE IF NOT EXISTS vector_search_runs ( /// Schema version expected by the server. The ingest envelope's /// `run_meta.schema_version` must match this exactly at alpha. pub const SCHEMA_VERSION: i32 = 1; + +/// Every table in the schema, in the order a fresh boot creates them. +/// Used by the snapshot endpoint to drive a per-table `COPY ... TO` +/// across the whole DB and by the restore docs to document the same +/// list. `commits` is the dim table; the rest are facts. +pub const TABLES: &[&str] = &[ + "commits", + "query_measurements", + "compression_times", + "compression_sizes", + "random_access_times", + "vector_search_runs", +]; diff --git a/benchmarks-website/server/tests/admin.rs b/benchmarks-website/server/tests/admin.rs new file mode 100644 index 00000000000..53afe3747b7 --- /dev/null +++ b/benchmarks-website/server/tests/admin.rs @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Integration tests for `/api/admin/*` — round-trips the bearer check, +//! the read-only SQL allow-list, the snapshot endpoint's path validation, +//! and verifies that admin routes 404 when `ADMIN_BEARER_TOKEN` is unset. + +use std::net::SocketAddr; + +use anyhow::Context as _; +use anyhow::Result; +use serde_json::Value; +use serde_json::json; +use tempfile::TempDir; +use tokio::net::TcpListener; +use tokio::task::JoinHandle; +use vortex_bench_server::app::AppState; +use vortex_bench_server::app::router; + +const INGEST_TOKEN: &str = "ingest-test-token"; +const ADMIN_TOKEN: &str = "admin-test-token"; + +struct Server { + addr: SocketAddr, + snapshot_dir: std::path::PathBuf, + _tmp: TempDir, + handle: JoinHandle<()>, +} + +impl Server { + /// Start a server with the admin router enabled. + async fn start_with_admin() -> Result { + Self::start_inner(true).await + } + + /// Start a server without the admin router (verifies admin routes 404). + async fn start_no_admin() -> Result { + Self::start_inner(false).await + } + + async fn start_inner(enable_admin: bool) -> Result { + let tmp = TempDir::new()?; + let db_path = tmp.path().join("bench.duckdb"); + let snapshot_dir = tmp.path().join("snapshots"); + let mut state = AppState::open(&db_path, INGEST_TOKEN.to_string())? + .with_snapshot_dir(snapshot_dir.clone()); + if enable_admin { + state = state.with_admin(ADMIN_TOKEN.to_string()); + } + let app = router(state); + let listener = TcpListener::bind("127.0.0.1:0").await?; + let addr = listener.local_addr()?; + let handle = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + Ok(Self { + addr, + snapshot_dir, + _tmp: tmp, + handle, + }) + } + + fn url(&self, path: &str) -> String { + format!("http://{}{}", self.addr, path) + } +} + +impl Drop for Server { + fn drop(&mut self) { + self.handle.abort(); + } +} + +#[tokio::test] +async fn admin_sql_select_round_trips() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + // The schema is applied on AppState::open, so commits exists with 0 rows. + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(ADMIN_TOKEN) + .json(&json!({ "sql": "SELECT COUNT(*) AS n FROM commits" })) + .send() + .await?; + assert_eq!(resp.status(), 200); + let body: Value = resp.json().await?; + assert_eq!(body["columns"], json!(["n"])); + assert_eq!(body["rows"], json!([[0]])); + assert_eq!(body["row_count"], json!(1)); + Ok(()) +} + +#[tokio::test] +async fn admin_sql_table_format_renders_ascii() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + let resp = client + .post(server.url("/api/admin/sql?format=table")) + .bearer_auth(ADMIN_TOKEN) + .json(&json!({ "sql": "SELECT 1 AS x, 'hello' AS y" })) + .send() + .await?; + assert_eq!(resp.status(), 200); + let ct = resp + .headers() + .get(reqwest::header::CONTENT_TYPE) + .and_then(|h| h.to_str().ok()) + .unwrap_or("") + .to_string(); + assert!(ct.starts_with("text/plain"), "got content-type {ct:?}"); + let body = resp.text().await?; + assert!( + body.contains("│ x │ y"), + "missing column header row in:\n{body}" + ); + assert!( + body.contains("│ 1 │ hello │"), + "missing data row in:\n{body}" + ); + assert!(body.contains("(1 row)"), "missing row count in:\n{body}"); + Ok(()) +} + +#[tokio::test] +async fn admin_sql_rejects_writes() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + for sql in [ + "DELETE FROM commits", + "UPDATE commits SET sha = 'x'", + "DROP TABLE commits", + "INSERT INTO commits VALUES ('x')", + "CREATE TABLE foo (a INT)", + "ATTACH ':memory:' AS bar", + ] { + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(ADMIN_TOKEN) + .json(&json!({ "sql": sql })) + .send() + .await?; + assert_eq!(resp.status(), 403, "expected 403 for {sql:?}"); + let body: Value = resp.json().await?; + assert_eq!(body["error"], json!("forbidden")); + } + Ok(()) +} + +#[tokio::test] +async fn admin_sql_allows_pragma_show_describe_explain_with() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + for sql in [ + "PRAGMA database_size", + "SHOW TABLES", + "DESCRIBE commits", + "EXPLAIN SELECT 1", + "WITH x AS (SELECT 1 AS a) SELECT * FROM x", + ] { + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(ADMIN_TOKEN) + .json(&json!({ "sql": sql })) + .send() + .await?; + assert_eq!(resp.status(), 200, "{sql:?} should be allowed"); + } + Ok(()) +} + +#[tokio::test] +async fn admin_requires_admin_bearer_not_ingest_bearer() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + let body = json!({ "sql": "SELECT 1" }); + + // No header. + let resp = client + .post(server.url("/api/admin/sql")) + .json(&body) + .send() + .await?; + assert_eq!(resp.status(), 401); + + // Wrong token. + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth("wrong") + .json(&body) + .send() + .await?; + assert_eq!(resp.status(), 401); + + // Ingest token explicitly does NOT work on admin routes. + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(INGEST_TOKEN) + .json(&body) + .send() + .await?; + assert_eq!(resp.status(), 401); + + // Right token. + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(ADMIN_TOKEN) + .json(&body) + .send() + .await?; + assert_eq!(resp.status(), 200); + Ok(()) +} + +#[tokio::test] +async fn admin_unmounted_when_admin_token_absent() -> Result<()> { + let server = Server::start_no_admin().await?; + let client = reqwest::Client::new(); + + let resp = client + .post(server.url("/api/admin/sql")) + .bearer_auth(ADMIN_TOKEN) + .json(&json!({ "sql": "SELECT 1" })) + .send() + .await?; + // Without with_admin, the route is not registered at all → 404. + assert_eq!(resp.status(), 404); + + let resp = client + .post(server.url("/api/admin/snapshot?ts=20260101T000000Z")) + .bearer_auth(ADMIN_TOKEN) + .send() + .await?; + assert_eq!(resp.status(), 404); + Ok(()) +} + +// The snapshot endpoint INSTALLs and LOADs the vortex DuckDB community +// extension on first call; that needs outbound network to +// `community-extensions.duckdb.org` which sandboxed CI environments +// generally don't allow. Run manually before merge: +// cargo test -p vortex-bench-server --test admin -- --ignored +#[tokio::test] +#[ignore = "needs network to install the vortex DuckDB community extension"] +async fn admin_snapshot_creates_export_directory() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + let ts = "20260101T000000Z"; + let resp = client + .post(server.url(&format!("/api/admin/snapshot?ts={ts}"))) + .bearer_auth(ADMIN_TOKEN) + .send() + .await?; + let status = resp.status(); + let text = resp.text().await?; + assert_eq!(status, 200, "snapshot failed: {text}"); + let body: Value = serde_json::from_str(&text)?; + let dir = body["snapshot_dir"] + .as_str() + .context("snapshot_dir field")?; + let dir_path = std::path::PathBuf::from(dir); + assert!(dir_path.exists(), "{dir} should exist"); + // schema.sql is written verbatim from SCHEMA_DDL. + assert!( + dir_path.join("schema.sql").exists(), + "{dir}/schema.sql should exist" + ); + // One .vortex file per table — `commits` is the dim table and is + // present even when the DB is otherwise empty (the schema was + // applied at AppState::open). + assert!( + dir_path.join("commits.vortex").exists(), + "{dir}/commits.vortex should exist" + ); + assert!( + dir_path.join("query_measurements.vortex").exists(), + "{dir}/query_measurements.vortex should exist" + ); + // And the directory should be under the configured snapshot dir. + assert!( + dir_path.starts_with(&server.snapshot_dir), + "{dir} not under {:?}", + server.snapshot_dir + ); + Ok(()) +} + +#[tokio::test] +#[ignore = "needs network to install the vortex DuckDB community extension"] +async fn admin_snapshot_rejects_existing_directory() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + let ts = "20260102T000000Z"; + let resp = client + .post(server.url(&format!("/api/admin/snapshot?ts={ts}"))) + .bearer_auth(ADMIN_TOKEN) + .send() + .await?; + assert_eq!(resp.status(), 200); + + // Second call with same ts → 409. + let resp = client + .post(server.url(&format!("/api/admin/snapshot?ts={ts}"))) + .bearer_auth(ADMIN_TOKEN) + .send() + .await?; + assert_eq!(resp.status(), 409); + Ok(()) +} + +#[tokio::test] +async fn admin_snapshot_validates_ts() -> Result<()> { + let server = Server::start_with_admin().await?; + let client = reqwest::Client::new(); + + let too_long = "x".repeat(65); + for bad_ts in ["", "../oops", "with space", too_long.as_str()] { + let url = server.url(&format!("/api/admin/snapshot?ts={}", urlencoding(bad_ts))); + let resp = client.post(&url).bearer_auth(ADMIN_TOKEN).send().await?; + assert_eq!(resp.status(), 400, "expected 400 for ts={bad_ts:?}"); + } + Ok(()) +} + +/// Tiny URL-encoder so the test doesn't grow another dep. Only handles the +/// characters our bad-ts cases produce. +fn urlencoding(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for b in s.bytes() { + match b { + b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~' => { + out.push(b as char); + } + _ => out.push_str(&format!("%{b:02X}")), + } + } + out +}