From 1e7bffa6873dabba6aec31891e6478a482152588 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Thu, 18 Jun 2026 17:05:31 -0700 Subject: [PATCH 1/2] feat(datasets)!: remove datasets commands and dataset feature surface Removes the datasets feature from the CLI entirely: - Delete the `datasets` command group and src/datasets.rs - Drop the `--dataset-id` index scope (IndexScope::Dataset, list_one_dataset, dataset path arms); indexes now scope to connection tables / managed-database catalogs only - Remove `dataset_refresh` and `create_dataset_index` from `jobs --job-type` - Repoint the SDK X-Workspace-Id header regression test off `datasets().list` onto `jobs().list` - Scrub README and all skill docs of dataset commands, `--dataset-id`, removed job types, and `datasets.main.*` query examples BREAKING CHANGE: `hotdata datasets ...`, `hotdata indexes --dataset-id`, and the `dataset_refresh`/`create_dataset_index` job-type filters are removed. Also fixes pre-existing doc drift: `indexes create --connection-id` (not a valid flag) corrected to `--catalog `. --- README.md | 44 +- skills/hotdata-analytics/SKILL.md | 20 +- .../hotdata-analytics/references/WORKFLOWS.md | 27 +- skills/hotdata-search/SKILL.md | 13 +- skills/hotdata-search/references/INDEXES.md | 7 +- skills/hotdata/SKILL.md | 80 +--- .../hotdata/references/DATA_MODEL.template.md | 10 +- skills/hotdata/references/MODEL_BUILD.md | 14 +- skills/hotdata/references/WORKFLOWS.md | 66 +-- src/command.rs | 125 +---- src/databases.rs | 2 +- src/datasets.rs | 441 ------------------ src/indexes.rs | 46 +- src/jobs.rs | 2 - src/main.rs | 181 ++----- src/sdk.rs | 13 +- tests/workspace_env.rs | 6 +- 17 files changed, 126 insertions(+), 971 deletions(-) delete mode 100644 src/datasets.rs diff --git a/README.md b/README.md index 6bfa99d..17d01e4 100644 --- a/README.md +++ b/README.md @@ -67,12 +67,11 @@ API key priority (lowest to highest): config file → `HOTDATA_API_KEY` env var | `connections` | `list`, `create`, `refresh`, `new` | Manage connections | | `databases` | `list`, `create`, `delete`, `tables` | Managed databases (create and load tables via parquet) | | `tables` | `list` | List tables and columns | -| `datasets` | `list`, `create`, `update` | Manage uploaded datasets | | `context` | `list`, `show`, `pull`, `push` | Workspace Markdown context (e.g. data model `DATAMODEL`) via the context API | | `query` | | Execute a SQL query | | `queries` | `list` | Inspect query run history | | `search` | | Full-text search across a table column | -| `indexes` | `list`, `create`, `delete` | Manage indexes on a table or dataset | +| `indexes` | `list`, `create`, `delete` | Manage indexes on a table | | `embedding-providers` | `list`, `get`, `create`, `update`, `delete` | Manage embedding providers used by vector indexes | | `results` | `list` | Retrieve stored query results | | `jobs` | `list` | Manage background jobs | @@ -155,7 +154,7 @@ hotdata databases tables delete [--database ] [--schema publ - `load` (top-level shorthand) — loads a parquet file into `--catalog.--schema.--table`. If the table was not declared at create time, the CLI automatically deletes and recreates the database with the table declared, then retries the load. - `tables load` uploads a **parquet** file (or uses a staged `upload_id` from `POST /v1/files`) and publishes it as the table generation (`replace` mode). - `run` mints a database-scoped JWT and execs `` with `HOTDATA_DATABASE_TOKEN`, `HOTDATA_DATABASE_REFRESH_TOKEN`, `HOTDATA_DATABASE`, `HOTDATA_WORKSPACE`, and `HOTDATA_API_URL` injected into its environment. -- For CSV/JSON uploads without a managed database, use `hotdata datasets create` instead (`datasets.main.*`). +- Managed table loads accept **parquet** only — convert CSV/JSON to parquet first. Example: @@ -176,26 +175,6 @@ hotdata tables list [--workspace-id ] [--connection-id ] [--schema ..
` — use this format in SQL queries. -## Datasets - -```sh -hotdata datasets list [--workspace-id ] [--limit ] [--offset ] [--format table|json|yaml] -hotdata datasets [--workspace-id ] [--format table|json|yaml] -hotdata datasets create --file data.csv [--label "My Dataset"] [--table-name my_dataset] -hotdata datasets create --sql "SELECT ..." --label "My Dataset" -hotdata datasets create --url "https://example.com/data.parquet" --label "My Dataset" -hotdata datasets update [--label "New Label"] [--table-name new_table] -hotdata datasets refresh [--workspace-id ] [--async] -``` - -- Datasets are queryable as `datasets.main.`. -- `--file`, `--sql`, `--query-id`, and `--url` are mutually exclusive. -- `--url` imports data directly from a URL (supports csv, json, parquet). -- Format is auto-detected from file extension or content. -- Piped stdin is supported: `cat data.csv | hotdata datasets create --label "My Dataset"` -- `refresh` re-runs the dataset's source (URL fetch or saved query) and creates a new version. Not supported for upload-source datasets. -- `--async` submits the refresh as a background job and returns a job ID; poll with `hotdata jobs `. - ## Workspace context Named Markdown documents for a workspace (data model, glossary, etc.) are stored in the **context API**. The CLI treats the server as the **source of truth**; local files are only used where the tool requires a path on disk. @@ -258,25 +237,20 @@ hotdata search "" --table
[--type vector] [--column --schema --table
\ --column --type bm25|vector|sorted \ [--name ] [--metric l2|cosine|dot] [--async] \ [--embedding-provider-id ] [--dimensions ] [--output-column ] [--description ] -# Connection-table scope (for non-managed connections) -hotdata indexes list --connection-id --schema --table
[-o table|json|yaml] -hotdata indexes create --connection-id --schema --table
\ - --column --type sorted|bm25|vector [--name ] ... -hotdata indexes delete --connection-id --schema --table
--name +# List — workspace scan, optionally filtered by connection / schema / table +hotdata indexes list [--connection-id ] [--schema ] [--table
] [-o table|json|yaml] -# Dataset scope -hotdata indexes list --dataset-id [-o table|json|yaml] -hotdata indexes create --dataset-id --column --type sorted|bm25|vector [--name ] ... -hotdata indexes delete --dataset-id --name +# Delete — connection scope (--connection-id + --schema + --table) +hotdata indexes delete --connection-id --schema --table
--name ``` - `--type` is **required** — choose `sorted` (B-tree-like), `bm25` (full-text), or `vector` (similarity). @@ -317,7 +291,7 @@ hotdata jobs [--workspace-id ] [--format table|json|yaml] ``` - `list` shows only active jobs (`pending` and `running`) by default. Use `--all` to see all jobs. -- `--job-type` accepts: `data_refresh_table`, `data_refresh_connection`, `dataset_refresh`, `create_index`, `create_dataset_index`. +- `--job-type` accepts: `data_refresh_table`, `data_refresh_connection`, `create_index`. - `--status` accepts: `pending`, `running`, `succeeded`, `partially_succeeded`, `failed`. ## Configuration diff --git a/skills/hotdata-analytics/SKILL.md b/skills/hotdata-analytics/SKILL.md index 6e18249..da479ed 100644 --- a/skills/hotdata-analytics/SKILL.md +++ b/skills/hotdata-analytics/SKILL.md @@ -1,6 +1,6 @@ --- name: hotdata-analytics -description: Use this skill when the user wants OLAP-style SQL analytics in Hotdata — aggregations, GROUP BY, JOINs, reporting, exploratory queries, query run history, stored results, or materialized follow-up tables (Chain via datasets or managed databases). Activate for "analyze", "aggregate", "rollup", "pivot", "report", "metrics", "GROUP BY", "query history", "past queries", "query runs", "stored results", "materialize", "chain", "intermediate table", or sorted indexes for filters/range scans. Do not load for BM25/vector search or geospatial SQL — use hotdata-search or hotdata-geospatial. Requires the core hotdata skill for connections, tables, datasets, and auth. +description: Use this skill when the user wants OLAP-style SQL analytics in Hotdata — aggregations, GROUP BY, JOINs, reporting, exploratory queries, query run history, stored results, or materialized follow-up tables (Chain into managed databases). Activate for "analyze", "aggregate", "rollup", "pivot", "report", "metrics", "GROUP BY", "query history", "past queries", "query runs", "stored results", "materialize", "chain", "intermediate table", or sorted indexes for filters/range scans. Do not load for BM25/vector search or geospatial SQL — use hotdata-search or hotdata-geospatial. Requires the core hotdata skill for connections, tables, and auth. version: 0.5.0 --- @@ -8,7 +8,7 @@ version: 0.5.0 **OLAP-style analytics** in Hotdata: PostgreSQL-dialect SQL, query execution, run history, stored results, **Chain** materializations, and **sorted** indexes for filters and joins. -**Prerequisites:** Authenticate, workspace, and catalog discovery via the **`hotdata`** skill (`connections`, `tables`, `datasets`, `databases`). +**Prerequisites:** Authenticate, workspace, and catalog discovery via the **`hotdata`** skill (`connections`, `tables`, `databases`). **Related skills:** **`hotdata-search`** (BM25, vector, retrieval indexes), **`hotdata-geospatial`** (spatial SQL). @@ -23,7 +23,7 @@ hotdata query status [--output table|json|csv] - **PostgreSQL dialect.** Quote mixed-case identifiers: `"CustomerName"`. - Use **`hotdata tables list`** for schema discovery — not `information_schema` via `query`. -- Fully qualified names: `..
`, `datasets..
`, `..
`. +- Fully qualified names: `..
`, `..
`. - Long-running queries may return `query_run_id` → poll with **`query status`** (exit `2` = still running). Do not re-run identical heavy SQL while polling. - For **workspace-wide** joins and naming, load **context:DATAMODEL** when listed (`hotdata context list` → `show DATAMODEL`) — see **`hotdata`** skill. @@ -79,24 +79,16 @@ hotdata results [--workspace-id ] [--output table|json hotdata query status # if async ``` -2. **Materialize** (pick one) - - ```bash - hotdata datasets create --name chain_slice [--description "chain slice"] --sql "SELECT ..." - hotdata datasets create --name chain_from_saved [--description "from saved"] --query-id - ``` - - Or managed parquet: +2. **Materialize** into a managed database (parquet) ```bash hotdata databases create --catalog analytics hotdata databases load --catalog analytics --table slice --file ./slice.parquet ``` -3. **Chain query** — use printed **`full_name`** or `datasets list` **FULL NAME** column: +3. **Chain query** — use the catalog-qualified name `.public.
`: ```bash - hotdata query "SELECT * FROM datasets.main.chain_slice WHERE ..." hotdata query "SELECT * FROM analytics.public.slice WHERE ..." ``` @@ -111,7 +103,7 @@ Full procedure: [references/WORKFLOWS.md](references/WORKFLOWS.md). For equality, range, and sort-heavy OLAP — not full-text or vector (see **`hotdata-search`**): ```bash -hotdata indexes create --connection-id --schema --table
\ +hotdata indexes create --catalog --schema --table
\ --name idx_orders_created --column created_at --type sorted [--async] ``` diff --git a/skills/hotdata-analytics/references/WORKFLOWS.md b/skills/hotdata-analytics/references/WORKFLOWS.md index 8542635..2485283 100644 --- a/skills/hotdata-analytics/references/WORKFLOWS.md +++ b/skills/hotdata-analytics/references/WORKFLOWS.md @@ -2,7 +2,7 @@ OLAP-style SQL, **History** (query runs and stored results), and **Chain** (materialized follow-ups). Requires **`hotdata`** for auth, workspaces, and catalog commands. -**Related:** **`hotdata-search`** for BM25/vector indexes and `hotdata search`; **`hotdata`** [WORKFLOWS.md](../../hotdata/references/WORKFLOWS.md) for datasets vs managed databases. +**Related:** **`hotdata-search`** for BM25/vector indexes and `hotdata search`; **`hotdata`** [WORKFLOWS.md](../../hotdata/references/WORKFLOWS.md) for managed databases. --- @@ -64,43 +64,32 @@ hotdata query "SELECT ..." ### 2. Materialize -Land a smaller table — pick one: - -**Datasets** (SQL query or saved query → `datasets..
`): - -```bash -hotdata datasets create --name chain_revenue_slice [--description "chain revenue slice"] --sql "SELECT ..." -hotdata datasets create --name chain_from_saved [--description "from saved"] --query-id -``` - -**Managed database** (parquet → `..
`): +Land a smaller table in a **managed database** (parquet → `..
`): ```bash hotdata databases create --catalog chain_db hotdata databases load --catalog chain_db --table revenue_slice --file ./revenue_slice.parquet ``` -Note the printed **`full_name`** (e.g. `datasets.main.chain_revenue_slice` or `chain_db.public.revenue_slice`). For datasets, **`FULL NAME`** from `datasets list` is authoritative. +The table is then addressable as `chain_db.public.revenue_slice`. Confirm with `hotdata databases tables list`. ### 3. Chain query -Query using the actual `full_name` from create or list — do not hardcode `datasets.main`; use whatever qualified name was printed: +Query using the catalog-qualified name `.public.
`: ```bash -hotdata datasets list -hotdata query "SELECT * FROM datasets.main.chain_revenue_slice WHERE ..." -# Managed database: -# hotdata query "SELECT * FROM chain_db.public.revenue_slice WHERE ..." +hotdata databases tables list +hotdata query "SELECT * FROM chain_db.public.revenue_slice WHERE ..." ``` ### Naming and documentation - Prefer predictable `--name` values: `chain__`. -- Record long-lived chains in **context:DATAMODEL → Derived tables (Chain)** with the **full** SQL name you use (`datasets.…` or `database.schema.table`). +- Record long-lived chains in **context:DATAMODEL → Derived tables (Chain)** with the **full** SQL name you use (`database.schema.table`). - Promote join/grain findings to **context:DATAMODEL** when they should be shared or persisted (**`hotdata`** skill). ### Guardrails - Materialize when the base scan is large and the follow-up runs many times. - Keep Chain tables focused; avoid wide `SELECT *` materializations when a narrow projection suffices. -- For upload format choice (datasets vs databases), see **`hotdata`** WORKFLOWS — [Datasets vs managed databases](../../hotdata/references/WORKFLOWS.md#datasets-vs-managed-databases). +- For managed-database uploads, see **`hotdata`** WORKFLOWS — [Managed databases](../../hotdata/references/WORKFLOWS.md#managed-databases). diff --git a/skills/hotdata-search/SKILL.md b/skills/hotdata-search/SKILL.md index ef7a2ae..015c952 100644 --- a/skills/hotdata-search/SKILL.md +++ b/skills/hotdata-search/SKILL.md @@ -42,25 +42,20 @@ hotdata search "" --table [--type vector] [--co ## Indexes (BM25 and vector) -Indexes attach to a **managed database table** (`--catalog`) or a **dataset** (`--dataset-id`). Create is not supported on raw connection tables via CLI. `list` and `delete` accept `--connection-id` for connection-scoped operations. +Create attaches to a table via its `--catalog` alias (a managed-database catalog or a connection name). `list` and `delete` accept `--connection-id` (+ `--schema` + `--table`) for connection-scoped operations. ```bash -# List — workspace scan (filter by connection, schema, table, or dataset) +# List — workspace scan (filter by connection, schema, or table) hotdata indexes list [--connection-id ] [--schema ] [--table
] [--workspace-id ] [--output table|json|yaml] -hotdata indexes list --dataset-id [--workspace-id ] [--output table|json|yaml] -# Create — managed database table (catalog alias) +# Create — by catalog alias (resolves a managed-database catalog or a connection name) hotdata indexes create --catalog --schema --table
\ --column --type bm25|vector \ [--name ] [--metric l2|cosine|dot] [--async] \ [--embedding-provider-id ] [--dimensions ] [--output-column ] [--description ] -# Create — dataset -hotdata indexes create --dataset-id --column --type bm25|vector [--name ] ... - -# Delete — connection table or dataset +# Delete — connection table (--connection-id + --schema + --table) hotdata indexes delete --connection-id --schema --table
--name -hotdata indexes delete --dataset-id --name ``` - **`--type` is required** on create: `bm25` (one text column) or `vector` (exactly one column; often embeddings or auto-embedded text). diff --git a/skills/hotdata-search/references/INDEXES.md b/skills/hotdata-search/references/INDEXES.md index fff424b..49844ce 100644 --- a/skills/hotdata-search/references/INDEXES.md +++ b/skills/hotdata-search/references/INDEXES.md @@ -23,7 +23,6 @@ High-cardinality **text** (`title`, `body`, …) → **bm25**. **Embedding** / f ```bash hotdata indexes list [--connection-id ] [--schema ] [--table
] -hotdata indexes list --dataset-id ``` Skip duplicates (same table, column, and purpose). @@ -40,13 +39,13 @@ hotdata indexes create --catalog --schema --table
\ --column embedding --type vector --metric cosine ``` -For regular connections (explicit connection ID): +For a regular connection, pass its name or ID to `--catalog`: ```bash -hotdata indexes create --connection-id --schema --table
\ +hotdata indexes create --catalog --schema --table
\ --name idx_posts_body_bm25 --column body --type bm25 -hotdata indexes create --connection-id --schema --table
\ +hotdata indexes create --catalog --schema --table
\ --name idx_chunks_embedding --column embedding --type vector --metric cosine ``` diff --git a/skills/hotdata/SKILL.md b/skills/hotdata/SKILL.md index ea2a2f4..7769637 100644 --- a/skills/hotdata/SKILL.md +++ b/skills/hotdata/SKILL.md @@ -1,6 +1,6 @@ --- name: hotdata -description: Use this skill when the user wants to run core hotdata CLI commands — auth, workspaces, connections, managed databases, datasets, tables, basic SQL query, database context (context:DATAMODEL), jobs, and skill install. Activate for "run hotdata", "list workspaces", "list connections", "create a connection", "list databases", "managed database", "load parquet", "list tables", "list datasets", "create a dataset", "execute a query", "database context", "context:DATAMODEL", or general Hotdata CLI usage. For full-text/vector search and retrieval indexes use hotdata-search; for OLAP analytics, query history, stored results, and Chain materializations use hotdata-analytics; for geospatial/GIS use hotdata-geospatial. +description: Use this skill when the user wants to run core hotdata CLI commands — auth, workspaces, connections, managed databases, tables, basic SQL query, database context (context:DATAMODEL), jobs, and skill install. Activate for "run hotdata", "list workspaces", "list connections", "create a connection", "list databases", "managed database", "load parquet", "list tables", "execute a query", "database context", "context:DATAMODEL", or general Hotdata CLI usage. For full-text/vector search and retrieval indexes use hotdata-search; for OLAP analytics, query history, stored results, and Chain materializations use hotdata-analytics; for geospatial/GIS use hotdata-geospatial. version: 0.5.0 --- @@ -20,7 +20,7 @@ Install all skills with **`hotdata skills install`**. Load specialized skills on | Skill | Use for | |-------|---------| -| **`hotdata`** (this file) | Auth, workspaces, connections, databases, datasets, tables, basic `query`, context, jobs | +| **`hotdata`** (this file) | Auth, workspaces, connections, databases, tables, basic `query`, context, jobs | | **`hotdata-search`** | BM25, vector search, `hotdata search`, bm25/vector indexes, embedding providers | | **`hotdata-analytics`** | OLAP SQL, aggregations, query/results history, Chain materializations, sorted indexes | | **`hotdata-geospatial`** | PostGIS-style `ST_*`, WKB, spatial joins | @@ -72,7 +72,7 @@ Keep two layers separate: - **Analysis modeling (day to day)** — Understanding data *for the current task*: exploratory SQL, join checks, column semantics for one report, hypotheses, scratch notes. Often conversational or short-lived. **The conversation or local scratch notes** are the right home while you explore; keep them there until you decide they are worth promoting. -- **context:DATAMODEL (Hotdata database data model)** — A **durable, database-scoped** map stored only via the **context API**: entities and tables across connections, PK/FK relationships, how datasets tie back to sources, naming and query conventions the **whole team** should rely on. This is **higher-level shared structure**, not a transcript of one investigation. +- **context:DATAMODEL (Hotdata database data model)** — A **durable, database-scoped** map stored only via the **context API**: entities and tables across connections, PK/FK relationships, how derived tables tie back to sources, naming and query conventions the **whole team** should rely on. This is **higher-level shared structure**, not a transcript of one investigation. **Promotion:** When analysis findings should **outlive the current session** and **guide everyone**, merge them into **context:DATAMODEL** (`hotdata context list` → if `DATAMODEL` is listed, `hotdata context show DATAMODEL` → reconcile → `hotdata context push DATAMODEL`). You do **not** need to update **context:DATAMODEL** after every ad-hoc query—only when the database story or join graph meaningfully changes. @@ -82,15 +82,15 @@ Use [references/DATA_MODEL.template.md](references/DATA_MODEL.template.md) and [ These are **patterns** built from the commands below—not separate CLI subcommands: -- **Model (`context:DATAMODEL`)** — The **shared** Markdown semantic map of the active database (entities, keys, joins across connections). **Store and read it only via database context** (`hotdata context list`, then `hotdata context show DATAMODEL` **only when listed**, `context push DATAMODEL`); refresh using `connections`, `connections refresh`, `tables list`, and `datasets list`. For a **deep** pass (connector enrichment, indexes, per-table detail), see [references/MODEL_BUILD.md](references/MODEL_BUILD.md). Contrast **analysis modeling** in the conversation or local scratch (see [Analysis modeling vs context:DATAMODEL](#analysis-modeling-vs-contextdatamodel)). +- **Model (`context:DATAMODEL`)** — The **shared** Markdown semantic map of the active database (entities, keys, joins across connections). **Store and read it only via database context** (`hotdata context list`, then `hotdata context show DATAMODEL` **only when listed**, `context push DATAMODEL`); refresh using `connections`, `connections refresh`, and `tables list`. For a **deep** pass (connector enrichment, indexes, per-table detail), see [references/MODEL_BUILD.md](references/MODEL_BUILD.md). Contrast **analysis modeling** in the conversation or local scratch (see [Analysis modeling vs context:DATAMODEL](#analysis-modeling-vs-contextdatamodel)). - **History / Chain / OLAP SQL** — See **`hotdata-analytics`** and [references/WORKFLOWS.md](references/WORKFLOWS.md). - **Search / retrieval indexes** — See **`hotdata-search`**. -Catalog, skill decision tree, epic flows (onboard, chain, retrieval), and datasets vs databases: [references/WORKFLOWS.md](references/WORKFLOWS.md). +Catalog, skill decision tree, epic flows (onboard, chain, retrieval), and managed databases: [references/WORKFLOWS.md](references/WORKFLOWS.md). ## Available Commands -Top-level subcommands (each detailed below): **`auth`**, **`datasets`**, **`query`**, **`workspaces`**, **`connections`**, **`databases`**, **`tables`**, **`skills`**, **`results`**, **`jobs`**, **`indexes`**, **`embedding-providers`**, **`search`**, **`queries`**, **`context`**, **`completions`**. Search, indexes (bm25/vector), and embedding providers are documented in **`hotdata-search`**; query history, results, Chain, and OLAP patterns in **`hotdata-analytics`**. +Top-level subcommands (each detailed below): **`auth`**, **`query`**, **`workspaces`**, **`connections`**, **`databases`**, **`tables`**, **`skills`**, **`results`**, **`jobs`**, **`indexes`**, **`embedding-providers`**, **`search`**, **`queries`**, **`context`**, **`completions`**. Search, indexes (bm25/vector), and embedding providers are documented in **`hotdata-search`**; query history, results, Chain, and OLAP patterns in **`hotdata-analytics`**. Global CLI options: **`--api-key`**, **`-v` / `--version`**, **`-h` / `--help`**, **`--no-input`** (disable interactive prompts; commands that require input will error instead — useful in CI or non-TTY environments). Hidden developer flag: **`--debug`** (verbose HTTP logs). @@ -181,7 +181,7 @@ hotdata connections create \ **Managed databases** are Hotdata-owned catalogs you create and populate yourself — no remote source to sync. Query them in SQL as **`..
`**. Prefer **`hotdata databases`** for this workflow. -**Parquet vs datasets:** `databases tables load` accepts **parquet only**. For SQL-query or saved-query materializations, use **`hotdata datasets create`**. +**Parquet only:** `databases tables load` accepts **parquet** files (local `--file`, remote `--url`, or a pre-staged `--upload-id`). **Active database:** `hotdata databases set ` saves the active database to config. All `databases tables` subcommands and all `context` commands default to the active database; pass **`--database `** to override per-command. @@ -236,64 +236,6 @@ hotdata tables list [--workspace-id ] [--connection-id ] [--limit ] [--offset ] [--output table|json|yaml] -``` -- Default format is `table`. -- Returns `id`, `label`, and `created_at`; table output includes a **`FULL NAME`** column (`datasets..
`). -- Results are paginated (default 100). Use `--offset` to fetch further pages. -- `datasets list` always returns **all** datasets in the workspace. Read **`FULL NAME`** to identify the schema: the middle segment is usually **`main`** (e.g. `datasets.main.my_table`) for ordinary uploads. - -#### Get dataset details -``` -hotdata datasets [--workspace-id ] [--output table|json|yaml] -``` -- Shows dataset metadata and a full column listing with `name`, `data_type`, `nullable`. -- Use this to inspect schema before querying. -- For the **qualified SQL name**, prefer **`FULL NAME` from `datasets list`** or the **`full_name` printed by `datasets create`**—do not assume `datasets.main`. - -#### Update a dataset -``` -hotdata datasets update [--description
[-w ] ``` -For dataset-backed indexes: `hotdata indexes list --dataset-id ` (not merged into the workspace-wide connection-table list). +Managed-database tables (`--catalog`) are covered by the same `indexes list` scan; filter with `--connection-id` / `--schema` / `--table` as above. Note: @@ -110,7 +110,7 @@ This Markdown body is what you store as **context:DATAMODEL** (`hotdata context - **Overview** — Domains and what the workspace is for. - **Per connection** — Optional subsection per source; for **deep** models, **repeat** one block per `connection.schema.table` (grain, column table with name/type/nullable/PK-FK/notes, relationships, queryability, caveats)—the template’s single `####` heading is a pattern to copy for each table. -- **Datasets** — Same treatment as connection tables where relevant. +- **Managed databases** — Same treatment as connection tables where relevant. - **Cross-connection joins** — Keys, semantics, type caveats. - **Search / index summary** — Table, column, index status, intended use. diff --git a/skills/hotdata/references/WORKFLOWS.md b/skills/hotdata/references/WORKFLOWS.md index 2dfb6ed..9fdc415 100644 --- a/skills/hotdata/references/WORKFLOWS.md +++ b/skills/hotdata/references/WORKFLOWS.md @@ -11,15 +11,15 @@ Load **`hotdata`** first for auth and workspace setup. Add a sub-skill only when | User goal | Skill | Key commands | |-----------|--------|----------------| | Login, workspaces, connections, tables, context | **`hotdata`** | `auth`, `workspaces`, `connections`, `tables`, `context` | -| Load parquet files or materialize SQL tables | **`hotdata`** | `databases create` + `databases load`, `datasets create --sql` | -| SQL analytics, aggregations, history, Chain | **`hotdata-analytics`** | `query`, `queries`, `results`, `datasets create --sql` | +| Load parquet files into a managed database | **`hotdata`** | `databases create` + `databases load` | +| SQL analytics, aggregations, history, Chain | **`hotdata-analytics`** | `query`, `queries`, `results` | | BM25 / vector search, retrieval indexes | **`hotdata-search`** | `search`, `indexes create`, `embedding-providers` | | Geospatial / PostGIS-style SQL | **`hotdata-geospatial`** | `query` with `ST_*`, WKB columns | | Concept | Where documented | |--------|------------------| | **Model** | This file — [Model](#model) | -| **Upload path (datasets vs databases)** | This file — [Datasets vs managed databases](#datasets-vs-managed-databases) | +| **Upload path (managed databases)** | This file — [Managed databases](#managed-databases) | | **History / Chain** | **`hotdata-analytics`** — [WORKFLOWS.md](../../hotdata-analytics/references/WORKFLOWS.md) | | **Search indexes** | **`hotdata-search`** — [INDEXES.md](../../hotdata-search/references/INDEXES.md) | | **Epic flows** | This file — [Epic flows](#epic-flows) | @@ -43,18 +43,16 @@ End-to-end checklists. Use the linked sections for command detail and guardrails 7. [ ] (Optional) `hotdata context list` — if `DATAMODEL` is listed, `hotdata context show DATAMODEL`; else skip `show` 8. [ ] (Optional) Bootstrap **context:DATAMODEL** — [Model](#model), [DATA_MODEL.template.md](DATA_MODEL.template.md) -**Next:** upload data ([Datasets vs managed databases](#datasets-vs-managed-databases)) or run analytics (**Chain** below). +**Next:** upload data ([Managed databases](#managed-databases)) or run analytics (**Chain** below). ### Chain (materialize then query) **Skill:** **`hotdata-analytics`** (catalog via **`hotdata`**) 1. [ ] Run base SQL: `hotdata query "SELECT …"` — poll `hotdata query status ` if async -2. [ ] Materialize one way: - - [ ] **Dataset:** `hotdata datasets create --name [--description "…"] --sql "SELECT …"` - - [ ] **Managed DB:** `hotdata databases create --catalog --table ` then `hotdata databases load --catalog --table --file ./….parquet` -3. [ ] Copy **`full_name`** from create output (or `datasets list` **FULL NAME**) -4. [ ] Chain: `hotdata query "SELECT … FROM WHERE …"` +2. [ ] Materialize into a managed database: `hotdata databases create --catalog --table ` then `hotdata databases load --catalog --table --file ./….parquet` +3. [ ] Query with the catalog-qualified name `.public.` +4. [ ] Chain: `hotdata query "SELECT … FROM .public. WHERE …"` 5. [ ] Record stable chains in **context:DATAMODEL** when they should outlive the session **Detail:** [hotdata-analytics WORKFLOWS — Chain](../../hotdata-analytics/references/WORKFLOWS.md#chain) @@ -67,7 +65,7 @@ End-to-end checklists. Use the linked sections for command detail and guardrails 2. [ ] `hotdata indexes list` — avoid duplicate bm25/vector indexes on the same column 3. [ ] Create index: - [ ] **Managed DB:** `hotdata indexes create --catalog --table --column --type bm25|vector` - - [ ] **Connection:** `hotdata indexes create --connection-id --schema --table --column --type bm25|vector [--metric cosine|l2|dot]` + - [ ] **Connection:** `hotdata indexes create --catalog --schema --table --column --type bm25|vector [--metric cosine|l2|dot]` - [ ] Large build: add `--async`, then `hotdata jobs ` 4. [ ] Search (--type and --column inferred when one search index exists): - [ ] `hotdata search "…" --table ` (auto-infer) @@ -78,40 +76,20 @@ End-to-end checklists. Use the linked sections for command detail and guardrails --- -## Datasets vs managed databases +## Managed databases -Both land queryable tables in the workspace; the path depends on **format** and **how you want to name tables in SQL**. +**Managed databases** land queryable tables you own in the workspace, addressed in SQL as `..
` where the catalog is the `--catalog` alias. -| | **Datasets** | **Managed databases** | -|---|-------------|------------------------| -| **Best for** | SQL or saved-query snapshot | Parquet files you own; catalog-style `alias.schema.table` | -| **SQL prefix** | `datasets..
` (often `datasets.main.*`) | `..
` where catalog = `--catalog` alias | -| **CLI** | `hotdata datasets create --sql “…”` | `hotdata databases create --catalog` + `databases load` | -| **Declare schema up front** | No | Yes — `--table` on create (auto-declared on first `databases load`) | -| **Parquet file uploads** | Not supported via CLI | `databases load --file` / `--url` / `--upload-id` | -| **Refresh** | `datasets refresh` (re-runs source query) | Replace via `databases load` again | +| | **Managed databases** | +|---|------------------------| +| **Best for** | Parquet files you own; catalog-style `alias.schema.table` | +| **SQL prefix** | `..
` where catalog = `--catalog` alias | +| **CLI** | `hotdata databases create --catalog` + `databases load` | +| **Declare schema up front** | Yes — `--table` on create (auto-declared on first `databases load`) | +| **Parquet file uploads** | `databases load --file` / `--url` / `--upload-id` | +| **Refresh** | Replace via `databases load` again | -**Rule of thumb:** SQL or saved-query materialization → **datasets**. Parquet files you control as **`mydb.public.orders`** → **databases**. - -### Workflow: dataset upload and query - -1. Authenticate and set workspace (`hotdata auth`, `hotdata workspaces set` if needed). -2. Create the dataset — `--name` is the SQL table name (required); `--description` is the display label (optional): - - ```bash - hotdata datasets create --name orders --sql "SELECT ..." - # or: --query-id - ``` - - For parquet file uploads use **managed databases** instead (see below). - -3. Note the printed **`full_name`** (e.g. `datasets.main.orders`) — do not assume `datasets.main`. -4. Inspect if needed: `hotdata datasets list`, `hotdata datasets `. -5. Query: - - ```bash - hotdata query "SELECT count(*) FROM datasets.main.orders" - ``` +**Rule of thumb:** Parquet files you control as **`mydb.public.orders`** → **managed databases**. ### Workflow: managed database (parquet) @@ -135,7 +113,7 @@ Both land queryable tables in the workspace; the path depends on **format** and hotdata query "SELECT count(*) FROM sales.public.orders" ``` -For **Chain** materializations into datasets or databases, see **`hotdata-analytics`**. +For **Chain** materializations into managed databases, see **`hotdata-analytics`**. --- @@ -163,8 +141,6 @@ hotdata connections list hotdata connections refresh # after DDL / stale remote metadata hotdata tables list hotdata tables list --connection-id -hotdata datasets list -hotdata datasets hotdata databases list ``` @@ -175,5 +151,5 @@ Use `hotdata tables list` for discovery; do not query `information_schema` for t ## Cross-cutting - **Workspace:** Active workspace or `--workspace-id`. **`hotdata queries`** uses the active workspace only (no `--workspace-id`). -- **Jobs:** `hotdata jobs list` / `jobs ` for async refreshes, dataset refresh, and index builds. +- **Jobs:** `hotdata jobs list` / `jobs ` for async refreshes and index builds. - **Discovery:** `hotdata tables list` — not `query` on `information_schema`. diff --git a/src/command.rs b/src/command.rs index b006607..8cf0903 100644 --- a/src/command.rs +++ b/src/command.rs @@ -8,23 +8,6 @@ pub enum Commands { command: Option, }, - /// Derived views — virtual SQL tables built from queries over your data - Datasets { - /// Dataset ID to show details - id: Option, - - /// Workspace ID (defaults to first workspace from login) - #[arg(long, short = 'w', global = true)] - workspace_id: Option, - - /// Output format (used with dataset ID) - #[arg(long = "output", short = 'o', default_value = "table", value_parser = ["table", "json", "yaml"])] - output: String, - - #[command(subcommand)] - command: Option, - }, - /// Execute a SQL query, or check status of a running query Query { /// SQL query string (omit when using a subcommand) @@ -284,33 +267,29 @@ pub enum AuthCommands { #[derive(Subcommand)] pub enum IndexesCommands { - /// List indexes (defaults to the whole workspace; narrow with filters or pass --dataset-id) + /// List indexes (defaults to the whole workspace; narrow with filters) List { /// Filter by connection ID - #[arg(long, short = 'c', conflicts_with = "dataset_id")] + #[arg(long, short = 'c')] connection_id: Option, /// Filter by schema name - #[arg(long, conflicts_with = "dataset_id")] + #[arg(long)] schema: Option, /// Filter by table name - #[arg(long, conflicts_with = "dataset_id")] - table: Option, - - /// List indexes for a specific dataset (alternative scope to --connection-id) #[arg(long)] - dataset_id: Option, + table: Option, /// Output format #[arg(long = "output", short = 'o', default_value = "table", value_parser = ["table", "json", "yaml"])] output: String, }, - /// Create an index on a table or dataset. + /// Create an index on a table. Create { /// SQL catalog alias of the target database (e.g. `--catalog airbnb`) - #[arg(long, conflicts_with = "dataset_id")] + #[arg(long)] catalog: Option, /// Schema name (default: public) @@ -318,17 +297,13 @@ pub enum IndexesCommands { schema: String, /// Table name to index - #[arg(long, conflicts_with = "dataset_id")] + #[arg(long)] table: Option, /// Column(s) to index (comma-separated) #[arg(long)] column: Option, - /// Dataset ID (alternative scope to --catalog/--table) - #[arg(long, conflicts_with_all = ["catalog", "table"])] - dataset_id: Option, - /// Index name (derived from table, columns, and type if omitted) #[arg(long)] name: Option, @@ -363,13 +338,12 @@ pub enum IndexesCommands { description: Option, }, - /// Delete an index from a table or dataset + /// Delete an index from a table /// - /// Pass either connection scope (--connection-id + --schema + --table) OR - /// dataset scope (--dataset-id), not both. + /// Pass connection scope: --connection-id + --schema + --table. Delete { /// Connection ID (use with --schema and --table) - #[arg(long, short = 'c', conflicts_with = "dataset_id", requires_all = ["schema", "table"])] + #[arg(long, short = 'c', requires_all = ["schema", "table"])] connection_id: Option, /// Schema name (requires --connection-id) @@ -380,10 +354,6 @@ pub enum IndexesCommands { #[arg(long, requires = "connection_id")] table: Option, - /// Dataset ID (alternative scope to --connection-id) - #[arg(long, conflicts_with_all = ["connection_id", "schema", "table"])] - dataset_id: Option, - /// Index name #[arg(long)] name: String, @@ -395,7 +365,7 @@ pub enum JobsCommands { /// List background jobs (shows active jobs by default) List { /// Filter by job type - #[arg(long, value_parser = ["data_refresh_table", "data_refresh_connection", "dataset_refresh", "create_index", "create_dataset_index"])] + #[arg(long, value_parser = ["data_refresh_table", "data_refresh_connection", "create_index"])] job_type: Option, /// Filter by status @@ -420,79 +390,6 @@ pub enum JobsCommands { }, } -#[derive(Subcommand)] -pub enum DatasetsCommands { - /// List all datasets in a workspace - List { - /// Maximum number of results (default: 100, max: 1000) - #[arg(long)] - limit: Option, - - /// Pagination offset - #[arg(long)] - offset: Option, - - /// Output format - #[arg(long = "output", short = 'o', default_value = "table", value_parser = ["table", "json", "yaml"])] - output: String, - }, - - /// Create a derived view from a SQL query or saved query - Create { - /// SQL table name the dataset is addressable as (e.g. my_view) - #[arg(long)] - name: String, - - /// Human-readable display label - #[arg(long)] - description: Option, - - /// SQL query to create the dataset from - #[arg( - long, - conflicts_with = "query_id", - required_unless_present = "query_id" - )] - sql: Option, - - /// Saved query ID to create the dataset from - #[arg(long, conflicts_with = "sql", required_unless_present = "sql")] - query_id: Option, - - /// Output format - #[arg(long = "output", short = 'o', default_value = "table", value_parser = ["table", "json", "yaml"])] - output: String, - }, - - /// Update a dataset's description and/or name - Update { - /// Dataset ID - id: String, - - /// New display label - #[arg(long)] - description: Option, - - /// New SQL table name (must be a valid identifier) - #[arg(long)] - name: Option, - - /// Output format - #[arg(long = "output", short = 'o', default_value = "table", value_parser = ["table", "json", "yaml"])] - output: String, - }, - - /// Refresh a dataset by re-running its source (URL fetch or saved query) and creating a new version - Refresh { - /// Dataset ID - id: String, - - /// Submit as a background job - #[arg(long)] - r#async: bool, - }, -} - #[derive(Subcommand)] pub enum WorkspaceCommands { /// List all workspaces diff --git a/src/databases.rs b/src/databases.rs index b4f7de1..d90d9e2 100644 --- a/src/databases.rs +++ b/src/databases.rs @@ -347,7 +347,7 @@ fn upload_parquet_file(api: &Api, path: &str) -> String { if !is_parquet_path(path) { eprintln!( "error: managed table loads require a parquet file (got '{}'). \ - Convert your data to parquet or use `hotdata datasets create` for CSV/JSON.", + Convert your data to parquet first.", path ); std::process::exit(1); diff --git a/src/datasets.rs b/src/datasets.rs deleted file mode 100644 index ab88ed7..0000000 --- a/src/datasets.rs +++ /dev/null @@ -1,441 +0,0 @@ -use crate::sdk::Api; -use hotdata::models::{ - CreateDatasetRequest, CreateDatasetResponse, DatasetSource, DatasetSourceOneOf1, - DatasetSourceOneOf2, GetDatasetResponse, RefreshRequest, RefreshResponse, UpdateDatasetRequest, - UpdateDatasetResponse, -}; -use serde::Serialize; - -/// Output shape for `create`, preserving the CLI's field order for json/yaml. -#[derive(Serialize)] -struct CreateView { - id: String, - label: String, - schema_name: String, - table_name: String, -} - -impl From for CreateView { - fn from(r: CreateDatasetResponse) -> Self { - CreateView { - id: r.id, - label: r.label, - schema_name: r.schema_name, - table_name: r.table_name, - } - } -} - -/// Output shape for `list` rows. -#[derive(Serialize)] -struct DatasetView { - id: String, - label: String, - schema_name: String, - table_name: String, - created_at: String, - updated_at: String, -} - -/// Output shape for `get`. -#[derive(Serialize)] -struct DatasetDetail { - id: String, - label: String, - schema_name: String, - table_name: String, - source_type: String, - created_at: String, - updated_at: String, - columns: Vec, -} - -#[derive(Serialize)] -struct Column { - name: String, - data_type: String, - nullable: bool, -} - -/// Output shape for `update`, preserving the CLI's field order and optional -/// `schema_name`. runtimedb's `UpdateDatasetResponse` does not currently send -/// `schema_name`, so we don't synthesize one — schema-scoped datasets live -/// under `datasets..
`, not `datasets.main.*`. -#[derive(Serialize)] -struct UpdateView { - id: String, - label: String, - #[serde(skip_serializing_if = "Option::is_none")] - schema_name: Option, - table_name: String, - #[serde(skip_serializing_if = "Option::is_none")] - latest_version: Option, - #[serde(skip_serializing_if = "Option::is_none")] - pinned_version: Option, - updated_at: String, -} - -impl From for UpdateView { - fn from(r: UpdateDatasetResponse) -> Self { - UpdateView { - id: r.id, - label: r.label, - // The SDK model carries no schema_name; keep None so we print the - // unqualified table_name + the "see qualified name" hint. - schema_name: None, - table_name: r.table_name, - latest_version: Some(r.latest_version), - pinned_version: r.pinned_version.flatten(), - updated_at: r.updated_at, - } - } -} - -fn create_dataset( - api: &Api, - description: Option<&str>, - name: &str, - source: DatasetSource, - format: &str, -) { - let label = description.unwrap_or(name).to_string(); - let mut request = CreateDatasetRequest::new(label, source); - request.table_name = Some(Some(name.to_string())); - - let resp = match crate::sdk::block(api.client().datasets().create(request, api.database_id())) { - Ok(r) => r, - Err(e) => e.exit(), - }; - let dataset = CreateView::from(resp); - - use crossterm::style::Stylize; - match format { - "json" => println!("{}", serde_json::to_string_pretty(&dataset).unwrap()), - "yaml" => print!("{}", serde_yaml::to_string(&dataset).unwrap()), - "table" => { - eprintln!("{}", "Dataset created".green()); - println!("id: {}", dataset.id); - println!("label: {}", dataset.label); - println!( - "full_name: datasets.{}.{}", - dataset.schema_name, dataset.table_name - ); - } - _ => unreachable!(), - } -} - -pub fn create_from_query( - workspace_id: &str, - sql: &str, - description: Option<&str>, - name: &str, - format: &str, -) { - let api = Api::new(Some(workspace_id)); - let source = DatasetSource::DatasetSourceOneOf2(Box::new(DatasetSourceOneOf2::new( - sql.to_string(), - hotdata::models::dataset_source_one_of_2::Type::SqlQuery, - ))); - create_dataset(&api, description, name, source, format); -} - -pub fn create_from_saved_query( - workspace_id: &str, - query_id: &str, - description: Option<&str>, - name: &str, - format: &str, -) { - let api = Api::new(Some(workspace_id)); - let source = DatasetSource::DatasetSourceOneOf1(Box::new(DatasetSourceOneOf1::new( - query_id.to_string(), - hotdata::models::dataset_source_one_of_1::Type::SavedQuery, - ))); - create_dataset(&api, description, name, source, format); -} - -pub fn list(workspace_id: &str, limit: Option, offset: Option, format: &str) { - let api = Api::new(Some(workspace_id)); - - let body = crate::sdk::block( - api.client() - .datasets() - .list(limit.map(|l| l as i32), offset.map(|o| o as i32)), - ) - .unwrap_or_else(|e| e.exit()); - - let datasets: Vec = body - .datasets - .into_iter() - .map(|d| DatasetView { - id: d.id, - label: d.label, - schema_name: d.schema_name, - table_name: d.table_name, - created_at: d.created_at, - updated_at: d.updated_at, - }) - .collect(); - - match format { - "json" => println!("{}", serde_json::to_string_pretty(&datasets).unwrap()), - "yaml" => print!("{}", serde_yaml::to_string(&datasets).unwrap()), - "table" => { - if datasets.is_empty() { - use crossterm::style::Stylize; - eprintln!("{}", "No datasets found.".dark_grey()); - } else { - let rows: Vec> = datasets - .iter() - .map(|d| { - vec![ - d.id.clone(), - d.label.clone(), - format!("datasets.{}.{}", d.schema_name, d.table_name), - crate::util::format_date(&d.created_at), - ] - }) - .collect(); - crate::table::print(&["ID", "LABEL", "FULL NAME", "CREATED AT"], &rows); - } - if body.has_more { - let next = offset.unwrap_or(0) + body.count.max(0) as u32; - use crossterm::style::Stylize; - eprintln!( - "{}", - format!( - "showing {} results — use --offset {next} for more", - body.count - ) - .dark_grey() - ); - } - } - _ => unreachable!(), - } -} - -pub fn get(dataset_id: &str, workspace_id: &str, format: &str) { - let api = Api::new(Some(workspace_id)); - - let resp: GetDatasetResponse = - crate::sdk::block(api.client().datasets().get(dataset_id)).unwrap_or_else(|e| e.exit()); - - let d = DatasetDetail { - id: resp.id, - label: resp.label, - schema_name: resp.schema_name, - table_name: resp.table_name, - source_type: resp.source_type, - created_at: resp.created_at, - updated_at: resp.updated_at, - columns: resp - .columns - .into_iter() - .map(|c| Column { - name: c.name, - data_type: c.data_type, - nullable: c.nullable, - }) - .collect(), - }; - - match format { - "json" => println!("{}", serde_json::to_string_pretty(&d).unwrap()), - "yaml" => print!("{}", serde_yaml::to_string(&d).unwrap()), - "table" => { - let created_at = crate::util::format_date(&d.created_at); - let updated_at = crate::util::format_date(&d.updated_at); - println!("id: {}", d.id); - println!("label: {}", d.label); - println!("full_name: datasets.main.{}", d.table_name); - println!("source_type: {}", d.source_type); - println!("created_at: {created_at}"); - println!("updated_at: {updated_at}"); - if !d.columns.is_empty() { - println!(); - let rows: Vec> = d - .columns - .iter() - .map(|col| { - vec![ - col.name.clone(), - col.data_type.clone(), - col.nullable.to_string(), - ] - }) - .collect(); - crate::table::print(&["COLUMN", "DATA TYPE", "NULLABLE"], &rows); - } - } - _ => unreachable!(), - } -} - -pub fn update( - dataset_id: &str, - workspace_id: &str, - description: Option<&str>, - name: Option<&str>, - format: &str, -) { - if description.is_none() && name.is_none() { - eprintln!("error: provide at least one of --description or --name."); - std::process::exit(1); - } - - let api = Api::new(Some(workspace_id)); - - let mut request = UpdateDatasetRequest::new(); - if let Some(d) = description { - request.label = Some(Some(d.to_string())); - } - if let Some(n) = name { - request.table_name = Some(Some(n.to_string())); - } - - let resp: UpdateDatasetResponse = - crate::sdk::block(api.client().datasets().update(dataset_id, request)) - .unwrap_or_else(|e| e.exit()); - let d = UpdateView::from(resp); - - use crossterm::style::Stylize; - eprintln!("{}", "Dataset updated".green()); - match format { - "json" => println!("{}", serde_json::to_string_pretty(&d).unwrap()), - "yaml" => print!("{}", serde_yaml::to_string(&d).unwrap()), - "table" => { - println!("id: {}", d.id); - println!("label: {}", d.label); - match &d.schema_name { - Some(schema) => { - println!("full_name: datasets.{}.{}", schema, d.table_name); - } - None => { - println!("table_name: {}", d.table_name); - eprintln!( - "{}", - format!( - "(run `hotdata datasets {}` to see the qualified name)", - d.id - ) - .dark_grey() - ); - } - } - println!("updated_at: {}", crate::util::format_date(&d.updated_at)); - } - _ => unreachable!(), - } -} - -pub fn refresh(workspace_id: &str, dataset_id: &str, async_mode: bool) { - use crossterm::style::Stylize; - - let api = Api::new(Some(workspace_id)); - - let mut request = RefreshRequest::new(); - request.dataset_id = Some(Some(dataset_id.to_string())); - if async_mode { - request.r#async = Some(true); - } - - let resp = - crate::sdk::block(api.client().refresh().refresh(request)).unwrap_or_else(|e| e.exit()); - - if async_mode { - let job_id = match &resp { - RefreshResponse::SubmitJobResponse(j) => j.id.clone(), - _ => "unknown".to_string(), - }; - println!("{}", "Dataset refresh submitted.".green()); - println!("job_id: {}", job_id); - println!( - "{}", - format!("Use 'hotdata jobs {}' to check status.", job_id).dark_grey() - ); - return; - } - - let (id, version, dataset_status) = match &resp { - RefreshResponse::RefreshDatasetResponse(r) => { - (r.id.clone(), r.version as i64, r.status.clone()) - } - RefreshResponse::SubmitJobResponse(j) => (j.id.clone(), 0, j.status.to_string()), - _ => ("unknown".to_string(), 0, String::new()), - }; - println!("{}", "Dataset refresh completed.".green()); - println!( - "{}", - format!(" id: {id}, version: {version}, status: {dataset_status}").dark_grey() - ); -} - -#[cfg(test)] -mod tests { - use super::*; - use hotdata::models::UpdateDatasetResponse; - - /// Mirrors runtimedb's `UpdateDatasetResponse` (see runtimedb/src/http/models.rs). - /// The SDK deserializes this exact shape; here we assert the CLI's `UpdateView` - /// conversion preserves the display contract: no synthesized schema_name, and - /// latest/pinned versions surfaced when present. - #[test] - fn update_view_from_runtimedb_payload() { - let body = serde_json::json!({ - "id": "ds_abc123", - "label": "url_test", - "table_name": "url_test", - "latest_version": 3, - "updated_at": "2026-04-28T18:30:00Z", - }); - let resp: UpdateDatasetResponse = serde_json::from_value(body).unwrap(); - let view = UpdateView::from(resp); - assert_eq!(view.id, "ds_abc123"); - assert_eq!(view.label, "url_test"); - assert_eq!(view.table_name, "url_test"); - // The server doesn't send schema_name and we never synthesize "main", - // so schema-scoped datasets aren't mislabeled. - assert!(view.schema_name.is_none()); - assert_eq!(view.latest_version, Some(3)); - assert!(view.pinned_version.is_none()); - } - - #[test] - fn update_view_handles_pinned_version() { - let body = serde_json::json!({ - "id": "ds_abc123", - "label": "x", - "table_name": "x", - "latest_version": 5, - "pinned_version": 2, - "updated_at": "2026-04-28T18:30:00Z", - }); - let resp: UpdateDatasetResponse = serde_json::from_value(body).unwrap(); - let view = UpdateView::from(resp); - assert_eq!(view.pinned_version, Some(2)); - } - - #[test] - fn create_from_query_builds_sql_source() { - let source = DatasetSource::DatasetSourceOneOf2(Box::new(DatasetSourceOneOf2::new( - "SELECT 1".to_string(), - hotdata::models::dataset_source_one_of_2::Type::SqlQuery, - ))); - let json = serde_json::to_value(&source).unwrap(); - assert_eq!(json["type"], "sql_query"); - assert_eq!(json["sql"], "SELECT 1"); - } - - #[test] - fn create_from_saved_query_builds_saved_query_source() { - let source = DatasetSource::DatasetSourceOneOf1(Box::new(DatasetSourceOneOf1::new( - "sq_123".to_string(), - hotdata::models::dataset_source_one_of_1::Type::SavedQuery, - ))); - let json = serde_json::to_value(&source).unwrap(); - assert_eq!(json["type"], "saved_query"); - assert_eq!(json["saved_query_id"], "sq_123"); - } -} diff --git a/src/indexes.rs b/src/indexes.rs index ffdac4a..001c832 100644 --- a/src/indexes.rs +++ b/src/indexes.rs @@ -216,12 +216,6 @@ fn list_one_table(api: &Api, connection_id: &str, schema: &str, table: &str) -> body.indexes } -fn list_one_dataset(api: &Api, dataset_id: &str) -> Vec { - let path = format!("/datasets/{dataset_id}/indexes"); - let body: ListResponse = api.get_json(&path, &[]).unwrap_or_else(|e| e.exit()); - body.indexes -} - fn list_one_table_scan(api: &Api, connection_id: &str, schema: &str, table: &str) -> Vec { let path = format!("/connections/{connection_id}/tables/{schema}/{table}/indexes"); match none_if_404(api.get_json::(&path, &[])).unwrap_or_else(|e| e.exit()) { @@ -323,24 +317,12 @@ pub fn list( connection_id: Option<&str>, schema: Option<&str>, table: Option<&str>, - dataset_id: Option<&str>, format: &str, ) { let api = Api::new(Some(workspace_id)); - let (rows, multi_table) = match (dataset_id, connection_id, schema, table) { - (Some(did), _, _, _) => { - let indexes = list_one_dataset(&api, did); - let rows: Vec = indexes - .into_iter() - .map(|i| IndexRow { - inner: i, - table: None, - }) - .collect(); - (rows, false) - } - (None, Some(cid), Some(sch), Some(tbl)) => { + let (rows, multi_table) = match (connection_id, schema, table) { + (Some(cid), Some(sch), Some(tbl)) => { let indexes = list_one_table(&api, cid, sch, tbl); let rows: Vec = indexes .into_iter() @@ -416,9 +398,6 @@ pub enum IndexScope<'a> { schema: &'a str, table: &'a str, }, - Dataset { - dataset_id: &'a str, - }, } impl IndexScope<'_> { @@ -429,7 +408,6 @@ impl IndexScope<'_> { schema, table, } => format!("/connections/{connection_id}/tables/{schema}/{table}/indexes"), - IndexScope::Dataset { dataset_id } => format!("/datasets/{dataset_id}/indexes"), } } @@ -445,9 +423,6 @@ impl IndexScope<'_> { } => { format!("/connections/{connection_id}/tables/{schema}/{table}/indexes/{index_name}") } - IndexScope::Dataset { dataset_id } => { - format!("/datasets/{dataset_id}/indexes/{index_name}") - } } } } @@ -553,11 +528,6 @@ pub fn delete(workspace_id: &str, scope: IndexScope<'_>, index_name: &str) { .indexes() .delete_index(connection_id, schema, table, index_name), ), - IndexScope::Dataset { dataset_id } => block( - api.client() - .indexes() - .delete_dataset_index(dataset_id, index_name), - ), }; if let Err(e) = result { @@ -601,18 +571,6 @@ mod tests { ); } - #[test] - fn index_scope_dataset_paths() { - let scope = IndexScope::Dataset { - dataset_id: "data_xyz", - }; - assert_eq!(scope.create_path(), "/datasets/data_xyz/indexes"); - assert_eq!( - scope.delete_path("idx_title"), - "/datasets/data_xyz/indexes/idx_title" - ); - } - #[test] fn information_schema_followup_breaks_when_more_but_no_cursor() { assert!(matches!( diff --git a/src/jobs.rs b/src/jobs.rs index ce1ff11..ca1f7af 100644 --- a/src/jobs.rs +++ b/src/jobs.rs @@ -40,9 +40,7 @@ fn parse_job_type(s: &str) -> Option { "noop" => Some(JobType::Noop), "data_refresh_table" => Some(JobType::DataRefreshTable), "data_refresh_connection" => Some(JobType::DataRefreshConnection), - "dataset_refresh" => Some(JobType::DatasetRefresh), "create_index" => Some(JobType::CreateIndex), - "create_dataset_index" => Some(JobType::CreateDatasetIndex), _ => None, } } diff --git a/src/main.rs b/src/main.rs index 5b4b5be..2371eb5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,7 +6,6 @@ mod connections_new; mod context; mod database_session; mod databases; -mod datasets; mod embedding_providers; mod indexes; mod jobs; @@ -27,7 +26,7 @@ use anstyle::AnsiColor; use clap::{Parser, builder::Styles}; use command::{ AuthCommands, Commands, ConnectionsCommands, ConnectionsCreateCommands, ContextCommands, - DatabaseTablesCommands, DatabasesCommands, DatasetsCommands, EmbeddingProvidersCommands, + DatabaseTablesCommands, DatabasesCommands, EmbeddingProvidersCommands, IndexesCommands, JobsCommands, QueriesCommands, QueryCommands, ResultsCommands, SkillCommands, TablesCommands, WorkspaceCommands, }; @@ -164,76 +163,6 @@ fn main() { Some(AuthCommands::Status) => auth::status("default"), Some(AuthCommands::Logout) => auth::logout("default"), }, - Commands::Datasets { - id, - workspace_id, - output, - command, - } => { - let workspace_id = resolve_workspace(workspace_id); - if let Some(id) = id { - datasets::get(&id, &workspace_id, &output) - } else { - match command { - Some(DatasetsCommands::List { - limit, - offset, - output, - }) => datasets::list(&workspace_id, limit, offset, &output), - Some(DatasetsCommands::Create { - name, - description, - sql, - query_id, - output, - }) => { - if let Some(sql) = sql { - datasets::create_from_query( - &workspace_id, - &sql, - description.as_deref(), - &name, - &output, - ) - } else { - datasets::create_from_saved_query( - &workspace_id, - query_id.as_deref().unwrap_or_else(|| { - unreachable!("clap enforces --sql or --query-id") - }), - description.as_deref(), - &name, - &output, - ) - } - } - Some(DatasetsCommands::Update { - id, - description, - name, - output, - }) => datasets::update( - &id, - &workspace_id, - description.as_deref(), - name.as_deref(), - &output, - ), - Some(DatasetsCommands::Refresh { id, r#async }) => { - datasets::refresh(&workspace_id, &id, r#async) - } - None => { - use clap::CommandFactory; - let mut cmd = Cli::command(); - cmd.build(); - cmd.find_subcommand_mut("datasets") - .unwrap() - .print_help() - .unwrap(); - } - } - } - } Commands::Query { sql, workspace_id, @@ -607,14 +536,12 @@ fn main() { connection_id, schema, table, - dataset_id, output, } => indexes::list( &workspace_id, connection_id.as_deref(), schema.as_deref(), table.as_deref(), - dataset_id.as_deref(), &output, ), IndexesCommands::Create { @@ -622,7 +549,6 @@ fn main() { schema, table, column, - dataset_id, name, r#type, metric, @@ -633,74 +559,34 @@ fn main() { description, } => { let api = sdk::Api::new(Some(&workspace_id)); - let (scope, resolved_columns, auto_name) = match ( - catalog.as_deref().or(table.as_deref()), - dataset_id.as_deref(), - ) { - (Some(_), None) => { - let catalog_or_conn = catalog.as_deref().unwrap_or_else(|| { - eprintln!("error: --catalog is required"); - std::process::exit(1); - }); - let tbl = table.as_deref().unwrap_or_else(|| { - eprintln!("error: --table is required"); - std::process::exit(1); - }); - let cols = column.as_deref().unwrap_or_else(|| { - eprintln!("error: --column is required"); - std::process::exit(1); - }); - let conn_id = - connections::resolve_connection_id(&api, catalog_or_conn); - let auto = format!( - "{tbl}_{cols}_{type}", - cols = cols.replace(',', "_"), - type = r#type - ); - ((conn_id, schema, tbl.to_string()), cols.to_string(), auto) - } - (None, Some(did)) => { - let cols = column.as_deref().unwrap_or_else(|| { - eprintln!("error: --column is required with --dataset-id"); - std::process::exit(1); - }); - let auto = format!( - "dataset_{cols}_{type}", - cols = cols.replace(',', "_"), - type = r#type - ); - ( - (did.to_string(), String::new(), String::new()), - cols.to_string(), - auto, - ) - } - _ => { - eprintln!( - "error: provide --catalog and --table, or --dataset-id with --column" - ); - std::process::exit(1); - } - }; + let catalog_or_conn = catalog.as_deref().unwrap_or_else(|| { + eprintln!("error: --catalog is required"); + std::process::exit(1); + }); + let tbl = table.as_deref().unwrap_or_else(|| { + eprintln!("error: --table is required"); + std::process::exit(1); + }); + let cols = column.as_deref().unwrap_or_else(|| { + eprintln!("error: --column is required"); + std::process::exit(1); + }); + let conn_id = connections::resolve_connection_id(&api, catalog_or_conn); + let auto_name = format!( + "{tbl}_{cols}_{type}", + cols = cols.replace(',', "_"), + type = r#type + ); let index_name = name.unwrap_or(auto_name); - let is_dataset = dataset_id.is_some(); - let (conn_id, schema, table) = scope; - let resolved_scope = if is_dataset { - indexes::IndexScope::Dataset { - dataset_id: &conn_id, - } - } else { + indexes::create( + &workspace_id, indexes::IndexScope::Connection { connection_id: &conn_id, schema: &schema, - table: &table, - } - }; - indexes::create( - &workspace_id, - resolved_scope, + table: tbl, + }, &index_name, - &resolved_columns, + cols, &r#type, metric.as_deref(), r#async, @@ -714,28 +600,21 @@ fn main() { connection_id, schema, table, - dataset_id, name, } => { let scope = match ( - dataset_id.as_deref(), connection_id.as_deref(), schema.as_deref(), table.as_deref(), ) { - (Some(did), _, _, _) => { - indexes::IndexScope::Dataset { dataset_id: did } - } - (None, Some(cid), Some(sch), Some(tbl)) => { - indexes::IndexScope::Connection { - connection_id: cid, - schema: sch, - table: tbl, - } - } + (Some(cid), Some(sch), Some(tbl)) => indexes::IndexScope::Connection { + connection_id: cid, + schema: sch, + table: tbl, + }, _ => { eprintln!( - "error: provide either --dataset-id or all three of --connection-id, --schema, --table" + "error: provide all three of --connection-id, --schema, --table" ); std::process::exit(1); } diff --git a/src/sdk.rs b/src/sdk.rs index a2404d0..0fefb4b 100644 --- a/src/sdk.rs +++ b/src/sdk.rs @@ -839,21 +839,22 @@ mod tests { #[test] fn workspace_id_header_is_installed_on_scoped_calls() { - // Regression for the old api.rs:598 header assertion. `datasets().list` - // carries the X-Workspace-Id api_key; assert it reaches the wire. + // Regression for the old api.rs:598 header assertion. A workspace-scoped + // generated-client call (here `jobs().list`) carries the X-Workspace-Id + // api_key; assert it reaches the wire. The mock matches on the header, so + // `m.assert()` failing means the header never left the client. let mut server = mockito::Server::new(); let m = server - .mock("GET", "/v1/datasets") + .mock("GET", "/v1/jobs") .match_header("Authorization", "Bearer test-jwt") .match_header("X-Workspace-Id", "ws-1") .with_status(200) .with_header("content-type", "application/json") - .with_body(r#"{"count":0,"datasets":[],"has_more":false,"limit":50,"offset":0}"#) + .with_body(r#"{"jobs":[]}"#) .create(); let api = Api::test_new(&server.url(), "test-jwt", Some("ws-1")); - let resp = block(api.client.datasets().list(None, None)).expect("list datasets"); - assert_eq!(resp.count, 0); + let _ = block(api.client.jobs().list(None, None, None, None)); m.assert(); } diff --git a/tests/workspace_env.rs b/tests/workspace_env.rs index c46429f..d8971a5 100644 --- a/tests/workspace_env.rs +++ b/tests/workspace_env.rs @@ -9,12 +9,12 @@ fn hotdata() -> Command { // `resolve_workspace` refuses to let a `--workspace-id`/`-w` flag override a // workspace pinned by the HOTDATA_WORKSPACE env var. That check runs before // any auth or network I/O, so any workspace-scoped subcommand exercises it; -// we use `datasets list` here. +// we use `indexes list` here. #[test] fn workspace_env_blocks_conflicting_flag() { let output = hotdata() - .args(["datasets", "list", "-w", "other-ws"]) + .args(["indexes", "list", "-w", "other-ws"]) .env("HOTDATA_WORKSPACE", "locked-ws") .output() .unwrap(); @@ -32,7 +32,7 @@ fn workspace_env_allows_matching_flag() { // When the flag matches the env var, no workspace conflict error. // Will fail later on auth, but should NOT fail on the workspace lock. let output = hotdata() - .args(["datasets", "list", "-w", "ws-1"]) + .args(["indexes", "list", "-w", "ws-1"]) .env("HOTDATA_WORKSPACE", "ws-1") .output() .unwrap(); From da544c678a19a74c3f5eaebdd0ad14eba75cf897 Mon Sep 17 00:00:00 2001 From: Eddie A Tejeda <669988+eddietejeda@users.noreply.github.com> Date: Fri, 19 Jun 2026 14:38:37 -0700 Subject: [PATCH 2/2] style: rustfmt import list after datasets removal --- src/main.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 2371eb5..9f88050 100644 --- a/src/main.rs +++ b/src/main.rs @@ -26,9 +26,9 @@ use anstyle::AnsiColor; use clap::{Parser, builder::Styles}; use command::{ AuthCommands, Commands, ConnectionsCommands, ConnectionsCreateCommands, ContextCommands, - DatabaseTablesCommands, DatabasesCommands, EmbeddingProvidersCommands, - IndexesCommands, JobsCommands, QueriesCommands, QueryCommands, ResultsCommands, SkillCommands, - TablesCommands, WorkspaceCommands, + DatabaseTablesCommands, DatabasesCommands, EmbeddingProvidersCommands, IndexesCommands, + JobsCommands, QueriesCommands, QueryCommands, ResultsCommands, SkillCommands, TablesCommands, + WorkspaceCommands, }; #[derive(Parser)]