From 211474cc16b3ad0ef0d2fdcef04d3c5c460ffae7 Mon Sep 17 00:00:00 2001 From: Steve Han Date: Tue, 19 May 2026 16:48:55 -0400 Subject: [PATCH 1/4] Use DataDesigner native resume for retrieval SDG --- catalog/plugins.json | 4 +- docs/catalog-schema.md | 4 +- docs/catalogs.md | 4 +- plugins/data-designer-retrieval-sdg/README.md | 34 +++- .../pyproject.toml | 2 +- .../src/data_designer_retrieval_sdg/cli.py | 103 ++++------ .../data_designer_retrieval_sdg/convert.py | 108 +++++++--- .../tests/test_cli.py | 185 ++++++++++++++++++ .../tests/test_convert.py | 42 ++++ uv.lock | 29 +-- 10 files changed, 400 insertions(+), 115 deletions(-) create mode 100644 plugins/data-designer-retrieval-sdg/tests/test_cli.py diff --git a/catalog/plugins.json b/catalog/plugins.json index 5f3d28a..11b4708 100644 --- a/catalog/plugins.json +++ b/catalog/plugins.json @@ -45,8 +45,8 @@ "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.5.7", - "specifier": ">=0.5.7", + "requirement": "data-designer>=0.6.0", + "specifier": ">=0.6.0", "marker": null } }, diff --git a/docs/catalog-schema.md b/docs/catalog-schema.md index 0e7e7ce..2c4e6d4 100644 --- a/docs/catalog-schema.md +++ b/docs/catalog-schema.md @@ -32,8 +32,8 @@ The top-level document must contain `schema_version` and `packages`: "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.5.7", - "specifier": ">=0.5.7", + "requirement": "data-designer>=0.6.0", + "specifier": ">=0.6.0", "marker": null } }, diff --git a/docs/catalogs.md b/docs/catalogs.md index 9151372..6ee7872 100644 --- a/docs/catalogs.md +++ b/docs/catalogs.md @@ -66,8 +66,8 @@ after installation. "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.5.7", - "specifier": ">=0.5.7", + "requirement": "data-designer>=0.6.0", + "specifier": ">=0.6.0", "marker": null } }, diff --git a/plugins/data-designer-retrieval-sdg/README.md b/plugins/data-designer-retrieval-sdg/README.md index 78424aa..45c7625 100644 --- a/plugins/data-designer-retrieval-sdg/README.md +++ b/plugins/data-designer-retrieval-sdg/README.md @@ -20,19 +20,27 @@ via `[project.entry-points."data_designer.plugins"]`: Both are registered automatically through Python entry points when the package is installed (see [Installation](#installation)). -## Native async (`DATA_DESIGNER_ASYNC_ENGINE=1`) +## Native async and resumable generation `embedding-dedup` implements `agenerate()` directly on top of `model.agenerate_text_embeddings`, so the column participates in -DataDesigner's async cell-level scheduler whenever the env var is set: +DataDesigner's async cell-level scheduler. + +The `generate` command uses DataDesigner's native resumable generation. +Use a stable `--artifact-path`, `--dataset-name`, and `--buffer-size`, then +resume an interrupted run with `--resume always`: ```bash -export DATA_DESIGNER_ASYNC_ENGINE=1 -data-designer-retrieval-sdg generate ... +data-designer-retrieval-sdg generate \ + --input-dir ./my_documents \ + --output-dir ./generated_output \ + --dataset-name my_retrieval_run \ + --buffer-size 200 \ + --resume always ``` -The async engine requires Python 3.11+; without the env var the package -runs on Python 3.10+ via the framework's sync bridge. +Use `--resume if_possible` to resume only when the saved config matches and +otherwise start a fresh run. ## Installation @@ -91,16 +99,28 @@ uv run data-designer-retrieval-sdg generate --help data-designer-retrieval-sdg generate \ --input-dir ./my_documents \ --output-dir ./generated_output \ + --dataset-name my_retrieval_run \ + --buffer-size 200 \ + --resume if_possible \ --num-pairs 7 ``` +Generation writes DataDesigner artifacts under `--artifact-path` and exports a +single JSONL file to `--output-dir`. + ### Convert to training format ```bash -data-designer-retrieval-sdg convert ./generated_output \ +data-designer-retrieval-sdg convert ./generated_output/my_retrieval_run.jsonl \ --corpus-id my_corpus ``` +Legacy `generated_batch*.json` directories remain supported by `convert`, but +`generate` no longer writes per-batch JSON files. The old manual restart flags +`--batch-size`, `--start-batch-index`, and `--end-batch-index` were removed +because DataDesigner now owns checkpointing through `--buffer-size` and +`--resume`. + ### Use as a library ```python diff --git a/plugins/data-designer-retrieval-sdg/pyproject.toml b/plugins/data-designer-retrieval-sdg/pyproject.toml index bc57a50..5644b85 100644 --- a/plugins/data-designer-retrieval-sdg/pyproject.toml +++ b/plugins/data-designer-retrieval-sdg/pyproject.toml @@ -7,7 +7,7 @@ version = "0.1.0" description = "Retriever SDG toolkit: registers the embedding-dedup column generator and document-chunker seed reader, plus a multi-step QA generation pipeline, CLI, and Automodel-compatible data conversion" requires-python = ">=3.10" dependencies = [ - "data-designer>=0.5.7", + "data-designer>=0.6.0", "nltk>=3.9.2", "pyyaml>=6.0", "pyarrow>=14.0", diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py index e6ef319..87b7dd3 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py @@ -8,12 +8,9 @@ - ``generate`` -- run the full SDG pipeline on a directory of text files - ``convert`` -- convert raw SDG output to Automodel-compatible formats -The ``generate`` subcommand drives a per-batch loop so each batch's output -is checkpointed to its own JSON file (resumable across crashes). The -batching wraps DataDesigner's native ``IndexRange`` selection strategy -applied to a :class:`DocumentChunkerSeedSource`; the framework owns -discovery, chunking, and async cell scheduling (when -``DATA_DESIGNER_ASYNC_ENGINE=1`` is set). +The ``generate`` subcommand runs the full pipeline through DataDesigner's +native resumable generation support. The framework owns discovery, chunking, +checkpointing, and async cell scheduling. """ from __future__ import annotations @@ -26,6 +23,7 @@ import data_designer.config as dd from data_designer.engine.resources.seed_reader import SeedReaderError from data_designer.engine.secret_resolver import PlaintextResolver +from data_designer.engine.storage.artifact_storage import ResumeMode from data_designer.interface import DataDesigner from data_designer.logging import LoggerConfig, LoggingConfig, OutputConfig, configure_logging @@ -98,9 +96,15 @@ def _add_generate_parser(subparsers: argparse._SubParsersAction) -> None: p.add_argument("--similarity-threshold", type=float, default=0.9, help="Cosine threshold for QA-pair dedup") p.add_argument("--preview", action="store_true", help="Preview without full generation") p.add_argument("--artifact-path", type=Path, default=Path("./artifacts"), help="DD artifact path") - p.add_argument("--batch-size", type=int, default=200, help="Records per batch") - p.add_argument("--start-batch-index", type=int, default=0, help="Batch index to start from") - p.add_argument("--end-batch-index", type=int, default=-1, help="Batch index to end at (exclusive)") + p.add_argument("--dataset-name", default=None, help="Stable DD dataset name for artifacts and resume") + p.add_argument("--buffer-size", type=int, default=200, help="DataDesigner checkpoint buffer size") + p.add_argument( + "--resume", + "-r", + choices=[mode.value for mode in ResumeMode], + default=ResumeMode.NEVER.value, + help="Resume behavior for interrupted generation runs", + ) g = p.add_argument_group("multi-document bundling") g.add_argument("--multi-doc", action="store_true", help="Enable multi-doc bundling") @@ -167,13 +171,10 @@ def _run_generate(args: argparse.Namespace) -> None: ) data_designer = DataDesigner(artifact_path=args.artifact_path, model_providers=model_providers) - data_designer.set_run_config(dd.RunConfig(disable_early_shutdown=True)) + data_designer.set_run_config(dd.RunConfig(disable_early_shutdown=True, buffer_size=args.buffer_size)) args.output_dir.mkdir(parents=True, exist_ok=True) - num_batches = (total_records + args.batch_size - 1) // args.batch_size - actual_end_batch = num_batches if args.end_batch_index == -1 else min(args.end_batch_index, num_batches) - pipeline_kwargs = _pipeline_kwargs(args) _print_model_config(args, custom_providers) @@ -181,20 +182,11 @@ def _run_generate(args: argparse.Namespace) -> None: _run_preview(data_designer, seed_source, total_records, args, pipeline_kwargs) return - _run_batches( - data_designer, - seed_source, - total_records, - num_batches, - args.start_batch_index, - actual_end_batch, - args, - pipeline_kwargs, - ) + _run_create(data_designer, seed_source, total_records, args, pipeline_kwargs) def _pipeline_kwargs(args: argparse.Namespace) -> dict: - """Collect pipeline-builder keyword arguments shared between preview and batch runs.""" + """Collect pipeline-builder keyword arguments shared between preview and create runs.""" return { "max_artifacts_per_type": args.max_artifacts_per_type, "num_pairs": args.num_pairs, @@ -238,7 +230,7 @@ def _run_preview( config_builder = build_qa_generation_pipeline( seed_source=seed_source, start_index=0, - end_index=min(args.batch_size - 1, total_records - 1), + end_index=min(args.buffer_size - 1, total_records - 1), **pipeline_kwargs, ) print("\nPreviewing generation...") @@ -249,51 +241,40 @@ def _run_preview( logger.warning("Preview error: %s", e) -def _run_batches( +def _run_create( data_designer: DataDesigner, seed_source: DocumentChunkerSeedSource, total_records: int, - num_batches: int, - start_batch: int, - end_batch: int, args: argparse.Namespace, pipeline_kwargs: dict, ) -> None: - """Process the pipeline in batches, writing one JSON per batch.""" + """Run full generation once and export the resulting dataset as JSONL.""" print(f"\nTotal records: {total_records}") - print(f"Batch size: {args.batch_size}") - print(f"Total batches: {num_batches}") - print(f"Starting from batch index: {start_batch}") - print(f"Ending at batch index: {end_batch} (exclusive)") - - for batch_idx in range(start_batch, end_batch): - start_idx = batch_idx * args.batch_size - end_idx = min(start_idx + args.batch_size - 1, total_records - 1) - num_in_batch = end_idx - start_idx + 1 - - print(f"\n{'=' * 60}") - print(f"Processing batch {batch_idx}/{num_batches - 1} (records {start_idx}-{end_idx})") - print(f"{'=' * 60}") - - config_builder = build_qa_generation_pipeline( - seed_source=seed_source, - start_index=start_idx, - end_index=end_idx, - **pipeline_kwargs, - ) + print(f"Buffer size: {args.buffer_size}") + print(f"Resume mode: {args.resume}") + + config_builder = build_qa_generation_pipeline( + seed_source=seed_source, + start_index=0, + end_index=total_records - 1, + **pipeline_kwargs, + ) - input_basename = args.input_dir.name - dataset_name = f"{input_basename}_batch{batch_idx}_{start_idx}_{end_idx}" - result = data_designer.create(config_builder, num_records=num_in_batch, dataset_name=dataset_name) - generated_df = result.load_dataset() + dataset_name = args.dataset_name or args.input_dir.name or "retrieval_sdg" + print(f"Dataset name: {dataset_name}") + print("\nGenerating dataset...") + result = data_designer.create( + config_builder, + num_records=total_records, + dataset_name=dataset_name, + resume=ResumeMode(args.resume), + ) - output_filename = f"generated_batch{batch_idx}_{start_idx}_{end_idx}.json" - generated_df.to_json(args.output_dir / output_filename, orient="records", indent=2) - print(f"Saved {output_filename} ({len(generated_df)} records)") + output_path = args.output_dir / f"{result.artifact_storage.resolved_dataset_name}.jsonl" + result.export(output_path, format="jsonl") - print(f"\n{'=' * 60}") - print(f"Generation complete! All batches saved to {args.output_dir}") - print(f"Total batches processed: {end_batch - start_batch}") + print(f"\nGeneration complete! Artifacts saved to {result.artifact_storage.base_dataset_path}") + print(f"Exported JSONL to {output_path}") def _add_convert_parser(subparsers: argparse._SubParsersAction) -> None: @@ -304,7 +285,7 @@ def _add_convert_parser(subparsers: argparse._SubParsersAction) -> None: formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - p.add_argument("input_path", help="Path to JSON file or directory of batch files") + p.add_argument("input_path", help="Path to generated JSONL/JSON/parquet file or output directory") p.add_argument("--corpus-id", required=True, help="Corpus identifier") p.add_argument("--output-dir", default=None, help="Output directory") p.add_argument("--eval-only", action="store_true", help="BEIR eval only (no train/val)") diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py index 3ba7a8b..48016b5 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py @@ -19,6 +19,7 @@ import os import random from collections import defaultdict +from pathlib import Path import pandas as pd @@ -74,48 +75,101 @@ def normalize_file_name(file_name: object) -> list[str]: return [file_name] if isinstance(file_name, list): return file_name + if hasattr(file_name, "tolist"): + value = file_name.tolist() + if isinstance(value, list): + return value + if isinstance(value, str): + return [value] return [str(file_name)] +def _load_json_records(input_file: Path) -> list[dict]: + """Load records from a JSON file containing one object or a list of objects.""" + with input_file.open(encoding="utf-8") as f: + records = json.load(f) + if isinstance(records, list): + return records + return [records] + + +def _load_jsonl_records(input_file: Path) -> list[dict]: + """Load records from a JSONL file containing one JSON object per line.""" + records: list[dict] = [] + with input_file.open(encoding="utf-8") as f: + for line in f: + stripped = line.strip() + if not stripped: + continue + record = json.loads(stripped) + if isinstance(record, list): + records.extend(record) + else: + records.append(record) + return records + + +def _load_parquet_records(input_file: Path) -> list[dict]: + """Load records from a parquet file exported by DataDesigner.""" + return pd.read_parquet(input_file).to_dict(orient="records") + + +def _load_generated_records_file(input_file: Path) -> list[dict]: + """Load generated records from one supported file path.""" + suffix = input_file.suffix.lower() + if suffix == ".json": + print(f"Loading JSON file: {input_file}") + return _load_json_records(input_file) + if suffix == ".jsonl": + print(f"Loading JSONL file: {input_file}") + return _load_jsonl_records(input_file) + if suffix == ".parquet": + print(f"Loading parquet file: {input_file}") + return _load_parquet_records(input_file) + raise ValueError(f"Unsupported generated data file format: {input_file}") + + +def _discover_generated_record_files(input_dir: Path) -> list[Path]: + """Discover generated output files in a directory, preferring the newest output contract.""" + pattern_groups = [ + "*.jsonl", + "generated_batch*.json", + "*.json", + "*.parquet", + ] + for pattern in pattern_groups: + files = sorted(Path(p) for p in glob_mod.glob(str(input_dir / pattern))) + if files: + return files + return [] + + def load_generated_json_files(input_path: str) -> pd.DataFrame: - """Load generated JSON from a single file or a directory of batch files. + """Load generated records from a file or output directory. Args: - input_path: Path to a merged JSON file **or** a directory containing - ``generated_batch*.json`` files. + input_path: Path to a generated ``.jsonl``, ``.json``, or ``.parquet`` + file, or a directory containing generated output files. Returns: Combined DataFrame with all records. Raises: - ValueError: If no JSON files are found. + ValueError: If no supported generated files are found. """ all_records: list[dict] = [] + path = Path(input_path) - if os.path.isfile(input_path): - print(f"Loading single JSON file: {input_path}") - with open(input_path, encoding="utf-8") as f: - records = json.load(f) - if isinstance(records, list): - all_records.extend(records) - else: - all_records.append(records) + if path.is_file(): + all_records.extend(_load_generated_records_file(path)) else: - json_files = sorted(glob_mod.glob(os.path.join(input_path, "generated_batch*.json"))) - if not json_files: - json_files = sorted(glob_mod.glob(os.path.join(input_path, "*.json"))) - if not json_files: - raise ValueError(f"No JSON files found in {input_path}") - - print(f"Found {len(json_files)} JSON files") - for json_file in json_files: - print(f" Loading: {json_file}") - with open(json_file, encoding="utf-8") as f: - records = json.load(f) - if isinstance(records, list): - all_records.extend(records) - else: - all_records.append(records) + generated_files = _discover_generated_record_files(path) + if not generated_files: + raise ValueError(f"No generated JSONL, JSON, or parquet files found in {input_path}") + + print(f"Found {len(generated_files)} generated file(s)") + for generated_file in generated_files: + all_records.extend(_load_generated_records_file(generated_file)) print("Normalizing file_name fields...") for record in all_records: diff --git a/plugins/data-designer-retrieval-sdg/tests/test_cli.py b/plugins/data-designer-retrieval-sdg/tests/test_cli.py new file mode 100644 index 0000000..ff8eb62 --- /dev/null +++ b/plugins/data-designer-retrieval-sdg/tests/test_cli.py @@ -0,0 +1,185 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import argparse +import sys +from pathlib import Path + +import pytest +from data_designer.engine.storage.artifact_storage import ResumeMode + +from data_designer_retrieval_sdg import cli + +BUILD_CALLS: list[dict[str, object]] = [] + + +def fake_count_seed_records(seed_source: object) -> int: + """Return a deterministic seed count for CLI generation tests.""" + return 3 + + +def fake_build_model_providers(**kwargs: object) -> tuple[list[str], list[object]]: + """Return a deterministic provider tuple for CLI generation tests.""" + return ["providers"], [] + + +def fake_build_qa_generation_pipeline(**kwargs: object) -> object: + """Capture pipeline-builder kwargs and return a sentinel builder.""" + BUILD_CALLS.append(kwargs) + return {"builder": "qa"} + + +class FakeArtifactStorage: + """Minimal artifact storage surface used by the generate command.""" + + def __init__(self, base_dataset_path: Path, resolved_dataset_name: str) -> None: + self.base_dataset_path = base_dataset_path + self.resolved_dataset_name = resolved_dataset_name + + +class FakeCreateResult: + """Minimal DataDesigner result surface used by the generate command.""" + + def __init__(self, artifact_storage: FakeArtifactStorage) -> None: + self.artifact_storage = artifact_storage + self.export_calls: list[tuple[Path, str | None]] = [] + + def export(self, path: Path, *, format: str | None = None) -> Path: + self.export_calls.append((path, format)) + path.write_text("", encoding="utf-8") + return path + + +class FakeDataDesigner: + """Capture DataDesigner calls made by the generate command.""" + + instances: list[FakeDataDesigner] = [] + + def __init__(self, artifact_path: Path, model_providers: object) -> None: + self.artifact_path = artifact_path + self.model_providers = model_providers + self.run_config = None + self.create_calls: list[dict[str, object]] = [] + self.result = FakeCreateResult(FakeArtifactStorage(artifact_path / "my_run", "my_run")) + FakeDataDesigner.instances.append(self) + + def set_run_config(self, run_config: object) -> None: + self.run_config = run_config + + def create( + self, + config_builder: object, + *, + num_records: int, + dataset_name: str, + resume: ResumeMode, + ) -> FakeCreateResult: + self.create_calls.append( + { + "config_builder": config_builder, + "num_records": num_records, + "dataset_name": dataset_name, + "resume": resume, + } + ) + return self.result + + +def _generate_args(tmp_path: Path) -> argparse.Namespace: + """Build generate args with defaults that match the CLI parser.""" + input_dir = tmp_path / "docs" + input_dir.mkdir() + return argparse.Namespace( + input_dir=input_dir, + output_dir=tmp_path / "out", + file_pattern="*", + recursive=True, + file_extensions=[".txt", ".md", ".text"], + min_text_length=50, + sentences_per_chunk=5, + num_sections=1, + num_files=None, + max_artifacts_per_type=2, + num_pairs=7, + min_hops=2, + max_hops=4, + min_complexity=4, + similarity_threshold=0.9, + preview=False, + artifact_path=tmp_path / "artifacts", + dataset_name="my_run", + buffer_size=37, + resume=ResumeMode.ALWAYS.value, + multi_doc=False, + bundle_size=2, + bundle_strategy="sequential", + max_docs_per_bundle=3, + multi_doc_manifest=None, + log_level="INFO", + artifact_extraction_model="artifact-model", + artifact_extraction_provider="nvidia", + qa_generation_model="qa-model", + qa_generation_provider="nvidia", + quality_judge_model="judge-model", + quality_judge_provider="nvidia", + embed_model="embed-model", + embed_provider="nvidia", + max_parallel_requests_for_gen=None, + custom_provider_endpoint=None, + custom_provider_name="custom", + custom_provider_type="openai", + custom_provider_api_key=None, + model_providers_file=None, + ) + + +def test_generate_uses_native_resume_and_exports_jsonl(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + BUILD_CALLS.clear() + FakeDataDesigner.instances.clear() + monkeypatch.setattr(cli, "DataDesigner", FakeDataDesigner) + monkeypatch.setattr(cli, "_count_seed_records", fake_count_seed_records) + monkeypatch.setattr(cli, "build_model_providers", fake_build_model_providers) + monkeypatch.setattr(cli, "build_qa_generation_pipeline", fake_build_qa_generation_pipeline) + + cli._run_generate(_generate_args(tmp_path)) + + instance = FakeDataDesigner.instances[0] + assert instance.run_config.buffer_size == 37 + assert instance.run_config.disable_early_shutdown is True + assert instance.create_calls == [ + { + "config_builder": {"builder": "qa"}, + "num_records": 3, + "dataset_name": "my_run", + "resume": ResumeMode.ALWAYS, + } + ] + assert BUILD_CALLS[0]["start_index"] == 0 + assert BUILD_CALLS[0]["end_index"] == 2 + assert instance.result.export_calls == [(tmp_path / "out" / "my_run.jsonl", "jsonl")] + + +@pytest.mark.parametrize("removed_flag", ["--batch-size", "--start-batch-index", "--end-batch-index"]) +def test_generate_rejects_removed_batch_flags( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + removed_flag: str, +) -> None: + argv = [ + "data-designer-retrieval-sdg", + "generate", + "--input-dir", + str(tmp_path), + "--output-dir", + str(tmp_path / "out"), + removed_flag, + "1", + ] + monkeypatch.setattr(sys, "argv", argv) + + with pytest.raises(SystemExit) as exc_info: + cli.main() + + assert exc_info.value.code == 2 diff --git a/plugins/data-designer-retrieval-sdg/tests/test_convert.py b/plugins/data-designer-retrieval-sdg/tests/test_convert.py index 40bedf8..8864418 100644 --- a/plugins/data-designer-retrieval-sdg/tests/test_convert.py +++ b/plugins/data-designer-retrieval-sdg/tests/test_convert.py @@ -156,6 +156,48 @@ def test_load_from_directory(tmp_path: Path) -> None: assert len(df) == 2 +def test_load_from_jsonl_file(tmp_path: Path) -> None: + records = [ + {"file_name": "a.txt", "deduplicated_qa_pairs": [], "qa_evaluations": {"evaluations": []}}, + {"file_name": "b.txt", "deduplicated_qa_pairs": [], "qa_evaluations": {"evaluations": []}}, + ] + path = tmp_path / "generated.jsonl" + path.write_text("\n".join(json.dumps(record) for record in records), encoding="utf-8") + + df = load_generated_json_files(str(path)) + + assert len(df) == 2 + assert df.iloc[0]["file_name"] == ["a.txt"] + + +def test_load_from_jsonl_directory(tmp_path: Path) -> None: + for name in ("generated-a.jsonl", "generated-b.jsonl"): + record = {"file_name": name, "deduplicated_qa_pairs": [], "qa_evaluations": {"evaluations": []}} + (tmp_path / name).write_text(json.dumps(record) + "\n", encoding="utf-8") + + df = load_generated_json_files(str(tmp_path)) + + assert len(df) == 2 + + +def test_load_from_parquet_file(tmp_path: Path) -> None: + path = tmp_path / "generated.parquet" + pd.DataFrame( + [ + { + "file_name": ["doc.txt"], + "deduplicated_qa_pairs": [], + "qa_evaluations": {"evaluations": []}, + } + ] + ).to_parquet(path, index=False) + + df = load_generated_json_files(str(path)) + + assert len(df) == 1 + assert df.iloc[0]["file_name"] == ["doc.txt"] + + # --------------------------------------------------------------------------- # generate_training_set / generate_eval_set # --------------------------------------------------------------------------- diff --git a/uv.lock b/uv.lock index 33dff95..9227cae 100644 --- a/uv.lock +++ b/uv.lock @@ -364,22 +364,23 @@ wheels = [ [[package]] name = "data-designer" -version = "0.5.7" +version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "data-designer-config" }, { name = "data-designer-engine" }, + { name = "packaging" }, { name = "prompt-toolkit" }, { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/dd/4b/00aeaaf364f1a7efbf5103954196ca351cdecc6d65203e5a7e4e33a69a2b/data_designer-0.5.7.tar.gz", hash = "sha256:374f9d15f7774fb5a79935b9e6ce989b7b5c364a8d1e0ce0e6e792258376b1a3", size = 120078, upload-time = "2026-04-17T22:03:14.088Z" } +sdist = { url = "https://files.pythonhosted.org/packages/84/d4/b4f4dec388ca1bbe5d0034815e026752853449c238cbc5201b8416bb4217/data_designer-0.6.0.tar.gz", hash = "sha256:b92862752ac7cb5a63825703cc127349f018ca1234560e15ba722de4935b744f", size = 191970, upload-time = "2026-05-13T20:35:52.291Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/18/93/ddadb9707ba8bde858bcca5cdfcaa016426b9ae9e5ce3bbfbaf3813a281f/data_designer-0.5.7-py3-none-any.whl", hash = "sha256:ec4f162b1c248c8d7fe81a8ca19c246998e0bb557f8dfbe629b8c85ac7e68182", size = 99133, upload-time = "2026-04-17T22:03:12.507Z" }, + { url = "https://files.pythonhosted.org/packages/b2/24/b13fe13c9f230386c118f1ed141c740236cfd0221dfd7ffa2831cfd301cc/data_designer-0.6.0-py3-none-any.whl", hash = "sha256:508eb97953577da0621ac5a4ceb3eaf66838f3231e1e0e50bb636c7177c88a6d", size = 141051, upload-time = "2026-05-13T20:35:50.813Z" }, ] [[package]] name = "data-designer-config" -version = "0.5.7" +version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -395,18 +396,19 @@ dependencies = [ { name = "requests" }, { name = "rich" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ac/b6/e1b29e2fc98322f9865f20a0c3baa18972bbe353b65dd52c7f9786f8b9c5/data_designer_config-0.5.7.tar.gz", hash = "sha256:248b28ad2ec446599614e4656bae443ba9a9f3805e14ab478374fb34eb89636d", size = 128660, upload-time = "2026-04-17T22:03:06.42Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b2/fa/c9bddc103362b388d31c5667d1390db4e665a827ea3fca2dca87be6a0328/data_designer_config-0.6.0.tar.gz", hash = "sha256:b623eb93309271a658c60a57cef7e304a78414b9c11bb314d32e10d6e9b39cf1", size = 143399, upload-time = "2026-05-13T20:35:44.495Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/65/fc/a85d1d9d7436e4ebccc11b1fc7c0be1287584f4bee3a9ed71d58da0d0b0a/data_designer_config-0.5.7-py3-none-any.whl", hash = "sha256:3d4d22c4d8e4b36189f62ef103122a108add1cfbbacf8afcfdd281e9458bd77d", size = 114479, upload-time = "2026-04-17T22:03:04.993Z" }, + { url = "https://files.pythonhosted.org/packages/15/e0/238c49998c71b959678d0e2080808e4f6cac8b8a3a39fc072441c41c82bb/data_designer_config-0.6.0-py3-none-any.whl", hash = "sha256:cd68eee04fd16c47ca2b56543783e8c0b90ca15f697e08af5bc5036ee601ee43", size = 123121, upload-time = "2026-05-13T20:35:43.138Z" }, ] [[package]] name = "data-designer-engine" -version = "0.5.7" +version = "0.6.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyascii" }, { name = "chardet" }, + { name = "cryptography" }, { name = "data-designer-config" }, { name = "duckdb" }, { name = "faker" }, @@ -422,15 +424,16 @@ dependencies = [ { name = "mcp" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "python-multipart" }, { name = "ruff" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "sqlfluff" }, { name = "tiktoken" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/5c/35/a8abd88c44aa603bacff33d6983959b95bdc4c0116fc03460fb4ef04f803/data_designer_engine-0.5.7.tar.gz", hash = "sha256:f1dfeaad52a12fe12bf9796ae45dddb9d1eed82bdb02979d6cdab8c723631651", size = 794680, upload-time = "2026-04-17T22:03:10.25Z" } +sdist = { url = "https://files.pythonhosted.org/packages/28/50/bb8c8952c3d21456f75dd6d2b56948ddb4ac8e85368fb9b04f1ec46e6a73/data_designer_engine-0.6.0.tar.gz", hash = "sha256:25d16a5b0b0662f469d57c7418882188b115d0faf1bba3d6367c82f9d45e7d69", size = 841765, upload-time = "2026-05-13T20:35:48.635Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/35/d4/3844529ae989be9e63b0b8f47c28492793993427dc7d54d6d2a923ad2acc/data_designer_engine-0.5.7-py3-none-any.whl", hash = "sha256:75cd7d5ad0b230ddf75950ba7f97c9ad75c54887ad1247cdf623dc008e31a418", size = 631945, upload-time = "2026-04-17T22:03:08.584Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e0/ae4fc414e522506d344c2b54bc027e0ae6da405006779c620645ba5e30f5/data_designer_engine-0.6.0-py3-none-any.whl", hash = "sha256:d1ba0b541540f047d3972229aa431291d3be3272fa6ca5be8c6991da84932bbc", size = 653564, upload-time = "2026-05-13T20:35:46.676Z" }, ] [[package]] @@ -478,7 +481,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "data-designer", specifier = ">=0.5.7" }, + { name = "data-designer", specifier = ">=0.6.0" }, { name = "nltk", specifier = ">=3.9.2" }, { name = "pyarrow", specifier = ">=14.0" }, { name = "pyyaml", specifier = ">=6.0" }, @@ -2038,11 +2041,11 @@ wheels = [ [[package]] name = "python-multipart" -version = "0.0.26" +version = "0.0.29" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" } +sdist = { url = "https://files.pythonhosted.org/packages/4e/fe/70bd71a6738b09a0bdf6480ca6436b167469ca4578b2a0efbe390b4b0e70/python_multipart-0.0.29.tar.gz", hash = "sha256:643e93849196645e2dbdd81a0f8829a23123ad7f797a84a364c6fb3563f18904", size = 45678, upload-time = "2026-05-17T17:29:47.654Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" }, + { url = "https://files.pythonhosted.org/packages/8f/cb/769cfc37177252872a45a71f3fbdde9d51b471a3f3c14bfe95dde3407386/python_multipart-0.0.29-py3-none-any.whl", hash = "sha256:2ddcc971cef266225f54f552d8fa10bcfbb1f14446caec199060daac59ff2d69", size = 29640, upload-time = "2026-05-17T17:29:45.69Z" }, ] [[package]] From f52a1cbccd4127a39afc8d6c035438c376f31968 Mon Sep 17 00:00:00 2001 From: Steve Han Date: Tue, 2 Jun 2026 15:38:29 -0400 Subject: [PATCH 2/4] Update retrieval SDG to DataDesigner 0.6.1 --- catalog/plugins.json | 4 +- docs/catalog-schema.md | 4 +- docs/catalogs.md | 4 +- .../pyproject.toml | 2 +- .../src/data_designer_retrieval_sdg/dedup.py | 8 +- .../tests/test_dedup.py | 22 ++- uv.lock | 154 ++++++++---------- 7 files changed, 101 insertions(+), 97 deletions(-) diff --git a/catalog/plugins.json b/catalog/plugins.json index 11b4708..27039a8 100644 --- a/catalog/plugins.json +++ b/catalog/plugins.json @@ -45,8 +45,8 @@ "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.6.0", - "specifier": ">=0.6.0", + "requirement": "data-designer>=0.6.1", + "specifier": ">=0.6.1", "marker": null } }, diff --git a/docs/catalog-schema.md b/docs/catalog-schema.md index 2c4e6d4..164e2aa 100644 --- a/docs/catalog-schema.md +++ b/docs/catalog-schema.md @@ -32,8 +32,8 @@ The top-level document must contain `schema_version` and `packages`: "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.6.0", - "specifier": ">=0.6.0", + "requirement": "data-designer>=0.6.1", + "specifier": ">=0.6.1", "marker": null } }, diff --git a/docs/catalogs.md b/docs/catalogs.md index 6ee7872..12e9933 100644 --- a/docs/catalogs.md +++ b/docs/catalogs.md @@ -66,8 +66,8 @@ after installation. "specifier": ">=3.10" }, "data_designer": { - "requirement": "data-designer>=0.6.0", - "specifier": ">=0.6.0", + "requirement": "data-designer>=0.6.1", + "specifier": ">=0.6.1", "marker": null } }, diff --git a/plugins/data-designer-retrieval-sdg/pyproject.toml b/plugins/data-designer-retrieval-sdg/pyproject.toml index 5644b85..73d8ae4 100644 --- a/plugins/data-designer-retrieval-sdg/pyproject.toml +++ b/plugins/data-designer-retrieval-sdg/pyproject.toml @@ -7,7 +7,7 @@ version = "0.1.0" description = "Retriever SDG toolkit: registers the embedding-dedup column generator and document-chunker seed reader, plus a multi-step QA generation pipeline, CLI, and Automodel-compatible data conversion" requires-python = ">=3.10" dependencies = [ - "data-designer>=0.6.0", + "data-designer>=0.6.1", "nltk>=3.9.2", "pyyaml>=6.0", "pyarrow>=14.0", diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/dedup.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/dedup.py index eca5dc1..575cfde 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/dedup.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/dedup.py @@ -41,10 +41,10 @@ class EmbeddingDedupColumnGenerator(ColumnGeneratorWithModelRegistry[EmbeddingDe similarity exceeds ``similarity_threshold``. 5. Returns the surviving items under ``self.config.name``. - Extends :class:`ColumnGeneratorWithModelRegistry` so the column reports - ``is_llm_bound = True`` to the async scheduler. Without this, embedding - HTTP calls would bypass ``_llm_wait_semaphore`` and could fan out up to - a full row group's worth of concurrent requests at the embedding endpoint. + Extends :class:`ColumnGeneratorWithModelRegistry` so DataDesigner's scheduler + can derive model resource metadata from the configured embedding alias. + Without that model-aware metadata, embedding HTTP calls could bypass endpoint + concurrency limits and fan out up to a full row group's worth of requests. """ @staticmethod diff --git a/plugins/data-designer-retrieval-sdg/tests/test_dedup.py b/plugins/data-designer-retrieval-sdg/tests/test_dedup.py index ef9c329..50298aa 100644 --- a/plugins/data-designer-retrieval-sdg/tests/test_dedup.py +++ b/plugins/data-designer-retrieval-sdg/tests/test_dedup.py @@ -161,11 +161,23 @@ def test_config_round_trip() -> None: assert cfg.similarity_threshold == 0.9 -def test_is_llm_bound_true() -> None: - """The column issues embedding HTTP calls and must route through the - async scheduler's LLM-wait semaphore.""" +def test_scheduling_metadata_uses_embedding_model_alias() -> None: + """Embedding calls should route through DataDesigner's model scheduler.""" gen = _make_generator() - assert gen.is_llm_bound is True + gen.resource_provider.model_registry.get_model_config.return_value = ModelConfig( + alias="embed", + model="mock-embedding-model", + provider="mock-provider", + inference_parameters=EmbeddingInferenceParams(max_parallel_requests=3), + ) + gen.resource_provider.model_registry.get_model_provider.return_value.name = "mock-provider" + + metadata = gen.get_scheduling_metadata() + + assert metadata.kind == "model" + assert metadata.identity == ("model", "mock-provider", "mock-embedding-model", "embedding") + assert metadata.weight == 3 + assert metadata.diagnostics["aliases"] == ("embed",) def test_validate_accepts_embedding_model() -> None: @@ -175,6 +187,7 @@ def test_validate_accepts_embedding_model() -> None: gen.resource_provider.model_registry.get_model_config.return_value = ModelConfig( alias="embed", model="some/embedding-model", + provider="mock-provider", inference_parameters=EmbeddingInferenceParams(), ) gen._validate() @@ -187,6 +200,7 @@ def test_validate_rejects_chat_model() -> None: gen.resource_provider.model_registry.get_model_config.return_value = ModelConfig( alias="embed", model="some/chat-model", + provider="mock-provider", inference_parameters=ChatCompletionInferenceParams(), ) with pytest.raises(BuilderConfigurationError, match="embed"): diff --git a/uv.lock b/uv.lock index 9227cae..49d1a4b 100644 --- a/uv.lock +++ b/uv.lock @@ -364,7 +364,7 @@ wheels = [ [[package]] name = "data-designer" -version = "0.6.0" +version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "data-designer-config" }, @@ -373,14 +373,14 @@ dependencies = [ { name = "prompt-toolkit" }, { name = "typer" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/84/d4/b4f4dec388ca1bbe5d0034815e026752853449c238cbc5201b8416bb4217/data_designer-0.6.0.tar.gz", hash = "sha256:b92862752ac7cb5a63825703cc127349f018ca1234560e15ba722de4935b744f", size = 191970, upload-time = "2026-05-13T20:35:52.291Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/81/85bf662d1f7eff3ae182ee0569dee9865472ccc310bd887a4b160bbed147/data_designer-0.6.1.tar.gz", hash = "sha256:bee4baa4779fa1e1592270a64a0f63760de4025693b44344d97679aa1484b163", size = 206602, upload-time = "2026-06-01T22:35:49.598Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b2/24/b13fe13c9f230386c118f1ed141c740236cfd0221dfd7ffa2831cfd301cc/data_designer-0.6.0-py3-none-any.whl", hash = "sha256:508eb97953577da0621ac5a4ceb3eaf66838f3231e1e0e50bb636c7177c88a6d", size = 141051, upload-time = "2026-05-13T20:35:50.813Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9e/6da970741c65178e54d4170e3b47d2df87cf4d1ec17da03c809868f389e0/data_designer-0.6.1-py3-none-any.whl", hash = "sha256:257e58e1fb860c59c9d0cc83969c52f313f55f113f112301672382d04be78a05", size = 148383, upload-time = "2026-06-01T22:35:48.467Z" }, ] [[package]] name = "data-designer-config" -version = "0.6.0" +version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2" }, @@ -395,15 +395,16 @@ dependencies = [ { name = "pyyaml" }, { name = "requests" }, { name = "rich" }, + { name = "urllib3" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b2/fa/c9bddc103362b388d31c5667d1390db4e665a827ea3fca2dca87be6a0328/data_designer_config-0.6.0.tar.gz", hash = "sha256:b623eb93309271a658c60a57cef7e304a78414b9c11bb314d32e10d6e9b39cf1", size = 143399, upload-time = "2026-05-13T20:35:44.495Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/13/76f616fbfffe1b6fe41d6d34cee72ba02b83f4e75d07a98159b6b7e62eec/data_designer_config-0.6.1.tar.gz", hash = "sha256:16f53fc34e11915aa821e3324de8e39bc6b03e7a2cd5402bbeca731bd6eea072", size = 150882, upload-time = "2026-06-01T22:35:43.292Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/e0/238c49998c71b959678d0e2080808e4f6cac8b8a3a39fc072441c41c82bb/data_designer_config-0.6.0-py3-none-any.whl", hash = "sha256:cd68eee04fd16c47ca2b56543783e8c0b90ca15f697e08af5bc5036ee601ee43", size = 123121, upload-time = "2026-05-13T20:35:43.138Z" }, + { url = "https://files.pythonhosted.org/packages/60/2c/97fd4210d0021ead0dac3b43fbfa80db13c52a7b8d26704d4d68ef39241a/data_designer_config-0.6.1-py3-none-any.whl", hash = "sha256:69a5de862246933e16e68a8b58fde90dad5156a14a20184fd276b6bff53cf2ae", size = 127974, upload-time = "2026-06-01T22:35:42.087Z" }, ] [[package]] name = "data-designer-engine" -version = "0.6.0" +version = "0.6.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "anyascii" }, @@ -424,6 +425,8 @@ dependencies = [ { name = "mcp" }, { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "python-multipart" }, { name = "ruff" }, { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, @@ -431,9 +434,9 @@ dependencies = [ { name = "sqlfluff" }, { name = "tiktoken" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/28/50/bb8c8952c3d21456f75dd6d2b56948ddb4ac8e85368fb9b04f1ec46e6a73/data_designer_engine-0.6.0.tar.gz", hash = "sha256:25d16a5b0b0662f469d57c7418882188b115d0faf1bba3d6367c82f9d45e7d69", size = 841765, upload-time = "2026-05-13T20:35:48.635Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/76/eb44cb07e748caa76330c496035be3f6d8187ff66139bc5a133f04b35237/data_designer_engine-0.6.1.tar.gz", hash = "sha256:fc75c0cb28aa8da0a98fe9a2ee101cdf05fb41d0cac01a9a4d7d2c40e1e72077", size = 900796, upload-time = "2026-06-01T22:35:46.759Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0c/e0/ae4fc414e522506d344c2b54bc027e0ae6da405006779c620645ba5e30f5/data_designer_engine-0.6.0-py3-none-any.whl", hash = "sha256:d1ba0b541540f047d3972229aa431291d3be3272fa6ca5be8c6991da84932bbc", size = 653564, upload-time = "2026-05-13T20:35:46.676Z" }, + { url = "https://files.pythonhosted.org/packages/2b/97/8465cb2171559ccbbf60b5d0fa1040a9e354a5354f622cabc5fcf32d1f59/data_designer_engine-0.6.1-py3-none-any.whl", hash = "sha256:f0d79b7e41e034ed31e946c617637ce1242a4283eb1031a8d5739d51fe85cfd5", size = 697247, upload-time = "2026-06-01T22:35:45.282Z" }, ] [[package]] @@ -481,7 +484,7 @@ dependencies = [ [package.metadata] requires-dist = [ - { name = "data-designer", specifier = ">=0.6.0" }, + { name = "data-designer", specifier = ">=0.6.1" }, { name = "nltk", specifier = ">=3.9.2" }, { name = "pyarrow", specifier = ">=14.0" }, { name = "pyyaml", specifier = ">=6.0" }, @@ -808,15 +811,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/3d/2d244233ac4f76e38533cfcb2991c9eb4c7bf688ae0a036d30725b8faafe/importlib_metadata-9.0.0-py3-none-any.whl", hash = "sha256:2d21d1cc5a017bd0559e36150c21c830ab1dc304dedd1b7ea85d20f45ef3edd7", size = 27789, upload-time = "2026-03-20T06:42:55.665Z" }, ] -[[package]] -name = "iniconfig" -version = "2.3.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, -] - [[package]] name = "jaraco-classes" version = "3.4.0" @@ -1753,44 +1747,59 @@ wheels = [ [[package]] name = "pyarrow" -version = "19.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/7f/09/a9046344212690f0632b9c709f9bf18506522feb333c894d0de81d62341a/pyarrow-19.0.1.tar.gz", hash = "sha256:3bf266b485df66a400f282ac0b6d1b500b9d2ae73314a153dbe97d6d5cc8a99e", size = 1129437, upload-time = "2025-02-18T18:55:57.027Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/36/01/b23b514d86b839956238d3f8ef206fd2728eee87ff1b8ce150a5678d9721/pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:fc28912a2dc924dddc2087679cc8b7263accc71b9ff025a1362b004711661a69", size = 30688914, upload-time = "2025-02-18T18:51:37.575Z" }, - { url = "https://files.pythonhosted.org/packages/c6/68/218ff7cf4a0652a933e5f2ed11274f724dd43b9813cb18dd72c0a35226a2/pyarrow-19.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fca15aabbe9b8355800d923cc2e82c8ef514af321e18b437c3d782aa884eaeec", size = 32102866, upload-time = "2025-02-18T18:51:44.358Z" }, - { url = "https://files.pythonhosted.org/packages/98/01/c295050d183014f4a2eb796d7d2bbfa04b6cccde7258bb68aacf6f18779b/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ad76aef7f5f7e4a757fddcdcf010a8290958f09e3470ea458c80d26f4316ae89", size = 41147682, upload-time = "2025-02-18T18:51:49.481Z" }, - { url = "https://files.pythonhosted.org/packages/40/17/a6c3db0b5f3678f33bbb552d2acbc16def67f89a72955b67b0109af23eb0/pyarrow-19.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d03c9d6f2a3dffbd62671ca070f13fc527bb1867b4ec2b98c7eeed381d4f389a", size = 42179192, upload-time = "2025-02-18T18:51:56.265Z" }, - { url = "https://files.pythonhosted.org/packages/cf/75/c7c8e599300d8cebb6cb339014800e1c720c9db2a3fcb66aa64ec84bac72/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:65cf9feebab489b19cdfcfe4aa82f62147218558d8d3f0fc1e9dea0ab8e7905a", size = 40517272, upload-time = "2025-02-18T18:52:02.969Z" }, - { url = "https://files.pythonhosted.org/packages/ef/c9/68ab123ee1528699c4d5055f645ecd1dd68ff93e4699527249d02f55afeb/pyarrow-19.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:41f9706fbe505e0abc10e84bf3a906a1338905cbbcf1177b71486b03e6ea6608", size = 42069036, upload-time = "2025-02-18T18:52:10.173Z" }, - { url = "https://files.pythonhosted.org/packages/54/e3/d5cfd7654084e6c0d9c3ce949e5d9e0ccad569ae1e2d5a68a3ec03b2be89/pyarrow-19.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:c6cb2335a411b713fdf1e82a752162f72d4a7b5dbc588e32aa18383318b05866", size = 25277951, upload-time = "2025-02-18T18:52:15.459Z" }, - { url = "https://files.pythonhosted.org/packages/a0/55/f1a8d838ec07fe3ca53edbe76f782df7b9aafd4417080eebf0b42aab0c52/pyarrow-19.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc55d71898ea30dc95900297d191377caba257612f384207fe9f8293b5850f90", size = 30713987, upload-time = "2025-02-18T18:52:20.463Z" }, - { url = "https://files.pythonhosted.org/packages/13/12/428861540bb54c98a140ae858a11f71d041ef9e501e6b7eb965ca7909505/pyarrow-19.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:7a544ec12de66769612b2d6988c36adc96fb9767ecc8ee0a4d270b10b1c51e00", size = 32135613, upload-time = "2025-02-18T18:52:25.29Z" }, - { url = "https://files.pythonhosted.org/packages/2f/8a/23d7cc5ae2066c6c736bce1db8ea7bc9ac3ef97ac7e1c1667706c764d2d9/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0148bb4fc158bfbc3d6dfe5001d93ebeed253793fff4435167f6ce1dc4bddeae", size = 41149147, upload-time = "2025-02-18T18:52:30.975Z" }, - { url = "https://files.pythonhosted.org/packages/a2/7a/845d151bb81a892dfb368bf11db584cf8b216963ccce40a5cf50a2492a18/pyarrow-19.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f24faab6ed18f216a37870d8c5623f9c044566d75ec586ef884e13a02a9d62c5", size = 42178045, upload-time = "2025-02-18T18:52:36.859Z" }, - { url = "https://files.pythonhosted.org/packages/a7/31/e7282d79a70816132cf6cae7e378adfccce9ae10352d21c2fecf9d9756dd/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:4982f8e2b7afd6dae8608d70ba5bd91699077323f812a0448d8b7abdff6cb5d3", size = 40532998, upload-time = "2025-02-18T18:52:42.578Z" }, - { url = "https://files.pythonhosted.org/packages/b8/82/20f3c290d6e705e2ee9c1fa1d5a0869365ee477e1788073d8b548da8b64c/pyarrow-19.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:49a3aecb62c1be1d822f8bf629226d4a96418228a42f5b40835c1f10d42e4db6", size = 42084055, upload-time = "2025-02-18T18:52:48.749Z" }, - { url = "https://files.pythonhosted.org/packages/ff/77/e62aebd343238863f2c9f080ad2ef6ace25c919c6ab383436b5b81cbeef7/pyarrow-19.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:008a4009efdb4ea3d2e18f05cd31f9d43c388aad29c636112c2966605ba33466", size = 25283133, upload-time = "2025-02-18T18:52:54.549Z" }, - { url = "https://files.pythonhosted.org/packages/78/b4/94e828704b050e723f67d67c3535cf7076c7432cd4cf046e4bb3b96a9c9d/pyarrow-19.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:80b2ad2b193e7d19e81008a96e313fbd53157945c7be9ac65f44f8937a55427b", size = 30670749, upload-time = "2025-02-18T18:53:00.062Z" }, - { url = "https://files.pythonhosted.org/packages/7e/3b/4692965e04bb1df55e2c314c4296f1eb12b4f3052d4cf43d29e076aedf66/pyarrow-19.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:ee8dec072569f43835932a3b10c55973593abc00936c202707a4ad06af7cb294", size = 32128007, upload-time = "2025-02-18T18:53:06.581Z" }, - { url = "https://files.pythonhosted.org/packages/22/f7/2239af706252c6582a5635c35caa17cb4d401cd74a87821ef702e3888957/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d5d1ec7ec5324b98887bdc006f4d2ce534e10e60f7ad995e7875ffa0ff9cb14", size = 41144566, upload-time = "2025-02-18T18:53:11.958Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e3/c9661b2b2849cfefddd9fd65b64e093594b231b472de08ff658f76c732b2/pyarrow-19.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3ad4c0eb4e2a9aeb990af6c09e6fa0b195c8c0e7b272ecc8d4d2b6574809d34", size = 42202991, upload-time = "2025-02-18T18:53:17.678Z" }, - { url = "https://files.pythonhosted.org/packages/fe/4f/a2c0ed309167ef436674782dfee4a124570ba64299c551e38d3fdaf0a17b/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:d383591f3dcbe545f6cc62daaef9c7cdfe0dff0fb9e1c8121101cabe9098cfa6", size = 40507986, upload-time = "2025-02-18T18:53:26.263Z" }, - { url = "https://files.pythonhosted.org/packages/27/2e/29bb28a7102a6f71026a9d70d1d61df926887e36ec797f2e6acfd2dd3867/pyarrow-19.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b4c4156a625f1e35d6c0b2132635a237708944eb41df5fbe7d50f20d20c17832", size = 42087026, upload-time = "2025-02-18T18:53:33.063Z" }, - { url = "https://files.pythonhosted.org/packages/16/33/2a67c0f783251106aeeee516f4806161e7b481f7d744d0d643d2f30230a5/pyarrow-19.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:5bd1618ae5e5476b7654c7b55a6364ae87686d4724538c24185bbb2952679960", size = 25250108, upload-time = "2025-02-18T18:53:38.462Z" }, - { url = "https://files.pythonhosted.org/packages/2b/8d/275c58d4b00781bd36579501a259eacc5c6dfb369be4ddeb672ceb551d2d/pyarrow-19.0.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e45274b20e524ae5c39d7fc1ca2aa923aab494776d2d4b316b49ec7572ca324c", size = 30653552, upload-time = "2025-02-18T18:53:44.357Z" }, - { url = "https://files.pythonhosted.org/packages/a0/9e/e6aca5cc4ef0c7aec5f8db93feb0bde08dbad8c56b9014216205d271101b/pyarrow-19.0.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:d9dedeaf19097a143ed6da37f04f4051aba353c95ef507764d344229b2b740ae", size = 32103413, upload-time = "2025-02-18T18:53:52.971Z" }, - { url = "https://files.pythonhosted.org/packages/6a/fa/a7033f66e5d4f1308c7eb0dfcd2ccd70f881724eb6fd1776657fdf65458f/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ebfb5171bb5f4a52319344ebbbecc731af3f021e49318c74f33d520d31ae0c4", size = 41134869, upload-time = "2025-02-18T18:53:59.471Z" }, - { url = "https://files.pythonhosted.org/packages/2d/92/34d2569be8e7abdc9d145c98dc410db0071ac579b92ebc30da35f500d630/pyarrow-19.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a21d39fbdb948857f67eacb5bbaaf36802de044ec36fbef7a1c8f0dd3a4ab2", size = 42192626, upload-time = "2025-02-18T18:54:06.062Z" }, - { url = "https://files.pythonhosted.org/packages/0a/1f/80c617b1084fc833804dc3309aa9d8daacd46f9ec8d736df733f15aebe2c/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:99bc1bec6d234359743b01e70d4310d0ab240c3d6b0da7e2a93663b0158616f6", size = 40496708, upload-time = "2025-02-18T18:54:12.347Z" }, - { url = "https://files.pythonhosted.org/packages/e6/90/83698fcecf939a611c8d9a78e38e7fed7792dcc4317e29e72cf8135526fb/pyarrow-19.0.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:1b93ef2c93e77c442c979b0d596af45e4665d8b96da598db145b0fec014b9136", size = 42075728, upload-time = "2025-02-18T18:54:19.364Z" }, - { url = "https://files.pythonhosted.org/packages/40/49/2325f5c9e7a1c125c01ba0c509d400b152c972a47958768e4e35e04d13d8/pyarrow-19.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:d9d46e06846a41ba906ab25302cf0fd522f81aa2a85a71021826f34639ad31ef", size = 25242568, upload-time = "2025-02-18T18:54:25.846Z" }, - { url = "https://files.pythonhosted.org/packages/3f/72/135088d995a759d4d916ec4824cb19e066585b4909ebad4ab196177aa825/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c0fe3dbbf054a00d1f162fda94ce236a899ca01123a798c561ba307ca38af5f0", size = 30702371, upload-time = "2025-02-18T18:54:30.665Z" }, - { url = "https://files.pythonhosted.org/packages/2e/01/00beeebd33d6bac701f20816a29d2018eba463616bbc07397fdf99ac4ce3/pyarrow-19.0.1-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:96606c3ba57944d128e8a8399da4812f56c7f61de8c647e3470b417f795d0ef9", size = 32116046, upload-time = "2025-02-18T18:54:35.995Z" }, - { url = "https://files.pythonhosted.org/packages/1f/c9/23b1ea718dfe967cbd986d16cf2a31fe59d015874258baae16d7ea0ccabc/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f04d49a6b64cf24719c080b3c2029a3a5b16417fd5fd7c4041f94233af732f3", size = 41091183, upload-time = "2025-02-18T18:54:42.662Z" }, - { url = "https://files.pythonhosted.org/packages/3a/d4/b4a3aa781a2c715520aa8ab4fe2e7fa49d33a1d4e71c8fc6ab7b5de7a3f8/pyarrow-19.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a9137cf7e1640dce4c190551ee69d478f7121b5c6f323553b319cac936395f6", size = 42171896, upload-time = "2025-02-18T18:54:49.808Z" }, - { url = "https://files.pythonhosted.org/packages/23/1b/716d4cd5a3cbc387c6e6745d2704c4b46654ba2668260d25c402626c5ddb/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:7c1bca1897c28013db5e4c83944a2ab53231f541b9e0c3f4791206d0c0de389a", size = 40464851, upload-time = "2025-02-18T18:54:57.073Z" }, - { url = "https://files.pythonhosted.org/packages/ed/bd/54907846383dcc7ee28772d7e646f6c34276a17da740002a5cefe90f04f7/pyarrow-19.0.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:58d9397b2e273ef76264b45531e9d552d8ec8a6688b7390b5be44c02a37aade8", size = 42085744, upload-time = "2025-02-18T18:55:08.562Z" }, +version = "22.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" }, + { url = "https://files.pythonhosted.org/packages/6c/41/3184b8192a120306270c5307f105b70320fdaa592c99843c5ef78aaefdcf/pyarrow-22.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:44d2d26cda26d18f7af7db71453b7b783788322d756e81730acb98f24eb90ace", size = 35942085, upload-time = "2025-10-24T10:03:38.146Z" }, + { url = "https://files.pythonhosted.org/packages/d9/3d/a1eab2f6f08001f9fb714b8ed5cfb045e2fe3e3e3c0c221f2c9ed1e6d67d/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:b9d71701ce97c95480fecb0039ec5bb889e75f110da72005743451339262f4ce", size = 44964613, upload-time = "2025-10-24T10:03:46.516Z" }, + { url = "https://files.pythonhosted.org/packages/46/46/a1d9c24baf21cfd9ce994ac820a24608decf2710521b29223d4334985127/pyarrow-22.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:710624ab925dc2b05a6229d47f6f0dac1c1155e6ed559be7109f684eba048a48", size = 47627059, upload-time = "2025-10-24T10:03:55.353Z" }, + { url = "https://files.pythonhosted.org/packages/3a/4c/f711acb13075c1391fd54bc17e078587672c575f8de2a6e62509af026dcf/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f963ba8c3b0199f9d6b794c90ec77545e05eadc83973897a4523c9e8d84e9340", size = 47947043, upload-time = "2025-10-24T10:04:05.408Z" }, + { url = "https://files.pythonhosted.org/packages/4e/70/1f3180dd7c2eab35c2aca2b29ace6c519f827dcd4cfeb8e0dca41612cf7a/pyarrow-22.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bd0d42297ace400d8febe55f13fdf46e86754842b860c978dfec16f081e5c653", size = 50206505, upload-time = "2025-10-24T10:04:15.786Z" }, + { url = "https://files.pythonhosted.org/packages/80/07/fea6578112c8c60ffde55883a571e4c4c6bc7049f119d6b09333b5cc6f73/pyarrow-22.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:00626d9dc0f5ef3a75fe63fd68b9c7c8302d2b5bbc7f74ecaedba83447a24f84", size = 28101641, upload-time = "2025-10-24T10:04:22.57Z" }, + { url = "https://files.pythonhosted.org/packages/2e/b7/18f611a8cdc43417f9394a3ccd3eace2f32183c08b9eddc3d17681819f37/pyarrow-22.0.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:3e294c5eadfb93d78b0763e859a0c16d4051fc1c5231ae8956d61cb0b5666f5a", size = 34272022, upload-time = "2025-10-24T10:04:28.973Z" }, + { url = "https://files.pythonhosted.org/packages/26/5c/f259e2526c67eb4b9e511741b19870a02363a47a35edbebc55c3178db22d/pyarrow-22.0.0-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:69763ab2445f632d90b504a815a2a033f74332997052b721002298ed6de40f2e", size = 35995834, upload-time = "2025-10-24T10:04:35.467Z" }, + { url = "https://files.pythonhosted.org/packages/50/8d/281f0f9b9376d4b7f146913b26fac0aa2829cd1ee7e997f53a27411bbb92/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:b41f37cabfe2463232684de44bad753d6be08a7a072f6a83447eeaf0e4d2a215", size = 45030348, upload-time = "2025-10-24T10:04:43.366Z" }, + { url = "https://files.pythonhosted.org/packages/f5/e5/53c0a1c428f0976bf22f513d79c73000926cb00b9c138d8e02daf2102e18/pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:35ad0f0378c9359b3f297299c3309778bb03b8612f987399a0333a560b43862d", size = 47699480, upload-time = "2025-10-24T10:04:51.486Z" }, + { url = "https://files.pythonhosted.org/packages/95/e1/9dbe4c465c3365959d183e6345d0a8d1dc5b02ca3f8db4760b3bc834cf25/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:8382ad21458075c2e66a82a29d650f963ce51c7708c7c0ff313a8c206c4fd5e8", size = 48011148, upload-time = "2025-10-24T10:04:59.585Z" }, + { url = "https://files.pythonhosted.org/packages/c5/b4/7caf5d21930061444c3cf4fa7535c82faf5263e22ce43af7c2759ceb5b8b/pyarrow-22.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1a812a5b727bc09c3d7ea072c4eebf657c2f7066155506ba31ebf4792f88f016", size = 50276964, upload-time = "2025-10-24T10:05:08.175Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f3/cec89bd99fa3abf826f14d4e53d3d11340ce6f6af4d14bdcd54cd83b6576/pyarrow-22.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:ec5d40dd494882704fb876c16fa7261a69791e784ae34e6b5992e977bd2e238c", size = 28106517, upload-time = "2025-10-24T10:05:14.314Z" }, + { url = "https://files.pythonhosted.org/packages/af/63/ba23862d69652f85b615ca14ad14f3bcfc5bf1b99ef3f0cd04ff93fdad5a/pyarrow-22.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:bea79263d55c24a32b0d79c00a1c58bb2ee5f0757ed95656b01c0fb310c5af3d", size = 34211578, upload-time = "2025-10-24T10:05:21.583Z" }, + { url = "https://files.pythonhosted.org/packages/b1/d0/f9ad86fe809efd2bcc8be32032fa72e8b0d112b01ae56a053006376c5930/pyarrow-22.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:12fe549c9b10ac98c91cf791d2945e878875d95508e1a5d14091a7aaa66d9cf8", size = 35989906, upload-time = "2025-10-24T10:05:29.485Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a8/f910afcb14630e64d673f15904ec27dd31f1e009b77033c365c84e8c1e1d/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:334f900ff08ce0423407af97e6c26ad5d4e3b0763645559ece6fbf3747d6a8f5", size = 45021677, upload-time = "2025-10-24T10:05:38.274Z" }, + { url = "https://files.pythonhosted.org/packages/13/95/aec81f781c75cd10554dc17a25849c720d54feafb6f7847690478dcf5ef8/pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:c6c791b09c57ed76a18b03f2631753a4960eefbbca80f846da8baefc6491fcfe", size = 47726315, upload-time = "2025-10-24T10:05:47.314Z" }, + { url = "https://files.pythonhosted.org/packages/bb/d4/74ac9f7a54cfde12ee42734ea25d5a3c9a45db78f9def949307a92720d37/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c3200cb41cdbc65156e5f8c908d739b0dfed57e890329413da2748d1a2cd1a4e", size = 47990906, upload-time = "2025-10-24T10:05:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/2e/71/fedf2499bf7a95062eafc989ace56572f3343432570e1c54e6599d5b88da/pyarrow-22.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ac93252226cf288753d8b46280f4edf3433bf9508b6977f8dd8526b521a1bbb9", size = 50306783, upload-time = "2025-10-24T10:06:08.08Z" }, + { url = "https://files.pythonhosted.org/packages/68/ed/b202abd5a5b78f519722f3d29063dda03c114711093c1995a33b8e2e0f4b/pyarrow-22.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:44729980b6c50a5f2bfcc2668d36c569ce17f8b17bccaf470c4313dcbbf13c9d", size = 27972883, upload-time = "2025-10-24T10:06:14.204Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d6/d0fac16a2963002fc22c8fa75180a838737203d558f0ed3b564c4a54eef5/pyarrow-22.0.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:e6e95176209257803a8b3d0394f21604e796dadb643d2f7ca21b66c9c0b30c9a", size = 34204629, upload-time = "2025-10-24T10:06:20.274Z" }, + { url = "https://files.pythonhosted.org/packages/c6/9c/1d6357347fbae062ad3f17082f9ebc29cc733321e892c0d2085f42a2212b/pyarrow-22.0.0-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:001ea83a58024818826a9e3f89bf9310a114f7e26dfe404a4c32686f97bd7901", size = 35985783, upload-time = "2025-10-24T10:06:27.301Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/782344c2ce58afbea010150df07e3a2f5fdad299cd631697ae7bd3bac6e3/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:ce20fe000754f477c8a9125543f1936ea5b8867c5406757c224d745ed033e691", size = 45020999, upload-time = "2025-10-24T10:06:35.387Z" }, + { url = "https://files.pythonhosted.org/packages/1b/8b/5362443737a5307a7b67c1017c42cd104213189b4970bf607e05faf9c525/pyarrow-22.0.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e0a15757fccb38c410947df156f9749ae4a3c89b2393741a50521f39a8cf202a", size = 47724601, upload-time = "2025-10-24T10:06:43.551Z" }, + { url = "https://files.pythonhosted.org/packages/69/4d/76e567a4fc2e190ee6072967cb4672b7d9249ac59ae65af2d7e3047afa3b/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cedb9dd9358e4ea1d9bce3665ce0797f6adf97ff142c8e25b46ba9cdd508e9b6", size = 48001050, upload-time = "2025-10-24T10:06:52.284Z" }, + { url = "https://files.pythonhosted.org/packages/01/5e/5653f0535d2a1aef8223cee9d92944cb6bccfee5cf1cd3f462d7cb022790/pyarrow-22.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:252be4a05f9d9185bb8c18e83764ebcfea7185076c07a7a662253af3a8c07941", size = 50307877, upload-time = "2025-10-24T10:07:02.405Z" }, + { url = "https://files.pythonhosted.org/packages/2d/f8/1d0bd75bf9328a3b826e24a16e5517cd7f9fbf8d34a3184a4566ef5a7f29/pyarrow-22.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:a4893d31e5ef780b6edcaf63122df0f8d321088bb0dee4c8c06eccb1ca28d145", size = 27977099, upload-time = "2025-10-24T10:08:07.259Z" }, + { url = "https://files.pythonhosted.org/packages/90/81/db56870c997805bf2b0f6eeeb2d68458bf4654652dccdcf1bf7a42d80903/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:f7fe3dbe871294ba70d789be16b6e7e52b418311e166e0e3cba9522f0f437fb1", size = 34336685, upload-time = "2025-10-24T10:07:11.47Z" }, + { url = "https://files.pythonhosted.org/packages/1c/98/0727947f199aba8a120f47dfc229eeb05df15bcd7a6f1b669e9f882afc58/pyarrow-22.0.0-cp313-cp313t-macosx_12_0_x86_64.whl", hash = "sha256:ba95112d15fd4f1105fb2402c4eab9068f0554435e9b7085924bcfaac2cc306f", size = 36032158, upload-time = "2025-10-24T10:07:18.626Z" }, + { url = "https://files.pythonhosted.org/packages/96/b4/9babdef9c01720a0785945c7cf550e4acd0ebcd7bdd2e6f0aa7981fa85e2/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:c064e28361c05d72eed8e744c9605cbd6d2bb7481a511c74071fd9b24bc65d7d", size = 44892060, upload-time = "2025-10-24T10:07:26.002Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ca/2f8804edd6279f78a37062d813de3f16f29183874447ef6d1aadbb4efa0f/pyarrow-22.0.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:6f9762274496c244d951c819348afbcf212714902742225f649cf02823a6a10f", size = 47504395, upload-time = "2025-10-24T10:07:34.09Z" }, + { url = "https://files.pythonhosted.org/packages/b9/f0/77aa5198fd3943682b2e4faaf179a674f0edea0d55d326d83cb2277d9363/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a9d9ffdc2ab696f6b15b4d1f7cec6658e1d788124418cb30030afbae31c64746", size = 48066216, upload-time = "2025-10-24T10:07:43.528Z" }, + { url = "https://files.pythonhosted.org/packages/79/87/a1937b6e78b2aff18b706d738c9e46ade5bfcf11b294e39c87706a0089ac/pyarrow-22.0.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ec1a15968a9d80da01e1d30349b2b0d7cc91e96588ee324ce1b5228175043e95", size = 50288552, upload-time = "2025-10-24T10:07:53.519Z" }, + { url = "https://files.pythonhosted.org/packages/60/ae/b5a5811e11f25788ccfdaa8f26b6791c9807119dffcf80514505527c384c/pyarrow-22.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:bba208d9c7decf9961998edf5c65e3ea4355d5818dd6cd0f6809bec1afb951cc", size = 28262504, upload-time = "2025-10-24T10:08:00.932Z" }, + { url = "https://files.pythonhosted.org/packages/bd/b0/0fa4d28a8edb42b0a7144edd20befd04173ac79819547216f8a9f36f9e50/pyarrow-22.0.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:9bddc2cade6561f6820d4cd73f99a0243532ad506bc510a75a5a65a522b2d74d", size = 34224062, upload-time = "2025-10-24T10:08:14.101Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/7a719076b3c1be0acef56a07220c586f25cd24de0e3f3102b438d18ae5df/pyarrow-22.0.0-cp314-cp314-macosx_12_0_x86_64.whl", hash = "sha256:e70ff90c64419709d38c8932ea9fe1cc98415c4f87ea8da81719e43f02534bc9", size = 35990057, upload-time = "2025-10-24T10:08:21.842Z" }, + { url = "https://files.pythonhosted.org/packages/89/3c/359ed54c93b47fb6fe30ed16cdf50e3f0e8b9ccfb11b86218c3619ae50a8/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:92843c305330aa94a36e706c16209cd4df274693e777ca47112617db7d0ef3d7", size = 45068002, upload-time = "2025-10-24T10:08:29.034Z" }, + { url = "https://files.pythonhosted.org/packages/55/fc/4945896cc8638536ee787a3bd6ce7cec8ec9acf452d78ec39ab328efa0a1/pyarrow-22.0.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:6dda1ddac033d27421c20d7a7943eec60be44e0db4e079f33cc5af3b8280ccde", size = 47737765, upload-time = "2025-10-24T10:08:38.559Z" }, + { url = "https://files.pythonhosted.org/packages/cd/5e/7cb7edeb2abfaa1f79b5d5eb89432356155c8426f75d3753cbcb9592c0fd/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:84378110dd9a6c06323b41b56e129c504d157d1a983ce8f5443761eb5256bafc", size = 48048139, upload-time = "2025-10-24T10:08:46.784Z" }, + { url = "https://files.pythonhosted.org/packages/88/c6/546baa7c48185f5e9d6e59277c4b19f30f48c94d9dd938c2a80d4d6b067c/pyarrow-22.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:854794239111d2b88b40b6ef92aa478024d1e5074f364033e73e21e3f76b25e0", size = 50314244, upload-time = "2025-10-24T10:08:55.771Z" }, + { url = "https://files.pythonhosted.org/packages/3c/79/755ff2d145aafec8d347bf18f95e4e81c00127f06d080135dfc86aea417c/pyarrow-22.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:b883fe6fd85adad7932b3271c38ac289c65b7337c2c132e9569f9d3940620730", size = 28757501, upload-time = "2025-10-24T10:09:59.891Z" }, + { url = "https://files.pythonhosted.org/packages/0e/d2/237d75ac28ced3147912954e3c1a174df43a95f4f88e467809118a8165e0/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:7a820d8ae11facf32585507c11f04e3f38343c1e784c9b5a8b1da5c930547fe2", size = 34355506, upload-time = "2025-10-24T10:09:02.953Z" }, + { url = "https://files.pythonhosted.org/packages/1e/2c/733dfffe6d3069740f98e57ff81007809067d68626c5faef293434d11bd6/pyarrow-22.0.0-cp314-cp314t-macosx_12_0_x86_64.whl", hash = "sha256:c6ec3675d98915bf1ec8b3c7986422682f7232ea76cad276f4c8abd5b7319b70", size = 36047312, upload-time = "2025-10-24T10:09:10.334Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2b/29d6e3782dc1f299727462c1543af357a0f2c1d3c160ce199950d9ca51eb/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3e739edd001b04f654b166204fc7a9de896cf6007eaff33409ee9e50ceaff754", size = 45081609, upload-time = "2025-10-24T10:09:18.61Z" }, + { url = "https://files.pythonhosted.org/packages/8d/42/aa9355ecc05997915af1b7b947a7f66c02dcaa927f3203b87871c114ba10/pyarrow-22.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7388ac685cab5b279a41dfe0a6ccd99e4dbf322edfb63e02fc0443bf24134e91", size = 47703663, upload-time = "2025-10-24T10:09:27.369Z" }, + { url = "https://files.pythonhosted.org/packages/ee/62/45abedde480168e83a1de005b7b7043fd553321c1e8c5a9a114425f64842/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:f633074f36dbc33d5c05b5dc75371e5660f1dbf9c8b1d95669def05e5425989c", size = 48066543, upload-time = "2025-10-24T10:09:34.908Z" }, + { url = "https://files.pythonhosted.org/packages/84/e9/7878940a5b072e4f3bf998770acafeae13b267f9893af5f6d4ab3904b67e/pyarrow-22.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4c19236ae2402a8663a2c8f21f1870a03cc57f0bef7e4b6eb3238cc82944de80", size = 50288838, upload-time = "2025-10-24T10:09:44.394Z" }, + { url = "https://files.pythonhosted.org/packages/7b/03/f335d6c52b4a4761bcc83499789a1e2e16d9d201a58c327a9b5cc9a41bd9/pyarrow-22.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0c34fe18094686194f204a3b1787a27456897d8a2d62caf84b61e8dfbc0252ae", size = 29185594, upload-time = "2025-10-24T10:09:53.111Z" }, ] [[package]] @@ -1991,24 +2000,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/27/a2fc51a4a122dfd1015e921ae9d22fee3d20b0b8080d9a704578bf9deece/pymdown_extensions-10.21.2-py3-none-any.whl", hash = "sha256:5c0fd2a2bea14eb39af8ff284f1066d898ab2187d81b889b75d46d4348c01638", size = 268901, upload-time = "2026-03-29T15:01:53.244Z" }, ] -[[package]] -name = "pytest" -version = "9.0.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "colorama", marker = "sys_platform == 'win32'" }, - { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "iniconfig" }, - { name = "packaging" }, - { name = "pluggy" }, - { name = "pygments" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, -] - [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2664,7 +2655,7 @@ wheels = [ [[package]] name = "sqlfluff" -version = "3.5.0" +version = "4.2.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chardet" }, @@ -2674,16 +2665,15 @@ dependencies = [ { name = "jinja2" }, { name = "pathspec" }, { name = "platformdirs" }, - { name = "pytest" }, { name = "pyyaml" }, { name = "regex" }, { name = "tblib" }, { name = "tomli", marker = "python_full_version < '3.11'" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4c/a8/d3dc6c510cc3bba9abbf7a3052a96d5ce6771b71dda141846003fa37277a/sqlfluff-3.5.0.tar.gz", hash = "sha256:2d0a546078ffb021de7021b9a6c2a50e5eef590daa820d5f1b082d24a1d5e1d4", size = 921199, upload-time = "2025-10-18T19:33:07.778Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b4/a1/3430aebc4fae35d7e466e793b5da2f36c2245af092311520dc1d3d3146d6/sqlfluff-4.2.1.tar.gz", hash = "sha256:32f43fbf6721e57f1a5a87d71df0d94b84ecba6ed65727266c7fa60991110fb9", size = 1013384, upload-time = "2026-05-14T21:15:37.161Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/47/d5/83c3eacdd6c3249fb5f8a0b5612ab10b661862e0df869951f45fd837448d/sqlfluff-3.5.0-py3-none-any.whl", hash = "sha256:6e5fb7a0c491676ded68912245fc0627e88f8b0e6290bd4b54a65ce735f69716", size = 921597, upload-time = "2025-10-18T19:33:05.839Z" }, + { url = "https://files.pythonhosted.org/packages/5f/55/8830f3204939cc965c72680bfae99b0f3fd6c16bffdaf79372ad0a3d1ca6/sqlfluff-4.2.1-py3-none-any.whl", hash = "sha256:ea84f196c41f45df40a851b0881cb3fbb660570e07acbbfd304ff4e9b893424d", size = 1002493, upload-time = "2026-05-14T21:15:35.582Z" }, ] [[package]] @@ -2915,11 +2905,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.6.3" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]] From 007bc9721106c8223e7f7dcf1380779847d64e20 Mon Sep 17 00:00:00 2001 From: Steve Han Date: Tue, 2 Jun 2026 15:46:09 -0400 Subject: [PATCH 3/4] Declare pytest workspace test dependency --- pyproject.toml | 1 + uv.lock | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6b02cd3..9dc88b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ testpaths = ["plugins"] [dependency-groups] dev = [ "dumb-pypi>=1.15.0", + "pytest>=8.0.0", "twine>=6.0.0", "zensical>=0.0.40", ] diff --git a/uv.lock b/uv.lock index 49d1a4b..c6f9d16 100644 --- a/uv.lock +++ b/uv.lock @@ -458,6 +458,7 @@ source = { virtual = "." } [package.dev-dependencies] dev = [ { name = "dumb-pypi" }, + { name = "pytest" }, { name = "twine" }, { name = "zensical" }, ] @@ -467,6 +468,7 @@ dev = [ [package.metadata.requires-dev] dev = [ { name = "dumb-pypi", specifier = ">=1.15.0" }, + { name = "pytest", specifier = ">=8.0.0" }, { name = "twine", specifier = ">=6.0.0" }, { name = "zensical", specifier = ">=0.0.40" }, ] @@ -811,6 +813,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/38/3d/2d244233ac4f76e38533cfcb2991c9eb4c7bf688ae0a036d30725b8faafe/importlib_metadata-9.0.0-py3-none-any.whl", hash = "sha256:2d21d1cc5a017bd0559e36150c21c830ab1dc304dedd1b7ea85d20f45ef3edd7", size = 27789, upload-time = "2026-03-20T06:42:55.665Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "jaraco-classes" version = "3.4.0" @@ -2000,6 +2011,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f7/27/a2fc51a4a122dfd1015e921ae9d22fee3d20b0b8080d9a704578bf9deece/pymdown_extensions-10.21.2-py3-none-any.whl", hash = "sha256:5c0fd2a2bea14eb39af8ff284f1066d898ab2187d81b889b75d46d4348c01638", size = 268901, upload-time = "2026-03-29T15:01:53.244Z" }, ] +[[package]] +name = "pytest" +version = "9.0.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, + { name = "pygments" }, + { name = "tomli", marker = "python_full_version < '3.11'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165, upload-time = "2026-04-07T17:16:18.027Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d4/24/a372aaf5c9b7208e7112038812994107bc65a84cd00e0354a88c2c77a617/pytest-9.0.3-py3-none-any.whl", hash = "sha256:2c5efc453d45394fdd706ade797c0a81091eccd1d6e4bccfcd476e2b8e0ab5d9", size = 375249, upload-time = "2026-04-07T17:16:16.13Z" }, +] + [[package]] name = "python-dateutil" version = "2.9.0.post0" From da9938da35a5c65bebb6971de556369b95cd5681 Mon Sep 17 00:00:00 2001 From: Steve Han Date: Wed, 3 Jun 2026 12:40:36 -0400 Subject: [PATCH 4/4] Address retrieval SDG resume review feedback --- plugins/data-designer-retrieval-sdg/README.md | 9 +- .../src/data_designer_retrieval_sdg/cli.py | 57 ++++++++- .../data_designer_retrieval_sdg/convert.py | 35 ++++- .../tests/test_cli.py | 120 +++++++++++------- .../tests/test_convert.py | 22 ++++ pyproject.toml | 2 + uv.lock | 6 +- 7 files changed, 192 insertions(+), 59 deletions(-) diff --git a/plugins/data-designer-retrieval-sdg/README.md b/plugins/data-designer-retrieval-sdg/README.md index 45c7625..9b7aa66 100644 --- a/plugins/data-designer-retrieval-sdg/README.md +++ b/plugins/data-designer-retrieval-sdg/README.md @@ -42,6 +42,11 @@ data-designer-retrieval-sdg generate \ Use `--resume if_possible` to resume only when the saved config matches and otherwise start a fresh run. +`--buffer-size` controls DataDesigner's checkpoint/write granularity and must +match across resumed runs. In DataDesigner 0.6.1, `create()` still profiles the +completed dataset before returning, so `--buffer-size` is not a hard cap on +final peak memory for very large runs. + ## Installation The package is distributed from the NVIDIA-NeMo plugin index (hosted on @@ -119,7 +124,9 @@ Legacy `generated_batch*.json` directories remain supported by `convert`, but `generate` no longer writes per-batch JSON files. The old manual restart flags `--batch-size`, `--start-batch-index`, and `--end-batch-index` were removed because DataDesigner now owns checkpointing through `--buffer-size` and -`--resume`. +`--resume`. For very large corpora, keep input partitions sized for +DataDesigner's final profiling step until DataDesigner exposes a no-materialize +create/export path. ### Use as a library diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py index 87b7dd3..191a0d4 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/cli.py @@ -65,6 +65,55 @@ def _count_seed_records(seed_source: DocumentChunkerSeedSource) -> int: return reader.get_seed_dataset_size() +def _path_is_relative_to(path: Path, root: Path) -> bool: + """Return whether *path* is contained by *root* after resolution.""" + try: + path.relative_to(root) + except ValueError: + return False + return True + + +def _validate_dataset_name(dataset_name: str, artifact_path: Path) -> str: + """Validate a DataDesigner dataset name before it is used as an artifact path segment. + + Args: + dataset_name: Requested dataset name. + artifact_path: DataDesigner artifact root. + + Returns: + The validated dataset name. + + Raises: + ValueError: If the dataset name is empty, unsafe, or escapes the artifact root. + """ + if not dataset_name: + raise ValueError("--dataset-name must not be empty") + if dataset_name in {".", ".."}: + raise ValueError("--dataset-name must be a real path segment, not '.' or '..'") + if any(ord(char) < 32 or ord(char) == 127 for char in dataset_name): + raise ValueError("--dataset-name must not contain control characters") + if any(separator in dataset_name for separator in ("/", "\\")): + raise ValueError("--dataset-name must be a single path segment without path separators") + + dataset_path = Path(dataset_name) + if dataset_path.is_absolute() or len(dataset_path.parts) != 1: + raise ValueError("--dataset-name must be a single relative path segment") + + artifact_root = artifact_path.resolve() + resolved_dataset_path = (artifact_root / dataset_name).resolve() + if resolved_dataset_path == artifact_root or not _path_is_relative_to(resolved_dataset_path, artifact_root): + raise ValueError("--dataset-name must resolve under --artifact-path") + + return dataset_name + + +def _resolve_dataset_name(input_dir: Path, artifact_path: Path, dataset_name: str | None) -> str: + """Return the explicit or default dataset name after safety validation.""" + resolved_name = dataset_name if dataset_name is not None else input_dir.name or "retrieval_sdg" + return _validate_dataset_name(resolved_name, artifact_path) + + def _add_generate_parser(subparsers: argparse._SubParsersAction) -> None: """Register the ``generate`` subcommand.""" p = subparsers.add_parser( @@ -162,6 +211,12 @@ def _run_generate(args: argparse.Namespace) -> None: row_type = "bundles" if args.multi_doc else "text files" print(f"Discovered {total_records} {row_type} under {args.input_dir}") + try: + args.dataset_name = _resolve_dataset_name(args.input_dir, args.artifact_path, args.dataset_name) + except ValueError as exc: + print(f"Error: {exc}", file=sys.stderr) + sys.exit(2) + model_providers, custom_providers = build_model_providers( custom_provider_endpoint=args.custom_provider_endpoint, custom_provider_name=args.custom_provider_name, @@ -260,7 +315,7 @@ def _run_create( **pipeline_kwargs, ) - dataset_name = args.dataset_name or args.input_dir.name or "retrieval_sdg" + dataset_name = _resolve_dataset_name(args.input_dir, args.artifact_path, args.dataset_name) print(f"Dataset name: {dataset_name}") print("\nGenerating dataset...") result = data_designer.create( diff --git a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py index 48016b5..bc7627e 100644 --- a/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py +++ b/plugins/data-designer-retrieval-sdg/src/data_designer_retrieval_sdg/convert.py @@ -59,6 +59,27 @@ def filter_mismatched_records(records: list[dict]) -> tuple[list[dict], int]: return filtered, dropped_count +def _to_plain_python(value: object) -> object: + """Recursively convert array-like values from parquet into plain Python containers.""" + if isinstance(value, dict): + return {key: _to_plain_python(nested_value) for key, nested_value in value.items()} + if isinstance(value, list): + return [_to_plain_python(nested_value) for nested_value in value] + if isinstance(value, tuple): + return [_to_plain_python(nested_value) for nested_value in value] + if not isinstance(value, str) and hasattr(value, "tolist"): + return _to_plain_python(value.tolist()) + return value + + +def _normalize_generated_record(record: object) -> dict: + """Convert one loaded record to a plain dict.""" + normalized = _to_plain_python(record) + if not isinstance(normalized, dict): + raise ValueError(f"Generated record must be a JSON object, got {type(normalized).__name__}") + return normalized + + def normalize_file_name(file_name: object) -> list[str]: """Normalise *file_name* to a list of strings. @@ -111,7 +132,8 @@ def _load_jsonl_records(input_file: Path) -> list[dict]: def _load_parquet_records(input_file: Path) -> list[dict]: """Load records from a parquet file exported by DataDesigner.""" - return pd.read_parquet(input_file).to_dict(orient="records") + records = pd.read_parquet(input_file).to_dict(orient="records") + return [_normalize_generated_record(record) for record in records] def _load_generated_records_file(input_file: Path) -> list[dict]: @@ -171,6 +193,8 @@ def load_generated_json_files(input_path: str) -> pd.DataFrame: for generated_file in generated_files: all_records.extend(_load_generated_records_file(generated_file)) + all_records = [_normalize_generated_record(record) for record in all_records] + print("Normalizing file_name fields...") for record in all_records: if "file_name" in record: @@ -253,17 +277,14 @@ def build_corpus_and_mappings( print("Building corpus and chunk mappings...") for _, row in generated_df.iterrows(): - file_name_list = row.get("file_name", []) - chunks = row.get("chunks", []) + file_name_list = normalize_file_name(_to_plain_python(row.get("file_name", []))) + chunks = _to_plain_python(row.get("chunks", [])) - if not chunks or not file_name_list: + if not isinstance(chunks, list) or len(chunks) == 0 or len(file_name_list) == 0: continue file_identifier = get_file_identifier(file_name_list) - if hasattr(chunks, "tolist"): - chunks = chunks.tolist() - for chunk in chunks: if isinstance(chunk, dict): chunk_id = chunk.get("chunk_id") diff --git a/plugins/data-designer-retrieval-sdg/tests/test_cli.py b/plugins/data-designer-retrieval-sdg/tests/test_cli.py index ff8eb62..e85a50f 100644 --- a/plugins/data-designer-retrieval-sdg/tests/test_cli.py +++ b/plugins/data-designer-retrieval-sdg/tests/test_cli.py @@ -3,7 +3,6 @@ from __future__ import annotations -import argparse import sys from pathlib import Path @@ -62,7 +61,7 @@ def __init__(self, artifact_path: Path, model_providers: object) -> None: self.model_providers = model_providers self.run_config = None self.create_calls: list[dict[str, object]] = [] - self.result = FakeCreateResult(FakeArtifactStorage(artifact_path / "my_run", "my_run")) + self.result = FakeCreateResult(FakeArtifactStorage(artifact_path / "my_run_resolved", "my_run_resolved")) FakeDataDesigner.instances.append(self) def set_run_config(self, run_config: object) -> None: @@ -87,52 +86,35 @@ def create( return self.result -def _generate_args(tmp_path: Path) -> argparse.Namespace: - """Build generate args with defaults that match the CLI parser.""" +def generate_argv( + tmp_path: Path, + *, + dataset_name: str = "my_run", + artifact_path: Path | None = None, + extra_args: list[str] | None = None, +) -> list[str]: + """Build generate CLI arguments for parser-level tests.""" input_dir = tmp_path / "docs" - input_dir.mkdir() - return argparse.Namespace( - input_dir=input_dir, - output_dir=tmp_path / "out", - file_pattern="*", - recursive=True, - file_extensions=[".txt", ".md", ".text"], - min_text_length=50, - sentences_per_chunk=5, - num_sections=1, - num_files=None, - max_artifacts_per_type=2, - num_pairs=7, - min_hops=2, - max_hops=4, - min_complexity=4, - similarity_threshold=0.9, - preview=False, - artifact_path=tmp_path / "artifacts", - dataset_name="my_run", - buffer_size=37, - resume=ResumeMode.ALWAYS.value, - multi_doc=False, - bundle_size=2, - bundle_strategy="sequential", - max_docs_per_bundle=3, - multi_doc_manifest=None, - log_level="INFO", - artifact_extraction_model="artifact-model", - artifact_extraction_provider="nvidia", - qa_generation_model="qa-model", - qa_generation_provider="nvidia", - quality_judge_model="judge-model", - quality_judge_provider="nvidia", - embed_model="embed-model", - embed_provider="nvidia", - max_parallel_requests_for_gen=None, - custom_provider_endpoint=None, - custom_provider_name="custom", - custom_provider_type="openai", - custom_provider_api_key=None, - model_providers_file=None, - ) + input_dir.mkdir(exist_ok=True) + argv = [ + "data-designer-retrieval-sdg", + "generate", + "--input-dir", + str(input_dir), + "--output-dir", + str(tmp_path / "out"), + "--artifact-path", + str(artifact_path or tmp_path / "artifacts"), + "--dataset-name", + dataset_name, + "--buffer-size", + "37", + "--resume", + "always", + ] + if extra_args: + argv.extend(extra_args) + return argv def test_generate_uses_native_resume_and_exports_jsonl(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: @@ -142,8 +124,9 @@ def test_generate_uses_native_resume_and_exports_jsonl(monkeypatch: pytest.Monke monkeypatch.setattr(cli, "_count_seed_records", fake_count_seed_records) monkeypatch.setattr(cli, "build_model_providers", fake_build_model_providers) monkeypatch.setattr(cli, "build_qa_generation_pipeline", fake_build_qa_generation_pipeline) + monkeypatch.setattr(sys, "argv", generate_argv(tmp_path)) - cli._run_generate(_generate_args(tmp_path)) + cli.main() instance = FakeDataDesigner.instances[0] assert instance.run_config.buffer_size == 37 @@ -158,7 +141,46 @@ def test_generate_uses_native_resume_and_exports_jsonl(monkeypatch: pytest.Monke ] assert BUILD_CALLS[0]["start_index"] == 0 assert BUILD_CALLS[0]["end_index"] == 2 - assert instance.result.export_calls == [(tmp_path / "out" / "my_run.jsonl", "jsonl")] + assert instance.result.export_calls == [(tmp_path / "out" / "my_run_resolved.jsonl", "jsonl")] + + +@pytest.mark.parametrize("dataset_name", ["", ".", "..", "nested/name", "nested\\name", "bad\nname"]) +def test_generate_rejects_unsafe_dataset_names( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, + dataset_name: str, +) -> None: + FakeDataDesigner.instances.clear() + monkeypatch.setattr(cli, "DataDesigner", FakeDataDesigner) + monkeypatch.setattr(cli, "_count_seed_records", fake_count_seed_records) + monkeypatch.setattr(sys, "argv", generate_argv(tmp_path, dataset_name=dataset_name)) + + with pytest.raises(SystemExit) as exc_info: + cli.main() + + assert exc_info.value.code == 2 + assert FakeDataDesigner.instances == [] + + +def test_generate_rejects_dataset_name_that_resolves_outside_artifact_path( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + artifact_path = tmp_path / "artifacts" + artifact_path.mkdir() + outside_path = tmp_path / "outside" + outside_path.mkdir() + (artifact_path / "linked").symlink_to(outside_path, target_is_directory=True) + FakeDataDesigner.instances.clear() + monkeypatch.setattr(cli, "DataDesigner", FakeDataDesigner) + monkeypatch.setattr(cli, "_count_seed_records", fake_count_seed_records) + monkeypatch.setattr(sys, "argv", generate_argv(tmp_path, dataset_name="linked", artifact_path=artifact_path)) + + with pytest.raises(SystemExit) as exc_info: + cli.main() + + assert exc_info.value.code == 2 + assert FakeDataDesigner.instances == [] @pytest.mark.parametrize("removed_flag", ["--batch-size", "--start-batch-index", "--end-batch-index"]) diff --git a/plugins/data-designer-retrieval-sdg/tests/test_convert.py b/plugins/data-designer-retrieval-sdg/tests/test_convert.py index 8864418..d50a4ee 100644 --- a/plugins/data-designer-retrieval-sdg/tests/test_convert.py +++ b/plugins/data-designer-retrieval-sdg/tests/test_convert.py @@ -198,6 +198,28 @@ def test_load_from_parquet_file(tmp_path: Path) -> None: assert df.iloc[0]["file_name"] == ["doc.txt"] +def test_load_from_parquet_normalizes_nested_arrays_for_chunk_mapping(tmp_path: Path) -> None: + path = tmp_path / "generated.parquet" + pd.DataFrame( + [ + { + "file_name": ["doc.txt"], + "chunks": [{"chunk_id": 1, "text": "hello"}, {"chunk_id": 2, "text": "world"}], + "deduplicated_qa_pairs": [], + "qa_evaluations": {"evaluations": []}, + } + ] + ).to_parquet(path, index=False) + + df = load_generated_json_files(str(path)) + corpus, mapping = build_corpus_and_mappings(df) + + assert isinstance(df.iloc[0]["chunks"], list) + assert len(corpus) == 2 + assert mapping[("doc", 1)] == "hello" + assert mapping[("doc", 2)] == "world" + + # --------------------------------------------------------------------------- # generate_training_set / generate_eval_set # --------------------------------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 9dc88b4..d2347e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,8 @@ package = false required-version = ">=0.7.10" constraint-dependencies = [ "idna>=3.12", + "python-multipart>=0.0.29", + "urllib3>=2.7.0", ] [tool.ddp.catalog] diff --git a/uv.lock b/uv.lock index c6f9d16..e96efec 100644 --- a/uv.lock +++ b/uv.lock @@ -15,7 +15,11 @@ members = [ "data-designer-template", "ddp", ] -constraints = [{ name = "idna", specifier = ">=3.12" }] +constraints = [ + { name = "idna", specifier = ">=3.12" }, + { name = "python-multipart", specifier = ">=0.0.29" }, + { name = "urllib3", specifier = ">=2.7.0" }, +] [[package]] name = "annotated-doc"