From 9a1267313fbbb9795bcb71ff8ab52ca84c5a2d4d Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Fri, 20 Feb 2026 17:04:06 -0500 Subject: [PATCH 01/12] Refactor output path computation into shared helper Extract output file path logic into a single source of truth so that naming convention changes need only be made in one place. No behavior change. Co-Authored-By: Claude Sonnet 4.6 --- src/taxonopy/output_manager.py | 104 +++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 25 deletions(-) diff --git a/src/taxonopy/output_manager.py b/src/taxonopy/output_manager.py index f67d83b..6317e12 100644 --- a/src/taxonopy/output_manager.py +++ b/src/taxonopy/output_manager.py @@ -177,6 +177,75 @@ def map_resolution_results_to_entries( logger.info(f"Mapped final resolution results for {processed_groups} entry groups to {mapped_uuids} individual entries.") return uuid_to_final_attempt + +def _resolve_output_paths_for_input( + input_file: str, + ref_dir: str, + output_dir: str, + output_format: str, + force_input: bool = False, +) -> Tuple[str, ...]: + """Return absolute output file path(s) for a single input file. + + This is the single source of truth for TaxonoPy output file naming. + Both the generate functions and compute_output_paths use it so that + naming convention changes need only be made here. + + Args: + input_file: Absolute path to the input file. + ref_dir: Root directory for preserving subdirectory structure. + output_dir: Output directory. + output_format: 'csv' or 'parquet'. + force_input: True when --force-input is set (produces .forced.* files). + + Returns: + Tuple of absolute output paths: (resolved, unsolved) or (forced,). + """ + rel_path = os.path.relpath(input_file, ref_dir) + base_name = os.path.splitext(os.path.basename(input_file))[0] + rel_dir = os.path.dirname(rel_path) + file_dir = os.path.join(output_dir, rel_dir) if rel_dir else output_dir + if force_input: + return (os.path.join(file_dir, f"{base_name}.forced.{output_format}"),) + return ( + os.path.join(file_dir, f"{base_name}.resolved.{output_format}"), + os.path.join(file_dir, f"{base_name}.unsolved.{output_format}"), + ) + + +def compute_output_paths( + input_path: str, + input_files: List[str], + output_dir: str, + output_format: str, + force_input: bool = False, +) -> List[str]: + """Return intended output file paths (relative to output_dir) for a resolve run. + + Used by the manifest system to record files before they are written. + Does not include fixed outputs such as resolution_stats.json or the + manifest file itself — callers are responsible for appending those. + + Args: + input_path: The --input argument (file or directory). + input_files: Expanded list of input file paths from find_input_files. + output_dir: The --output-dir argument. + output_format: 'csv' or 'parquet'. + force_input: True when --force-input is set. + + Returns: + List of relative file paths (relative to output_dir). + """ + ref_dir = input_path if os.path.isdir(input_path) else os.path.dirname(input_path) + files = [] + for input_file in input_files: + for abs_path in _resolve_output_paths_for_input( + input_file, ref_dir, output_dir, output_format, force_input + ): + files.append(os.path.relpath(abs_path, output_dir)) + return files + + def generate_resolution_output( input_path: str, output_dir: str, @@ -209,25 +278,16 @@ def generate_resolution_output( resolved_files = [] unsolved_files = [] + ref_dir = input_path if os.path.isdir(input_path) else os.path.dirname(input_path) for input_file in input_files: logger.info(f"Generating resolution output for: {input_file}") input_file_name = os.path.basename(input_file) - # Preserve directory structure - rel_path = os.path.relpath(input_file, input_path if os.path.isdir(input_path) else os.path.dirname(input_path)) - base_name = os.path.splitext(os.path.basename(input_file))[0] - rel_dir = os.path.dirname(rel_path) - - resolved_dir = os.path.join(output_dir, rel_dir) - unsolved_dir = os.path.join(output_dir, rel_dir) - os.makedirs(resolved_dir, exist_ok=True) - os.makedirs(unsolved_dir, exist_ok=True) - - resolved_file_name = f"{base_name}.resolved.{output_format}" - unsolved_file_name = f"{base_name}.unsolved.{output_format}" - resolved_file_path = os.path.join(resolved_dir, resolved_file_name) - unsolved_file_path = os.path.join(unsolved_dir, unsolved_file_name) + resolved_file_path, unsolved_file_path = _resolve_output_paths_for_input( + input_file, ref_dir, output_dir, output_format, force_input=False + ) + os.makedirs(os.path.dirname(resolved_file_path), exist_ok=True) try: if input_file.endswith(".parquet"): @@ -330,6 +390,7 @@ def generate_forced_output( output_dir_path = Path(output_dir) output_dir_path.mkdir(parents=True, exist_ok=True) generated_files = [] + ref_dir = input_path if os.path.isdir(input_path) else os.path.dirname(input_path) for input_file in input_files: try: @@ -381,19 +442,12 @@ def generate_forced_output( output_rows.append(output_row) if output_rows: + (output_file_path,) = _resolve_output_paths_for_input( + input_file, ref_dir, output_dir, output_format, force_input=True + ) try: output_df = pl.DataFrame(output_rows) - - # Preserve directory structure - rel_path = os.path.relpath(input_file, input_path if os.path.isdir(input_path) else os.path.dirname(input_path)) - base_name = os.path.splitext(os.path.basename(input_file))[0] - rel_dir = os.path.dirname(rel_path) - - output_dir_for_file = os.path.join(output_dir, rel_dir) - os.makedirs(output_dir_for_file, exist_ok=True) - - output_file_name = f"{base_name}.forced.{output_format}" - output_file_path = os.path.join(output_dir_for_file, output_file_name) + os.makedirs(os.path.dirname(output_file_path), exist_ok=True) if output_format == "parquet": output_df.write_parquet(output_file_path) From 55236b1047a5abbc59d7b44088e99adbc01f8cab Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Fri, 20 Feb 2026 17:04:43 -0500 Subject: [PATCH 02/12] Fix #28: target --full-rerun to TaxonoPy-specific output files via manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before writing any output, each command writes a command-scoped manifest listing every file it intends to produce. On --full-rerun, only the files listed in that manifest are deleted — unrelated files in the output directory are never touched. Interrupted runs are handled cleanly since the manifest is written before any output files exist. The existing-output guard now checks for the manifest first, with a legacy glob fallback for output produced before this change. If no manifest is found on --full-rerun, a warning is logged and no files are removed. Co-Authored-By: Claude Sonnet 4.6 --- src/taxonopy/cli.py | 34 +++++- src/taxonopy/manifest.py | 160 +++++++++++++++++++++++++++ src/taxonopy/resolve_common_names.py | 12 ++ 3 files changed, 200 insertions(+), 6 deletions(-) create mode 100644 src/taxonopy/manifest.py diff --git a/src/taxonopy/cli.py b/src/taxonopy/cli.py index cd6cd86..f582a8c 100644 --- a/src/taxonopy/cli.py +++ b/src/taxonopy/cli.py @@ -11,7 +11,12 @@ from pathlib import Path from typing import List, Optional import json -import shutil +from taxonopy.manifest import ( + MANIFEST_FILENAMES, + delete_from_manifest, + get_intended_files_for_resolve, + write_manifest, +) from taxonopy import __version__ from taxonopy.config import config @@ -182,7 +187,10 @@ def run_resolve(args: argparse.Namespace) -> int: namespace_stats = get_cache_stats() existing_namespace = namespace_stats["entry_count"] > 0 and not cache_cleared_via_flag - existing_output = any(output_dir.glob("*.resolved.*")) + existing_output = ( + (output_dir / MANIFEST_FILENAMES["resolve"]).exists() + or any(output_dir.glob("*.resolved.*")) + ) if (existing_namespace or existing_output) and not args.full_rerun: logging.warning( "Existing cache (%s) and/or output (%s) detected for this input. Rerun with --full-rerun to replace them.", @@ -191,11 +199,13 @@ def run_resolve(args: argparse.Namespace) -> int: ) return 0 if args.full_rerun: - logging.info("--full-rerun set: clearing cache and output directory before proceeding.") + logging.info("--full-rerun set: clearing cache and TaxonoPy-specific output files before proceeding.") clear_cache() - if output_dir.exists(): - shutil.rmtree(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) + if not delete_from_manifest(str(output_dir), "resolve"): + logging.warning( + "--full-rerun: no manifest found in %s; no output files were removed.", + output_dir, + ) try: start_time = time.time() @@ -204,6 +214,12 @@ def run_resolve(args: argparse.Namespace) -> int: if args.force_input: logging.info("Skipping resolution due to --force-input flag") + write_manifest( + str(output_dir), "resolve", __version__, args.input, str(cache_path), + get_intended_files_for_resolve( + args.input, input_files, str(output_dir), args.output_format, force_input=True + ), + ) generated_files = generate_forced_output(args.input, args.output_dir, args.output_format) elapsed_time = time.time() - start_time logging.info(f"Forced output completed in {elapsed_time:.2f} seconds. Files: {generated_files}") @@ -259,6 +275,12 @@ def run_resolve(args: argparse.Namespace) -> int: # 4. Generate output + write_manifest( + str(output_dir), "resolve", __version__, args.input, str(cache_path), + get_intended_files_for_resolve( + args.input, input_files, str(output_dir), args.output_format, force_input=False + ), + ) logging.info("Generating output files...") # Pass only the manager and entry group map resolved_files, unsolved_files = generate_resolution_output( diff --git a/src/taxonopy/manifest.py b/src/taxonopy/manifest.py new file mode 100644 index 0000000..4d4d69f --- /dev/null +++ b/src/taxonopy/manifest.py @@ -0,0 +1,160 @@ +"""Manifest tracking for TaxonoPy output files. + +Each TaxonoPy command writes a manifest file to its output directory listing +every file it intends to produce. The manifest is written before any output +files are created, so interrupted runs leave a complete record of what should +be cleaned up on the next --full-rerun. + +Manifest files are command-scoped to avoid collisions when multiple commands +share an output directory. +""" + +import json +import logging +import os +from datetime import datetime +from pathlib import Path +from typing import List, Optional + +from taxonopy.output_manager import compute_output_paths + +logger = logging.getLogger(__name__) + +MANIFEST_FILENAMES = { + "resolve": "taxonopy_resolve_manifest.json", + "common-names": "taxonopy_common_names_manifest.json", +} + +RESOLUTION_STATS_FILENAME = "resolution_stats.json" + + +def get_intended_files_for_resolve( + input_path: str, + input_files: List[str], + output_dir: str, + output_format: str, + force_input: bool = False, +) -> List[str]: + """Return the full list of files a resolve run intends to write. + + Delegates output path naming to compute_output_paths (single source of + truth in output_manager), then appends the fixed outputs. + + Args: + input_path: The --input argument (file or directory). + input_files: Expanded list of input file paths from find_input_files. + output_dir: The --output-dir argument. + output_format: 'csv' or 'parquet'. + force_input: True when --force-input is set. + + Returns: + List of relative file paths (relative to output_dir). + """ + files = compute_output_paths(input_path, input_files, output_dir, output_format, force_input) + if not force_input: + files.append(RESOLUTION_STATS_FILENAME) + files.append(MANIFEST_FILENAMES["resolve"]) + return files + + +def get_intended_files_for_common_names( + annotation_dir: str, + annotation_paths: List[str], +) -> List[str]: + """Return the full list of files a common-names run intends to write. + + Output files preserve the input directory structure, so paths are simply + the relative paths of the annotation files. No naming convention is + encoded here. + + Args: + annotation_dir: The --resolved-dir argument. + annotation_paths: Expanded list of resolved parquet paths. + + Returns: + List of relative file paths (relative to output_dir). + """ + files = [os.path.relpath(p, annotation_dir) for p in annotation_paths] + files.append(MANIFEST_FILENAMES["common-names"]) + return files + + +def write_manifest( + output_dir: str, + command: str, + version: str, + input_path: str, + cache_namespace: Optional[str], + files: List[str], +) -> Path: + """Write a manifest file to output_dir before any output files are created. + + Args: + output_dir: Directory where the manifest will be written. + command: TaxonoPy command name ('resolve' or 'common-names'). + version: TaxonoPy version string. + input_path: Value of the --input or --resolved-dir argument. + cache_namespace: Active cache namespace path, or None. + files: Relative paths (relative to output_dir) of all intended outputs. + + Returns: + Path to the written manifest file. + """ + manifest = { + "taxonopy_version": version, + "command": command, + "created_at": datetime.now().isoformat(), + "input": input_path, + "cache_namespace": cache_namespace, + "files": files, + } + manifest_path = Path(output_dir) / MANIFEST_FILENAMES[command] + manifest_path.write_text(json.dumps(manifest, indent=4)) + logger.info("Manifest written to %s", manifest_path) + return manifest_path + + +def read_manifest(output_dir: str, command: str) -> Optional[dict]: + """Read the manifest for a given command from output_dir. + + Args: + output_dir: Directory to look for the manifest. + command: TaxonoPy command name ('resolve' or 'common-names'). + + Returns: + Parsed manifest dict, or None if no manifest file is present. + """ + manifest_path = Path(output_dir) / MANIFEST_FILENAMES[command] + if not manifest_path.exists(): + return None + return json.loads(manifest_path.read_text()) + + +def delete_from_manifest(output_dir: str, command: str) -> bool: + """Delete all files listed in the manifest, then delete the manifest itself. + + Only deletes files that actually exist; missing files are silently skipped + so that interrupted runs can be cleaned up without error. + + Args: + output_dir: Directory containing the manifest. + command: TaxonoPy command name ('resolve' or 'common-names'). + + Returns: + True if a manifest was found and cleanup was performed, False otherwise. + """ + manifest = read_manifest(output_dir, command) + if manifest is None: + return False + output_dir_path = Path(output_dir) + removed = 0 + for rel_path in manifest.get("files", []): + f = output_dir_path / rel_path + if f.exists(): + f.unlink() + removed += 1 + manifest_path = output_dir_path / MANIFEST_FILENAMES[command] + if manifest_path.exists(): + manifest_path.unlink() + logger.info("Removed %d file(s) listed in manifest for command '%s'.", removed, command) + return True diff --git a/src/taxonopy/resolve_common_names.py b/src/taxonopy/resolve_common_names.py index 60875ee..38e5e6d 100644 --- a/src/taxonopy/resolve_common_names.py +++ b/src/taxonopy/resolve_common_names.py @@ -7,7 +7,12 @@ from pathlib import Path import shutil +from taxonopy import __version__ from taxonopy.constants import TAXONOMIC_RANKS_BY_SPECIFICITY, INVALID_VALUES, TAXONOMIC_RANKS +from taxonopy.manifest import ( + get_intended_files_for_common_names, + write_manifest, +) # Module-level constant for join columns to avoid duplication PARENT_RANKS = TAXONOMIC_RANKS[:-1] @@ -464,6 +469,13 @@ def main(annotation_dir=None, output_dir=None): recursive=True ) + # Write manifest before producing any output + os.makedirs(output_dir, exist_ok=True) + write_manifest( + output_dir, "common-names", __version__, annotation_dir, None, + get_intended_files_for_common_names(annotation_dir, annotation_paths), + ) + # Process one-by-one, preserving subdirs for idx, annotation_path in enumerate(annotation_paths, start=1): print(f"[{idx}/{len(annotation_paths)}] {annotation_path}") From 504de678953aaf0586f3f3b0cd3d68897c051560 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Fri, 20 Feb 2026 17:05:15 -0500 Subject: [PATCH 03/12] Add tests for manifest-based output tracking Cover manifest filename constants, intended-file computation for both commands, write/read/delete lifecycle, tolerance of missing files, non-TaxonoPy file safety, command scoping, and the write-before-output ordering guarantee. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_full_rerun.py | 219 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 tests/test_full_rerun.py diff --git a/tests/test_full_rerun.py b/tests/test_full_rerun.py new file mode 100644 index 0000000..9018452 --- /dev/null +++ b/tests/test_full_rerun.py @@ -0,0 +1,219 @@ +import json + + +from taxonopy.manifest import ( + MANIFEST_FILENAMES, + RESOLUTION_STATS_FILENAME, + delete_from_manifest, + get_intended_files_for_common_names, + get_intended_files_for_resolve, + read_manifest, + write_manifest, +) + + +class TestManifestFilenames: + def test_resolve_filename(self): + assert MANIFEST_FILENAMES["resolve"] == "taxonopy_resolve_manifest.json" + + def test_common_names_filename(self): + assert MANIFEST_FILENAMES["common-names"] == "taxonopy_common_names_manifest.json" + + def test_filenames_are_distinct(self): + assert MANIFEST_FILENAMES["resolve"] != MANIFEST_FILENAMES["common-names"] + + +class TestGetIntendedFilesForResolve: + def test_single_file_normal(self, tmp_path): + input_file = tmp_path / "sample.csv" + input_file.write_text("uuid,kingdom\n1,Animalia\n") + out = str(tmp_path / "out") + + files = get_intended_files_for_resolve(str(tmp_path), [str(input_file)], out, "csv") + + assert "sample.resolved.csv" in files + assert "sample.unsolved.csv" in files + assert RESOLUTION_STATS_FILENAME in files + assert MANIFEST_FILENAMES["resolve"] in files + + def test_single_file_force_input(self, tmp_path): + input_file = tmp_path / "sample.csv" + input_file.write_text("uuid,kingdom\n1,Animalia\n") + out = str(tmp_path / "out") + + files = get_intended_files_for_resolve( + str(tmp_path), [str(input_file)], out, "parquet", force_input=True + ) + + assert "sample.forced.parquet" in files + assert "sample.resolved.parquet" not in files + assert "sample.unsolved.parquet" not in files + assert RESOLUTION_STATS_FILENAME not in files + assert MANIFEST_FILENAMES["resolve"] in files + + def test_subdirectory_structure_preserved(self, tmp_path): + subdir = tmp_path / "sub" + subdir.mkdir() + input_file = subdir / "sample.csv" + input_file.write_text("uuid,kingdom\n1,Animalia\n") + out = str(tmp_path / "out") + + files = get_intended_files_for_resolve(str(tmp_path), [str(input_file)], out, "csv") + + assert any("sub" in f and "sample.resolved.csv" in f for f in files) + assert any("sub" in f and "sample.unsolved.csv" in f for f in files) + + def test_parquet_format(self, tmp_path): + input_file = tmp_path / "sample.csv" + input_file.write_text("uuid,kingdom\n1,Animalia\n") + out = str(tmp_path / "out") + + files = get_intended_files_for_resolve(str(tmp_path), [str(input_file)], out, "parquet") + + assert "sample.resolved.parquet" in files + assert "sample.unsolved.parquet" in files + + +class TestGetIntendedFilesForCommonNames: + def test_lists_output_files_and_manifest(self, tmp_path): + annotation_dir = tmp_path / "resolved" + annotation_dir.mkdir() + p = annotation_dir / "sample.resolved.parquet" + p.write_text("") + + files = get_intended_files_for_common_names(str(annotation_dir), [str(p)]) + + assert "sample.resolved.parquet" in files + assert MANIFEST_FILENAMES["common-names"] in files + + def test_subdirectory_structure_preserved(self, tmp_path): + annotation_dir = tmp_path / "resolved" + sub = annotation_dir / "sub" + sub.mkdir(parents=True) + p = sub / "sample.resolved.parquet" + p.write_text("") + + files = get_intended_files_for_common_names(str(annotation_dir), [str(p)]) + + import os + assert os.path.join("sub", "sample.resolved.parquet") in files + + +class TestWriteManifest: + def test_creates_file_with_correct_content(self, tmp_path): + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", "cache/ns", ["a.csv"]) + + manifest_path = tmp_path / MANIFEST_FILENAMES["resolve"] + assert manifest_path.exists() + data = json.loads(manifest_path.read_text()) + assert data["command"] == "resolve" + assert data["taxonopy_version"] == "0.2.0" + assert data["input"] == "input/" + assert data["cache_namespace"] == "cache/ns" + assert "a.csv" in data["files"] + assert "created_at" in data + + def test_common_names_uses_correct_filename(self, tmp_path): + write_manifest(str(tmp_path), "common-names", "0.2.0", "input/", None, []) + + assert (tmp_path / MANIFEST_FILENAMES["common-names"]).exists() + assert not (tmp_path / MANIFEST_FILENAMES["resolve"]).exists() + + def test_cache_namespace_can_be_none(self, tmp_path): + write_manifest(str(tmp_path), "common-names", "0.2.0", "input/", None, []) + + data = json.loads((tmp_path / MANIFEST_FILENAMES["common-names"]).read_text()) + assert data["cache_namespace"] is None + + +class TestReadManifest: + def test_returns_none_when_missing(self, tmp_path): + assert read_manifest(str(tmp_path), "resolve") is None + + def test_returns_none_for_wrong_command(self, tmp_path): + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, []) + assert read_manifest(str(tmp_path), "common-names") is None + + def test_reads_existing_manifest(self, tmp_path): + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, ["a.csv"]) + + data = read_manifest(str(tmp_path), "resolve") + assert data is not None + assert data["command"] == "resolve" + assert "a.csv" in data["files"] + + +class TestDeleteFromManifest: + def test_deletes_listed_files_and_manifest(self, tmp_path): + (tmp_path / "sample.resolved.csv").write_text("data") + (tmp_path / "sample.unsolved.csv").write_text("data") + (tmp_path / RESOLUTION_STATS_FILENAME).write_text("{}") + files = [ + "sample.resolved.csv", + "sample.unsolved.csv", + RESOLUTION_STATS_FILENAME, + MANIFEST_FILENAMES["resolve"], + ] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + result = delete_from_manifest(str(tmp_path), "resolve") + + assert result is True + assert not (tmp_path / "sample.resolved.csv").exists() + assert not (tmp_path / "sample.unsolved.csv").exists() + assert not (tmp_path / RESOLUTION_STATS_FILENAME).exists() + assert not (tmp_path / MANIFEST_FILENAMES["resolve"]).exists() + + def test_returns_false_when_no_manifest(self, tmp_path): + assert delete_from_manifest(str(tmp_path), "resolve") is False + + def test_tolerates_missing_listed_files(self, tmp_path): + files = ["missing.resolved.csv", MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + result = delete_from_manifest(str(tmp_path), "resolve") + + assert result is True + assert not (tmp_path / MANIFEST_FILENAMES["resolve"]).exists() + + def test_does_not_delete_unlisted_files(self, tmp_path): + (tmp_path / "user_file.txt").write_text("keep me") + (tmp_path / "sample.resolved.csv").write_text("data") + files = ["sample.resolved.csv", MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + delete_from_manifest(str(tmp_path), "resolve") + + assert (tmp_path / "user_file.txt").exists() + + def test_scoped_to_command(self, tmp_path): + (tmp_path / "sample.resolved.csv").write_text("data") + write_manifest( + str(tmp_path), "resolve", "0.2.0", "input/", None, + ["sample.resolved.csv", MANIFEST_FILENAMES["resolve"]], + ) + write_manifest( + str(tmp_path), "common-names", "0.2.0", "input/", None, + [MANIFEST_FILENAMES["common-names"]], + ) + + delete_from_manifest(str(tmp_path), "resolve") + + assert not (tmp_path / MANIFEST_FILENAMES["resolve"]).exists() + assert (tmp_path / MANIFEST_FILENAMES["common-names"]).exists() + + def test_manifest_written_before_output_files(self, tmp_path): + """Manifest must exist before any output file is created.""" + manifest_path = tmp_path / MANIFEST_FILENAMES["resolve"] + output_file = tmp_path / "sample.resolved.csv" + + write_manifest( + str(tmp_path), "resolve", "0.2.0", "input/", None, + ["sample.resolved.csv", MANIFEST_FILENAMES["resolve"]], + ) + assert manifest_path.exists() + assert not output_file.exists() + + output_file.write_text("data") + assert manifest_path.exists() + assert output_file.exists() From eb7b5bbefb9eeae5abe4d4d5cd8b778fb8595406 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Fri, 20 Feb 2026 17:06:15 -0500 Subject: [PATCH 04/12] Document --full-rerun behavior, manifest, and rerun lifecycle Add a dedicated Reruns page under the IO section covering the existing-output guard, what --full-rerun touches and does not touch, the manifest schema, and no-manifest behavior. Update surrounding pages and the quick reference guide to link to it. Update AGENTS.md to reflect the new module and --full-rerun semantics. Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 3 +- docs/user-guide/io/cache.md | 4 +- docs/user-guide/io/index.md | 1 + docs/user-guide/io/output.md | 10 ++++ docs/user-guide/io/reruns.md | 94 ++++++++++++++++++++++++++++++ docs/user-guide/quick-reference.md | 4 +- mkdocs.yml | 1 + 7 files changed, 114 insertions(+), 3 deletions(-) create mode 100644 docs/user-guide/io/reruns.md diff --git a/AGENTS.md b/AGENTS.md index 907099b..7709665 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -12,7 +12,7 @@ Use this file primarily when operating as a coding agent. Its intent is to captu - CLI-first tool for normalizing taxonomy: ingest (Parquet/CSV) → parse/group (`TaxonomicEntry`/`EntryGroupRef`) → plan + run GNVerifier queries → classify via strategy profiles → write resolved & unsolved outputs → optional common-name enrichment. - Source layout: CLI entry (`src/taxonopy/cli.py`), parsing/grouping/cache (`input_parser`, `entry_grouper`, `cache_manager`), query stack (`query/planner|executor|gnverifier_client`), -resolution logic (`resolution/attempt_manager` + profiles), outputs (`output_manager`), tracing (`trace/entry.py`). +resolution logic (`resolution/attempt_manager` + profiles), outputs (`output_manager`), manifest tracking (`manifest.py`), tracing (`trace/entry.py`). - Dependencies (see `pyproject.toml`): Python ≥ 3.10, Polars, Pandas/PyArrow, Pydantic v2, tqdm, requests; dev extras provide Ruff, pytest scaffolding, datamodel-code-generator, pre-commit. ## Environment Setup @@ -73,6 +73,7 @@ taxonopy common-names \ - `--clear-cache` - `--refresh-cache` (per run) to ignore stale grouping/parsing caches. - Don’t delete cache files manually unless instructed; prefer the flags above. +- `--full-rerun` clears the input-scoped cache namespace and deletes only the files listed in `taxonopy_resolve_manifest.json` (written before any output on every run). Non-TaxonoPy files in the output directory are never touched. If no manifest is found (pre-v0.3.0 output), a warning is logged and no files are removed. ## Validation & QA - Run `ruff check .` after modifying Python files (requires the `dev` extra). diff --git a/docs/user-guide/io/cache.md b/docs/user-guide/io/cache.md index 9f42947..0ea527e 100644 --- a/docs/user-guide/io/cache.md +++ b/docs/user-guide/io/cache.md @@ -40,7 +40,9 @@ This keeps caches isolated across datasets and releases. - `--cache-stats` — show cache statistics and exit. - `--clear-cache` — remove cached objects. - `--refresh-cache` (resolve only) — ignore cached parse/group results. -- `--full-rerun` (resolve only) — clear cache for the input and overwrite outputs. +- `--full-rerun` (resolve only) — clear the input-scoped cache and remove + TaxonoPy-specific output files before rerunning. See [Reruns](reruns.md) for + full details. If you change input files or want to force a clean run, use `--refresh-cache` or `--full-rerun`. diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md index 85ede67..9a922f6 100644 --- a/docs/user-guide/io/index.md +++ b/docs/user-guide/io/index.md @@ -7,3 +7,4 @@ the cache supports provenance and transparency throughout the resolution process - [Input](input.md) - [Output](output.md) - [Cache](cache.md) +- [Reruns](reruns.md) diff --git a/docs/user-guide/io/output.md b/docs/user-guide/io/output.md index 76cc165..c74e56e 100644 --- a/docs/user-guide/io/output.md +++ b/docs/user-guide/io/output.md @@ -8,6 +8,13 @@ When you run `taxonopy resolve`, TaxonoPy writes two outputs for each input file The output directory mirrors the input directory structure. Output format is controlled by the `--output-format` flag (`csv` or `parquet`). +TaxonoPy also writes a manifest file to the output directory before creating +any other files. This manifest lists every file the run intends to produce and +is used by `--full-rerun` to clean up precisely. Each command writes its own +manifest (`taxonopy_resolve_manifest.json` and +`taxonopy_common_names_manifest.json` respectively) so they coexist safely if +both commands share an output directory. See [Reruns](reruns.md) for details. + ## What’s Inside Each output row corresponds to one input record. Resolved entries contain the @@ -20,5 +27,8 @@ Running through the sample resolution results in the following core files: - `taxonopy resolve`: - `examples/resolved/sample.resolved.parquet` - `examples/resolved/sample.unsolved.parquet` + - `examples/resolved/resolution_stats.json` + - `examples/resolved/taxonopy_resolve_manifest.json` - `taxonopy common-names`: - `examples/resolved/common/sample.resolved.parquet` + - `examples/resolved/common/taxonopy_common_names_manifest.json` diff --git a/docs/user-guide/io/reruns.md b/docs/user-guide/io/reruns.md new file mode 100644 index 0000000..9920040 --- /dev/null +++ b/docs/user-guide/io/reruns.md @@ -0,0 +1,94 @@ +# Reruns + +## The Guard + +TaxonoPy checks for existing output before processing. If a prior run is +detected for the current input, it exits with a warning rather than silently +overwriting: + +``` +Existing cache (...) and/or output (...) detected for this input. +Rerun with --full-rerun to replace them. +``` + +Detection uses two signals: + +- the presence of a `taxonopy_resolve_manifest.json` in the output directory + (written by any run using TaxonoPy v0.3.0 or later), or +- `.resolved.*` files in the output directory root (legacy fallback for output + produced by earlier versions). + +## `--full-rerun` + +`--full-rerun` is the explicit escape hatch through the guard. It clears the +input-scoped cache namespace and removes all TaxonoPy-specific files from the +output directory before proceeding. + +```console +taxonopy resolve \ + --input examples/input \ + --output-dir examples/resolved \ + --full-rerun +``` + +### What it touches + +- **Cache**: the namespace scoped to the current command, TaxonoPy version, and + input fingerprint. Other namespaces (different inputs, different versions) are + not affected. +- **Output files**: only the files listed in `taxonopy_resolve_manifest.json`. + Any other files in the output directory are left untouched. + +### What it does not touch + +- Files not listed in the manifest — including any non-TaxonoPy files you have + placed in the output directory. +- Cache namespaces from other runs. + +### No manifest found + +If `--full-rerun` is used but no manifest is present (e.g. output from a +pre-v0.3.0 run, or a manually populated directory), TaxonoPy logs a warning +and proceeds without removing any files: + +``` +--full-rerun: no manifest found in ; no output files were removed. +``` + +The run then writes fresh output and a new manifest. + +## The Manifest + +Every TaxonoPy run writes a manifest file to the output directory **before** +creating any output. This means interrupted runs leave a complete record of +what should be cleaned up — `--full-rerun` deletes exactly those files and +nothing else. + +Manifest files are command-scoped so they coexist safely if multiple commands +share an output directory: + +| Command | Manifest file | +|---|---| +| `resolve` | `taxonopy_resolve_manifest.json` | +| `common-names` | `taxonopy_common_names_manifest.json` | + +### Schema + +```json +{ + "taxonopy_version": "0.3.0", + "command": "resolve", + "created_at": "2025-07-19T10:38:04.123456", + "input": "examples/input", + "cache_namespace": "~/.cache/taxonopy/resolve_v0.3.0_a3f9b2c1d4e5f678", + "files": [ + "sample.resolved.parquet", + "sample.unsolved.parquet", + "resolution_stats.json", + "taxonopy_resolve_manifest.json" + ] +} +``` + +All paths in `files` are relative to the output directory. `cache_namespace` +is `null` for `common-names`, which does not use an input-scoped cache. diff --git a/docs/user-guide/quick-reference.md b/docs/user-guide/quick-reference.md index 540ab8a..a3f86c6 100644 --- a/docs/user-guide/quick-reference.md +++ b/docs/user-guide/quick-reference.md @@ -88,7 +88,7 @@ taxonopy common-names \ --output-dir examples/resolved/common ``` -This command uses GBIF Backbone data only and applies deterministic fallback: species to kingdom, with English names preferred at each rank. +This command uses GBIF Backbone data only and applies deterministic fallback: species to kingdom, with English names preferred at each rank. It also writes a `taxonopy_common_names_manifest.json` to the output directory. _**Sample common-name output (`examples/resolved/common/sample.resolved.parquet`)**; the last two rows (both Laelia rosea) fall back to family-level common names—none available at species or genus rank._
@@ -120,6 +120,8 @@ The `resolution_stats.json` file summarizes counts of how many entries from the TaxonoPy also writes cache data to disk (default: `~/.cache/taxonopy`) so it can trace provenance and avoid reprocessing. Use `--show-cache-path`, `--cache-stats`, or `--clear-cache` if you want to inspect or manage it, or see the [Cache](io/cache.md) guide for details. +If TaxonoPy detects existing output for your input it will exit with a warning. Use `--full-rerun` to clear the cache and remove previous outputs before rerunning. TaxonoPy tracks exactly which files it produced via a manifest, so only TaxonoPy-specific files are removed — nothing else in your output directory is touched. See [Reruns](io/reruns.md) for details. + ## Trace an Entry You can trace how a single UUID was resolved. For example, let's trace one of the _Laelia rosea_ entries: diff --git a/mkdocs.yml b/mkdocs.yml index a05a945..677a245 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -15,6 +15,7 @@ nav: - Input: user-guide/io/input.md - Output: user-guide/io/output.md - Cache: user-guide/io/cache.md + - Reruns: user-guide/io/reruns.md - Development: - Contributing: - development/contributing/index.md From efae52f566e374972d024af9751e17f0977c3b1e Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Fri, 20 Feb 2026 17:08:50 -0500 Subject: [PATCH 05/12] Update contribution convention to use Co-Authored-By for AI attribution Replace the [AI-assisted session] footer with a Co-Authored-By trailer identifying the model and provider, consistent with standard git co-authorship convention and applicable across any AI assistant. Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 7709665..873c0ec 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -101,7 +101,12 @@ taxonopy common-names \ - Follow best version control practices including, but not limited to, the following: - At the start of a session, ensure that work is done on a relevant branch (not `main`), and pull the latest changes from `main` before starting. - Make commit messages imperative, one line, and descriptive of the change's "what" and "why" (not "how"). Any needed description beyond this can go in the extended body. -- For every commit you produce, append "[AI-assisted session]" as a final line in the extended commit message body. +- For every commit you produce, add a `Co-Authored-By` trailer in the extended commit message body identifying the model and provider, e.g.: + ``` + Co-Authored-By: Claude Sonnet 4.6 + Co-Authored-By: GPT-4o + Co-Authored-By: Gemini 2.0 Flash + ``` - Do not use Git or the GitHub CLI for any destructive actions like `git reset --hard`, `git rebase`, `git push --force`, `git branch -D`, `gh repo delete`, `gh issue delete`, and so on, nor commands like `rm -rf` that delete files or directories. If you consider a destructive command to be necessary, stop and discuss the situation with a maintainer. - When modifying CLI behavior, resolution strategies, or caching semantics, update this AGENTS file so future agents follow the latest contract. - Run `ruff check .`, `pytest`, and the sample `taxonopy resolve` workflow before handing off changes or opening discussions with maintainers. From b0b3c62a3917480bc75b588a722db17c491b0315 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 14:32:30 -0500 Subject: [PATCH 06/12] Prevent symlink and traversal sequences from escaping the output directory Canonicalize each path before the containment check so that symlinks and .. sequences cannot be used to target files outside the output directory. Also replace the exists()+unlink() pair with unlink(missing_ok=True). Co-Authored-By: Claude Sonnet 4.6 --- src/taxonopy/manifest.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/src/taxonopy/manifest.py b/src/taxonopy/manifest.py index 4d4d69f..5a97877 100644 --- a/src/taxonopy/manifest.py +++ b/src/taxonopy/manifest.py @@ -147,12 +147,39 @@ def delete_from_manifest(output_dir: str, command: str) -> bool: if manifest is None: return False output_dir_path = Path(output_dir) + resolved_output_dir = output_dir_path.resolve() removed = 0 for rel_path in manifest.get("files", []): + if not isinstance(rel_path, (str, os.PathLike)): + logger.warning( + "Skipping non-string entry in manifest for command '%s': %r", + command, + rel_path, + ) + continue f = output_dir_path / rel_path - if f.exists(): - f.unlink() - removed += 1 + if not f.exists(): + continue + try: + resolved_f = f.resolve(strict=True) + except OSError: + logger.warning( + "Skipping file with invalid path in manifest for command '%s': %r", + command, + rel_path, + ) + continue + try: + resolved_f.relative_to(resolved_output_dir) + except ValueError: + logger.warning( + "Skipping file outside output directory in manifest for command '%s': %r", + command, + rel_path, + ) + continue + resolved_f.unlink(missing_ok=True) + removed += 1 manifest_path = output_dir_path / MANIFEST_FILENAMES[command] if manifest_path.exists(): manifest_path.unlink() From 98b2e0ac0141be9be57d0b4887b65a7aab4917f8 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 14:32:59 -0500 Subject: [PATCH 07/12] Add security tests for manifest deletion path containment Cover non-string entries, path traversal, absolute paths outside the output directory, and symlink escape. Symlink test skips gracefully when symlink creation is not permitted by the OS. Co-Authored-By: Claude Sonnet 4.6 --- tests/test_full_rerun.py | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/test_full_rerun.py b/tests/test_full_rerun.py index 9018452..85d957d 100644 --- a/tests/test_full_rerun.py +++ b/tests/test_full_rerun.py @@ -1,5 +1,7 @@ import json +import os +import pytest from taxonopy.manifest import ( MANIFEST_FILENAMES, @@ -217,3 +219,49 @@ def test_manifest_written_before_output_files(self, tmp_path): output_file.write_text("data") assert manifest_path.exists() assert output_file.exists() + + def test_skips_non_string_entries_in_manifest(self, tmp_path): + (tmp_path / "valid.csv").write_text("data") + files = [42, None, "valid.csv", MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + result = delete_from_manifest(str(tmp_path), "resolve") + + assert result is True + assert not (tmp_path / "valid.csv").exists() + assert not (tmp_path / MANIFEST_FILENAMES["resolve"]).exists() + + def test_rejects_path_traversal_in_manifest(self, tmp_path): + outside = tmp_path.parent / "outside.txt" + outside.write_text("keep me") + files = ["../outside.txt", MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + delete_from_manifest(str(tmp_path), "resolve") + + assert outside.exists() + + def test_rejects_absolute_path_in_manifest(self, tmp_path): + outside = tmp_path.parent / "outside_abs.txt" + outside.write_text("keep me") + files = [str(outside.resolve()), MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + delete_from_manifest(str(tmp_path), "resolve") + + assert outside.exists() + + def test_rejects_symlink_escape_in_manifest(self, tmp_path): + outside = tmp_path.parent / "outside_sym.txt" + outside.write_text("keep me") + link = tmp_path / "link.txt" + try: + link.symlink_to(outside) + except OSError: + pytest.skip("symlink creation not permitted on this platform") + files = ["link.txt", MANIFEST_FILENAMES["resolve"]] + write_manifest(str(tmp_path), "resolve", "0.2.0", "input/", None, files) + + delete_from_manifest(str(tmp_path), "resolve") + + assert outside.exists() From d392ba19f1fab956bc61724b65beb95e59b5c9a1 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 14:50:08 -0500 Subject: [PATCH 08/12] Fix linting errors --- tests/test_full_rerun.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_full_rerun.py b/tests/test_full_rerun.py index 85d957d..2bc78cc 100644 --- a/tests/test_full_rerun.py +++ b/tests/test_full_rerun.py @@ -97,7 +97,6 @@ def test_subdirectory_structure_preserved(self, tmp_path): files = get_intended_files_for_common_names(str(annotation_dir), [str(p)]) - import os assert os.path.join("sub", "sample.resolved.parquet") in files From 661bfed9dfc5c97bd87bb6b546b945f9907d2c82 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 15:45:04 -0500 Subject: [PATCH 09/12] Raise on corrupt or unreadable manifest with an actionable error message A JSONDecodeError or OSError from read_manifest previously surfaced as a generic 'unexpected error' with no guidance. Now the error names the file, explains that automated cleanup cannot proceed, and tells the user to fix or delete the manifest or use a new output directory. Co-Authored-By: Claude Sonnet 4.6 --- src/taxonopy/manifest.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/taxonopy/manifest.py b/src/taxonopy/manifest.py index 5a97877..3eb872a 100644 --- a/src/taxonopy/manifest.py +++ b/src/taxonopy/manifest.py @@ -127,7 +127,17 @@ def read_manifest(output_dir: str, command: str) -> Optional[dict]: manifest_path = Path(output_dir) / MANIFEST_FILENAMES[command] if not manifest_path.exists(): return None - return json.loads(manifest_path.read_text()) + try: + return json.loads(manifest_path.read_text()) + except (OSError, UnicodeDecodeError, json.JSONDecodeError) as exc: + logger.error( + "Cannot read manifest at '%s': %s -- automated rerun cleanup is not possible. " + "To proceed: fix or delete this file and remove previous TaxonoPy output files " + "from this output directory manually, or specify a new output directory with --output-dir.", + manifest_path, + exc, + ) + raise def delete_from_manifest(output_dir: str, command: str) -> bool: From 73019c00093385d0c793a94ad7d17d8938a9b8bb Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 15:45:36 -0500 Subject: [PATCH 10/12] Test that read_manifest raises on corrupt JSON Co-Authored-By: Claude Sonnet 4.6 --- tests/test_full_rerun.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_full_rerun.py b/tests/test_full_rerun.py index 2bc78cc..542f11f 100644 --- a/tests/test_full_rerun.py +++ b/tests/test_full_rerun.py @@ -143,6 +143,12 @@ def test_reads_existing_manifest(self, tmp_path): assert data["command"] == "resolve" assert "a.csv" in data["files"] + def test_raises_on_corrupt_json(self, tmp_path): + (tmp_path / MANIFEST_FILENAMES["resolve"]).write_text("not valid json {{{") + + with pytest.raises(json.JSONDecodeError): + read_manifest(str(tmp_path), "resolve") + class TestDeleteFromManifest: def test_deletes_listed_files_and_manifest(self, tmp_path): From 75691a81e3e99ba569142f2b5fe735e93c1110fe Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 18:32:22 -0500 Subject: [PATCH 11/12] Document no-hard-wrap convention and fix existing wraps in AGENTS.md Adds three style rules to Coding Conventions covering comments, string literals, and markdown prose. Also joins the two wrapped lines in the Project Snapshot section to match the new convention. Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 873c0ec..2fae568 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -9,10 +9,8 @@ Use this file primarily when operating as a coding agent. Its intent is to captu - When instructions here conflict with new information, trust the current codebase and update AGENTS.md alongside your change. If critical context is still missing, pause and ask the maintainer rather than guessing. ## Project Snapshot -- CLI-first tool for normalizing taxonomy: ingest (Parquet/CSV) → parse/group (`TaxonomicEntry`/`EntryGroupRef`) → plan + run GNVerifier queries → classify via strategy profiles → write -resolved & unsolved outputs → optional common-name enrichment. -- Source layout: CLI entry (`src/taxonopy/cli.py`), parsing/grouping/cache (`input_parser`, `entry_grouper`, `cache_manager`), query stack (`query/planner|executor|gnverifier_client`), -resolution logic (`resolution/attempt_manager` + profiles), outputs (`output_manager`), manifest tracking (`manifest.py`), tracing (`trace/entry.py`). +- CLI-first tool for normalizing taxonomy: ingest (Parquet/CSV) → parse/group (`TaxonomicEntry`/`EntryGroupRef`) → plan + run GNVerifier queries → classify via strategy profiles → write resolved & unsolved outputs → optional common-name enrichment. +- Source layout: CLI entry (`src/taxonopy/cli.py`), parsing/grouping/cache (`input_parser`, `entry_grouper`, `cache_manager`), query stack (`query/planner|executor|gnverifier_client`), resolution logic (`resolution/attempt_manager` + profiles), outputs (`output_manager`), manifest tracking (`manifest.py`), tracing (`trace/entry.py`). - Dependencies (see `pyproject.toml`): Python ≥ 3.10, Polars, Pandas/PyArrow, Pydantic v2, tqdm, requests; dev extras provide Ruff, pytest scaffolding, datamodel-code-generator, pre-commit. ## Environment Setup @@ -81,6 +79,9 @@ taxonopy common-names \ - Validate functional changes by running `taxonopy resolve` against `examples/input` (or issue-specific datasets) and reviewing outputs/logs, plus `taxonopy trace entry ...` when touching parsing/grouping logic. ## Coding Conventions +- Don't hard-wrap comments. Only use line breaks for new paragraphs. Let the editor soft-wrap content. +- Don't hard-wrap string literals. Keep each log or user-facing message in a single source line and rely on soft wrapping when reading it. +- Don't hard-wrap markdown prose in documentation. Let the renderer wrap lines as needed. - Prefer frozen dataclasses (`types/data_classes.py`) for shared structures; mutate via new objects rather than in-place edits. - Rely on strong typing + Pydantic models for external data (`types/gnverifier.py`); regenerate via the helper script instead of editing generated files. - Log through the standard logging config (`logging_config.setup_logging`) and keep tqdm progress bars for long-running loops. From 273edcb86841ffd72cc4f1c77919658e57a1c860 Mon Sep 17 00:00:00 2001 From: Matthew Thompson Date: Tue, 24 Feb 2026 18:32:35 -0500 Subject: [PATCH 12/12] Remove hard wrapping from all files touched by this PR Co-Authored-By: Claude Sonnet 4.6 --- docs/user-guide/io/cache.md | 10 +++------ docs/user-guide/io/index.md | 4 +--- docs/user-guide/io/output.md | 23 ++++++-------------- docs/user-guide/io/reruns.md | 39 ++++++++++------------------------ src/taxonopy/manifest.py | 23 +++++--------------- src/taxonopy/output_manager.py | 8 ++----- 6 files changed, 29 insertions(+), 78 deletions(-) diff --git a/docs/user-guide/io/cache.md b/docs/user-guide/io/cache.md index 0ea527e..36d68c7 100644 --- a/docs/user-guide/io/cache.md +++ b/docs/user-guide/io/cache.md @@ -1,7 +1,6 @@ # Cache -TaxonoPy caches intermediate results (like parsed inputs and grouped entries) to -speed up repeated runs on the same dataset. +TaxonoPy caches intermediate results (like parsed inputs and grouped entries) to speed up repeated runs on the same dataset. ## Location @@ -40,9 +39,6 @@ This keeps caches isolated across datasets and releases. - `--cache-stats` — show cache statistics and exit. - `--clear-cache` — remove cached objects. - `--refresh-cache` (resolve only) — ignore cached parse/group results. -- `--full-rerun` (resolve only) — clear the input-scoped cache and remove - TaxonoPy-specific output files before rerunning. See [Reruns](reruns.md) for - full details. +- `--full-rerun` (resolve only) — clear the input-scoped cache and remove TaxonoPy-specific output files before rerunning. See [Reruns](reruns.md) for full details. -If you change input files or want to force a clean run, use `--refresh-cache` or -`--full-rerun`. +If you change input files or want to force a clean run, use `--refresh-cache` or `--full-rerun`. diff --git a/docs/user-guide/io/index.md b/docs/user-guide/io/index.md index 9a922f6..f328fc4 100644 --- a/docs/user-guide/io/index.md +++ b/docs/user-guide/io/index.md @@ -1,8 +1,6 @@ # IO -TaxonoPy accepts CSV or Parquet inputs with the same schema. Use the pages below -for the exact input columns, the structure of resolved/unsolved outputs, and how -the cache supports provenance and transparency throughout the resolution process. +TaxonoPy accepts CSV or Parquet inputs with the same schema. Use the pages below for the exact input columns, the structure of resolved/unsolved outputs, and how the cache supports provenance and transparency throughout the resolution process. - [Input](input.md) - [Output](output.md) diff --git a/docs/user-guide/io/output.md b/docs/user-guide/io/output.md index c74e56e..8dbe720 100644 --- a/docs/user-guide/io/output.md +++ b/docs/user-guide/io/output.md @@ -5,22 +5,13 @@ When you run `taxonopy resolve`, TaxonoPy writes two outputs for each input file - **Resolved**: `.resolved.` - **Unsolved**: `.unsolved.` -The output directory mirrors the input directory structure. Output format is -controlled by the `--output-format` flag (`csv` or `parquet`). - -TaxonoPy also writes a manifest file to the output directory before creating -any other files. This manifest lists every file the run intends to produce and -is used by `--full-rerun` to clean up precisely. Each command writes its own -manifest (`taxonopy_resolve_manifest.json` and -`taxonopy_common_names_manifest.json` respectively) so they coexist safely if -both commands share an output directory. See [Reruns](reruns.md) for details. - -## What’s Inside - -Each output row corresponds to one input record. Resolved entries contain the -standardized taxonomy where available, while unsolved entries preserve the -original input ranks. Both outputs include resolution metadata such as status -and strategy information. +The output directory mirrors the input directory structure. Output format is controlled by the `--output-format` flag (`csv` or `parquet`). + +TaxonoPy also writes a manifest file to the output directory before creating any other files. This manifest lists every file the run intends to produce and is used by `--full-rerun` to clean up precisely. Each command writes its own manifest (`taxonopy_resolve_manifest.json` and `taxonopy_common_names_manifest.json` respectively) so they coexist safely if both commands share an output directory. See [Reruns](reruns.md) for details. + +## What's Inside + +Each output row corresponds to one input record. Resolved entries contain the standardized taxonomy where available, while unsolved entries preserve the original input ranks. Both outputs include resolution metadata such as status and strategy information. Running through the sample resolution results in the following core files: diff --git a/docs/user-guide/io/reruns.md b/docs/user-guide/io/reruns.md index 9920040..b77b1dc 100644 --- a/docs/user-guide/io/reruns.md +++ b/docs/user-guide/io/reruns.md @@ -2,9 +2,7 @@ ## The Guard -TaxonoPy checks for existing output before processing. If a prior run is -detected for the current input, it exits with a warning rather than silently -overwriting: +TaxonoPy checks for existing output before processing. If a prior run is detected for the current input, it exits with a warning rather than silently overwriting: ``` Existing cache (...) and/or output (...) detected for this input. @@ -13,16 +11,12 @@ Rerun with --full-rerun to replace them. Detection uses two signals: -- the presence of a `taxonopy_resolve_manifest.json` in the output directory - (written by any run using TaxonoPy v0.3.0 or later), or -- `.resolved.*` files in the output directory root (legacy fallback for output - produced by earlier versions). +- the presence of a `taxonopy_resolve_manifest.json` in the output directory (written by any run using TaxonoPy v0.3.0 or later), or +- `.resolved.*` files in the output directory root (legacy fallback for output produced by earlier versions). ## `--full-rerun` -`--full-rerun` is the explicit escape hatch through the guard. It clears the -input-scoped cache namespace and removes all TaxonoPy-specific files from the -output directory before proceeding. +`--full-rerun` is the explicit escape hatch through the guard. It clears the input-scoped cache namespace and removes all TaxonoPy-specific files from the output directory before proceeding. ```console taxonopy resolve \ @@ -33,23 +27,17 @@ taxonopy resolve \ ### What it touches -- **Cache**: the namespace scoped to the current command, TaxonoPy version, and - input fingerprint. Other namespaces (different inputs, different versions) are - not affected. -- **Output files**: only the files listed in `taxonopy_resolve_manifest.json`. - Any other files in the output directory are left untouched. +- **Cache**: the namespace scoped to the current command, TaxonoPy version, and input fingerprint. Other namespaces (different inputs, different versions) are not affected. +- **Output files**: only the files listed in `taxonopy_resolve_manifest.json`. Any other files in the output directory are left untouched. ### What it does not touch -- Files not listed in the manifest — including any non-TaxonoPy files you have - placed in the output directory. +- Files not listed in the manifest — including any non-TaxonoPy files you have placed in the output directory. - Cache namespaces from other runs. ### No manifest found -If `--full-rerun` is used but no manifest is present (e.g. output from a -pre-v0.3.0 run, or a manually populated directory), TaxonoPy logs a warning -and proceeds without removing any files: +If `--full-rerun` is used but no manifest is present (e.g. output from a pre-v0.3.0 run, or a manually populated directory), TaxonoPy logs a warning and proceeds without removing any files: ``` --full-rerun: no manifest found in ; no output files were removed. @@ -59,13 +47,9 @@ The run then writes fresh output and a new manifest. ## The Manifest -Every TaxonoPy run writes a manifest file to the output directory **before** -creating any output. This means interrupted runs leave a complete record of -what should be cleaned up — `--full-rerun` deletes exactly those files and -nothing else. +Every TaxonoPy run writes a manifest file to the output directory **before** creating any output. This means interrupted runs leave a complete record of what should be cleaned up — `--full-rerun` deletes exactly those files and nothing else. -Manifest files are command-scoped so they coexist safely if multiple commands -share an output directory: +Manifest files are command-scoped so they coexist safely if multiple commands share an output directory: | Command | Manifest file | |---|---| @@ -90,5 +74,4 @@ share an output directory: } ``` -All paths in `files` are relative to the output directory. `cache_namespace` -is `null` for `common-names`, which does not use an input-scoped cache. +All paths in `files` are relative to the output directory. `cache_namespace` is `null` for `common-names`, which does not use an input-scoped cache. diff --git a/src/taxonopy/manifest.py b/src/taxonopy/manifest.py index 3eb872a..566d341 100644 --- a/src/taxonopy/manifest.py +++ b/src/taxonopy/manifest.py @@ -1,12 +1,8 @@ """Manifest tracking for TaxonoPy output files. -Each TaxonoPy command writes a manifest file to its output directory listing -every file it intends to produce. The manifest is written before any output -files are created, so interrupted runs leave a complete record of what should -be cleaned up on the next --full-rerun. +Each TaxonoPy command writes a manifest file to its output directory listing every file it intends to produce. The manifest is written before any output files are created, so interrupted runs leave a complete record of what should be cleaned up on the next --full-rerun. -Manifest files are command-scoped to avoid collisions when multiple commands -share an output directory. +Manifest files are command-scoped to avoid collisions when multiple commands share an output directory. """ import json @@ -37,8 +33,7 @@ def get_intended_files_for_resolve( ) -> List[str]: """Return the full list of files a resolve run intends to write. - Delegates output path naming to compute_output_paths (single source of - truth in output_manager), then appends the fixed outputs. + Delegates output path naming to compute_output_paths (single source of truth in output_manager), then appends the fixed outputs. Args: input_path: The --input argument (file or directory). @@ -63,9 +58,7 @@ def get_intended_files_for_common_names( ) -> List[str]: """Return the full list of files a common-names run intends to write. - Output files preserve the input directory structure, so paths are simply - the relative paths of the annotation files. No naming convention is - encoded here. + Output files preserve the input directory structure, so paths are simply the relative paths of the annotation files. No naming convention is encoded here. Args: annotation_dir: The --resolved-dir argument. @@ -130,13 +123,7 @@ def read_manifest(output_dir: str, command: str) -> Optional[dict]: try: return json.loads(manifest_path.read_text()) except (OSError, UnicodeDecodeError, json.JSONDecodeError) as exc: - logger.error( - "Cannot read manifest at '%s': %s -- automated rerun cleanup is not possible. " - "To proceed: fix or delete this file and remove previous TaxonoPy output files " - "from this output directory manually, or specify a new output directory with --output-dir.", - manifest_path, - exc, - ) + logger.error("Cannot read manifest at '%s': %s -- automated rerun cleanup is not possible. To proceed: fix or delete this file and remove previous TaxonoPy output files from this output directory manually, or specify a new output directory with --output-dir.", manifest_path, exc) raise diff --git a/src/taxonopy/output_manager.py b/src/taxonopy/output_manager.py index 6317e12..f44aac5 100644 --- a/src/taxonopy/output_manager.py +++ b/src/taxonopy/output_manager.py @@ -187,9 +187,7 @@ def _resolve_output_paths_for_input( ) -> Tuple[str, ...]: """Return absolute output file path(s) for a single input file. - This is the single source of truth for TaxonoPy output file naming. - Both the generate functions and compute_output_paths use it so that - naming convention changes need only be made here. + This is the single source of truth for TaxonoPy output file naming. Both the generate functions and compute_output_paths use it so that naming convention changes need only be made here. Args: input_file: Absolute path to the input file. @@ -222,9 +220,7 @@ def compute_output_paths( ) -> List[str]: """Return intended output file paths (relative to output_dir) for a resolve run. - Used by the manifest system to record files before they are written. - Does not include fixed outputs such as resolution_stats.json or the - manifest file itself — callers are responsible for appending those. + Used by the manifest system to record files before they are written. Does not include fixed outputs such as resolution_stats.json or the manifest file itself — callers are responsible for appending those. Args: input_path: The --input argument (file or directory).