From fda43f15a423f6b3c681bdbab22bcb7480e2aa85 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Feb 2026 20:59:15 +0000 Subject: [PATCH 1/2] Initial plan From b9f91d3b9dd7288ca805e26db6e862af43a593f0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Feb 2026 21:04:42 +0000 Subject: [PATCH 2/2] Target --full-rerun to TaxonoPy-specific output files via manifest Previously --full-rerun called shutil.rmtree(output_dir), deleting the entire output directory including any non-TaxonoPy files. Now each run writes a taxonopy_manifest.json listing only the files it produced. --full-rerun reads that manifest and removes only the listed files (plus the manifest itself), leaving other content untouched. The existing_output detection also checks for the manifest file so that a previous run is reliably detected even if output filenames change. [AI-assisted session] Co-authored-by: thompsonmj <31709066+thompsonmj@users.noreply.github.com> --- src/taxonopy/cli.py | 27 ++++++++++---- src/taxonopy/output_manager.py | 53 ++++++++++++++++++++++++++ tests/test_output_manager.py | 68 ++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+), 7 deletions(-) create mode 100644 tests/test_output_manager.py diff --git a/src/taxonopy/cli.py b/src/taxonopy/cli.py index cd6cd86..c712154 100644 --- a/src/taxonopy/cli.py +++ b/src/taxonopy/cli.py @@ -11,7 +11,6 @@ from pathlib import Path from typing import List, Optional import json -import shutil from taxonopy import __version__ from taxonopy.config import config @@ -28,7 +27,7 @@ from taxonopy.query.gnverifier_client import GNVerifierClient from taxonopy.resolution.attempt_manager import ResolutionAttemptManager -from taxonopy.output_manager import generate_forced_output, generate_resolution_output +from taxonopy.output_manager import generate_forced_output, generate_resolution_output, write_output_manifest, read_output_manifest, MANIFEST_FILENAME from taxonopy.trace import entry as trace_entry @@ -182,7 +181,7 @@ def run_resolve(args: argparse.Namespace) -> int: namespace_stats = get_cache_stats() existing_namespace = namespace_stats["entry_count"] > 0 and not cache_cleared_via_flag - existing_output = any(output_dir.glob("*.resolved.*")) + existing_output = (output_dir / MANIFEST_FILENAME).exists() or any(output_dir.glob("*.resolved.*")) if (existing_namespace or existing_output) and not args.full_rerun: logging.warning( "Existing cache (%s) and/or output (%s) detected for this input. Rerun with --full-rerun to replace them.", @@ -191,11 +190,21 @@ def run_resolve(args: argparse.Namespace) -> int: ) return 0 if args.full_rerun: - logging.info("--full-rerun set: clearing cache and output directory before proceeding.") + logging.info("--full-rerun set: clearing cache and removing previous TaxonoPy outputs before proceeding.") clear_cache() - if output_dir.exists(): - shutil.rmtree(output_dir) - output_dir.mkdir(parents=True, exist_ok=True) + manifest_files = read_output_manifest(str(output_dir)) + manifest_path = output_dir / MANIFEST_FILENAME + for f in manifest_files: + try: + Path(f).unlink(missing_ok=True) + except OSError as e: + logging.warning("Could not remove output file %s: %s", f, e) + manifest_path.unlink(missing_ok=True) + if manifest_files: + logging.info("Removed %d previously generated TaxonoPy output files.", len(manifest_files)) + else: + logging.info("No manifest found; no previous TaxonoPy outputs removed.") + output_dir.mkdir(parents=True, exist_ok=True) try: start_time = time.time() @@ -205,6 +214,7 @@ def run_resolve(args: argparse.Namespace) -> int: if args.force_input: logging.info("Skipping resolution due to --force-input flag") generated_files = generate_forced_output(args.input, args.output_dir, args.output_format) + write_output_manifest(args.output_dir, generated_files) elapsed_time = time.time() - start_time logging.info(f"Forced output completed in {elapsed_time:.2f} seconds. Files: {generated_files}") return 0 @@ -280,6 +290,9 @@ def run_resolve(args: argparse.Namespace) -> int: stats_file_path = output_dir / "resolution_stats.json" stats_file_path.write_text(json.dumps(final_stats, indent=4)) logging.info(f"Statistics saved to {stats_file_path}") + # Write manifest of all generated output files + all_output_files = resolved_files + unsolved_files + [str(stats_file_path)] + write_output_manifest(str(output_dir), all_output_files) elapsed_time = time.time() - start_time logging.info(f"Processing completed in {elapsed_time:.2f} seconds.") return 0 diff --git a/src/taxonopy/output_manager.py b/src/taxonopy/output_manager.py index f67d83b..15f3d48 100644 --- a/src/taxonopy/output_manager.py +++ b/src/taxonopy/output_manager.py @@ -1,3 +1,4 @@ +import json import os import logging from pathlib import Path @@ -17,6 +18,58 @@ logger = logging.getLogger(__name__) +MANIFEST_FILENAME = "taxonopy_manifest.json" + + +def write_output_manifest(output_dir: str, files: List[str]) -> str: + """Write a manifest of TaxonoPy-generated files to the output directory. + + Args: + output_dir: Output directory path. + files: List of file paths generated by TaxonoPy. + + Returns: + Path to the manifest file. + """ + output_dir_path = Path(output_dir) + manifest_path = output_dir_path / MANIFEST_FILENAME + relative_files = [] + for f in files: + try: + relative_files.append(str(Path(f).relative_to(output_dir_path))) + except ValueError: + logger.warning( + "File %s is not under output_dir %s; storing absolute path in manifest.", f, output_dir_path + ) + relative_files.append(str(f)) + manifest = {"files": relative_files} + manifest_path.write_text(json.dumps(manifest, indent=2)) + logger.info(f"Wrote output manifest to {manifest_path}") + return str(manifest_path) + + +def read_output_manifest(output_dir: str) -> List[str]: + """Read the TaxonoPy output manifest and return absolute file paths. + + Args: + output_dir: Output directory path. + + Returns: + List of absolute file paths listed in the manifest, or empty list if no + manifest exists or it cannot be parsed. + """ + output_dir_path = Path(output_dir) + manifest_path = output_dir_path / MANIFEST_FILENAME + if not manifest_path.exists(): + return [] + try: + manifest = json.loads(manifest_path.read_text()) + return [str(output_dir_path / f) for f in manifest.get("files", [])] + except (json.JSONDecodeError, KeyError): + logger.warning(f"Could not parse output manifest at {manifest_path}.") + return [] + + def map_entry_to_output_format( entry: TaxonomicEntry, final_attempt: Optional[ResolutionAttempt] = None diff --git a/tests/test_output_manager.py b/tests/test_output_manager.py new file mode 100644 index 0000000..3b10957 --- /dev/null +++ b/tests/test_output_manager.py @@ -0,0 +1,68 @@ +import json +from pathlib import Path + +from taxonopy.output_manager import ( + MANIFEST_FILENAME, + read_output_manifest, + write_output_manifest, +) + + +def test_write_output_manifest_creates_file(tmp_path): + files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "b.unsolved.parquet")] + manifest_path = write_output_manifest(str(tmp_path), files) + + assert Path(manifest_path).exists() + assert Path(manifest_path).name == MANIFEST_FILENAME + + +def test_write_output_manifest_stores_relative_paths(tmp_path): + files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "sub" / "b.unsolved.csv")] + write_output_manifest(str(tmp_path), files) + + manifest = json.loads((tmp_path / MANIFEST_FILENAME).read_text()) + assert "a.resolved.parquet" in manifest["files"] + assert str(Path("sub") / "b.unsolved.csv") in manifest["files"] + # No absolute paths stored + for f in manifest["files"]: + assert not Path(f).is_absolute() + + +def test_read_output_manifest_returns_absolute_paths(tmp_path): + files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "resolution_stats.json")] + write_output_manifest(str(tmp_path), files) + + result = read_output_manifest(str(tmp_path)) + assert sorted(result) == sorted(files) + + +def test_read_output_manifest_returns_empty_when_missing(tmp_path): + result = read_output_manifest(str(tmp_path)) + assert result == [] + + +def test_read_output_manifest_returns_empty_on_corrupt_json(tmp_path): + (tmp_path / MANIFEST_FILENAME).write_text("not valid json{{{") + result = read_output_manifest(str(tmp_path)) + assert result == [] + + +def test_full_rerun_deletes_only_manifest_files(tmp_path): + """Simulate the --full-rerun logic: only files listed in the manifest are removed.""" + taxonopy_file = tmp_path / "data.resolved.parquet" + other_file = tmp_path / "important_user_data.csv" + taxonopy_file.write_text("taxonopy output") + other_file.write_text("user data") + + write_output_manifest(str(tmp_path), [str(taxonopy_file)]) + + # Simulate --full-rerun behaviour + manifest_files = read_output_manifest(str(tmp_path)) + manifest_path = tmp_path / MANIFEST_FILENAME + for f in manifest_files: + Path(f).unlink(missing_ok=True) + manifest_path.unlink(missing_ok=True) + + assert not taxonopy_file.exists(), "TaxonoPy output file should have been removed" + assert other_file.exists(), "Non-TaxonoPy file should NOT have been removed" + assert not manifest_path.exists(), "Manifest should have been removed"