Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 20 additions & 7 deletions src/taxonopy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from pathlib import Path
from typing import List, Optional
import json
import shutil

from taxonopy import __version__
from taxonopy.config import config
Expand All @@ -28,7 +27,7 @@
from taxonopy.query.gnverifier_client import GNVerifierClient
from taxonopy.resolution.attempt_manager import ResolutionAttemptManager

from taxonopy.output_manager import generate_forced_output, generate_resolution_output
from taxonopy.output_manager import generate_forced_output, generate_resolution_output, write_output_manifest, read_output_manifest, MANIFEST_FILENAME

from taxonopy.trace import entry as trace_entry

Expand Down Expand Up @@ -182,7 +181,7 @@ def run_resolve(args: argparse.Namespace) -> int:

namespace_stats = get_cache_stats()
existing_namespace = namespace_stats["entry_count"] > 0 and not cache_cleared_via_flag
existing_output = any(output_dir.glob("*.resolved.*"))
existing_output = (output_dir / MANIFEST_FILENAME).exists() or any(output_dir.glob("*.resolved.*"))
if (existing_namespace or existing_output) and not args.full_rerun:
logging.warning(
"Existing cache (%s) and/or output (%s) detected for this input. Rerun with --full-rerun to replace them.",
Expand All @@ -191,11 +190,21 @@ def run_resolve(args: argparse.Namespace) -> int:
)
return 0
if args.full_rerun:
logging.info("--full-rerun set: clearing cache and output directory before proceeding.")
logging.info("--full-rerun set: clearing cache and removing previous TaxonoPy outputs before proceeding.")
clear_cache()
if output_dir.exists():
shutil.rmtree(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
manifest_files = read_output_manifest(str(output_dir))
manifest_path = output_dir / MANIFEST_FILENAME
for f in manifest_files:
try:
Path(f).unlink(missing_ok=True)
except OSError as e:
logging.warning("Could not remove output file %s: %s", f, e)
manifest_path.unlink(missing_ok=True)
if manifest_files:
logging.info("Removed %d previously generated TaxonoPy output files.", len(manifest_files))
else:
logging.info("No manifest found; no previous TaxonoPy outputs removed.")
output_dir.mkdir(parents=True, exist_ok=True)

try:
start_time = time.time()
Expand All @@ -205,6 +214,7 @@ def run_resolve(args: argparse.Namespace) -> int:
if args.force_input:
logging.info("Skipping resolution due to --force-input flag")
generated_files = generate_forced_output(args.input, args.output_dir, args.output_format)
write_output_manifest(args.output_dir, generated_files)
elapsed_time = time.time() - start_time
logging.info(f"Forced output completed in {elapsed_time:.2f} seconds. Files: {generated_files}")
return 0
Expand Down Expand Up @@ -280,6 +290,9 @@ def run_resolve(args: argparse.Namespace) -> int:
stats_file_path = output_dir / "resolution_stats.json"
stats_file_path.write_text(json.dumps(final_stats, indent=4))
logging.info(f"Statistics saved to {stats_file_path}")
# Write manifest of all generated output files
all_output_files = resolved_files + unsolved_files + [str(stats_file_path)]
write_output_manifest(str(output_dir), all_output_files)
elapsed_time = time.time() - start_time
logging.info(f"Processing completed in {elapsed_time:.2f} seconds.")
return 0
Expand Down
53 changes: 53 additions & 0 deletions src/taxonopy/output_manager.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
import os
import logging
from pathlib import Path
Expand All @@ -17,6 +18,58 @@

logger = logging.getLogger(__name__)

MANIFEST_FILENAME = "taxonopy_manifest.json"


def write_output_manifest(output_dir: str, files: List[str]) -> str:
"""Write a manifest of TaxonoPy-generated files to the output directory.

Args:
output_dir: Output directory path.
files: List of file paths generated by TaxonoPy.

Returns:
Path to the manifest file.
"""
output_dir_path = Path(output_dir)
manifest_path = output_dir_path / MANIFEST_FILENAME
relative_files = []
for f in files:
try:
relative_files.append(str(Path(f).relative_to(output_dir_path)))
except ValueError:
logger.warning(
"File %s is not under output_dir %s; storing absolute path in manifest.", f, output_dir_path
)
relative_files.append(str(f))
manifest = {"files": relative_files}
manifest_path.write_text(json.dumps(manifest, indent=2))
logger.info(f"Wrote output manifest to {manifest_path}")
return str(manifest_path)


def read_output_manifest(output_dir: str) -> List[str]:
"""Read the TaxonoPy output manifest and return absolute file paths.

Args:
output_dir: Output directory path.

Returns:
List of absolute file paths listed in the manifest, or empty list if no
manifest exists or it cannot be parsed.
"""
output_dir_path = Path(output_dir)
manifest_path = output_dir_path / MANIFEST_FILENAME
if not manifest_path.exists():
return []
try:
manifest = json.loads(manifest_path.read_text())
return [str(output_dir_path / f) for f in manifest.get("files", [])]
except (json.JSONDecodeError, KeyError):
logger.warning(f"Could not parse output manifest at {manifest_path}.")
return []


def map_entry_to_output_format(
entry: TaxonomicEntry,
final_attempt: Optional[ResolutionAttempt] = None
Expand Down
68 changes: 68 additions & 0 deletions tests/test_output_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
from pathlib import Path

from taxonopy.output_manager import (
MANIFEST_FILENAME,
read_output_manifest,
write_output_manifest,
)


def test_write_output_manifest_creates_file(tmp_path):
files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "b.unsolved.parquet")]
manifest_path = write_output_manifest(str(tmp_path), files)

assert Path(manifest_path).exists()
assert Path(manifest_path).name == MANIFEST_FILENAME


def test_write_output_manifest_stores_relative_paths(tmp_path):
files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "sub" / "b.unsolved.csv")]
write_output_manifest(str(tmp_path), files)

manifest = json.loads((tmp_path / MANIFEST_FILENAME).read_text())
assert "a.resolved.parquet" in manifest["files"]
assert str(Path("sub") / "b.unsolved.csv") in manifest["files"]
# No absolute paths stored
for f in manifest["files"]:
assert not Path(f).is_absolute()


def test_read_output_manifest_returns_absolute_paths(tmp_path):
files = [str(tmp_path / "a.resolved.parquet"), str(tmp_path / "resolution_stats.json")]
write_output_manifest(str(tmp_path), files)

result = read_output_manifest(str(tmp_path))
assert sorted(result) == sorted(files)


def test_read_output_manifest_returns_empty_when_missing(tmp_path):
result = read_output_manifest(str(tmp_path))
assert result == []


def test_read_output_manifest_returns_empty_on_corrupt_json(tmp_path):
(tmp_path / MANIFEST_FILENAME).write_text("not valid json{{{")
result = read_output_manifest(str(tmp_path))
assert result == []


def test_full_rerun_deletes_only_manifest_files(tmp_path):
"""Simulate the --full-rerun logic: only files listed in the manifest are removed."""
taxonopy_file = tmp_path / "data.resolved.parquet"
other_file = tmp_path / "important_user_data.csv"
taxonopy_file.write_text("taxonopy output")
other_file.write_text("user data")

write_output_manifest(str(tmp_path), [str(taxonopy_file)])

# Simulate --full-rerun behaviour
manifest_files = read_output_manifest(str(tmp_path))
manifest_path = tmp_path / MANIFEST_FILENAME
for f in manifest_files:
Path(f).unlink(missing_ok=True)
manifest_path.unlink(missing_ok=True)

assert not taxonopy_file.exists(), "TaxonoPy output file should have been removed"
assert other_file.exists(), "Non-TaxonoPy file should NOT have been removed"
assert not manifest_path.exists(), "Manifest should have been removed"