Skip to content

Commit 7bde1c3

Browse files
committed
RAW rdf cleaning safeguard
1 parent 6b44365 commit 7bde1c3

2 files changed

Lines changed: 74 additions & 1 deletion

File tree

test/test_vcf_rdfizer_unit.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,6 +1033,63 @@ def fake_run(cmd, cwd=None, env=None):
10331033
self.assertTrue((out_dir / "sample" / "sample.nt").exists())
10341034
self.assertTrue((out_dir / "sample" / "sample.hdt").exists())
10351035

1036+
def test_main_full_mode_refuses_rdf_cleanup_until_all_methods_succeed(self):
1037+
"""Raw RDF is not deleted if any requested compression method is missing/failed."""
1038+
with tempfile.TemporaryDirectory() as td:
1039+
tmp_path = Path(td)
1040+
input_dir, rules_path = prepare_inputs(tmp_path)
1041+
out_dir = tmp_path / "out"
1042+
1043+
def fake_run(cmd, cwd=None, env=None):
1044+
if "/opt/vcf-rdfizer/run_conversion.sh" in cmd:
1045+
output_name = output_name_from_command(cmd) or "sample"
1046+
out_sample_dir = out_dir / output_name
1047+
out_sample_dir.mkdir(parents=True, exist_ok=True)
1048+
(out_sample_dir / f"{output_name}.nt").write_text("<s> <p> <o> .\n")
1049+
return 0
1050+
1051+
# Simulate partial compression bookkeeping: gzip recorded, brotli missing.
1052+
def fake_compress(*, rdf_path, out_dir, target_out_dir, image_ref, methods, wrapper_log_path, status_indent):
1053+
return True, {
1054+
"gzip": {
1055+
"exit_code": 0,
1056+
"wall_seconds": 0.01,
1057+
"output_path": str((target_out_dir or out_dir) / f"{rdf_path.name}.gz"),
1058+
"output_size_bytes": 12,
1059+
}
1060+
}
1061+
1062+
old_cwd = os.getcwd()
1063+
os.chdir(tmp_path)
1064+
try:
1065+
with mock.patch.object(vcf_rdfizer, "run", side_effect=fake_run), mock.patch.object(
1066+
vcf_rdfizer, "check_docker", return_value=True
1067+
), mock.patch.object(
1068+
vcf_rdfizer, "docker_image_exists", return_value=True
1069+
), mock.patch.object(
1070+
vcf_rdfizer, "discover_tsv_triplets", return_value=mocked_triplets()
1071+
), mock.patch.object(
1072+
vcf_rdfizer, "run_compression_methods_for_rdf", side_effect=fake_compress
1073+
):
1074+
rc = invoke_main(
1075+
[
1076+
"--input",
1077+
str(input_dir),
1078+
"--rules",
1079+
str(rules_path),
1080+
"--out",
1081+
str(out_dir),
1082+
"--compression",
1083+
"gzip,brotli",
1084+
"--keep-tsv",
1085+
]
1086+
)
1087+
finally:
1088+
os.chdir(old_cwd)
1089+
1090+
self.assertEqual(rc, 1)
1091+
self.assertTrue((out_dir / "sample" / "sample.nt").exists())
1092+
10361093
def test_main_full_mode_writes_compression_metrics_artifacts(self):
10371094
"""Full mode writes compression metrics JSON/time artifacts and updates metrics.csv row."""
10381095
with tempfile.TemporaryDirectory() as td:

vcf_rdfizer.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1560,8 +1560,24 @@ def run_full_mode(
15601560
return 1
15611561

15621562
if not keep_rdf and selected_methods:
1563-
# Cleanup raw RDF when compression is enabled and keep-rdf is not set.
1563+
# Cleanup raw RDF only after every selected compression method has
1564+
# completed successfully for that specific RDF artifact.
15641565
for raw_rdf_path in raw_rdf_files:
1566+
method_results = method_results_by_file.get(raw_rdf_path.name, {})
1567+
missing_or_failed = []
1568+
for method in selected_methods:
1569+
result = method_results.get(method)
1570+
if result is None or int(result.get("exit_code", 1)) != 0:
1571+
missing_or_failed.append(method)
1572+
if missing_or_failed:
1573+
eprint(
1574+
"Error: refusing to remove raw RDF before all selected compression "
1575+
f"methods completed successfully for '{raw_rdf_path.name}'. "
1576+
f"Pending/failed: {', '.join(missing_or_failed)}"
1577+
)
1578+
eprint(f"See log for details: {wrapper_log_path}")
1579+
return 1
1580+
15651581
if raw_rdf_path.exists():
15661582
if not remove_file_with_docker_fallback(
15671583
path=raw_rdf_path,

0 commit comments

Comments
 (0)