Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/scope-pipeline-artifacts-by-run-id.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Scope pipeline artifact directory by run ID to prevent concurrent runs from clobbering each other's H5 files, calibration packages, and weights.
3 changes: 3 additions & 0 deletions modal_app/data_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,7 @@ def build_datasets(
clear_checkpoints: bool = False,
skip_tests: bool = False,
skip_enhanced_cps: bool = False,
run_id: str = "",
):
"""Build all datasets with preemption-resilient checkpointing.

Expand Down Expand Up @@ -593,6 +594,8 @@ def build_datasets(
# failure does not block downstream calibration steps.
print("Copying pipeline artifacts to shared volume...")
artifacts_dir = Path(PIPELINE_MOUNT) / "artifacts"
if run_id:
artifacts_dir = artifacts_dir / run_id
artifacts_dir.mkdir(parents=True, exist_ok=True)

# Copy all intermediate H5 datasets for lineage tracing
Expand Down
14 changes: 9 additions & 5 deletions modal_app/local_area.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def run_phase(
},
memory=16384,
cpu=4.0,
timeout=14400,
timeout=28800,
nonpreemptible=True,
)
def build_areas_worker(
Expand Down Expand Up @@ -438,7 +438,7 @@ def validate_staging(branch: str, version: str, run_id: str = "") -> Dict:
secrets=[hf_secret],
volumes={VOLUME_MOUNT: staging_volume},
memory=8192,
timeout=14400,
timeout=28800,
nonpreemptible=True,
)
def upload_to_staging(
Expand Down Expand Up @@ -646,7 +646,9 @@ def coordinate_publish(
version_dir = staging_dir / version

pipeline_volume.reload()
artifacts = Path("/pipeline/artifacts")
artifacts = (
Path(f"/pipeline/artifacts/{run_id}") if run_id else Path("/pipeline/artifacts")
)
weights_path = artifacts / "calibration_weights.npy"
db_path = artifacts / "policy_data.db"
dataset_path = artifacts / "source_imputed_stratified_extended_cps.h5"
Expand Down Expand Up @@ -900,7 +902,7 @@ def main(
"/pipeline": pipeline_volume,
},
memory=16384,
timeout=14400,
timeout=28800,
nonpreemptible=True,
)
def coordinate_national_publish(
Expand Down Expand Up @@ -929,7 +931,9 @@ def coordinate_national_publish(
staging_dir = Path(VOLUME_MOUNT)

pipeline_volume.reload()
artifacts = Path("/pipeline/artifacts")
artifacts = (
Path(f"/pipeline/artifacts/{run_id}") if run_id else Path("/pipeline/artifacts")
)
weights_path = artifacts / "national_calibration_weights.npy"
db_path = artifacts / "policy_data.db"
dataset_path = artifacts / "source_imputed_stratified_extended_cps.h5"
Expand Down
34 changes: 24 additions & 10 deletions modal_app/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,21 @@
REPO_URL = "https://github.com/PolicyEngine/policyengine-us-data.git"
PIPELINE_MOUNT = "/pipeline"
STAGING_MOUNT = "/staging"
ARTIFACTS_DIR = f"{PIPELINE_MOUNT}/artifacts"
ARTIFACTS_BASE = f"{PIPELINE_MOUNT}/artifacts"
RUNS_DIR = f"{PIPELINE_MOUNT}/runs"


def artifacts_dir_for_run(run_id: str) -> str:
"""Return the run-scoped artifacts directory.

When run_id is empty, falls back to the flat base path
for backward compatibility with standalone invocations.
"""
if run_id:
return f"{ARTIFACTS_BASE}/{run_id}"
return ARTIFACTS_BASE


# ── Run metadata ─────────────────────────────────────────────────


Expand Down Expand Up @@ -302,7 +313,7 @@ def stage_base_datasets(
version: Package version string for the commit.
branch: Git branch for repo clone.
"""
artifacts = Path(ARTIFACTS_DIR)
artifacts = Path(artifacts_dir_for_run(run_id))

files_with_paths = []

Expand Down Expand Up @@ -666,8 +677,8 @@ def run_pipeline(
run_dir.mkdir(parents=True, exist_ok=True)
(run_dir / "diagnostics").mkdir(exist_ok=True)

# Create artifacts directory
Path(ARTIFACTS_DIR).mkdir(parents=True, exist_ok=True)
# Create run-scoped artifacts directory
Path(artifacts_dir_for_run(run_id)).mkdir(parents=True, exist_ok=True)

write_run_meta(meta, pipeline_volume)

Expand Down Expand Up @@ -704,6 +715,7 @@ def run_pipeline(
clear_checkpoints=clear_checkpoints,
skip_tests=True,
skip_enhanced_cps=False,
run_id=run_id,
)

# The build_datasets step produces files in its
Expand Down Expand Up @@ -732,6 +744,7 @@ def run_pipeline(
branch=branch,
workers=num_workers,
n_clones=n_clones,
run_id=run_id,
)
print(f" Package at: {pkg_path}")

Expand All @@ -750,7 +763,7 @@ def run_pipeline(
print("\n[Step 3/5] Fitting calibration weights...")
step_start = time.time()

vol_path = "/pipeline/artifacts/calibration_package.pkl"
vol_path = f"{artifacts_dir_for_run(run_id)}/calibration_package.pkl"
target_cfg = "policyengine_us_data/calibration/target_config.yaml"

# Spawn regional fit
Expand Down Expand Up @@ -794,16 +807,17 @@ def run_pipeline(
regional_result = regional_handle.get()
print(" Regional fit complete. Writing to volume...")

# Write regional results to pipeline volume
# Write regional results to pipeline volume (run-scoped)
artifacts_rel = f"artifacts/{run_id}" if run_id else "artifacts"
with pipeline_volume.batch_upload(force=True) as batch:
batch.put_file(
BytesIO(regional_result["weights"]),
"artifacts/calibration_weights.npy",
f"{artifacts_rel}/calibration_weights.npy",
)
if regional_result.get("config"):
batch.put_file(
BytesIO(regional_result["config"]),
"artifacts/unified_run_config.json",
f"{artifacts_rel}/unified_run_config.json",
)

archive_diagnostics(
Expand All @@ -822,12 +836,12 @@ def run_pipeline(
with pipeline_volume.batch_upload(force=True) as batch:
batch.put_file(
BytesIO(national_result["weights"]),
"artifacts/national_calibration_weights.npy",
f"{artifacts_rel}/national_calibration_weights.npy",
)
if national_result.get("config"):
batch.put_file(
BytesIO(national_result["config"]),
"artifacts/national_unified_run_config.json",
f"{artifacts_rel}/national_unified_run_config.json",
)

archive_diagnostics(
Expand Down
Loading
Loading