diff --git a/.github/scripts/clean-build.sh b/.github/scripts/clean-build.sh new file mode 100644 index 0000000000..2bed13ea0c --- /dev/null +++ b/.github/scripts/clean-build.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Provides clean_build(): renames build/ aside and deletes it in the background. +# mv is a metadata-only operation that succeeds even with stale NFS file handles, +# unlike rm -rf which fails on ESTALE. The background delete is best-effort and +# scoped to this job's PID to avoid races with concurrent matrix jobs. +# +# Usage: source .github/scripts/clean-build.sh +# clean_build + +clean_build() { + # Clean up leftover stale directories from previous runs before adding a new one. + rm -rf build.stale.* 2>/dev/null || true + mv build "build.stale.$$" 2>/dev/null || true + rm -rf "build.stale.$$" 2>/dev/null & disown +} diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 581630f742..938ce2f438 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -22,7 +22,8 @@ case "$cluster" in *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; esac -rm -rf build +source .github/scripts/clean-build.sh +clean_build . ./mfc.sh load -c "$flag" -m g diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh new file mode 100644 index 0000000000..ad3925b920 --- /dev/null +++ b/.github/scripts/retry-sbatch.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Provides retry_sbatch(): submits a job script string via sbatch with retries. +# Only retries on known transient SLURM/infrastructure errors (socket timeouts, +# connection failures). Hard failures (bad account, invalid partition, QOS +# violations) are not retried. +# +# Usage: source .github/scripts/retry-sbatch.sh +# job_id=$(retry_sbatch "$script_contents") + +retry_sbatch() { + local script_contents="$1" + local max_attempts=3 + local attempt=1 + local submit_output job_id last_output="" + + while [ $attempt -le $max_attempts ]; do + echo "sbatch attempt $attempt of $max_attempts..." >&2 + submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true + job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$') + if [ -n "$job_id" ]; then + echo "$job_id" + return 0 + fi + last_output="$submit_output" + echo "sbatch failed: $submit_output" >&2 + if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then + echo "Non-transient sbatch failure — not retrying." >&2 + return 1 + fi + if [ $attempt -lt $max_attempts ]; then + echo "Transient error — retrying in 30s..." >&2 + sleep 30 + fi + attempt=$((attempt + 1)) + done + + echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2 + return 1 +} + diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index eb6702cfbe..78dd1ee9a2 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -85,9 +85,9 @@ if [ "$device" = "cpu" ]; then case "$cluster" in phoenix) sbatch_device_opts="\ -#SBATCH -p cpu-small -#SBATCH --ntasks-per-node=24 -#SBATCH --mem-per-cpu=2G" +#SBATCH -p cpu-small,cpu-medium,cpu-large +#SBATCH --ntasks-per-node=12 +#SBATCH --mem-per-cpu=8G" ;; frontier|frontier_amd) sbatch_device_opts="\ @@ -161,8 +161,9 @@ rm -f "$output_file" # --- Module load mode (short form) --- module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") -# --- Submit --- -submit_output=$(sbatch < "$id_file" diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 3251f7baca..66d77cfd99 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -25,7 +25,8 @@ fi # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then - rm -rf build + source .github/scripts/clean-build.sh + clean_build fi if [ ! -d "build" ]; then diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 746c54f5d1..141d2e72d3 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -8,12 +8,26 @@ set -euo pipefail source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" +# --- Phoenix TMPDIR setup --- +# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each +# spawning MPI processes, it fills up and ORTE session dir creation fails. +# Redirect TMPDIR to project storage, same as bench.sh. +if [ "$job_cluster" = "phoenix" ]; then + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir + trap 'rm -rf "$currentdir" || true' EXIT +fi + # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh # to avoid SIGILL from stale binaries compiled on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then - rm -rf build + source .github/scripts/clean-build.sh + clean_build fi if [ ! -d "build" ]; then diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index d21b1ddac4..5bd40999d7 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,7 +20,8 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -rm -rf build +source .github/scripts/clean-build.sh +clean_build source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then