From 18ff9017298c53fd7feb5686326525058a3802ea Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:30:42 -0400 Subject: [PATCH 01/15] ci: harden Phoenix CPU job memory and add sbatch retry on transient errors - Increase Phoenix cpu-small mem-per-cpu from 2G to 8G, reduce ntasks from 24 to 12 to match actual -j 8 build parallelism and prevent OOM during case-optimized nvfortran --gpu mp compilation - Add --exclusive to prevent memory contention from co-scheduled jobs - Broaden partition to cpu-small,cpu-medium,cpu-large for availability - Add retry-sbatch.sh: retries sbatch up to 3x on transient SLURM errors (socket timeout, connection failures) but not on hard config failures - Use retry_sbatch() in submit-slurm-job.sh Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-sbatch.sh | 38 +++++++++++++++++++++++++++++ .github/scripts/submit-slurm-job.sh | 19 +++++++++------ 2 files changed, 49 insertions(+), 8 deletions(-) create mode 100644 .github/scripts/retry-sbatch.sh diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh new file mode 100644 index 0000000000..28af5c6e26 --- /dev/null +++ b/.github/scripts/retry-sbatch.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Provides retry_sbatch(): submits a job script string via sbatch with retries. +# Only retries on known transient SLURM/infrastructure errors (socket timeouts, +# connection failures). Hard failures (bad account, invalid partition, QOS +# violations) are not retried. +# +# Usage: source .github/scripts/retry-sbatch.sh +# job_id=$(retry_sbatch "$script_contents") + +retry_sbatch() { + local script_contents="$1" + local max_attempts=3 + local attempt=1 + local submit_output job_id + + while [ $attempt -le $max_attempts ]; do + echo "sbatch attempt $attempt of $max_attempts..." + submit_output=$(echo "$script_contents" | sbatch 2>&1) || true + job_id=$(echo "$submit_output" | grep -oE '[0-9]+') + if [ -n "$job_id" ]; then + echo "$job_id" + return 0 + fi + echo "sbatch failed: $submit_output" + if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then + echo "Non-transient sbatch failure — not retrying." + return 1 + fi + if [ $attempt -lt $max_attempts ]; then + echo "Transient error — retrying in 30s..." + sleep 30 + fi + attempt=$((attempt + 1)) + done + + echo "sbatch failed after $max_attempts attempts." + return 1 +} diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index eb6702cfbe..5987822f9b 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -85,9 +85,10 @@ if [ "$device" = "cpu" ]; then case "$cluster" in phoenix) sbatch_device_opts="\ -#SBATCH -p cpu-small -#SBATCH --ntasks-per-node=24 -#SBATCH --mem-per-cpu=2G" +#SBATCH -p cpu-small,cpu-medium,cpu-large +#SBATCH --ntasks-per-node=12 +#SBATCH --mem-per-cpu=8G +#SBATCH --exclusive" ;; frontier|frontier_amd) sbatch_device_opts="\ @@ -161,8 +162,9 @@ rm -f "$output_file" # --- Module load mode (short form) --- module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c") -# --- Submit --- -submit_output=$(sbatch < Date: Thu, 12 Mar 2026 19:38:10 -0400 Subject: [PATCH 02/15] ci: tolerate stale NFS file handles on rm -rf build Suppress errors from stale NFS file handles when wiping the build directory before a fresh build. Stale-handle files are inaccessible and cannot cause SIGILL, so ignoring rm failures is safe. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/prebuild-case-optimization.sh | 2 +- .github/workflows/common/bench.sh | 2 +- .github/workflows/common/test.sh | 3 ++- .github/workflows/frontier/build.sh | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 581630f742..66444efa9b 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -22,7 +22,7 @@ case "$cluster" in *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; esac -rm -rf build +rm -rf build 2>/dev/null || true . ./mfc.sh load -c "$flag" -m g diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 3251f7baca..2573658700 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -25,7 +25,7 @@ fi # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then - rm -rf build + rm -rf build 2>/dev/null || true fi if [ ! -d "build" ]; then diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 746c54f5d1..4355891aaf 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -13,7 +13,8 @@ build_opts="$gpu_opts" # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh # to avoid SIGILL from stale binaries compiled on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then - rm -rf build + # Suppress stale NFS file handle errors — those files are inaccessible anyway. + rm -rf build 2>/dev/null || true fi if [ ! -d "build" ]; then diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index d21b1ddac4..6d20842ea6 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,7 +20,7 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -rm -rf build +rm -rf build 2>/dev/null || true source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then From ab106161fa4abce13a5df73259e8973dd9cff27c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:42:57 -0400 Subject: [PATCH 03/15] ci: fix retry-sbatch stdout/stderr and job-ID extraction - Redirect all diagnostic output to stderr so it isn't captured into job_id when called via command substitution - Match 'Submitted batch job NNN' exactly instead of any digit sequence, preventing error messages containing numbers from being mistaken for a valid job ID Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-sbatch.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh index 28af5c6e26..ddfab3b216 100644 --- a/.github/scripts/retry-sbatch.sh +++ b/.github/scripts/retry-sbatch.sh @@ -14,25 +14,25 @@ retry_sbatch() { local submit_output job_id while [ $attempt -le $max_attempts ]; do - echo "sbatch attempt $attempt of $max_attempts..." + echo "sbatch attempt $attempt of $max_attempts..." >&2 submit_output=$(echo "$script_contents" | sbatch 2>&1) || true - job_id=$(echo "$submit_output" | grep -oE '[0-9]+') + job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$') if [ -n "$job_id" ]; then echo "$job_id" return 0 fi - echo "sbatch failed: $submit_output" + echo "sbatch failed: $submit_output" >&2 if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then - echo "Non-transient sbatch failure — not retrying." + echo "Non-transient sbatch failure — not retrying." >&2 return 1 fi if [ $attempt -lt $max_attempts ]; then - echo "Transient error — retrying in 30s..." + echo "Transient error — retrying in 30s..." >&2 sleep 30 fi attempt=$((attempt + 1)) done - echo "sbatch failed after $max_attempts attempts." + echo "sbatch failed after $max_attempts attempts." >&2 return 1 } From 381dc7e210ea11a5e68398124b7a556777dfe8e3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:46:22 -0400 Subject: [PATCH 04/15] ci: use printf over echo for sbatch script piping More portable and defensive against script content starting with - or containing escape sequences. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-sbatch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh index ddfab3b216..fe3f151be7 100644 --- a/.github/scripts/retry-sbatch.sh +++ b/.github/scripts/retry-sbatch.sh @@ -15,7 +15,7 @@ retry_sbatch() { while [ $attempt -le $max_attempts ]; do echo "sbatch attempt $attempt of $max_attempts..." >&2 - submit_output=$(echo "$script_contents" | sbatch 2>&1) || true + submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$') if [ -n "$job_id" ]; then echo "$job_id" From 7f467b187ea26a5485299c462a582ed008f1dcd4 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:46:42 -0400 Subject: [PATCH 05/15] ci: add explanatory comment to rm -rf build suppression in build.sh and bench.sh Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/common/bench.sh | 1 + .github/workflows/frontier/build.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 2573658700..29bf78b3ff 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -25,6 +25,7 @@ fi # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then + # Suppress stale NFS file handle errors — those files are inaccessible anyway. rm -rf build 2>/dev/null || true fi diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 6d20842ea6..a121d9447f 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,6 +20,7 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") +# Suppress stale NFS file handle errors — those files are inaccessible anyway. rm -rf build 2>/dev/null || true source .github/scripts/retry-build.sh From e864803a2ddc1a980ff70d4f45464db5f5eca729 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:48:22 -0400 Subject: [PATCH 06/15] ci: always rebuild on Phoenix even if rm -rf build leaves partial directory If stale NFS handles prevent full deletion of build/, the old 'if [ ! -d build ]' guard would skip the rebuild entirely, leaving stale binaries from a different compute node that could cause SIGILL. Force a rebuild on Phoenix unconditionally. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/common/bench.sh | 2 +- .github/workflows/common/test.sh | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 29bf78b3ff..186ce3ba96 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -29,7 +29,7 @@ if [ "$job_cluster" = "phoenix" ]; then rm -rf build 2>/dev/null || true fi -if [ ! -d "build" ]; then +if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then source .github/scripts/retry-build.sh retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 fi diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 4355891aaf..09b996536d 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -17,7 +17,9 @@ if [ "$job_cluster" = "phoenix" ]; then rm -rf build 2>/dev/null || true fi -if [ ! -d "build" ]; then +# Phoenix must always rebuild (heterogeneous compute nodes → ISA mismatch risk), +# even if rm above left a partial build/ directory behind. +if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then source .github/scripts/retry-build.sh # Phoenix: smoke-test the syscheck binary to catch architecture mismatches From ff18b55e5366672f2b60ef6298cb49fb1da29e6b Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:54:21 -0400 Subject: [PATCH 07/15] ci: use rename trick to handle stale NFS file handles on build cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 'rm -rf build || true' with 'mv build build.stale.$$' followed by a background best-effort delete. mv is a metadata-only operation that succeeds even when files have stale NFS handles, guaranteeing build/ is gone before the fresh build starts. Old stale trees are cleaned up opportunistically in the background. This also simplifies test.sh and bench.sh: since mv reliably removes build/, the Phoenix-specific override in the build condition is no longer needed — the plain '[ ! -d build ]' check is sufficient again. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/prebuild-case-optimization.sh | 3 ++- .github/workflows/common/bench.sh | 8 +++++--- .github/workflows/common/test.sh | 10 +++++----- .github/workflows/frontier/build.sh | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 66444efa9b..345c144563 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -22,7 +22,8 @@ case "$cluster" in *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; esac -rm -rf build 2>/dev/null || true +mv build build.stale.$$ 2>/dev/null || true +rm -rf build.stale.* 2>/dev/null & disown . ./mfc.sh load -c "$flag" -m g diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 186ce3ba96..4cd45ac7ab 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -25,11 +25,13 @@ fi # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then - # Suppress stale NFS file handle errors — those files are inaccessible anyway. - rm -rf build 2>/dev/null || true + # Rename instead of rm: mv is a metadata-only op that succeeds even with stale + # NFS file handles. Delete the old tree in the background (best-effort). + mv build build.stale.$$ 2>/dev/null || true + rm -rf build.stale.* 2>/dev/null & disown fi -if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then +if [ ! -d "build" ]; then source .github/scripts/retry-build.sh retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1 fi diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 09b996536d..58fb48cd33 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -13,13 +13,13 @@ build_opts="$gpu_opts" # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh # to avoid SIGILL from stale binaries compiled on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then - # Suppress stale NFS file handle errors — those files are inaccessible anyway. - rm -rf build 2>/dev/null || true + # Rename instead of rm: mv is a metadata-only op that succeeds even with stale + # NFS file handles. Delete the old tree in the background (best-effort). + mv build build.stale.$$ 2>/dev/null || true + rm -rf build.stale.* 2>/dev/null & disown fi -# Phoenix must always rebuild (heterogeneous compute nodes → ISA mismatch risk), -# even if rm above left a partial build/ directory behind. -if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then +if [ ! -d "build" ]; then source .github/scripts/retry-build.sh # Phoenix: smoke-test the syscheck binary to catch architecture mismatches diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index a121d9447f..4d0afc20ac 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,8 +20,8 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -# Suppress stale NFS file handle errors — those files are inaccessible anyway. -rm -rf build 2>/dev/null || true +mv build build.stale.$$ 2>/dev/null || true +rm -rf build.stale.* 2>/dev/null & disown source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then From b55a21e2881a75d9c1b35a7086a53229fad7541e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 19:58:26 -0400 Subject: [PATCH 08/15] ci: log last sbatch error output on retry exhaustion Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-sbatch.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh index fe3f151be7..78c45fa821 100644 --- a/.github/scripts/retry-sbatch.sh +++ b/.github/scripts/retry-sbatch.sh @@ -11,7 +11,7 @@ retry_sbatch() { local script_contents="$1" local max_attempts=3 local attempt=1 - local submit_output job_id + local submit_output job_id last_output="" while [ $attempt -le $max_attempts ]; do echo "sbatch attempt $attempt of $max_attempts..." >&2 @@ -21,6 +21,7 @@ retry_sbatch() { echo "$job_id" return 0 fi + last_output="$submit_output" echo "sbatch failed: $submit_output" >&2 if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then echo "Non-transient sbatch failure — not retrying." >&2 @@ -33,6 +34,6 @@ retry_sbatch() { attempt=$((attempt + 1)) done - echo "sbatch failed after $max_attempts attempts." >&2 + echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2 return 1 } From 38fca240c1506c11fea64434f4098b6ccc1faf3e Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:02:34 -0400 Subject: [PATCH 09/15] ci: scope stale build cleanup to current job's PID and fix missing EOF newline Replace 'rm -rf build.stale.*' with 'rm -rf build.stale.$$' so each job only cleans up its own renamed directory, avoiding a race if concurrent matrix jobs share a workspace. Also add trailing newline to retry-sbatch.sh. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/prebuild-case-optimization.sh | 2 +- .github/scripts/retry-sbatch.sh | 1 + .github/workflows/common/bench.sh | 2 +- .github/workflows/common/test.sh | 2 +- .github/workflows/frontier/build.sh | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 345c144563..2b1637c8c2 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -23,7 +23,7 @@ case "$cluster" in esac mv build build.stale.$$ 2>/dev/null || true -rm -rf build.stale.* 2>/dev/null & disown +rm -rf "build.stale.$$" 2>/dev/null & disown . ./mfc.sh load -c "$flag" -m g diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh index 78c45fa821..a46dbbcc63 100644 --- a/.github/scripts/retry-sbatch.sh +++ b/.github/scripts/retry-sbatch.sh @@ -37,3 +37,4 @@ retry_sbatch() { echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2 return 1 } + diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 4cd45ac7ab..9d4611309c 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -28,7 +28,7 @@ if [ "$job_cluster" = "phoenix" ]; then # Rename instead of rm: mv is a metadata-only op that succeeds even with stale # NFS file handles. Delete the old tree in the background (best-effort). mv build build.stale.$$ 2>/dev/null || true - rm -rf build.stale.* 2>/dev/null & disown + rm -rf "build.stale.$$" 2>/dev/null & disown fi if [ ! -d "build" ]; then diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 58fb48cd33..7bd2235771 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -16,7 +16,7 @@ if [ "$job_cluster" = "phoenix" ]; then # Rename instead of rm: mv is a metadata-only op that succeeds even with stale # NFS file handles. Delete the old tree in the background (best-effort). mv build build.stale.$$ 2>/dev/null || true - rm -rf build.stale.* 2>/dev/null & disown + rm -rf "build.stale.$$" 2>/dev/null & disown fi if [ ! -d "build" ]; then diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index 4d0afc20ac..b664b53747 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -21,7 +21,7 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") mv build build.stale.$$ 2>/dev/null || true -rm -rf build.stale.* 2>/dev/null & disown +rm -rf "build.stale.$$" 2>/dev/null & disown source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then From 084f31a3df1d9a49d10833a0fa4265468b3e6da3 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:04:11 -0400 Subject: [PATCH 10/15] ci: extract clean_build() helper to eliminate duplication The rename-then-background-delete pattern was copy-pasted across four scripts. Move it into .github/scripts/clean-build.sh and source it at each call site. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/clean-build.sh | 13 +++++++++++++ .github/scripts/prebuild-case-optimization.sh | 4 ++-- .github/workflows/common/bench.sh | 6 ++---- .github/workflows/common/test.sh | 6 ++---- .github/workflows/frontier/build.sh | 4 ++-- 5 files changed, 21 insertions(+), 12 deletions(-) create mode 100644 .github/scripts/clean-build.sh diff --git a/.github/scripts/clean-build.sh b/.github/scripts/clean-build.sh new file mode 100644 index 0000000000..4c859910d5 --- /dev/null +++ b/.github/scripts/clean-build.sh @@ -0,0 +1,13 @@ +#!/bin/bash +# Provides clean_build(): renames build/ aside and deletes it in the background. +# mv is a metadata-only operation that succeeds even with stale NFS file handles, +# unlike rm -rf which fails on ESTALE. The background delete is best-effort and +# scoped to this job's PID to avoid races with concurrent matrix jobs. +# +# Usage: source .github/scripts/clean-build.sh +# clean_build + +clean_build() { + mv build "build.stale.$$" 2>/dev/null || true + rm -rf "build.stale.$$" 2>/dev/null & disown +} diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh index 2b1637c8c2..938ce2f438 100755 --- a/.github/scripts/prebuild-case-optimization.sh +++ b/.github/scripts/prebuild-case-optimization.sh @@ -22,8 +22,8 @@ case "$cluster" in *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;; esac -mv build build.stale.$$ 2>/dev/null || true -rm -rf "build.stale.$$" 2>/dev/null & disown +source .github/scripts/clean-build.sh +clean_build . ./mfc.sh load -c "$flag" -m g diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh index 9d4611309c..66d77cfd99 100644 --- a/.github/workflows/common/bench.sh +++ b/.github/workflows/common/bench.sh @@ -25,10 +25,8 @@ fi # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk). if [ "$job_cluster" = "phoenix" ]; then - # Rename instead of rm: mv is a metadata-only op that succeeds even with stale - # NFS file handles. Delete the old tree in the background (best-effort). - mv build build.stale.$$ 2>/dev/null || true - rm -rf "build.stale.$$" 2>/dev/null & disown + source .github/scripts/clean-build.sh + clean_build fi if [ ! -d "build" ]; then diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 7bd2235771..8badab2699 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -13,10 +13,8 @@ build_opts="$gpu_opts" # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh # to avoid SIGILL from stale binaries compiled on a different microarchitecture. if [ "$job_cluster" = "phoenix" ]; then - # Rename instead of rm: mv is a metadata-only op that succeeds even with stale - # NFS file handles. Delete the old tree in the background (best-effort). - mv build build.stale.$$ 2>/dev/null || true - rm -rf "build.stale.$$" 2>/dev/null & disown + source .github/scripts/clean-build.sh + clean_build fi if [ ! -d "build" ]; then diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh index b664b53747..5bd40999d7 100644 --- a/.github/workflows/frontier/build.sh +++ b/.github/workflows/frontier/build.sh @@ -20,8 +20,8 @@ build_opts="$gpu_opts" . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c") -mv build build.stale.$$ 2>/dev/null || true -rm -rf "build.stale.$$" 2>/dev/null & disown +source .github/scripts/clean-build.sh +clean_build source .github/scripts/retry-build.sh if [ "$run_bench" == "bench" ]; then From ecd2b228806a7d7011f495fa9dae6f909cd7c74c Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:05:48 -0400 Subject: [PATCH 11/15] ci: clean up leftover stale build dirs at job start to bound accumulation Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/clean-build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/scripts/clean-build.sh b/.github/scripts/clean-build.sh index 4c859910d5..2bed13ea0c 100644 --- a/.github/scripts/clean-build.sh +++ b/.github/scripts/clean-build.sh @@ -8,6 +8,8 @@ # clean_build clean_build() { + # Clean up leftover stale directories from previous runs before adding a new one. + rm -rf build.stale.* 2>/dev/null || true mv build "build.stale.$$" 2>/dev/null || true rm -rf "build.stale.$$" 2>/dev/null & disown } From 5e67dec22f25c053b6166a638d095e85574c7431 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:06:35 -0400 Subject: [PATCH 12/15] ci: remove unreachable job_id guard after retry_sbatch Under set -e, retry_sbatch returning 1 exits the script immediately. The if [ -z "$job_id" ] block was never reachable. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/submit-slurm-job.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index 5987822f9b..abead3c497 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -197,11 +197,6 @@ EOT job_id=$(retry_sbatch "$_sbatch_script") unset _sbatch_script -if [ -z "$job_id" ]; then - echo "ERROR: Failed to submit job." - exit 1 -fi - echo "Submitted batch job $job_id" echo "$job_id" > "$id_file" echo "Job ID written to $id_file" From 7ea9da530c0ca59ef536ad6c8a96ba8f1d8d7e49 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:07:18 -0400 Subject: [PATCH 13/15] ci: remove 'try again' from transient sbatch error pattern 'try again' is too broad and could match hard SLURM policy errors like 'QOS violates policy, try again later', causing unintended retries. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/retry-sbatch.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh index a46dbbcc63..ad3925b920 100644 --- a/.github/scripts/retry-sbatch.sh +++ b/.github/scripts/retry-sbatch.sh @@ -23,7 +23,7 @@ retry_sbatch() { fi last_output="$submit_output" echo "sbatch failed: $submit_output" >&2 - if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then + if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then echo "Non-transient sbatch failure — not retrying." >&2 return 1 fi From defdd7c984ba6224b5b32068c20e4a6f2e0593ec Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 20:16:23 -0400 Subject: [PATCH 14/15] ci: remove --exclusive from Phoenix CPU jobs --exclusive caused 'Requested node configuration is not available' on cpu-small,cpu-medium,cpu-large partitions. The --mem-per-cpu=8G reservation already prevents memory contention from co-scheduled jobs. Co-Authored-By: Claude Sonnet 4.6 --- .github/scripts/submit-slurm-job.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh index abead3c497..78dd1ee9a2 100755 --- a/.github/scripts/submit-slurm-job.sh +++ b/.github/scripts/submit-slurm-job.sh @@ -87,8 +87,7 @@ if [ "$device" = "cpu" ]; then sbatch_device_opts="\ #SBATCH -p cpu-small,cpu-medium,cpu-large #SBATCH --ntasks-per-node=12 -#SBATCH --mem-per-cpu=8G -#SBATCH --exclusive" +#SBATCH --mem-per-cpu=8G" ;; frontier|frontier_amd) sbatch_device_opts="\ From 65e2642b10250b8d6df4162da9ca3383fdbb4862 Mon Sep 17 00:00:00 2001 From: Spencer Bryngelson Date: Thu, 12 Mar 2026 23:42:38 -0400 Subject: [PATCH 15/15] ci: redirect Phoenix TMPDIR in test.sh to avoid MPI session dir failures Phoenix compute nodes have a small /tmp. With 8 parallel test threads each spawning MPI processes over ~96 minutes, it fills up and ORTE fails to create its session directory, causing the last batch of tests to fail with 'No such file or directory'. Apply the same TMPDIR redirect to project storage that bench.sh already uses. Co-Authored-By: Claude Sonnet 4.6 --- .github/workflows/common/test.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh index 8badab2699..141d2e72d3 100644 --- a/.github/workflows/common/test.sh +++ b/.github/workflows/common/test.sh @@ -8,6 +8,19 @@ set -euo pipefail source .github/scripts/gpu-opts.sh build_opts="$gpu_opts" +# --- Phoenix TMPDIR setup --- +# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each +# spawning MPI processes, it fills up and ORTE session dir creation fails. +# Redirect TMPDIR to project storage, same as bench.sh. +if [ "$job_cluster" = "phoenix" ]; then + tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build + currentdir=$tmpbuild/run-$(( RANDOM % 9000 )) + mkdir -p $tmpbuild + mkdir -p $currentdir + export TMPDIR=$currentdir + trap 'rm -rf "$currentdir" || true' EXIT +fi + # --- Build (if not pre-built on login node) --- # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node. # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh