From 18ff9017298c53fd7feb5686326525058a3802ea Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:30:42 -0400
Subject: [PATCH 01/15] ci: harden Phoenix CPU job memory and add sbatch retry
 on transient errors

- Increase Phoenix cpu-small mem-per-cpu from 2G to 8G, reduce ntasks
  from 24 to 12 to match actual -j 8 build parallelism and prevent OOM
  during case-optimized nvfortran --gpu mp compilation
- Add --exclusive to prevent memory contention from co-scheduled jobs
- Broaden partition to cpu-small,cpu-medium,cpu-large for availability
- Add retry-sbatch.sh: retries sbatch up to 3x on transient SLURM errors
  (socket timeout, connection failures) but not on hard config failures
- Use retry_sbatch() in submit-slurm-job.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-sbatch.sh     | 38 +++++++++++++++++++++++++++++
 .github/scripts/submit-slurm-job.sh | 19 +++++++++------
 2 files changed, 49 insertions(+), 8 deletions(-)
 create mode 100644 .github/scripts/retry-sbatch.sh

diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
new file mode 100644
index 0000000000..28af5c6e26
--- /dev/null
+++ b/.github/scripts/retry-sbatch.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Provides retry_sbatch(): submits a job script string via sbatch with retries.
+# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
+# connection failures). Hard failures (bad account, invalid partition, QOS
+# violations) are not retried.
+#
+# Usage: source .github/scripts/retry-sbatch.sh
+#        job_id=$(retry_sbatch "$script_contents")
+
+retry_sbatch() {
+    local script_contents="$1"
+    local max_attempts=3
+    local attempt=1
+    local submit_output job_id
+
+    while [ $attempt -le $max_attempts ]; do
+        echo "sbatch attempt $attempt of $max_attempts..."
+        submit_output=$(echo "$script_contents" | sbatch 2>&1) || true
+        job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+        if [ -n "$job_id" ]; then
+            echo "$job_id"
+            return 0
+        fi
+        echo "sbatch failed: $submit_output"
+        if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then
+            echo "Non-transient sbatch failure — not retrying."
+            return 1
+        fi
+        if [ $attempt -lt $max_attempts ]; then
+            echo "Transient error — retrying in 30s..."
+            sleep 30
+        fi
+        attempt=$((attempt + 1))
+    done
+
+    echo "sbatch failed after $max_attempts attempts."
+    return 1
+}
diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh
index eb6702cfbe..5987822f9b 100755
--- a/.github/scripts/submit-slurm-job.sh
+++ b/.github/scripts/submit-slurm-job.sh
@@ -85,9 +85,10 @@ if [ "$device" = "cpu" ]; then
     case "$cluster" in
         phoenix)
             sbatch_device_opts="\
-#SBATCH -p cpu-small
-#SBATCH --ntasks-per-node=24
-#SBATCH --mem-per-cpu=2G"
+#SBATCH -p cpu-small,cpu-medium,cpu-large
+#SBATCH --ntasks-per-node=12
+#SBATCH --mem-per-cpu=8G
+#SBATCH --exclusive"
             ;;
         frontier|frontier_amd)
             sbatch_device_opts="\
@@ -161,8 +162,9 @@ rm -f "$output_file"
 # --- Module load mode (short form) ---
 module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
 
-# --- Submit ---
-submit_output=$(sbatch <<EOT
+# --- Submit (with retries for transient SLURM errors) ---
+source "${SCRIPT_DIR}/retry-sbatch.sh"
+_sbatch_script=$(cat <<EOT
 #!/bin/bash
 #SBATCH -J ${job_prefix}-${job_slug}
 #SBATCH --account=${account}
@@ -192,10 +194,11 @@ $sbatch_script_contents
 EOT
 )
 
-job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+job_id=$(retry_sbatch "$_sbatch_script")
+unset _sbatch_script
+
 if [ -z "$job_id" ]; then
-    echo "ERROR: Failed to submit job. sbatch output:"
-    echo "$submit_output"
+    echo "ERROR: Failed to submit job."
     exit 1
 fi
 

From 9acb3757b24f0c8787a8018c9f08cbf1a3fc8562 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:38:10 -0400
Subject: [PATCH 02/15] ci: tolerate stale NFS file handles on rm -rf build

Suppress errors from stale NFS file handles when wiping the build
directory before a fresh build. Stale-handle files are inaccessible
and cannot cause SIGILL, so ignoring rm failures is safe.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/prebuild-case-optimization.sh | 2 +-
 .github/workflows/common/bench.sh             | 2 +-
 .github/workflows/common/test.sh              | 3 ++-
 .github/workflows/frontier/build.sh           | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh
index 581630f742..66444efa9b 100755
--- a/.github/scripts/prebuild-case-optimization.sh
+++ b/.github/scripts/prebuild-case-optimization.sh
@@ -22,7 +22,7 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
-rm -rf build
+rm -rf build 2>/dev/null || true
 
 . ./mfc.sh load -c "$flag" -m g
 
diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 3251f7baca..2573658700 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -25,7 +25,7 @@ fi
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
-    rm -rf build
+    rm -rf build 2>/dev/null || true
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 746c54f5d1..4355891aaf 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -13,7 +13,8 @@ build_opts="$gpu_opts"
 # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
 # to avoid SIGILL from stale binaries compiled on a different microarchitecture.
 if [ "$job_cluster" = "phoenix" ]; then
-    rm -rf build
+    # Suppress stale NFS file handle errors — those files are inaccessible anyway.
+    rm -rf build 2>/dev/null || true
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index d21b1ddac4..6d20842ea6 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,7 +20,7 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-rm -rf build
+rm -rf build 2>/dev/null || true
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then

From ab106161fa4abce13a5df73259e8973dd9cff27c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:42:57 -0400
Subject: [PATCH 03/15] ci: fix retry-sbatch stdout/stderr and job-ID
 extraction

- Redirect all diagnostic output to stderr so it isn't captured into
  job_id when called via command substitution
- Match 'Submitted batch job NNN' exactly instead of any digit sequence,
  preventing error messages containing numbers from being mistaken for
  a valid job ID

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-sbatch.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
index 28af5c6e26..ddfab3b216 100644
--- a/.github/scripts/retry-sbatch.sh
+++ b/.github/scripts/retry-sbatch.sh
@@ -14,25 +14,25 @@ retry_sbatch() {
     local submit_output job_id
 
     while [ $attempt -le $max_attempts ]; do
-        echo "sbatch attempt $attempt of $max_attempts..."
+        echo "sbatch attempt $attempt of $max_attempts..." >&2
         submit_output=$(echo "$script_contents" | sbatch 2>&1) || true
-        job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
+        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
         if [ -n "$job_id" ]; then
             echo "$job_id"
             return 0
         fi
-        echo "sbatch failed: $submit_output"
+        echo "sbatch failed: $submit_output" >&2
         if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then
-            echo "Non-transient sbatch failure — not retrying."
+            echo "Non-transient sbatch failure — not retrying." >&2
             return 1
         fi
         if [ $attempt -lt $max_attempts ]; then
-            echo "Transient error — retrying in 30s..."
+            echo "Transient error — retrying in 30s..." >&2
             sleep 30
         fi
         attempt=$((attempt + 1))
     done
 
-    echo "sbatch failed after $max_attempts attempts."
+    echo "sbatch failed after $max_attempts attempts." >&2
     return 1
 }

From 381dc7e210ea11a5e68398124b7a556777dfe8e3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:46:22 -0400
Subject: [PATCH 04/15] ci: use printf over echo for sbatch script piping

More portable and defensive against script content starting with -
or containing escape sequences.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-sbatch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
index ddfab3b216..fe3f151be7 100644
--- a/.github/scripts/retry-sbatch.sh
+++ b/.github/scripts/retry-sbatch.sh
@@ -15,7 +15,7 @@ retry_sbatch() {
 
     while [ $attempt -le $max_attempts ]; do
         echo "sbatch attempt $attempt of $max_attempts..." >&2
-        submit_output=$(echo "$script_contents" | sbatch 2>&1) || true
+        submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
         job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
         if [ -n "$job_id" ]; then
             echo "$job_id"

From 7f467b187ea26a5485299c462a582ed008f1dcd4 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:46:42 -0400
Subject: [PATCH 05/15] ci: add explanatory comment to rm -rf build suppression
 in build.sh and bench.sh

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/common/bench.sh   | 1 +
 .github/workflows/frontier/build.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 2573658700..29bf78b3ff 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -25,6 +25,7 @@ fi
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
+    # Suppress stale NFS file handle errors — those files are inaccessible anyway.
     rm -rf build 2>/dev/null || true
 fi
 
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 6d20842ea6..a121d9447f 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,6 +20,7 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
+# Suppress stale NFS file handle errors — those files are inaccessible anyway.
 rm -rf build 2>/dev/null || true
 
 source .github/scripts/retry-build.sh

From e864803a2ddc1a980ff70d4f45464db5f5eca729 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:48:22 -0400
Subject: [PATCH 06/15] ci: always rebuild on Phoenix even if rm -rf build
 leaves partial directory

If stale NFS handles prevent full deletion of build/, the old
'if [ ! -d build ]' guard would skip the rebuild entirely, leaving
stale binaries from a different compute node that could cause SIGILL.
Force a rebuild on Phoenix unconditionally.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/common/bench.sh | 2 +-
 .github/workflows/common/test.sh  | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 29bf78b3ff..186ce3ba96 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -29,7 +29,7 @@ if [ "$job_cluster" = "phoenix" ]; then
     rm -rf build 2>/dev/null || true
 fi
 
-if [ ! -d "build" ]; then
+if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then
     source .github/scripts/retry-build.sh
     retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 fi
diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 4355891aaf..09b996536d 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -17,7 +17,9 @@ if [ "$job_cluster" = "phoenix" ]; then
     rm -rf build 2>/dev/null || true
 fi
 
-if [ ! -d "build" ]; then
+# Phoenix must always rebuild (heterogeneous compute nodes → ISA mismatch risk),
+# even if rm above left a partial build/ directory behind.
+if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then
     source .github/scripts/retry-build.sh
 
     # Phoenix: smoke-test the syscheck binary to catch architecture mismatches

From ff18b55e5366672f2b60ef6298cb49fb1da29e6b Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:54:21 -0400
Subject: [PATCH 07/15] ci: use rename trick to handle stale NFS file handles
 on build cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace 'rm -rf build || true' with 'mv build build.stale.$$' followed
by a background best-effort delete. mv is a metadata-only operation that
succeeds even when files have stale NFS handles, guaranteeing build/ is
gone before the fresh build starts. Old stale trees are cleaned up
opportunistically in the background.

This also simplifies test.sh and bench.sh: since mv reliably removes
build/, the Phoenix-specific override in the build condition is no longer
needed — the plain '[ ! -d build ]' check is sufficient again.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/prebuild-case-optimization.sh |  3 ++-
 .github/workflows/common/bench.sh             |  8 +++++---
 .github/workflows/common/test.sh              | 10 +++++-----
 .github/workflows/frontier/build.sh           |  4 ++--
 4 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh
index 66444efa9b..345c144563 100755
--- a/.github/scripts/prebuild-case-optimization.sh
+++ b/.github/scripts/prebuild-case-optimization.sh
@@ -22,7 +22,8 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
-rm -rf build 2>/dev/null || true
+mv build build.stale.$$ 2>/dev/null || true
+rm -rf build.stale.* 2>/dev/null & disown
 
 . ./mfc.sh load -c "$flag" -m g
 
diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 186ce3ba96..4cd45ac7ab 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -25,11 +25,13 @@ fi
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
-    # Suppress stale NFS file handle errors — those files are inaccessible anyway.
-    rm -rf build 2>/dev/null || true
+    # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
+    # NFS file handles. Delete the old tree in the background (best-effort).
+    mv build build.stale.$$ 2>/dev/null || true
+    rm -rf build.stale.* 2>/dev/null & disown
 fi
 
-if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then
+if [ ! -d "build" ]; then
     source .github/scripts/retry-build.sh
     retry_build ./mfc.sh build -j $n_jobs $build_opts || exit 1
 fi
diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 09b996536d..58fb48cd33 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -13,13 +13,13 @@ build_opts="$gpu_opts"
 # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
 # to avoid SIGILL from stale binaries compiled on a different microarchitecture.
 if [ "$job_cluster" = "phoenix" ]; then
-    # Suppress stale NFS file handle errors — those files are inaccessible anyway.
-    rm -rf build 2>/dev/null || true
+    # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
+    # NFS file handles. Delete the old tree in the background (best-effort).
+    mv build build.stale.$$ 2>/dev/null || true
+    rm -rf build.stale.* 2>/dev/null & disown
 fi
 
-# Phoenix must always rebuild (heterogeneous compute nodes → ISA mismatch risk),
-# even if rm above left a partial build/ directory behind.
-if [ "$job_cluster" = "phoenix" ] || [ ! -d "build" ]; then
+if [ ! -d "build" ]; then
     source .github/scripts/retry-build.sh
 
     # Phoenix: smoke-test the syscheck binary to catch architecture mismatches
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index a121d9447f..4d0afc20ac 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,8 +20,8 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-# Suppress stale NFS file handle errors — those files are inaccessible anyway.
-rm -rf build 2>/dev/null || true
+mv build build.stale.$$ 2>/dev/null || true
+rm -rf build.stale.* 2>/dev/null & disown
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then

From b55a21e2881a75d9c1b35a7086a53229fad7541e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 19:58:26 -0400
Subject: [PATCH 08/15] ci: log last sbatch error output on retry exhaustion

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-sbatch.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
index fe3f151be7..78c45fa821 100644
--- a/.github/scripts/retry-sbatch.sh
+++ b/.github/scripts/retry-sbatch.sh
@@ -11,7 +11,7 @@ retry_sbatch() {
     local script_contents="$1"
     local max_attempts=3
     local attempt=1
-    local submit_output job_id
+    local submit_output job_id last_output=""
 
     while [ $attempt -le $max_attempts ]; do
         echo "sbatch attempt $attempt of $max_attempts..." >&2
@@ -21,6 +21,7 @@ retry_sbatch() {
             echo "$job_id"
             return 0
         fi
+        last_output="$submit_output"
         echo "sbatch failed: $submit_output" >&2
         if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then
             echo "Non-transient sbatch failure — not retrying." >&2
@@ -33,6 +34,6 @@ retry_sbatch() {
         attempt=$((attempt + 1))
     done
 
-    echo "sbatch failed after $max_attempts attempts." >&2
+    echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
     return 1
 }

From 38fca240c1506c11fea64434f4098b6ccc1faf3e Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:02:34 -0400
Subject: [PATCH 09/15] ci: scope stale build cleanup to current job's PID and
 fix missing EOF newline

Replace 'rm -rf build.stale.*' with 'rm -rf build.stale.$$' so each job
only cleans up its own renamed directory, avoiding a race if concurrent
matrix jobs share a workspace. Also add trailing newline to retry-sbatch.sh.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/prebuild-case-optimization.sh | 2 +-
 .github/scripts/retry-sbatch.sh               | 1 +
 .github/workflows/common/bench.sh             | 2 +-
 .github/workflows/common/test.sh              | 2 +-
 .github/workflows/frontier/build.sh           | 2 +-
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh
index 345c144563..2b1637c8c2 100755
--- a/.github/scripts/prebuild-case-optimization.sh
+++ b/.github/scripts/prebuild-case-optimization.sh
@@ -23,7 +23,7 @@ case "$cluster" in
 esac
 
 mv build build.stale.$$ 2>/dev/null || true
-rm -rf build.stale.* 2>/dev/null & disown
+rm -rf "build.stale.$$" 2>/dev/null & disown
 
 . ./mfc.sh load -c "$flag" -m g
 
diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
index 78c45fa821..a46dbbcc63 100644
--- a/.github/scripts/retry-sbatch.sh
+++ b/.github/scripts/retry-sbatch.sh
@@ -37,3 +37,4 @@ retry_sbatch() {
     echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
     return 1
 }
+
diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 4cd45ac7ab..9d4611309c 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -28,7 +28,7 @@ if [ "$job_cluster" = "phoenix" ]; then
     # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
     # NFS file handles. Delete the old tree in the background (best-effort).
     mv build build.stale.$$ 2>/dev/null || true
-    rm -rf build.stale.* 2>/dev/null & disown
+    rm -rf "build.stale.$$" 2>/dev/null & disown
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 58fb48cd33..7bd2235771 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -16,7 +16,7 @@ if [ "$job_cluster" = "phoenix" ]; then
     # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
     # NFS file handles. Delete the old tree in the background (best-effort).
     mv build build.stale.$$ 2>/dev/null || true
-    rm -rf build.stale.* 2>/dev/null & disown
+    rm -rf "build.stale.$$" 2>/dev/null & disown
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index 4d0afc20ac..b664b53747 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -21,7 +21,7 @@ build_opts="$gpu_opts"
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
 mv build build.stale.$$ 2>/dev/null || true
-rm -rf build.stale.* 2>/dev/null & disown
+rm -rf "build.stale.$$" 2>/dev/null & disown
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then

From 084f31a3df1d9a49d10833a0fa4265468b3e6da3 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:04:11 -0400
Subject: [PATCH 10/15] ci: extract clean_build() helper to eliminate
 duplication

The rename-then-background-delete pattern was copy-pasted across four
scripts. Move it into .github/scripts/clean-build.sh and source it at
each call site.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/clean-build.sh                | 13 +++++++++++++
 .github/scripts/prebuild-case-optimization.sh |  4 ++--
 .github/workflows/common/bench.sh             |  6 ++----
 .github/workflows/common/test.sh              |  6 ++----
 .github/workflows/frontier/build.sh           |  4 ++--
 5 files changed, 21 insertions(+), 12 deletions(-)
 create mode 100644 .github/scripts/clean-build.sh

diff --git a/.github/scripts/clean-build.sh b/.github/scripts/clean-build.sh
new file mode 100644
index 0000000000..4c859910d5
--- /dev/null
+++ b/.github/scripts/clean-build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Provides clean_build(): renames build/ aside and deletes it in the background.
+# mv is a metadata-only operation that succeeds even with stale NFS file handles,
+# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
+# scoped to this job's PID to avoid races with concurrent matrix jobs.
+#
+# Usage: source .github/scripts/clean-build.sh
+#        clean_build
+
+clean_build() {
+    mv build "build.stale.$$" 2>/dev/null || true
+    rm -rf "build.stale.$$" 2>/dev/null & disown
+}
diff --git a/.github/scripts/prebuild-case-optimization.sh b/.github/scripts/prebuild-case-optimization.sh
index 2b1637c8c2..938ce2f438 100755
--- a/.github/scripts/prebuild-case-optimization.sh
+++ b/.github/scripts/prebuild-case-optimization.sh
@@ -22,8 +22,8 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
-mv build build.stale.$$ 2>/dev/null || true
-rm -rf "build.stale.$$" 2>/dev/null & disown
+source .github/scripts/clean-build.sh
+clean_build
 
 . ./mfc.sh load -c "$flag" -m g
 
diff --git a/.github/workflows/common/bench.sh b/.github/workflows/common/bench.sh
index 9d4611309c..66d77cfd99 100644
--- a/.github/workflows/common/bench.sh
+++ b/.github/workflows/common/bench.sh
@@ -25,10 +25,8 @@ fi
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
-    # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
-    # NFS file handles. Delete the old tree in the background (best-effort).
-    mv build build.stale.$$ 2>/dev/null || true
-    rm -rf "build.stale.$$" 2>/dev/null & disown
+    source .github/scripts/clean-build.sh
+    clean_build
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 7bd2235771..8badab2699 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -13,10 +13,8 @@ build_opts="$gpu_opts"
 # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
 # to avoid SIGILL from stale binaries compiled on a different microarchitecture.
 if [ "$job_cluster" = "phoenix" ]; then
-    # Rename instead of rm: mv is a metadata-only op that succeeds even with stale
-    # NFS file handles. Delete the old tree in the background (best-effort).
-    mv build build.stale.$$ 2>/dev/null || true
-    rm -rf "build.stale.$$" 2>/dev/null & disown
+    source .github/scripts/clean-build.sh
+    clean_build
 fi
 
 if [ ! -d "build" ]; then
diff --git a/.github/workflows/frontier/build.sh b/.github/workflows/frontier/build.sh
index b664b53747..5bd40999d7 100644
--- a/.github/workflows/frontier/build.sh
+++ b/.github/workflows/frontier/build.sh
@@ -20,8 +20,8 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-mv build build.stale.$$ 2>/dev/null || true
-rm -rf "build.stale.$$" 2>/dev/null & disown
+source .github/scripts/clean-build.sh
+clean_build
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then

From ecd2b228806a7d7011f495fa9dae6f909cd7c74c Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:05:48 -0400
Subject: [PATCH 11/15] ci: clean up leftover stale build dirs at job start to
 bound accumulation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/clean-build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/scripts/clean-build.sh b/.github/scripts/clean-build.sh
index 4c859910d5..2bed13ea0c 100644
--- a/.github/scripts/clean-build.sh
+++ b/.github/scripts/clean-build.sh
@@ -8,6 +8,8 @@
 #        clean_build
 
 clean_build() {
+    # Clean up leftover stale directories from previous runs before adding a new one.
+    rm -rf build.stale.* 2>/dev/null || true
     mv build "build.stale.$$" 2>/dev/null || true
     rm -rf "build.stale.$$" 2>/dev/null & disown
 }

From 5e67dec22f25c053b6166a638d095e85574c7431 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:06:35 -0400
Subject: [PATCH 12/15] ci: remove unreachable job_id guard after retry_sbatch

Under set -e, retry_sbatch returning 1 exits the script immediately.
The if [ -z "$job_id" ] block was never reachable.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit-slurm-job.sh | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh
index 5987822f9b..abead3c497 100755
--- a/.github/scripts/submit-slurm-job.sh
+++ b/.github/scripts/submit-slurm-job.sh
@@ -197,11 +197,6 @@ EOT
 job_id=$(retry_sbatch "$_sbatch_script")
 unset _sbatch_script
 
-if [ -z "$job_id" ]; then
-    echo "ERROR: Failed to submit job."
-    exit 1
-fi
-
 echo "Submitted batch job $job_id"
 echo "$job_id" > "$id_file"
 echo "Job ID written to $id_file"

From 7ea9da530c0ca59ef536ad6c8a96ba8f1d8d7e49 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:07:18 -0400
Subject: [PATCH 13/15] ci: remove 'try again' from transient sbatch error
 pattern

'try again' is too broad and could match hard SLURM policy errors like
'QOS violates policy, try again later', causing unintended retries.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/retry-sbatch.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/scripts/retry-sbatch.sh b/.github/scripts/retry-sbatch.sh
index a46dbbcc63..ad3925b920 100644
--- a/.github/scripts/retry-sbatch.sh
+++ b/.github/scripts/retry-sbatch.sh
@@ -23,7 +23,7 @@ retry_sbatch() {
         fi
         last_output="$submit_output"
         echo "sbatch failed: $submit_output" >&2
-        if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|try again|temporarily unavailable"; then
+        if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
             echo "Non-transient sbatch failure — not retrying." >&2
             return 1
         fi

From defdd7c984ba6224b5b32068c20e4a6f2e0593ec Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 20:16:23 -0400
Subject: [PATCH 14/15] ci: remove --exclusive from Phoenix CPU jobs

--exclusive caused 'Requested node configuration is not available' on
cpu-small,cpu-medium,cpu-large partitions. The --mem-per-cpu=8G
reservation already prevents memory contention from co-scheduled jobs.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/scripts/submit-slurm-job.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/scripts/submit-slurm-job.sh b/.github/scripts/submit-slurm-job.sh
index abead3c497..78dd1ee9a2 100755
--- a/.github/scripts/submit-slurm-job.sh
+++ b/.github/scripts/submit-slurm-job.sh
@@ -87,8 +87,7 @@ if [ "$device" = "cpu" ]; then
             sbatch_device_opts="\
 #SBATCH -p cpu-small,cpu-medium,cpu-large
 #SBATCH --ntasks-per-node=12
-#SBATCH --mem-per-cpu=8G
-#SBATCH --exclusive"
+#SBATCH --mem-per-cpu=8G"
             ;;
         frontier|frontier_amd)
             sbatch_device_opts="\

From 65e2642b10250b8d6df4162da9ca3383fdbb4862 Mon Sep 17 00:00:00 2001
From: Spencer Bryngelson <sbryngelson@gmail.com>
Date: Thu, 12 Mar 2026 23:42:38 -0400
Subject: [PATCH 15/15] ci: redirect Phoenix TMPDIR in test.sh to avoid MPI
 session dir failures

Phoenix compute nodes have a small /tmp. With 8 parallel test
threads each spawning MPI processes over ~96 minutes, it fills up
and ORTE fails to create its session directory, causing the last
batch of tests to fail with 'No such file or directory'. Apply the
same TMPDIR redirect to project storage that bench.sh already uses.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .github/workflows/common/test.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.github/workflows/common/test.sh b/.github/workflows/common/test.sh
index 8badab2699..141d2e72d3 100644
--- a/.github/workflows/common/test.sh
+++ b/.github/workflows/common/test.sh
@@ -8,6 +8,19 @@ set -euo pipefail
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
+# --- Phoenix TMPDIR setup ---
+# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
+# spawning MPI processes, it fills up and ORTE session dir creation fails.
+# Redirect TMPDIR to project storage, same as bench.sh.
+if [ "$job_cluster" = "phoenix" ]; then
+    tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
+    currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
+    mkdir -p $tmpbuild
+    mkdir -p $currentdir
+    export TMPDIR=$currentdir
+    trap 'rm -rf "$currentdir" || true' EXIT
+fi
+
 # --- Build (if not pre-built on login node) ---
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh