MFlowCode · sbryngelson · Mar 13, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Provides clean_build(): renames build/ aside and deletes it in the background.
+# mv is a metadata-only operation that succeeds even with stale NFS file handles,
+# unlike rm -rf which fails on ESTALE. The background delete is best-effort and
+# scoped to this job's PID to avoid races with concurrent matrix jobs.
+#
+# Usage: source .github/scripts/clean-build.sh
+#        clean_build
+
+clean_build() {
+    # Clean up leftover stale directories from previous runs before adding a new one.
+    rm -rf build.stale.* 2>/dev/null || true
+    mv build "build.stale.$$" 2>/dev/null || true
+    rm -rf "build.stale.$$" 2>/dev/null & disown
+}
@@ -22,7 +22,8 @@ case "$cluster" in
     *) echo "ERROR: Unknown cluster '$cluster'"; exit 1 ;;
 esac
 
-rm -rf build
+source .github/scripts/clean-build.sh
+clean_build
 
 . ./mfc.sh load -c "$flag" -m g
 

@@ -0,0 +1,40 @@
+#!/bin/bash
+# Provides retry_sbatch(): submits a job script string via sbatch with retries.
+# Only retries on known transient SLURM/infrastructure errors (socket timeouts,
+# connection failures). Hard failures (bad account, invalid partition, QOS
+# violations) are not retried.
+#
+# Usage: source .github/scripts/retry-sbatch.sh
+#        job_id=$(retry_sbatch "$script_contents")
+
+retry_sbatch() {
+    local script_contents="$1"
+    local max_attempts=3
+    local attempt=1
+    local submit_output job_id last_output=""
+
+    while [ $attempt -le $max_attempts ]; do
+        echo "sbatch attempt $attempt of $max_attempts..." >&2
+        submit_output=$(printf '%s\n' "$script_contents" | sbatch 2>&1) || true
+        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
+        if [ -n "$job_id" ]; then
-        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
-        if [ -n "$job_id" ]; then
+        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$' || true)
+        if [ -n "$job_id" ]; then
-        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$')
-        if [ -n "$job_id" ]; then
+        job_id=$(echo "$submit_output" | grep -oE 'Submitted batch job ([0-9]+)' | grep -oE '[0-9]+$' || true)
+        if [ -n "$job_id" ]; then
+            echo "$job_id"
+            return 0
+        fi
+        last_output="$submit_output"
+        echo "sbatch failed: $submit_output" >&2
+        if ! echo "$submit_output" | grep -qiE "timed out|connection refused|connection reset|temporarily unavailable"; then
+            echo "Non-transient sbatch failure — not retrying." >&2
+            return 1
+        fi
+        if [ $attempt -lt $max_attempts ]; then
+            echo "Transient error — retrying in 30s..." >&2
+            sleep 30
+        fi
+        attempt=$((attempt + 1))
+    done
+
+    echo "sbatch failed after $max_attempts attempts. Last error: $last_output" >&2
+    return 1
+}
+
@@ -85,9 +85,9 @@ if [ "$device" = "cpu" ]; then
     case "$cluster" in
         phoenix)
             sbatch_device_opts="\
-#SBATCH -p cpu-small
-#SBATCH --ntasks-per-node=24
-#SBATCH --mem-per-cpu=2G"
+#SBATCH -p cpu-small,cpu-medium,cpu-large
+#SBATCH --ntasks-per-node=12
+#SBATCH --mem-per-cpu=8G"
             ;;
         frontier|frontier_amd)
             sbatch_device_opts="\
@@ -161,8 +161,9 @@ rm -f "$output_file"
 # --- Module load mode (short form) ---
 module_mode=$([ "$device" = "gpu" ] && echo "g" || echo "c")
 
-# --- Submit ---
-submit_output=$(sbatch <<EOT
+# --- Submit (with retries for transient SLURM errors) ---
+source "${SCRIPT_DIR}/retry-sbatch.sh"
+_sbatch_script=$(cat <<EOT
 #!/bin/bash
 #SBATCH -J ${job_prefix}-${job_slug}
 #SBATCH --account=${account}
@@ -192,12 +193,8 @@ $sbatch_script_contents
 EOT
 )
 
-job_id=$(echo "$submit_output" | grep -oE '[0-9]+')
-if [ -z "$job_id" ]; then
-    echo "ERROR: Failed to submit job. sbatch output:"
-    echo "$submit_output"
-    exit 1
-fi
+job_id=$(retry_sbatch "$_sbatch_script")
+unset _sbatch_script
 
 echo "Submitted batch job $job_id"
 echo "$job_id" > "$id_file"

@@ -25,7 +25,8 @@ fi
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix: always nuke stale builds (heterogeneous compute nodes → ISA mismatch risk).
 if [ "$job_cluster" = "phoenix" ]; then
-    rm -rf build
+    source .github/scripts/clean-build.sh
+    clean_build
 fi
 
 if [ ! -d "build" ]; then

@@ -8,12 +8,26 @@ set -euo pipefail
 source .github/scripts/gpu-opts.sh
 build_opts="$gpu_opts"
 
+# --- Phoenix TMPDIR setup ---
+# Phoenix compute nodes have a small /tmp. With 8 parallel test threads each
+# spawning MPI processes, it fills up and ORTE session dir creation fails.
+# Redirect TMPDIR to project storage, same as bench.sh.
+if [ "$job_cluster" = "phoenix" ]; then
+    tmpbuild=/storage/project/r-sbryngelson3-0/sbryngelson3/mytmp_build
+    currentdir=$tmpbuild/run-$(( RANDOM % 9000 ))
+    mkdir -p $tmpbuild
+    mkdir -p $currentdir
+    export TMPDIR=$currentdir
+    trap 'rm -rf "$currentdir" || true' EXIT
+fi
+
 # --- Build (if not pre-built on login node) ---
 # Phoenix builds inside SLURM; Frontier pre-builds via build.sh on the login node.
 # Phoenix builds inside SLURM on heterogeneous compute nodes — always start fresh
 # to avoid SIGILL from stale binaries compiled on a different microarchitecture.
 if [ "$job_cluster" = "phoenix" ]; then
-    rm -rf build
+    source .github/scripts/clean-build.sh
+    clean_build
 fi
 
 if [ ! -d "build" ]; then

@@ -20,7 +20,8 @@ build_opts="$gpu_opts"
 
 . ./mfc.sh load -c $compiler_flag -m $([ "$job_device" = "gpu" ] && echo "g" || echo "c")
 
-rm -rf build
+source .github/scripts/clean-build.sh
+clean_build
 
 source .github/scripts/retry-build.sh
 if [ "$run_bench" == "bench" ]; then