Skip to content

Commit d5569b0

Browse files
sbryngelsonclaude
andcommitted
Merge upstream/master into ci-bench
Resolve conflicts with MFlowCode#1148 (build caching): - frontier/build.sh, frontier_amd/build.sh: take upstream's cache + retry logic (proactive clean would defeat caching) - bench.yml: keep our pull_request trigger model (upstream's workflow_run Get PR Info step doesn't apply) - phoenix/bench.sh: remove proactive clean (unnecessary overhead for fresh checkouts, and would break caching) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 parents 74e11ab + 356b61f commit d5569b0

116 files changed

Lines changed: 1689 additions & 518 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/scripts/run_parallel_benchmarks.sh

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,16 @@ else
5252
echo "Master job completed successfully"
5353
fi
5454

55-
# Check if either job failed
55+
# Warn if either job failed (partial results may still be usable)
5656
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
57-
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
58-
exit 1
57+
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
58+
echo "Checking for partial results..."
59+
else
60+
echo "=========================================="
61+
echo "Both benchmark jobs completed successfully!"
62+
echo "=========================================="
5963
fi
6064

61-
echo "=========================================="
62-
echo "Both benchmark jobs completed successfully!"
63-
echo "=========================================="
64-
6565
# Final verification that output files exist before proceeding
6666
pr_yaml="pr/bench-${device}-${interface}.yaml"
6767
master_yaml="master/bench-${device}-${interface}.yaml"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/bin/bash
2+
# Sets up a persistent build cache for self-hosted CI runners.
3+
# Creates a symlink: ./build -> /storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/<key>/build
4+
#
5+
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
6+
# This avoids cross-runner path issues entirely — CMake's absolute paths are
7+
# always correct because the same runner always uses the same workspace path.
8+
#
9+
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>
10+
11+
_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
12+
_cache_device="${2:?}"
13+
_cache_interface="${3:-none}"
14+
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
15+
16+
_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
17+
_cache_base="/storage/coda1/d-coc/0/sbryngelson3/.mfc-ci-cache/${_cache_key}/build"
18+
19+
mkdir -p "$_cache_base"
20+
_cache_dir="$(cd "$_cache_base" && pwd -P)"
21+
22+
echo "=== Build Cache Setup ==="
23+
echo " Cache key: $_cache_key"
24+
echo " Cache dir: $_cache_dir"
25+
26+
# Replace any existing build/ (real dir or stale symlink) with a symlink
27+
# to our runner-specific cache directory.
28+
# Use unlink for symlinks to avoid rm -rf following the link and deleting
29+
# the shared cache contents (which another runner may be using).
30+
if [ -L "build" ]; then
31+
unlink "build"
32+
elif [ -e "build" ]; then
33+
rm -rf "build"
34+
fi
35+
36+
ln -s "$_cache_dir" "build"
37+
38+
echo " Symlink: build -> $_cache_dir"
39+
echo "========================="

.github/scripts/submit_and_monitor_bench.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@ fi
3737
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"
3838

3939
# Use the monitoring script from PR (where this script lives)
40-
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"
41-
42-
echo "[$dir] Monitoring complete for job $job_id"
40+
monitor_exit=0
41+
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
42+
if [ "$monitor_exit" -ne 0 ]; then
43+
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
44+
else
45+
echo "[$dir] Monitoring complete for job $job_id"
46+
fi
4347

4448
# Verify the YAML output file was created
4549
yaml_file="${job_slug}.yaml"

.github/workflows/bench.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ jobs:
123123
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}
124124

125125
- name: Generate & Post Comment
126+
if: always()
126127
run: |
127128
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
128129
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)

.github/workflows/coverage.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@ jobs:
3535
- name: Checkouts
3636
uses: actions/checkout@v4
3737

38+
- name: Restore Build Cache
39+
uses: actions/cache@v4
40+
with:
41+
path: build
42+
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}
43+
3844
- name: Setup Ubuntu
3945
run: |
4046
sudo apt update -y

.github/workflows/docs.yml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,12 @@ jobs:
6767
echo "url-count = ${{ steps.sitemap.outputs.url-count }}"
6868
echo "excluded-count = ${{ steps.sitemap.outputs.excluded-count }}"
6969
70+
- name: Linkcheck - Lychee
71+
uses: lycheeverse/lychee-action@v2
72+
with:
73+
args: -c .lychee.toml build/install/docs/mfc/
74+
fail: true
75+
7076
- name: Publish Documentation
7177
if: github.repository == 'MFlowCode/MFC' && github.ref == 'refs/heads/master' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' )
7278
run: |
@@ -83,11 +89,5 @@ jobs:
8389
git -C ../www commit -m "Docs @ ${GITHUB_SHA::7}" || true
8490
git -C ../www push
8591
86-
- name: Linkcheck - Lychee
87-
uses: lycheeverse/lychee-action@v2
88-
with:
89-
args: -c .lychee.toml build/install/docs/mfc/
90-
fail: true
91-
9292
# DOC_PUSH_URL should be of the format:
9393
# --> https://<username>:<token>@github.com/<username>/<repository>
Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#!/bin/bash
22

3-
set -e
4-
53
# Ignore SIGHUP to survive login node session drops
64
trap '' HUP
75

@@ -20,13 +18,44 @@ fi
2018

2119
. ./mfc.sh load -c f -m g
2220

23-
# Clean stale build artifacts from previous CI runs
24-
./mfc.sh clean
25-
26-
if [ "$run_bench" == "bench" ]; then
27-
for dir in benchmarks/*/; do
28-
./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
29-
done
30-
else
31-
./mfc.sh test -v -a --dry-run --rdma-mpi -j 4 $build_opts
21+
# Only set up build cache for test suite, not benchmarks
22+
if [ "$run_bench" != "bench" ]; then
23+
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
3224
fi
25+
26+
max_attempts=3
27+
attempt=1
28+
while [ $attempt -le $max_attempts ]; do
29+
echo "Build attempt $attempt of $max_attempts..."
30+
if [ "$run_bench" == "bench" ]; then
31+
build_cmd_ok=true
32+
for dir in benchmarks/*/; do
33+
dirname=$(basename "$dir")
34+
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
35+
build_cmd_ok=false
36+
break
37+
fi
38+
done
39+
else
40+
if ./mfc.sh test -v -a --dry-run --rdma-mpi -j 8 $build_opts; then
41+
build_cmd_ok=true
42+
else
43+
build_cmd_ok=false
44+
fi
45+
fi
46+
47+
if [ "$build_cmd_ok" = true ]; then
48+
echo "Build succeeded on attempt $attempt."
49+
exit 0
50+
fi
51+
52+
if [ $attempt -lt $max_attempts ]; then
53+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
54+
rm -rf build/staging build/install build/lock.yaml
55+
sleep 30
56+
fi
57+
attempt=$((attempt + 1))
58+
done
59+
60+
echo "Build failed after $max_attempts attempts."
61+
exit 1
Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#!/bin/bash
22

3-
set -e
4-
53
# Ignore SIGHUP to survive login node session drops
64
trap '' HUP
75

@@ -20,13 +18,44 @@ fi
2018

2119
. ./mfc.sh load -c famd -m g
2220

23-
# Clean stale build artifacts from previous CI runs
24-
./mfc.sh clean
25-
26-
if [ "$run_bench" == "bench" ]; then
27-
for dir in benchmarks/*/; do
28-
./mfc.sh run -v "$dir/case.py" --case-optimization -j 4 --dry-run $build_opts
29-
done
30-
else
31-
./mfc.sh test -v -a --dry-run -j 4 $build_opts
21+
# Only set up build cache for test suite, not benchmarks
22+
if [ "$run_bench" != "bench" ]; then
23+
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
3224
fi
25+
26+
max_attempts=3
27+
attempt=1
28+
while [ $attempt -le $max_attempts ]; do
29+
echo "Build attempt $attempt of $max_attempts..."
30+
if [ "$run_bench" == "bench" ]; then
31+
build_cmd_ok=true
32+
for dir in benchmarks/*/; do
33+
dirname=$(basename "$dir")
34+
if ! ./mfc.sh run -v "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts; then
35+
build_cmd_ok=false
36+
break
37+
fi
38+
done
39+
else
40+
if ./mfc.sh test -v -a --dry-run -j 8 $build_opts; then
41+
build_cmd_ok=true
42+
else
43+
build_cmd_ok=false
44+
fi
45+
fi
46+
47+
if [ "$build_cmd_ok" = true ]; then
48+
echo "Build succeeded on attempt $attempt."
49+
exit 0
50+
fi
51+
52+
if [ $attempt -lt $max_attempts ]; then
53+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
54+
rm -rf build/staging build/install build/lock.yaml
55+
sleep 30
56+
fi
57+
attempt=$((attempt + 1))
58+
done
59+
60+
echo "Build failed after $max_attempts attempts."
61+
exit 1

.github/workflows/phoenix/bench.sh

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,5 @@
11
#!/bin/bash
22

3-
# Clean stale build artifacts from previous CI runs to prevent
4-
# cross-compiler contamination (e.g. gfortran LAPACK linked by NVHPC)
5-
./mfc.sh clean
6-
73
n_ranks=12
84

95
echo "My interface is:" $job_interface

.github/workflows/phoenix/test.sh

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,39 @@ if [ "$job_device" = "gpu" ]; then
1010
fi
1111
fi
1212

13+
# Set up persistent build cache
14+
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"
15+
1316
max_attempts=3
1417
attempt=1
1518
while [ $attempt -le $max_attempts ]; do
1619
echo "Build attempt $attempt of $max_attempts..."
1720
if ./mfc.sh test -v --dry-run -j 8 $build_opts; then
1821
echo "Build succeeded on attempt $attempt."
22+
23+
# Smoke-test the cached binaries to catch architecture mismatches
24+
# (SIGILL from binaries compiled on a different compute node).
25+
syscheck_bin=$(find build/install -name syscheck -type f 2>/dev/null | head -1)
26+
if [ -n "$syscheck_bin" ] && ! "$syscheck_bin" > /dev/null 2>&1; then
27+
echo "WARNING: syscheck binary crashed — cached install is stale."
28+
if [ $attempt -lt $max_attempts ]; then
29+
echo "Clearing cache and rebuilding..."
30+
rm -rf build/staging build/install build/lock.yaml
31+
sleep 5
32+
attempt=$((attempt + 1))
33+
continue
34+
else
35+
echo "ERROR: syscheck still failing after $max_attempts attempts."
36+
exit 1
37+
fi
38+
fi
39+
1940
break
2041
fi
2142

2243
if [ $attempt -lt $max_attempts ]; then
23-
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
24-
./mfc.sh clean
44+
echo "Build failed on attempt $attempt. Clearing cache and retrying in 30s..."
45+
rm -rf build/staging build/install build/lock.yaml
2546
sleep 30
2647
else
2748
echo "Build failed after $max_attempts attempts."
@@ -40,4 +61,3 @@ if [ "$job_device" = "gpu" ]; then
4061
fi
4162

4263
./mfc.sh test -v --max-attempts 3 -a -j $n_test_threads $device_opts -- -c phoenix
43-

0 commit comments

Comments
 (0)