Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions .github/scripts/run_parallel_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,16 @@ else
echo "Master job completed successfully"
fi

# Check if either job failed
# Warn if either job failed (partial results may still be usable)
if [ "${pr_exit}" -ne 0 ] || [ "${master_exit}" -ne 0 ]; then
echo "ERROR: One or both benchmark jobs failed: pr_exit=${pr_exit}, master_exit=${master_exit}"
exit 1
echo "WARNING: Benchmark jobs had failures: pr=${pr_exit}, master=${master_exit}"
Comment thread
sbryngelson marked this conversation as resolved.
echo "Checking for partial results..."
else
echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="
fi
Comment thread
sbryngelson marked this conversation as resolved.

echo "=========================================="
echo "Both benchmark jobs completed successfully!"
echo "=========================================="

# Final verification that output files exist before proceeding
pr_yaml="pr/bench-${device}-${interface}.yaml"
master_yaml="master/bench-${device}-${interface}.yaml"
Expand Down
35 changes: 35 additions & 0 deletions .github/scripts/setup-build-cache.sh
Comment thread
sbryngelson marked this conversation as resolved.
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
Comment thread
sbryngelson marked this conversation as resolved.
# Sets up a persistent build cache for self-hosted CI runners.
# Creates a symlink: ./build -> <scratch>/.mfc-ci-cache/<key>/build
#
# Each runner gets its own cache keyed by (cluster, device, interface, runner).
# This avoids cross-runner path issues entirely — CMake's absolute paths are
# always correct because the same runner always uses the same workspace path.
#
# Usage: source .github/scripts/setup-build-cache.sh <cluster> <device> <interface>

_cache_cluster="${1:?Usage: setup-build-cache.sh <cluster> <device> <interface>}"
_cache_device="${2:?}"
_cache_interface="${3:-none}"
_cache_runner="${RUNNER_NAME:?RUNNER_NAME not set}"
Comment thread
sbryngelson marked this conversation as resolved.

_cache_key="${_cache_cluster}-${_cache_device}-${_cache_interface}-${_cache_runner}"
_cache_base="$HOME/scratch/.mfc-ci-cache/${_cache_key}/build"

mkdir -p "$_cache_base"
_cache_dir="$(cd "$_cache_base" && pwd -P)"

echo "=== Build Cache Setup ==="
echo " Cache key: $_cache_key"
echo " Cache dir: $_cache_dir"

# Replace any existing build/ (real dir or stale symlink) with a symlink
# to our runner-specific cache directory.
if [ -e "build" ] || [ -L "build" ]; then
rm -rf "build"
fi
Comment thread
sbryngelson marked this conversation as resolved.
Outdated

ln -s "$_cache_dir" "build"
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.

echo " Symlink: build -> $_cache_dir"
echo "========================="
10 changes: 7 additions & 3 deletions .github/scripts/submit_and_monitor_bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,13 @@ fi
echo "[$dir] Job ID: $job_id, monitoring output file: $output_file"

# Use the monitoring script from PR (where this script lives)
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file"

echo "[$dir] Monitoring complete for job $job_id"
monitor_exit=0
bash "${SCRIPT_DIR}/monitor_slurm_job.sh" "$job_id" "$output_file" || monitor_exit=$?
if [ "$monitor_exit" -ne 0 ]; then
echo "[$dir] WARNING: SLURM job exited with code $monitor_exit"
Comment thread
sbryngelson marked this conversation as resolved.
else
echo "[$dir] Monitoring complete for job $job_id"
fi

# Verify the YAML output file was created
yaml_file="${job_slug}.yaml"
Expand Down
33 changes: 23 additions & 10 deletions .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,21 +46,34 @@ jobs:
else
# Get PR number from workflow_run
PR_NUMBER="${{ github.event.workflow_run.pull_requests[0].number }}"
if [ -z "$PR_NUMBER" ]; then
# Cross-repo PRs don't populate pull_requests[]. Search by head SHA.
HEAD_SHA="${{ github.event.workflow_run.head_sha }}"
PR_NUMBER=$(gh api "repos/${{ github.repository }}/pulls?state=open&sort=updated&direction=desc&per_page=30" \
--jq ".[] | select(.head.sha == \"$HEAD_SHA\") | .number" | head -1)
fi
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.

if [ -n "$PR_NUMBER" ]; then
echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT

# Fetch actual PR author from API (workflow_run.actor is the re-runner, not PR author)
PR_AUTHOR=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER --jq '.user.login')
echo "author=$PR_AUTHOR" >> $GITHUB_OUTPUT

# Check if PR is approved
APPROVED=$(gh api repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews \
--jq '[.[] | select(.state == "APPROVED")] | length')
if [ "$APPROVED" -gt 0 ]; then
echo "approved=true" >> $GITHUB_OUTPUT
else
echo "approved=false" >> $GITHUB_OUTPUT
fi
# Check if PR is approved by a maintainer/admin (ignore AI bot approvals)
APPROVERS=$(gh api "repos/${{ github.repository }}/pulls/$PR_NUMBER/reviews" \
--jq '[.[] | select(.state == "APPROVED") | .user.login] | unique | .[]')
APPROVED="false"
for approver in $APPROVERS; do
PERM=$(gh api "repos/${{ github.repository }}/collaborators/$approver/permission" \
--jq '.permission' 2>/dev/null || echo "none")
if [ "$PERM" = "admin" ] || [ "$PERM" = "maintain" ] || [ "$PERM" = "write" ]; then
echo " Approved by $approver (permission: $PERM)"
APPROVED="true"
break
fi
done
Comment thread
sbryngelson marked this conversation as resolved.
echo "approved=$APPROVED" >> $GITHUB_OUTPUT
else
echo "pr_number=" >> $GITHUB_OUTPUT
echo "approved=false" >> $GITHUB_OUTPUT
Expand All @@ -76,8 +89,7 @@ jobs:
(
github.event_name == 'workflow_dispatch' ||
needs.file-changes.outputs.pr_approved == 'true' ||
needs.file-changes.outputs.pr_author == 'sbryngelson' ||
needs.file-changes.outputs.pr_author == 'wilfonba'
needs.file-changes.outputs.pr_author == 'sbryngelson'
)
needs: [file-changes]
strategy:
Expand Down Expand Up @@ -164,6 +176,7 @@ jobs:
run: bash pr/.github/scripts/run_parallel_benchmarks.sh ${{ matrix.device }} ${{ matrix.interface }} ${{ matrix.cluster }}

- name: Generate & Post Comment
if: always()
run: |
(cd pr && . ./mfc.sh load -c ${{ matrix.flag }} -m g)
(cd pr && ./mfc.sh bench_diff ../master/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml ../pr/bench-${{ matrix.device }}-${{ matrix.interface }}.yaml)
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@ jobs:
- name: Checkouts
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
Comment thread
sbryngelson marked this conversation as resolved.
key: mfc-coverage-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}

- name: Setup Ubuntu
run: |
sudo apt update -y
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/frontier/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ fi

. ./mfc.sh load -c f -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
fi
Comment thread
sbryngelson marked this conversation as resolved.

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging and retrying in 30s..."
rm -rf build/staging build/lock.yaml
Comment thread
cubic-dev-ai[bot] marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
9 changes: 7 additions & 2 deletions .github/workflows/frontier_amd/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ fi

. ./mfc.sh load -c famd -m g

# Only set up build cache for test suite, not benchmarks
if [ "$run_bench" != "bench" ]; then
source .github/scripts/setup-build-cache.sh frontier_amd "$job_device" "$job_interface"
fi

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand Down Expand Up @@ -45,8 +50,8 @@ while [ $attempt -le $max_attempts ]; do
fi
Comment thread
sbryngelson marked this conversation as resolved.

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging and retrying in 30s..."
rm -rf build/staging build/lock.yaml
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
Comment thread
sbryngelson marked this conversation as resolved.
Outdated
sleep 30
fi
attempt=$((attempt + 1))
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/phoenix/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ if [ "$job_device" = "gpu" ]; then
fi
fi

# Set up persistent build cache
source .github/scripts/setup-build-cache.sh phoenix "$job_device" "$job_interface"

max_attempts=3
attempt=1
while [ $attempt -le $max_attempts ]; do
Expand All @@ -20,8 +23,8 @@ while [ $attempt -le $max_attempts ]; do
fi

if [ $attempt -lt $max_attempts ]; then
echo "Build failed on attempt $attempt. Cleaning and retrying in 30s..."
./mfc.sh clean
echo "Build failed on attempt $attempt. Clearing staging and retrying in 30s..."
rm -rf build/staging build/lock.yaml
sleep 30
else
echo "Build failed after $max_attempts attempts."
Expand Down
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,12 @@ jobs:
- name: Clone
uses: actions/checkout@v4

- name: Restore Build Cache
uses: actions/cache@v4
with:
path: build
key: mfc-build-${{ matrix.os }}-${{ matrix.mpi }}-${{ matrix.debug }}-${{ matrix.precision }}-${{ matrix.intel }}-${{ hashFiles('CMakeLists.txt', 'toolchain/dependencies/**', 'toolchain/cmake/**', 'src/**/*.fpp', 'src/**/*.f90') }}

- name: Setup MacOS
if: matrix.os == 'macos'
run: |
Expand Down Expand Up @@ -202,6 +208,8 @@ jobs:
steps:
- name: Clone
uses: actions/checkout@v4
with:
clean: false
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.

- name: Build
if: matrix.cluster != 'phoenix'
Expand Down
Loading
Loading