Skip to content
Closed
6 changes: 3 additions & 3 deletions .github/scripts/monitor_slurm_job.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,10 @@ squeue_failures=0
last_heartbeat=$(date +%s)

while true; do
# Try to read from tail output (non-blocking via timeout)
# Try to read from tail output with a 1s timeout (polling-based)
# Read multiple lines if available to avoid falling behind
lines_read=0
while IFS= read -r -t 0.1 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
lines_read=$((lines_read + 1))
last_heartbeat=$(date +%s)
Expand Down Expand Up @@ -115,7 +115,7 @@ done
# Drain any remaining output from tail after job completes
echo "Draining remaining output..."
drain_count=0
while IFS= read -r -t 0.5 line <&3 2>/dev/null; do
while IFS= read -r -t 1 line <&3 2>/dev/null; do
echo "$line"
drain_count=$((drain_count + 1))
# Safety limit to avoid infinite loop
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -CL40S
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\
"
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.

if [ "$2" = "cpu" ]; then
Expand Down
5 changes: 2 additions & 3 deletions .github/workflows/phoenix/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ sbatch_cpu_opts="\
"

sbatch_gpu_opts="\
#SBATCH -p gpu-v100,gpu-a100,gpu-h100,gpu-l40s
#SBATCH --ntasks-per-node=4 # Number of cores per node required
#SBATCH -G2\
#SBATCH --gres=gpu:H200:2
#SBATCH --ntasks-per-node=8 # Number of tasks (MPI ranks) per node\
"
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.

if [ "$2" = "cpu" ]; then
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,16 @@ jobs:
group: phoenix
labels: ${{ matrix.runner }}
env:
NODE_OPTIONS: ${{ matrix.cluster == 'phoenix' && '--max-old-space-size=2048' || '' }}
ACTIONS_RUNNER_FORCE_ACTIONS_NODE_VERSION: node16
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
steps:
- name: Clean workspace
run: rm -rf "${GITHUB_WORKSPACE:?}"/* "${GITHUB_WORKSPACE:?}"/.[!.]* 2>/dev/null || true
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.
Comment thread
sbryngelson marked this conversation as resolved.

- name: Clone
uses: actions/checkout@v4
with:
clean: false
Comment thread
sbryngelson marked this conversation as resolved.

- name: Build
if: matrix.cluster != 'phoenix'
Expand Down
Loading