update the OLCF workflow with best practices

zingale · zingale · commit 57b329f24258 · 2025-07-15T12:25:47.000-04:00
diff --git a/job_scripts/frontier/frontier.slurm b/job_scripts/frontier/frontier.slurm
@@ -10,6 +10,7 @@
 #SBATCH --cpus-per-task=7
 #SBATCH --gpus-per-task=1
 #SBATCH --gpu-bind=closest
+#SBATCH --signal=B:URG@300
 
 EXEC=./Castro3d.hip.x86-trento.MPI.HIP.SMPLSDC.ex
 INPUTS=inputs_3d.N14.coarse
@@ -22,18 +23,13 @@ module load rocm/6.3.1
 
 export LD_LIBRARY_PATH=$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH
 
-# libfabric workaround
-export FI_MR_CACHE_MONITOR=memhooks
-
 # set the file system striping
 
 echo $SLURM_SUBMIT_DIR
 
 module load lfs-wrapper
 lfs setstripe -c 32 -S 10M $SLURM_SUBMIT_DIR
 
-module list
-
 function find_chk_file {
     # find_chk_file takes a single argument -- the wildcard pattern
     # for checkpoint files to look through
@@ -76,6 +72,21 @@ else
     restartString="amr.restart=${restartFile}"
 fi
 
+
+# clean up any run management files left over from previous runs
+rm -f dump_and_stop
+
+# The `--signal=B:URG@<n>` option tells slurm to send SIGURG to this batch
+# script n seconds before the runtime limit, so we can exit gracefully.
+function sig_handler {
+    touch dump_and_stop
+    # disable this signal handler
+    trap - URG
+    echo "BATCH: allocation ending soon; telling Castro to dump a checkpoint and stop"
+}
+trap sig_handler URG
+
+
 export OMP_NUM_THREADS=1
 export NMPI_PER_NODE=8
 export TOTAL_NMPI=$(( ${SLURM_JOB_NUM_NODES} * ${NMPI_PER_NODE} ))
@@ -107,5 +118,18 @@ echo appending parameters: ${FILE_IO_PARAMS}
 
 (sleep 300; check_restart ) &
 
-srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString} ${FILE_IO_PARAMS}
+# execute srun in the background then use the builtin wait so the shell can
+# handle the signal
+srun -n${TOTAL_NMPI} -N${SLURM_JOB_NUM_NODES} --ntasks-per-node=8 --gpus-per-task=1 ./$EXEC $INPUTS ${restartString} ${FILE_IO_PARAMS} &
+pid=$!
+wait $pid
+ret=$?
+
+if (( ret == 128 + 23 )); then
+    # received SIGURG, keep waiting
+    wait $pid
+    ret=$?
+fi
+
+exit $ret
 
diff --git a/sphinx_docs/source/olcf-workflow.rst b/sphinx_docs/source/olcf-workflow.rst
@@ -38,7 +38,30 @@ Submitting jobs
 
 Frontier uses SLURM.
 
-Here's a script that runs on GPUs and has the I/O fixes described above.
+Here's a script that uses our best practices on Frontier.  It uses 64 nodes (512 GPUs)
+and does the following:
+
+* Sets the filesystem striping (see https://docs.olcf.ornl.gov/data/index.html#lfs-setstripe-wrapper)
+
+* Includes logic for automatically restarting from the last checkpoint file
+  (useful for job-chaining).  This is done via the ``find_chk_file`` function.
+
+* Installs a signal handler to create a ``dump_and_stop`` file shortly before
+  the queue window ends.  This ensures that we get a checkpoint at the very
+  end of the queue window.
+
+* Can do a special check on restart to ensure that we don't hang on
+  reading the initial checkpoint file (uncomment out the line):
+
+  ::
+
+      (sleep 300; check_restart ) &
+
+  This uses the ``check_restart`` function and will kill the job if it doesn't
+  detect a successful restart within 5 minutes.
+
+* Adds special I/O parameters to the job to work around filesystem issues
+  (these are defined in ``FILE_IO_PARAMS``.
 
 .. literalinclude:: ../../job_scripts/frontier/frontier.slurm
    :language: bash
@@ -51,6 +74,20 @@ The job is submitted as:
 
 where ``frontier.slurm`` is the name of the submission script.
 
+.. note::
+
+   If the job times out before writing out a checkpoint (leaving a
+   ``dump_and_stop`` file behind), you can give it more time between the
+   warning signal and the end of the allocation by adjusting the
+   ``#SBATCH --signal=B:URG@<n>`` line at the top of the script.
+
+   Also, by default, AMReX will output a plotfile at the same time as a checkpoint file,
+   which means you'll get one from the ``dump_and_stop``, which may not be at the same
+   time intervals as your ``amr.plot_per``.  To suppress this, set:
+
+   ::
+
+      amr.write_plotfile_with_checkpoint = 0
 
 Also see the WarpX docs: https://warpx.readthedocs.io/en/latest/install/hpc/frontier.html