Skip to content

Commit 8e214ec

Browse files
committed
Improvements to grid_submit
* changes to reflect novel Alien behaviour to get sub-job ids * less calls to alien.py * some cleanup
1 parent c36a755 commit 8e214ec

File tree

1 file changed

+40
-37
lines changed

1 file changed

+40
-37
lines changed

GRID/utils/grid_submit.sh

Lines changed: 40 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -411,37 +411,46 @@ EOF
411411
echo -ne "\b\b\b${spin[$((counter%4))]} ${JOBSTATUS}"
412412
let counter=counter+1
413413
if [ ! "${counter}" == "100" ]; then
414+
# ensures that we see spinner ... but only check for new job
415+
# status every 100 * 0.5 = 50s?
414416
continue
415417
fi
416-
let counter=0
418+
let counter=0 # reset counter
417419
JOBSTATUS=$(alien.py ps -j ${MY_JOBID} | awk '//{print $4}')
418420
# echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"
419-
if [ "$JOBSTATUS" == "D" ]; then
421+
422+
if [ "${JOBSTATUS}" == "D" ]; then
420423
echo "Job done"
421-
WAITFORALIEN=""
424+
WAITFORALIEN="" # guarantees to go out of outer while loop
422425

423426
if [ "${FETCHOUTPUT}" ]; then
424-
SUBJOBIDS=""
425-
while [ ! ${SUBJOBIDS} ]; do
426-
SUBJOBIDS=($(alien.py ps --trace ${MY_JOBID} | awk '/Subjob submitted/' | sed 's/.*submitted: //' | tr '\n' ' '))
427-
sleep 1
428-
done
429-
# TODO: make this happen in a single alien.py session and with parallel copying
430-
echo "Fetching results"
431-
for splitcounter in `seq 1 ${PRODSPLIT}`; do
432-
# we still need to check if this particular subjob was successful
433-
SUBJOBSTATUS=$(alien.py ps -j ${SUBJOBIDS[splitcounter-1]} | awk '//{print $4}')
434-
if [ "$SUBJOBSTATUS" == "D" ]; then
435-
SPLITOUTDIR=$(printf "%03d" ${splitcounter})
436-
[ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
437-
echo "Fetching result files for subjob ${splitcounter} into ${PWD}"
438-
CPCMD="alien.py cp ${MY_JOBWORKDIR}/${SPLITOUTDIR}/* file:./${SPLITOUTDIR}"
439-
eval "${CPCMD}" 2> /dev/null
440-
else
441-
echo "Not fetching files for subjob ${splitcounter} since job code is ${SUBJOBSTATUS}"
442-
fi
443-
done
444-
wait
427+
SUBJOBIDS=()
428+
SUBJOBSTATUSES=()
429+
echo "Fetching subjob info"
430+
while [ "${#SUBJOBIDS[@]}" == "0" ]; do
431+
QUERYRESULT=$(ALIENPY_JSON=true alien.py ps -a -m ${MY_JOBID})
432+
SUBJOBIDS=($(echo ${QUERYRESULT} | jq -r '.results[].id' | tr '\n' ' '))
433+
SUBJOBSTATUSES=($(echo ${QUERYRESULT} | jq -r '.results[].status' | tr '\n' ' '))
434+
# echo "LENGTH SUBJOBS ${#SUBJOBIDS[@]}"
435+
sleep 1
436+
done
437+
# TODO: make this happen with parallel copying
438+
echo "Fetching results for ${PRODSPLIT} sub-jobs"
439+
for splitcounter in `seq 1 ${PRODSPLIT}`; do
440+
let jobindex=splitcounter-1
441+
THIS_STATUS=${SUBJOBSTATUSES[jobindex]}
442+
THIS_JOB=${SUBJOBIDS[jobindex]}
443+
echo "Fetching for job ${THIS_JOB}"
444+
if [ "${THIS_STATUS}" == "DONE" ]; then
445+
SPLITOUTDIR=$(printf "%03d" ${splitcounter})
446+
[ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
447+
echo "Fetching result files for subjob ${splitcounter} into ${PWD}"
448+
CPCMD="alien.py cp ${MY_JOBWORKDIR}/${SPLITOUTDIR}/* file:./${SPLITOUTDIR}"
449+
eval "${CPCMD}" 2> /dev/null
450+
else
451+
echo "Not fetching files for subjob ${splitcounter} since job code is ${THIS_STATUS}"
452+
fi
453+
done
445454
fi
446455
fi
447456
if [[ "${FOO:0:1}" == [EK] ]]; then
@@ -541,13 +550,13 @@ if [ "${ONGRID}" = "1" ]; then
541550
fi
542551

543552
# ----------- DOWNLOAD ADDITIONAL HELPERS ----------------------------
544-
curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
545-
chmod +x analyse_CPU.py
553+
# curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
554+
# chmod +x analyse_CPU.py
546555
export PATH=$PATH:$PWD
547-
export JOBUTILS_MONITORCPU=ON
548-
export JOBUTILS_WRAPPER_SLEEP=5
549-
#export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
550-
export JOBUTILS_MONITORMEM=ON
556+
# export JOBUTILS_MONITORCPU=ON
557+
# export JOBUTILS_WRAPPER_SLEEP=5
558+
# export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
559+
# export JOBUTILS_MONITORMEM=ON
551560

552561
# ----------- EXECUTE ACTUAL JOB ------------------------------------
553562
# source the actual job script from the work dir
@@ -558,13 +567,7 @@ chmod +x ./alien_jobscript.sh
558567
cp alien_log_${ALIEN_PROC_ID:-0}.txt logtmp_${ALIEN_PROC_ID:-0}.txt
559568
[ "${ALIEN_JOB_OUTPUTDIR}" ] && upload_to_Alien logtmp_${ALIEN_PROC_ID:-0}.txt ${ALIEN_JOB_OUTPUTDIR}/
560569

561-
# MOMENTARILY WE ZIP ALL LOG FILES
562-
ziparchive=logs_PROCID${ALIEN_PROC_ID:-0}.zip
563-
find ./ -name "*.log*" -exec zip ${ziparchive} {} ';'
564-
find ./ -name "*mergerlog*" -exec zip ${ziparchive} {} ';'
565-
find ./ -name "*serverlog*" -exec zip ${ziparchive} {} ';'
566-
find ./ -name "*workerlog*" -exec zip ${ziparchive} {} ';'
567-
find ./ -name "alien_log*.txt" -exec zip ${ziparchive} {} ';'
570+
echo "Job done"
568571

569572
# We need to exit for the ALIEN JOB HANDLER!
570573
exit 0

0 commit comments

Comments
 (0)