@@ -411,37 +411,46 @@ EOF
411411 echo -ne " \b\b\b${spin[$((counter%4))]} ${JOBSTATUS} "
412412 let counter=counter+1
413413 if [ ! " ${counter} " == " 100" ]; then
414+ # ensures that we see spinner ... but only check for new job
415+ # status every 100 * 0.5 = 50s?
414416 continue
415417 fi
416- let counter=0
418+ let counter=0 # reset counter
417419 JOBSTATUS=$( alien.py ps -j ${MY_JOBID} | awk ' //{print $4}' )
418420 # echo -ne "Waiting for jobs to return; Last status ${JOBSTATUS}"
419- if [ " $JOBSTATUS " == " D" ]; then
421+
422+ if [ " ${JOBSTATUS} " == " D" ]; then
420423 echo " Job done"
421- WAITFORALIEN=" "
424+ WAITFORALIEN=" " # guarantees to go out of outer while loop
422425
423426 if [ " ${FETCHOUTPUT} " ]; then
424- SUBJOBIDS=" "
425- while [ ! ${SUBJOBIDS} ]; do
426- SUBJOBIDS=($( alien.py ps --trace ${MY_JOBID} | awk ' /Subjob submitted/' | sed ' s/.*submitted: //' | tr ' \n' ' ' ) )
427- sleep 1
428- done
429- # TODO: make this happen in a single alien.py session and with parallel copying
430- echo " Fetching results"
431- for splitcounter in ` seq 1 ${PRODSPLIT} ` ; do
432- # we still need to check if this particular subjob was successful
433- SUBJOBSTATUS=$( alien.py ps -j ${SUBJOBIDS[splitcounter-1]} | awk ' //{print $4}' )
434- if [ " $SUBJOBSTATUS " == " D" ]; then
435- SPLITOUTDIR=$( printf " %03d" ${splitcounter} )
436- [ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
437- echo " Fetching result files for subjob ${splitcounter} into ${PWD} "
438- CPCMD=" alien.py cp ${MY_JOBWORKDIR} /${SPLITOUTDIR} /* file:./${SPLITOUTDIR} "
439- eval " ${CPCMD} " 2> /dev/null
440- else
441- echo " Not fetching files for subjob ${splitcounter} since job code is ${SUBJOBSTATUS} "
442- fi
443- done
444- wait
427+ SUBJOBIDS=()
428+ SUBJOBSTATUSES=()
429+ echo " Fetching subjob info"
430+ while [ " ${# SUBJOBIDS[@]} " == " 0" ]; do
431+ QUERYRESULT=$( ALIENPY_JSON=true alien.py ps -a -m ${MY_JOBID} )
432+ SUBJOBIDS=($( echo ${QUERYRESULT} | jq -r ' .results[].id' | tr ' \n' ' ' ) )
433+ SUBJOBSTATUSES=($( echo ${QUERYRESULT} | jq -r ' .results[].status' | tr ' \n' ' ' ) )
434+ # echo "LENGTH SUBJOBS ${#SUBJOBIDS[@]}"
435+ sleep 1
436+ done
437+ # TODO: make this happen with parallel copying
438+ echo " Fetching results for ${PRODSPLIT} sub-jobs"
439+ for splitcounter in ` seq 1 ${PRODSPLIT} ` ; do
440+ let jobindex=splitcounter-1
441+ THIS_STATUS=${SUBJOBSTATUSES[jobindex]}
442+ THIS_JOB=${SUBJOBIDS[jobindex]}
443+ echo " Fetching for job ${THIS_JOB} "
444+ if [ " ${THIS_STATUS} " == " DONE" ]; then
445+ SPLITOUTDIR=$( printf " %03d" ${splitcounter} )
446+ [ ! -f ${SPLITOUTDIR} ] && mkdir ${SPLITOUTDIR}
447+ echo " Fetching result files for subjob ${splitcounter} into ${PWD} "
448+ CPCMD=" alien.py cp ${MY_JOBWORKDIR} /${SPLITOUTDIR} /* file:./${SPLITOUTDIR} "
449+ eval " ${CPCMD} " 2> /dev/null
450+ else
451+ echo " Not fetching files for subjob ${splitcounter} since job code is ${THIS_STATUS} "
452+ fi
453+ done
445454 fi
446455 fi
447456 if [[ " ${FOO: 0: 1} " == [EK] ]]; then
@@ -541,13 +550,13 @@ if [ "${ONGRID}" = "1" ]; then
541550fi
542551
543552# ----------- DOWNLOAD ADDITIONAL HELPERS ----------------------------
544- curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py & > /dev/null
545- chmod +x analyse_CPU.py
553+ # curl -o analyse_CPU.py https://raw.githubusercontent.com/sawenzel/AliceO2/swenzel/cpuana/Utilities/Tools/analyse_CPU.py &> /dev/null
554+ # chmod +x analyse_CPU.py
546555export PATH=$PATH :$PWD
547- export JOBUTILS_MONITORCPU=ON
548- export JOBUTILS_WRAPPER_SLEEP=5
549- # export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
550- export JOBUTILS_MONITORMEM=ON
556+ # export JOBUTILS_MONITORCPU=ON
557+ # export JOBUTILS_WRAPPER_SLEEP=5
558+ # export JOBUTILS_JOB_KILLINACTIVE=180 # kill inactive jobs after 3 minutes --> will be the task of pipeline runner? (or make it optional)
559+ # export JOBUTILS_MONITORMEM=ON
551560
552561# ----------- EXECUTE ACTUAL JOB ------------------------------------
553562# source the actual job script from the work dir
@@ -558,13 +567,7 @@ chmod +x ./alien_jobscript.sh
558567cp alien_log_${ALIEN_PROC_ID:- 0} .txt logtmp_${ALIEN_PROC_ID:- 0} .txt
559568[ " ${ALIEN_JOB_OUTPUTDIR} " ] && upload_to_Alien logtmp_${ALIEN_PROC_ID:- 0} .txt ${ALIEN_JOB_OUTPUTDIR} /
560569
561- # MOMENTARILY WE ZIP ALL LOG FILES
562- ziparchive=logs_PROCID${ALIEN_PROC_ID:- 0} .zip
563- find ./ -name " *.log*" -exec zip ${ziparchive} {} ' ;'
564- find ./ -name " *mergerlog*" -exec zip ${ziparchive} {} ' ;'
565- find ./ -name " *serverlog*" -exec zip ${ziparchive} {} ' ;'
566- find ./ -name " *workerlog*" -exec zip ${ziparchive} {} ' ;'
567- find ./ -name " alien_log*.txt" -exec zip ${ziparchive} {} ' ;'
570+ echo " Job done"
568571
569572# We need to exit for the ALIEN JOB HANDLER!
570573exit 0
0 commit comments