From 372a76dc7d40b4267b892a0659f898541c537c72 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:14:03 +0100 Subject: [PATCH 01/47] feat: BundleCE implementation, first approach --- .../Computing/BundleComputingElement.py | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 src/DIRAC/Resources/Computing/BundleComputingElement.py diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py new file mode 100644 index 00000000000..2fd720ef9c9 --- /dev/null +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -0,0 +1,251 @@ +import uuid + +from DIRAC import S_ERROR, S_OK +from DIRAC.Resources.Computing.ComputingElement import ComputingElement + +# Strategies are not used yet, just an idea +SENDING_STRATEGIES = { + "NO_MORE_JOBS_FIT", + "MAX_TIME_SINCE_FIRST", + "MAX_TIME_BETWEEN_SUBMISSIONS", +} + +STORING_STRATEGIES = { + "NO_STRATEGY", + "SAME_JOB_TYPE", +} + +# SHELL code that bundles all wrappers +BUNDLE_STRING = """\ +#!/bin/bash +set -e + +BASEDIR=${{PWD}} +INPUT={inputs} + +get_id() {{ + basename ${{1}} .json +}} + +run_task() {{ + local input=$1 + local task_id=$(get_id ${{input}}) + + >&2 echo "Executing task ${{task_id}}" + >&2 {command} ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & + local task_pid=$! + + >&2 echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." + wait ${{task_pid}} ; local task_status=$? + + # report status + echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee task_${{task_id}}.status +}} + +# execute tasks +for input in ${{INPUT}}; do + [ -f "$input" ] || break + taskdir="task_$(get_id ${{input}})" + mkdir ${{taskdir}} && cd "$_" && + run_task ${{input}} >> ${{BASEDIR}}/tasks_status.log & + cd ${{BASEDIR}} +done + +# wait for all tasks +wait +""" + +class BundleComputingElement(ComputingElement): + def __init__(self, ceUniqueID): + """Standard constructor.""" + super().__init__(ceUniqueID) + + self.jobToBundle = {} + self.bundles = {} + self.bundleReady = {} + + # These are just ideas, could be interesting to take into account + self.timeout = -1 + self.max_time_between_submissions = -1 + + self.storeStrategies = [] + self.sendStrategies = [] + + # Currently this has to be hard-coded. + # It must either be generated dynamically through the ceDict or use + # another Inner one like PoolCE does. + self.ce = None + + self.log.setLevel("DEBUG") + + def _storeOnBundle(self, bundleId, job, n_processors): + self.bundles[bundleId]["Jobs"].append(job) + self.bundles[bundleId]["ProcessorSum"] += n_processors + + def _storeJob(self, jobID, executable, ceDict, n_processors, proxy=None, inputs=None): + bundle_id = f"{ceDict['Site']}:{ceDict['GridCE']}:{ceDict['Queue']}" + + if bundle_id not in self.bundles: + self._initBundle(bundle_id, ceDict) + + bundle = self.bundles[bundle_id] + job = { + "ID": jobID, + "Executable": executable, + "Inputs": inputs, + "Proxy": proxy + } + + # Is the bundle ready for execution in this CE? + if bundle["ProcessorSum"] + n_processors >= bundle["MaxProcessors"]: + # Clear bundle related to the CE + self._initBundle(bundle_id, ceDict) + + # Add the job to the bundle ready if it fits. + # Otherwise, add it to the storage + if bundle["ProcessorSum"] + n_processors == bundle["MaxProcessors"]: + bundle["Jobs"].append(job) + else: + self._storeOnBundle(bundle_id, job, n_processors) + + # Make the bundle ready for execution + self.bundleReady = bundle + + else: + # Just store it + self._storeOnBundle(bundle_id, job, n_processors) + + self.log.debug("Current bundle status: ", self.bundles) + + return bundle_id + + def _initBundle(self, bundleId, ceDict, startingProcessors=0): + self.bundles[bundleId] = {} + self.bundles[bundleId]["Jobs"] = [] + self.bundles[bundleId]["ProcessorSum"] = startingProcessors + self.bundles[bundleId]["MaxProcessors"] = ceDict["NumberOfProcessors"] + self.bundles[bundleId]["LastAddedJobTimestamp"] = 0 + self.bundles[bundleId]["CEDict"] = ceDict + + def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, jobDesc=None, inputs=None): + jobID = jobDesc["jobID"] + resourceParams = jobDesc["resourceParams"] + + bundleID = self._storeJob(jobID, executableFiles, resourceParams, numberOfProcessors, proxy=proxy, inputs=inputs) + + if not self.bundleReady: + self.log.info(f"Job {jobID} stored successfully in bundle: ", bundleID) + return S_OK() + + executablePath, proxy, grouped_inputs = self._wrap_bundle(command="bash") + + self.log.info("Submitting job to CE: ", self.ce.ceName) + + # result = self.ce.submitJob(executablePath, proxy, inputs=grouped_inputs) + result = {} + + self.bundleReady = None + + return S_OK(result) + + def _wrap_bundle(self, command): + wrap_string = BUNDLE_STRING + bundle_inputs_string = "(" + inputs = [] + + filepath = f"/tmp/BundledJobs_{uuid.uuid4()}" + for job in self.bundleReady["Jobs"]: + self.jobToBundle[job["ID"]] = filepath + bundle_inputs_string += job["Executable"].replace(" ", "\ ") + " " + + # Add the original executable as an input, as well as the original inputs + inputs.append(job["Executable"]) + inputs += job["Inputs"] + + bundle_inputs_string = bundle_inputs_string[:-1] + ")" + + wrap_string = wrap_string.format(inputs=bundle_inputs_string, command=command) + + with open(filepath, "x") as fd: + fd.write(wrap_string) + + self.log.debug("Bundle created:\n", wrap_string) + self.log.debug("Inputs used:", inputs) + + return filepath, self.bundleReady["Jobs"][0]["Proxy"], inputs + + # + # BIG ISSUE HERE + # ---------------- + # If we accept job bundling from multiple CEs, there is no way of obtaining the status of + # the CE, because it's different depending of the job bundle you are asking about + # + # A way of circumvent this is enforcing the usage of just a singular Inner CE + # + def getDescription(self): + pass + + def getCEStatus(self): + pass + +if __name__ == "__main__": + from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement + + bundleCE = BundleComputingElement("BundleCE") + innerCE = InProcessComputingElement("InnerCE") + bundleCE.ce = innerCE + + max_processors = 3 + + CE_DICT = { + 'NumberOfProcessors': max_processors, + 'CPUTime': 3456, + 'FileCatalog': 'FileCatalog', + 'CPUTimeLeft': 10000, + 'WaitingToRunningRatio': 0.5, + 'MaxWaitingJobs': 1, + 'MaxTotalJobs': 366, + 'CEType': 'AREX', + 'architecture': 'x86_64', + 'VO': 'lhcb', + 'VirtualOrganization': 'lhcb', + 'MaxRAM': 16000, + 'SubmissionMode': 'Direct', + 'Preamble': 'source /cvmfs/lhcb.cern.ch/lhcbdirac/diracosrc', + 'XRSLExtraString': '(runtimeEnvironment="ENV/SINGULARITY" "/gpfs/projects/sall73/cvmfs/lhcb.cern.ch/containers/os-base/alma9-devel/prod/amd64/" "" "/apps/GPP/SINGULARITY/3.11.5/bin/singularity")', + 'Port': 8443, + 'Platform': 'skylake-any', + 'Timeout': 300, + 'ARCLogLevel': 'DEBUG', + 'MaxCPUTime': 3456, + 'CPUNormalizationFactor': 30, + 'Tag': ['MultiProcessor'], + 'Queue': 'nordugrid-slurm-gp_resa', + 'GridCE': 'lhcbvs02.ific.uv.es', + 'Site': 'DIRAC.MareNostrum.es', + 'GridEnv': '', + 'Setup': 'MyDIRAC-Production', + 'RequiredTag': [], + 'DIRACVersion': 'v8.0.55', + 'ReleaseVersion': 'v8.0.55', + 'RemoteExecution': True + } + + dummy_proxy = """\ +-----BEGIN CERTIFICATE----- +BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 +-----END CERTIFICATE----- +-----BEGIN PRIVATE KEY----- +BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 +-----END PRIVATE KEY----- +-----BEGIN CERTIFICATE----- +BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 +-----END CERTIFICATE----- +""" + + for i in range(max_processors*2): + executable_file = f"./test/job_{i}.py" + inputs = [f"./test/wrapper_{i}.py", f"./test/wrapper_{i}.json"] + + jobDesc = {"jobID": i, "resourceParams": CE_DICT} + bundleCE.submitJob(f"./test/job_{i}.py", dummy_proxy, numberOfProcessors=1, jobDesc=jobDesc, inputs=inputs) \ No newline at end of file From 83568fe3f59f6aef4a1ceaf8208233bc50f53b32 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 11 Apr 2025 16:57:57 +0200 Subject: [PATCH 02/47] feat: add BundleDB and BundleService --- .../Client/BundlerClient.py | 24 ++ .../WorkloadManagementSystem/DB/BundleDB.py | 243 ++++++++++++++++++ .../WorkloadManagementSystem/DB/BundleDB.sql | 41 +++ .../Service/BundlerHandler.py | 154 +++++++++++ 4 files changed, 462 insertions(+) create mode 100644 src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py create mode 100755 src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py create mode 100644 src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql create mode 100644 src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py diff --git a/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py new file mode 100644 index 00000000000..5535d74d541 --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py @@ -0,0 +1,24 @@ +""" Module that contains simple client access to Bundler service +""" + +from DIRAC.Core.Base.Client import Client, createClient +from DIRAC.Core.Utilities.DEncode import ignoreEncodeWarning + + +@createClient("WorkloadManagement/Bundler") +class BundlerClient(Client): + """Exposes the functionality available in the WorkloadManagement/BundlerHandler + + This inherits the DIRAC base Client for direct execution of server functionality. + The following methods are available (although not visible here). + + """ + + def __init__(self, url=None, **kwargs): + super().__init__(**kwargs) + + if not url: + self.serverURL = "WorkloadManagement/Bundler" + + else: + self.serverURL = url \ No newline at end of file diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py new file mode 100755 index 00000000000..0ded683ceac --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -0,0 +1,243 @@ +""" BundleDB class is a front-end to the bundle db +""" +from DIRAC import S_ERROR, S_OK +from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations +from DIRAC.Core.Base.DB import DB +from DIRAC.FrameworkSystem.Client.Logger import contextLogger + +# NOTE: +# THIS BLOCK SHOULD BE ITS OWN FUNCTION: +# +# result = self._query(cmd) +# if not result["OK"]: +# return result +# return S_OK(result["Value"][0]) + +BUNDLE_STATUS = ('Storing', 'Full', 'Sent','Finalized') + +def formatSelectOutput(listOfResults): + retVal = [] + + for kvTuple in listOfResults: + inner = {} + for k, v in kvTuple: + inner[k] = v + retVal.append(inner) + + return retVal + +class BundleDB(DB): + """BundleDB MySQL Database Manager""" + + def __init__(self, parentLogger=None): + DB.__init__(self, "BundleDB", "WorkloadManagement/BundleDB", parentLogger=parentLogger) + self._defaultLogger = self.log + self.__opsHelper = Operations() + + @property + def log(self): + return contextLogger.get() or self._defaultLogger + + @log.setter + def log(self, value): + self._defaultLogger = value + + def getBundleIdFromJobId(self, jobID): + result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobID}) + + if not result["OK"]: + return result + + return S_OK(result["Value"][0]) + + def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): + result = self.__getBundlesFromCEDict(ceDict) + + if not result["OK"]: + return result + + bundles = result["Value"] + + # No bundles matching ceDict, so create a new one + if not bundles: + bundleId = self.__createNewBundle(ceDict) + return S_OK(bundleId) + + # Check the best possible bundle to insert the job + bundleId = self.__selectBestBundle(bundles, processors) + + # If it does not fit in an already created bundle, create a new one + if not bundleId: + bundleId = self.__createNewBundle(ceDict) + + # Insert it and obtain if it is ready to be submitted + readyForSubmission = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + + return S_OK({"BundleId": bundleId, "Ready": readyForSubmission}) + + def getBundle(self, bundleId): + result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + retVal = formatSelectOutput(result["Value"]) + return S_OK(retVal[0]) + + def getJobsOfBundle(self, bundleId): + result = self.getFields("JobToBundle", ["JobID", "ExecutablePath", "Inputs"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + retVal = formatSelectOutput(result["Value"]) + return S_OK(retVal) + + def setTaskId(self, bundleId, taskId): + result = self.updateFields("BundlesInfo", ["TaskID"], [taskId], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK() + + def __createNewBundle(self, ceDict): + insertInfo = { + "ProcessorSum": 0, + "MaxProcessors": ceDict["NumberOfProcessors"], + "ExecTemplate": ceDict["ExecTemplate"], + "Site": ceDict['Site'], + "CE": ceDict['GridCE'], + "Queue": ceDict['Queue'], + "CEDict": str(ceDict) + } + + result = self.insertFields( + "BundlesInfo", + list(insertInfo.keys()), + list(insertInfo.values()) + ) + + if not result["OK"]: + return result + + #! WILL THIS WORK?? + result = self.getFields("BundlesInfo", ["BundleID"], {"lastRowId": result["lastRowId"]}) + retVal = formatSelectOutput(result["Value"]) + + return S_OK(retVal[0]) #! IT SHOULD RETURN THE ID OF THE BUNDLE + + def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): + # Insert the job into the bundle + insertInfo = { + "JobID": jobId, + "BundleID": bundleId, + "ExecutablePath": executable, + "Inputs": inputs + } + + result = self.insertFields( + "JobToBundle", + list(insertInfo.keys()), + list(insertInfo.values()) + ) + + if not result["OK"]: + return result + + # Modify the number of processors that will be used by the bundle + cmd = "UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {} WHERE BundleID = {};".format( + nProcessors, bundleId + ) + result = self._query(cmd) + + if not result["OK"]: + return result + + # Obtain the current Sum and the Max available + result = self.getFields("BundlesInfo", ["ProcessorSum", "MaxProcessors"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + retVal = formatSelectOutput(result["Value"]) + selection = retVal[0] + + # TODO: Change this to a strategy based selection and remove self.__selectBestBundle(...) + return S_OK(selection["ProcessorSum"] == selection["MaxProcessors"]) + + def __getBundlesFromCEDict(self, ceDict): + conditions = { + "Site": ceDict['Site'], + "CE": ceDict['GridCE'], + "Queue": ceDict['Queue'], + } + + result = self.getFields("BundlesInfo", [], conditions) + + if not result["OK"]: + return result + + if not result["Value"]: + return S_OK() + + retVal = formatSelectOutput(result["Value"]) + return S_OK(retVal) + + def __updateBundleStatus(self, bundleId, newStatus): + if newStatus not in BUNDLE_STATUS: + msg = "The new status '{}' does not correspond with the possible statuses:".format(newStatus) + return S_ERROR(msg, BUNDLE_STATUS) + + cmd = "UPDATE BundlesInfo SET Status = {} WHERE BundleID = {};".format( + newStatus, bundleId + ) + result = self._query(cmd) + + if not result["OK"]: + return result + + return S_OK() + + # This is function quite dumb, and should not work like this, but for a fist + # aproximation is fine (I guess). + # + # The best way (in my opinion) of approching this is by taking advantage of + # dynamic programming. + # We could approach this by considering the bundles as sacks and selecting + # the bundle to insert the same way it is done in the Knapsack Problem. + # + # REF: https://en.wikipedia.org/wiki/Knapsack_problem + # + # Each bundle that relates to the same CE would be a Knapsack and each item + # would be a different job. The job would have its 'weight' and 'price' set + # to the number of processors it needs, and the algorithm would optimize + # how they are distributed around the bundles. + # + # By having multiple bundles, this would relate more to the Bin Packing Problem, + # which is an abstaction of the Knapsack Problem. + # + # REF: https://en.wikipedia.org/wiki/Bin_packing_problem + # + def __selectBestBundle(self, bundles, nProcessors): + bestBundleId = None + currentBestProcs = 0 + + for bundle in bundles: + bundleId = bundle["BundleID"] + procs = bundle["ProcessorSum"] + maxProcs = bundle["MaxProcessors"] + + newProcSum = procs + nProcessors + + if newProcSum == maxProcs: + return bundleId + + elif newProcSum > maxProcs: + continue + + elif newProcSum > currentBestProcs: + bestBundleId = bundleId + + return bestBundleId + diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql new file mode 100644 index 00000000000..9bc41b388bd --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -0,0 +1,41 @@ +-- When installing via dirac tools, the following is not needed (still here for reference) +-- +-- DROP DATABASE IF EXISTS BundleDB; +-- CREATE DATABASE BundleDB; +-- ------------------------------------------------------------------------------ +-- Database owner definition +-- USE mysql; +-- +-- Must set passwords for database user by replacing "must_be_set". +-- +-- GRANT SELECT,INSERT,LOCK TABLES,UPDATE,DELETE,CREATE,DROP,ALTER,REFERENCES ON BundleDB.* TO Dirac@'%' IDENTIFIED BY 'must_be_set'; +-- FLUSH PRIVILEGES; + +USE BundleDB; + +-- ------------------------------------------------------------------------------ +DROP TABLE IF EXISTS `BundlesInfo`; +CREATE TABLE `BundlesInfo` ( + `BundleID` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT, + `ProcessorSum` INT(5) UNSIGNED NOT NULL DEFAULT 0, + `MaxProcessors` INT(5) UNSIGNED NOT NULL, + `Site` TEXT NOT NULL, + `CE` TEXT NOT NULL, + `Queue` TEXT NOT NULL, + `CEDict` TEXT NOT NULL, + `ExecTemplate` TEXT NOT NULL, + `TaskID` INTEGER(11) UNSIGNED, + `Status` ENUM('Storing', 'Full', 'Sent','Finalized') NOT NULL DEFAULT 'Storing', + PRIMARY KEY (BundleID), +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + +-- ------------------------------------------------------------------------------ +DROP TABLE IF EXISTS `JobToBundle`; +CREATE TABLE `JobToBundle` ( + `JobID` INTEGER(11) UNSIGNED NOT NULL, + `BundleID` INTEGER(11) UNSIGNED NOT NULL, + `ExecutablePath` TEXT NOT NULL, + `Inputs` TEXT NOT NULL, + PRIMARY KEY (`JobID`), + FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`), +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; \ No newline at end of file diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py new file mode 100644 index 00000000000..7424bad7b53 --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -0,0 +1,154 @@ +""" The Bundler service provides an interface for bundling jobs into a a big job + + It connects to a BundleDB to store and retrive bundles. +""" +from ast import literal_eval + +from DIRAC import S_ERROR, S_OK +from DIRAC.Core.DISET.RequestHandler import RequestHandler +from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader +from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory + + +class BundlerHandler(RequestHandler): + @classmethod + def initializeHandler(cls, serviceInfoDict): + try: + result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.BundleDB", "BundleDB") + if not result["OK"]: + return result + cls.bundleDB = result["Value"](parentLogger=cls.log) + cls.jobToCE = {} + cls.ceFactory = ComputingElementFactory() + + except RuntimeError as excp: + return S_ERROR(f"Can't connect to DB: {excp}") + + return S_OK() + + types_storeInBundle = [int, str, list, str, int, dict] + + def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceDict): + ce = self.ceFactory.getCE(ceParametersDict=ceDict) + self.jobToCE[jobId] = ce + + result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, processors, ceDict) + if not result["OK"]: + return S_ERROR() + + bundleID = result["Value"]["BundleId"] + readyForSubmission = result["Value"]["Ready"] + + if readyForSubmission: + bundle_exe, bundle_inputs = self.__wrapBundle(bundleID) + result = ce.submitJob(bundle_exe, inputs=bundle_inputs, proxy=proxy) + + if not result["OK"]: + return result + + taskID = result["Value"] + result = self.bundleDB.setTaskId(bundleID, taskID) + + if not bundleID["OK"]: + return result + + return S_OK({"BundleID": bundleID, "Executing": readyForSubmission}) + + + types_getOutput = [int] + + def export_getOutput(self, jobID): + result = self.bundleDB.getBundleIdFromJobId(jobID) + + if not result["OK"]: + return result + bundleID = result["Value"] + + ce = self.__getJobCE(jobID) + result = ce.getJobOutput(bundleID) + + if not result["OK"]: + return result + + return result["Value"] + + def __getJobBundle(self, jobID): + result = self.bundleDB.getBundleIdFromJobId(jobID) + + if not result["OK"]: + return result + + bundleId = result["Value"] + + result = self.bundleDB.getBundle(bundleId) + + if not result["OK"]: + return S_ERROR() + + return S_OK(result["Value"]) + + def __getJobCE(self, jobID): + if jobID not in self.jobToCE: + # Look for it in the DB + result = self.__getJobBundle(jobID) + + if not result["OK"]: + return S_ERROR("Job not in a bundle") + + # Convert the CEDict from string to a dictionary + ceDict = literal_eval(result["Value"]["CEDict"]) + # Build the ce obtained from the DB + result = self.ceFactory.getCE(ceParametersDict=ceDict) + + if not result["OK"]: + return result + + self.jobToCE[jobID] = result["Value"] + + return self.jobToCE[jobID] + + def __getJobTask(self, jobId): + result = self.bundleDB.getBundleIdFromJobId(jobId) + + if not result["OK"]: + return result + + bundleId = result["Value"] + + result = self.bundleDB.getBundle(bundleId) + + if not result["OK"]: + return result + + return result["Value"]["TaskID"] + + def __wrapBundle(self, bundleId): + result = self.bundleDB.getBundle(bundleId) + + if not result["OK"]: + return result + + bundle = result["Value"] + + result = self.bundleDB.getJobsOfBundle(bundleId) + + if not result["OK"]: + return result + + jobs = result["Value"] + + wrapper = bundle["ExecTemplate"] + inputs = [] + execs = [] + + for job in jobs: + execs.append(job["ExecutablePath"]) + inputs.append(job["Inputs"]) + + wrappedBundle = wrapper.format(inputs=','.join(execs)) + wrapperPath = f"/tmp/bundle_wrapper_{bundleId}" + + with open(wrapperPath, "x") as f: + f.write(wrappedBundle) + + return wrapperPath, inputs \ No newline at end of file From 2f376d38548f37718cddc0ab49c7eb85b0fc4ff2 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 25 Apr 2025 11:10:00 +0200 Subject: [PATCH 03/47] fix: SQL syntax error --- src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 9bc41b388bd..26155bdbaeb 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -25,8 +25,8 @@ CREATE TABLE `BundlesInfo` ( `CEDict` TEXT NOT NULL, `ExecTemplate` TEXT NOT NULL, `TaskID` INTEGER(11) UNSIGNED, - `Status` ENUM('Storing', 'Full', 'Sent','Finalized') NOT NULL DEFAULT 'Storing', - PRIMARY KEY (BundleID), + `Status` ENUM('Storing', 'Full', 'Sent', 'Finalized') NOT NULL DEFAULT 'Storing', + PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ @@ -37,5 +37,5 @@ CREATE TABLE `JobToBundle` ( `ExecutablePath` TEXT NOT NULL, `Inputs` TEXT NOT NULL, PRIMARY KEY (`JobID`), - FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`), + FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; \ No newline at end of file From 763b1416b1ff100ee7506d5d50be4982b561bcd9 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 25 Apr 2025 16:11:09 +0200 Subject: [PATCH 04/47] feat(test): Add BundleDB integration tests --- .../WorkloadManagementSystem/Test_BundleDB.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 tests/Integration/WorkloadManagementSystem/Test_BundleDB.py diff --git a/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py b/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py new file mode 100644 index 00000000000..b6cfc7a76ac --- /dev/null +++ b/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py @@ -0,0 +1,147 @@ +# pylint: disable=invalid-name, missing-docstring +import pytest + +import DIRAC + +DIRAC.initialize() # Initialize configuration + +from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB # noqa: E402 + + +@pytest.fixture(name="jobInfos") +def fixtureJobInfo(): + return [ + { + "Executable": "./executable1.sh", + "Inputs": ["./input1.py", "./input1.json"], + "Proxy": "FAKE-PROXY", + "Processors": 2, + "CEDict": { + "NumberOfProcessors": 3, + "ExecTemplate": "bash {inputs}", + "Site": "DIRAC.Site1.fake", + "GridCE": "FakeCE", + "Queue": "FakeQueue", + } + }, + { + "Executable": "./executable2.sh", + "Inputs": ["./input2.py", "./input2.json"], + "Proxy": "FAKE-PROXY", + "Processors": 2, + "CEDict": { + "NumberOfProcessors": 3, + "ExecTemplate": "bash {inputs}", + "Site": "DIRAC.Site1.fake", + "GridCE": "FakeCE", + "Queue": "FakeQueue", + } + }, + { + "Executable": "./executable3.sh", + "Inputs": ["./input3.py", "./input3.json"], + "Proxy": "FAKE-PROXY", + "Processors": 2, + "CEDict": { + "NumberOfProcessors": 2, + "ExecTemplate": "bash {inputs}", + "Site": "DIRAC.Site2.fake", + "GridCE": "FakeCE", + "Queue": "FakeQueue", + } + }, + { + "Executable": "./executable4.sh", + "Inputs": ["./input4.py", "./input4.json"], + "Proxy": "FAKE-PROXY", + "Processors": 1, + "CEDict": { + "NumberOfProcessors": 3, + "ExecTemplate": "bash {inputs}", + "Site": "DIRAC.Site1.fake", + "GridCE": "FakeCE", + "Queue": "FakeQueue", + } + }, + ] + +@pytest.fixture(name="bundleDB") +def fixtureBundleDB(): + db = BundleDB() + yield db + db._query("DELETE FROM JobToBundle") + db._query("DELETE FROM BundlesInfo") + + +def test_AddToBundle(bundleDB: BundleDB, jobInfos): + jobId = 0 + + # + # Should return error + result = bundleDB.getBundleIdFromJobId(jobId) + assert not result["OK"] + + # + # Should create a new bundle + job = jobInfos[0] + result = bundleDB.insertJobToBundle(jobId, job["Executable"], job["Inputs"], job["Processors"], job["CEDict"]) + assert result["OK"] + assert result["Value"] + assert not result["Value"]["Ready"] + + # Save the bundle and job ids for later use + bundleId1 = result["Value"]["BundleId"] + jobId1 = jobId + + # + # Should return the same bundle + result = bundleDB.getBundleIdFromJobId(jobId) + assert result["OK"] + assert result["Value"] + assert result["Value"] == bundleId1 + + jobId += 1 + + # + # Should create a new bundle because it does not fit + job = jobInfos[1] + result = bundleDB.insertJobToBundle(jobId, job["Executable"], job["Inputs"], job["Processors"], job["CEDict"]) + assert result["OK"] + assert result["Value"] + assert not result["Value"]["Ready"] + bundleId2 = result["Value"]["BundleId"] + assert bundleId2 != bundleId1 + + jobId += 1 + + # + # Should create a new bundle because a different CE + job = jobInfos[2] + result = bundleDB.insertJobToBundle(jobId, job["Executable"], job["Inputs"], job["Processors"], job["CEDict"]) + assert result["OK"] + assert result["Value"] + assert result["Value"]["Ready"] + bundleId3 = result["Value"]["BundleId"] + assert bundleId3 != bundleId2 and bundleId3 != bundleId1 + + jobId += 1 + + # + # Should add it to the very first bundle because it fits + job = jobInfos[3] + result = bundleDB.insertJobToBundle(jobId, job["Executable"], job["Inputs"], job["Processors"], job["CEDict"]) + assert result["OK"] + assert result["Value"] + assert result["Value"]["Ready"] + bundleId4 = result["Value"]["BundleId"] + assert bundleId4 == bundleId1 + jobId4 = jobId + + # + # Should contain the 2 added jobs + result = bundleDB.getJobsOfBundle(bundleId4) + assert result["OK"] + assert result["Value"] + jobIds = [job["JobID"] for job in result["Value"]] + assert jobId1 in jobIds and jobId4 in jobIds + \ No newline at end of file From 08ac0d479e3e1aa498dfb1a86d566f58cfbeab0c Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 25 Apr 2025 16:11:49 +0200 Subject: [PATCH 05/47] fix(test): Bug during BundleDB integration tests --- .../WorkloadManagementSystem/DB/BundleDB.py | 73 +++++++++++-------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 0ded683ceac..1fe8ce1f4c7 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -1,26 +1,18 @@ """ BundleDB class is a front-end to the bundle db """ from DIRAC import S_ERROR, S_OK -from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations from DIRAC.Core.Base.DB import DB from DIRAC.FrameworkSystem.Client.Logger import contextLogger -# NOTE: -# THIS BLOCK SHOULD BE ITS OWN FUNCTION: -# -# result = self._query(cmd) -# if not result["OK"]: -# return result -# return S_OK(result["Value"][0]) +# This might not be necessary +BUNDLE_STATUS = ('Storing', 'Full', 'Sent', 'Finalized') -BUNDLE_STATUS = ('Storing', 'Full', 'Sent','Finalized') - -def formatSelectOutput(listOfResults): +def formatSelectOutput(listOfResults, keys): retVal = [] for kvTuple in listOfResults: inner = {} - for k, v in kvTuple: + for k, v in zip(keys, list(kvTuple)): inner[k] = v retVal.append(inner) @@ -30,9 +22,8 @@ class BundleDB(DB): """BundleDB MySQL Database Manager""" def __init__(self, parentLogger=None): - DB.__init__(self, "BundleDB", "WorkloadManagement/BundleDB", parentLogger=parentLogger) + super().__init__("BundleDB", "WorkloadManagement/BundleDB", parentLogger=parentLogger) self._defaultLogger = self.log - self.__opsHelper = Operations() @property def log(self): @@ -48,7 +39,10 @@ def getBundleIdFromJobId(self, jobID): if not result["OK"]: return result - return S_OK(result["Value"][0]) + if not result["Value"]: + return S_ERROR("JobId not present in any bundle") + + return S_OK(result["Value"][0][0]) def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): result = self.__getBundlesFromCEDict(ceDict) @@ -60,20 +54,38 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): # No bundles matching ceDict, so create a new one if not bundles: - bundleId = self.__createNewBundle(ceDict) - return S_OK(bundleId) + result = self.__createNewBundle(ceDict) + + if not result["OK"]: + return result + + bundleId = result["Value"] + result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + + if not result["OK"]: + return result + + return S_OK({"BundleId": bundleId, "Ready": result["Value"]}) # Check the best possible bundle to insert the job bundleId = self.__selectBestBundle(bundles, processors) # If it does not fit in an already created bundle, create a new one if not bundleId: - bundleId = self.__createNewBundle(ceDict) + result = self.__createNewBundle(ceDict) + if not result["OK"]: + return result + + bundleId = result["Value"] + # Insert it and obtain if it is ready to be submitted - readyForSubmission = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + + if not result["OK"]: + return result - return S_OK({"BundleId": bundleId, "Ready": readyForSubmission}) + return S_OK({"BundleId": bundleId, "Ready": result["Value"]}) def getBundle(self, bundleId): result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) @@ -81,7 +93,7 @@ def getBundle(self, bundleId): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"]) + retVal = formatSelectOutput(result["Value"], []) return S_OK(retVal[0]) def getJobsOfBundle(self, bundleId): @@ -90,7 +102,7 @@ def getJobsOfBundle(self, bundleId): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"]) + retVal = formatSelectOutput(result["Value"], ["JobID", "ExecutablePath", "Inputs"]) return S_OK(retVal) def setTaskId(self, bundleId, taskId): @@ -121,11 +133,8 @@ def __createNewBundle(self, ceDict): if not result["OK"]: return result - #! WILL THIS WORK?? - result = self.getFields("BundlesInfo", ["BundleID"], {"lastRowId": result["lastRowId"]}) - retVal = formatSelectOutput(result["Value"]) - - return S_OK(retVal[0]) #! IT SHOULD RETURN THE ID OF THE BUNDLE + # Returns the ID of the Bundle (which is automatically incremented) + return S_OK(result["lastRowId"]) def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): # Insert the job into the bundle @@ -133,7 +142,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): "JobID": jobId, "BundleID": bundleId, "ExecutablePath": executable, - "Inputs": inputs + "Inputs": ' '.join(inputs) } result = self.insertFields( @@ -160,7 +169,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"]) + retVal = formatSelectOutput(result["Value"], ["ProcessorSum", "MaxProcessors"]) selection = retVal[0] # TODO: Change this to a strategy based selection and remove self.__selectBestBundle(...) @@ -180,8 +189,9 @@ def __getBundlesFromCEDict(self, ceDict): if not result["Value"]: return S_OK() - - retVal = formatSelectOutput(result["Value"]) + + # TODO: This line is awful, should change to something easier to scale + retVal = formatSelectOutput(result["Value"], ["BundleID", "ProcessorSum", "MaxProcessors", "Site", "CE", "Queue", "CEDict", "ExecTemplate", "TaskID", "Status"]) return S_OK(retVal) def __updateBundleStatus(self, bundleId, newStatus): @@ -237,6 +247,7 @@ def __selectBestBundle(self, bundles, nProcessors): continue elif newProcSum > currentBestProcs: + currentBestProcs = newProcSum bestBundleId = bundleId return bestBundleId From dd70f9d2e450c72481d83a7bfe20a1a1b1622cad Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 8 May 2025 15:13:29 +0200 Subject: [PATCH 06/47] feat(BundleDB): Change TEXT datatype to VARCHAR --- .../WorkloadManagementSystem/DB/BundleDB.sql | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 26155bdbaeb..ac44194e97a 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -14,27 +14,28 @@ USE BundleDB; -- ------------------------------------------------------------------------------ +DROP TABLE IF EXISTS `JobToBundle`; DROP TABLE IF EXISTS `BundlesInfo`; + CREATE TABLE `BundlesInfo` ( `BundleID` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT, `ProcessorSum` INT(5) UNSIGNED NOT NULL DEFAULT 0, `MaxProcessors` INT(5) UNSIGNED NOT NULL, - `Site` TEXT NOT NULL, - `CE` TEXT NOT NULL, - `Queue` TEXT NOT NULL, + `Site` VARCHAR(128) NOT NULL, + `CE` VARCHAR(128) NOT NULL, + `Queue` VARCHAR(128) NOT NULL, `CEDict` TEXT NOT NULL, - `ExecTemplate` TEXT NOT NULL, + `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` INTEGER(11) UNSIGNED, `Status` ENUM('Storing', 'Full', 'Sent', 'Finalized') NOT NULL DEFAULT 'Storing', PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ -DROP TABLE IF EXISTS `JobToBundle`; CREATE TABLE `JobToBundle` ( - `JobID` INTEGER(11) UNSIGNED NOT NULL, + `JobID` VARCHAR(255) NOT NULL, `BundleID` INTEGER(11) UNSIGNED NOT NULL, - `ExecutablePath` TEXT NOT NULL, + `ExecutablePath` VARCHAR(255) NOT NULL, `Inputs` TEXT NOT NULL, PRIMARY KEY (`JobID`), FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) From da323d44523052fb77f8d4dc2f12b3ecd8d9bd67 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 8 May 2025 15:29:01 +0200 Subject: [PATCH 07/47] chore: Improve Bundle template and logging --- .../Service/BundlerHandler.py | 95 +++++++++++-------- .../Utilities/BundlerTemplates.py | 58 +++++++++++ 2 files changed, 115 insertions(+), 38 deletions(-) create mode 100644 src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 7424bad7b53..7ae6c8a2ee5 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -8,9 +8,11 @@ from DIRAC.Core.DISET.RequestHandler import RequestHandler from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory +from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import generate_template class BundlerHandler(RequestHandler): + @classmethod def initializeHandler(cls, serviceInfoDict): try: @@ -26,56 +28,71 @@ def initializeHandler(cls, serviceInfoDict): return S_OK() - types_storeInBundle = [int, str, list, str, int, dict] + types_storeInBundle = [str, str, list, str, int, dict] def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceDict): - ce = self.ceFactory.getCE(ceParametersDict=ceDict) + result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceParametersDict=ceDict) + + if not result["OK"]: + self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) + return result + + ce = result["Value"] self.jobToCE[jobId] = ce result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, processors, ceDict) if not result["OK"]: - return S_ERROR() + self.log.error("Failed to insert into a bundle the job with id ", str(jobId)) + return result - bundleID = result["Value"]["BundleId"] + bundleId = result["Value"]["BundleId"] readyForSubmission = result["Value"]["Ready"] + self.log.info("Job inserted in bundle successfully") if readyForSubmission: - bundle_exe, bundle_inputs = self.__wrapBundle(bundleID) + self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") + + bundle_exe, bundle_inputs = self.__wrapBundle(bundleId) result = ce.submitJob(bundle_exe, inputs=bundle_inputs, proxy=proxy) if not result["OK"]: + self.log.error("Failed to submit job to with id ", str(jobId)) return result taskID = result["Value"] - result = self.bundleDB.setTaskId(bundleID, taskID) + result = self.bundleDB.setTaskId(bundleId, taskID) - if not bundleID["OK"]: + if not result["OK"]: + self.log.error("Failed to set task id of JobId ", str(jobId)) return result - return S_OK({"BundleID": bundleID, "Executing": readyForSubmission}) - + return S_OK({"BundleID": bundleId, "Executing": readyForSubmission}) - types_getOutput = [int] + types_getOutput = [str] - def export_getOutput(self, jobID): - result = self.bundleDB.getBundleIdFromJobId(jobID) + def export_getOutput(self, jobId): + result = self.bundleDB.getBundleIdFromJobId(jobId) if not result["OK"]: + self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) return result + bundleID = result["Value"] - ce = self.__getJobCE(jobID) + # TODO: THIS CAN BE CACHED + ce = self.__getJobCE(jobId) result = ce.getJobOutput(bundleID) if not result["OK"]: - return result + self.log.error("Failed to obtain Job Output of JobId ", str(jobId)) - return result["Value"] + return result - def __getJobBundle(self, jobID): - result = self.bundleDB.getBundleIdFromJobId(jobID) + def __getJobBundle(self, jobId): + result = self.bundleDB.getBundleIdFromJobId(jobId) if not result["OK"]: + self.log.error("Failed to obtain BundleId of JobId ", str(jobId)) return result bundleId = result["Value"] @@ -83,17 +100,18 @@ def __getJobBundle(self, jobID): result = self.bundleDB.getBundle(bundleId) if not result["OK"]: - return S_ERROR() - - return S_OK(result["Value"]) + self.log.error - def __getJobCE(self, jobID): - if jobID not in self.jobToCE: + return result + + def __getJobCE(self, jobId): + if jobId not in self.jobToCE: # Look for it in the DB - result = self.__getJobBundle(jobID) + result = self.__getJobBundle(jobId) if not result["OK"]: - return S_ERROR("Job not in a bundle") + self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) + return result # Convert the CEDict from string to a dictionary ceDict = literal_eval(result["Value"]["CEDict"]) @@ -101,23 +119,18 @@ def __getJobCE(self, jobID): result = self.ceFactory.getCE(ceParametersDict=ceDict) if not result["OK"]: + self.log.error("Failed to CE of JobId ", str(jobId)) return result - self.jobToCE[jobID] = result["Value"] + self.jobToCE[jobId] = result["Value"] - return self.jobToCE[jobID] + return self.jobToCE[jobId] def __getJobTask(self, jobId): - result = self.bundleDB.getBundleIdFromJobId(jobId) - - if not result["OK"]: - return result - - bundleId = result["Value"] - - result = self.bundleDB.getBundle(bundleId) + result = self.__getJobBundle(jobId) if not result["OK"]: + self.log.error("Failed to obtain task id of Job ", str(jobId)) return result return result["Value"]["TaskID"] @@ -126,6 +139,7 @@ def __wrapBundle(self, bundleId): result = self.bundleDB.getBundle(bundleId) if not result["OK"]: + self.log.error("Failed to obtain bundle while wrapping. BundleID=", str(bundleId)) return result bundle = result["Value"] @@ -133,19 +147,24 @@ def __wrapBundle(self, bundleId): result = self.bundleDB.getJobsOfBundle(bundleId) if not result["OK"]: + self.log.error("Failed to obtain bundled job while wrapping. BundleID=", str(bundleId)) return result jobs = result["Value"] - wrapper = bundle["ExecTemplate"] + template = bundle["ExecTemplate"] inputs = [] - execs = [] for job in jobs: - execs.append(job["ExecutablePath"]) + inputs.append(job["ExecutablePath"]) inputs.append(job["Inputs"]) - wrappedBundle = wrapper.format(inputs=','.join(execs)) + result = generate_template(template, inputs) + + if not result["OK"]: + return result + + wrappedBundle = result["Value"] wrapperPath = f"/tmp/bundle_wrapper_{bundleId}" with open(wrapperPath, "x") as f: diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py new file mode 100644 index 00000000000..0e333d2f9dd --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -0,0 +1,58 @@ +from DIRAC import S_ERROR, S_OK + + +def generate_template(template: str, inputs: list[str]): + template_lower = template.lower() + func_name = "_generate_" + template_lower + generator = globals()[func_name] + + if not generator: + return S_ERROR("Template not found") + + template, formatted_inputs = generator(inputs) + + return S_OK(template.format(inputs=formatted_inputs)) + +def _generate_bash(inputs: list[str]): + template = """\ +#!/bin/bash +set -e + +BASEDIR=${{PWD}} +INPUT={inputs} + +get_id() {{ + basename ${{1}} .json +}} + +run_task() {{ + local input=$1 + local task_id=$(get_id ${{input}}) + + >&2 echo "Executing task ${{task_id}}" + >&2 bash ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & + local task_pid=$! + + >&2 echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." + wait ${{task_pid}} ; local task_status=$? + + # report status + echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee task_${{task_id}}.status +}} + +# execute tasks +for input in ${{INPUT}}; do + [ -f "$input" ] || break + taskdir="task_$(get_id ${{input}})" + mkdir ${{taskdir}} && cd "$_" && + run_task ${{input}} >> ${{BASEDIR}}/tasks_status.log & + cd ${{BASEDIR}} +done + +# wait for all tasks +wait +""" + + formatted_inputs = '(' + ', '.join(inputs) + ')' + + return template, formatted_inputs From 81abf8fb63c5d6512940cb682f8c37631b3d9bcb Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 8 May 2025 15:29:50 +0200 Subject: [PATCH 08/47] chore(BundleDB): Return input files as list --- src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 1fe8ce1f4c7..2288e6d9ddf 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -93,7 +93,7 @@ def getBundle(self, bundleId): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], []) + retVal = formatSelectOutput(result["Value"], ["BundleID", "ProcessorSum", "MaxProcessors", "Site", "CE", "Queue", "CEDict", "ExecTemplate", "TaskID", "Status"]) return S_OK(retVal[0]) def getJobsOfBundle(self, bundleId): @@ -103,6 +103,7 @@ def getJobsOfBundle(self, bundleId): return result retVal = formatSelectOutput(result["Value"], ["JobID", "ExecutablePath", "Inputs"]) + retVal["Inputs"] = retVal["Inputs"].split(" ") return S_OK(retVal) def setTaskId(self, bundleId, taskId): @@ -114,6 +115,9 @@ def setTaskId(self, bundleId, taskId): return S_OK() def __createNewBundle(self, ceDict): + if "ExecTemplate" not in ceDict: + return S_ERROR("CE must have a properly formatted ExecTemplate") + insertInfo = { "ProcessorSum": 0, "MaxProcessors": ceDict["NumberOfProcessors"], From d9e11e7195e6976fa3e9e42b7651709c364c0519 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 8 May 2025 15:30:18 +0200 Subject: [PATCH 09/47] feat: Add BundlerService in ConfigTemplate --- src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg index f31d304d764..7b212e3c5c9 100644 --- a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +++ b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg @@ -158,6 +158,12 @@ Services { Port = 9175 } + ##BEGIN Bundler + Bundler + { + Port = 9176 + } + ##END } Agents { From f3b72f9288abb5e72b6f73243f9f26fcd08b5185 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 8 May 2025 16:30:41 +0200 Subject: [PATCH 10/47] chore(BundleCE): Adapt communication to the Service (untested) fix: pre-commit --- .../Computing/BundleComputingElement.py | 359 +++++++----------- .../Client/BundlerClient.py | 2 +- .../WorkloadManagementSystem/DB/BundleDB.py | 115 +++--- .../WorkloadManagementSystem/DB/BundleDB.sql | 2 +- .../Service/BundlerHandler.py | 33 +- .../Utilities/BundlerTemplates.py | 9 +- 6 files changed, 222 insertions(+), 298 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 2fd720ef9c9..ff63c698c3a 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -1,251 +1,162 @@ +"""Bundle Computing Elemenet + +Allows grouping jobs in a single big job prior to their submission in an actual CE. + +**Configuration Parameters** + +Configuration for the BundleComputingElemenet submission can be done via the configuration system. +Below, you can find a list of parameters specific to the BundleCE. + +ExecTemplate: + Name of the execution template to be used to bundle the jobs. + This template will the one that be passed to the CE to be executed alongside + each jobExecutable file and input as the inputs of the template. + +InnerCEType: + Type of the CE that will end up executing the templated wrapper. + +**CE Configuration** + +This CE must be configure in the same way as the one that will execute the jobs, the only +difference is that the CEType will become InnerCEType and it must have configured the template +to be used. + +For example: + +CEs +{ + host + { + CEType = SSH + SSHHost = host + SSHUser = user + SSHPassword = password + ... + Queues + { + dirac + { + ... + } + } + } +} + +Will become: + +CEs +{ + host + { + CEType = BUNDLE + InnerCEType = SSH + ExecTemplate = BASH + + SSHHost = host + SSHUser = user + SSHPassword = password + ... + Queues + { + dirac + { + ... + } + } + } +} + +**Code Documentation** +""" + import uuid from DIRAC import S_ERROR, S_OK from DIRAC.Resources.Computing.ComputingElement import ComputingElement +from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory +from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient -# Strategies are not used yet, just an idea -SENDING_STRATEGIES = { - "NO_MORE_JOBS_FIT", - "MAX_TIME_SINCE_FIRST", - "MAX_TIME_BETWEEN_SUBMISSIONS", -} -STORING_STRATEGIES = { - "NO_STRATEGY", - "SAME_JOB_TYPE", -} +class BundleComputingElement(ComputingElement): + def __init__(self, ceUniqueID): + """Standard constructor.""" + super().__init__(ceUniqueID) -# SHELL code that bundles all wrappers -BUNDLE_STRING = """\ -#!/bin/bash -set -e + self.mandatoryParameters = ["ExecTemplate", "InnerCEType"] -BASEDIR=${{PWD}} -INPUT={inputs} + self.innerCE = None -get_id() {{ - basename ${{1}} .json -}} + self.bundler = BundlerClient() + self.ceFactory = ComputingElementFactory() -run_task() {{ - local input=$1 - local task_id=$(get_id ${{input}}) + def _reset(self): + # Force the CE to make the job submissions asynchronous + self.ceParameters["AsyncSubmission"] = True - >&2 echo "Executing task ${{task_id}}" - >&2 {command} ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & - local task_pid=$! + # Create the InnerCE from the config obtained from the BundleCE + innerCEParams = self.ceParameters.copy() + innerCEType = innerCEParams.pop("InnerCEType") + innerCEParams["CEType"] = innerCEType - >&2 echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." - wait ${{task_pid}} ; local task_status=$? + # Building of the InnerCE + self.innerCE = self.ceFactory.getCE(ceType=innerCEType, ceParametersDict=innerCEParams) - # report status - echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee task_${{task_id}}.status -}} + def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=None): + # Create a unique ID that cannot clash with other BundleCEs and Jobs in the database + jobId = f"BUNDLE_{self.ceUniqueID}_{uuid.uuid4()}" -# execute tasks -for input in ${{INPUT}}; do - [ -f "$input" ] || break - taskdir="task_$(get_id ${{input}})" - mkdir ${{taskdir}} && cd "$_" && - run_task ${{input}} >> ${{BASEDIR}}/tasks_status.log & - cd ${{BASEDIR}} -done + # Store the job in a bundle using the ceDict of the InnerCE (containing the template) + ceDict = self.innerCE.getDescription() + result = self.bundler.storeInBundle(jobId, executableFiles, inputs, proxy, numberOfProcessors, ceDict) -# wait for all tasks -wait -""" + if not result["OK"]: + self.log.error("Failure while storing in the Bundle") + return result -class BundleComputingElement(ComputingElement): - def __init__(self, ceUniqueID): - """Standard constructor.""" - super().__init__(ceUniqueID) + bundleId = result["Value"]["BundleID"] + submitted = result["Value"]["Executing"] - self.jobToBundle = {} - self.bundles = {} - self.bundleReady = {} - - # These are just ideas, could be interesting to take into account - self.timeout = -1 - self.max_time_between_submissions = -1 - - self.storeStrategies = [] - self.sendStrategies = [] - - # Currently this has to be hard-coded. - # It must either be generated dynamically through the ceDict or use - # another Inner one like PoolCE does. - self.ce = None - - self.log.setLevel("DEBUG") - - def _storeOnBundle(self, bundleId, job, n_processors): - self.bundles[bundleId]["Jobs"].append(job) - self.bundles[bundleId]["ProcessorSum"] += n_processors - - def _storeJob(self, jobID, executable, ceDict, n_processors, proxy=None, inputs=None): - bundle_id = f"{ceDict['Site']}:{ceDict['GridCE']}:{ceDict['Queue']}" - - if bundle_id not in self.bundles: - self._initBundle(bundle_id, ceDict) - - bundle = self.bundles[bundle_id] - job = { - "ID": jobID, - "Executable": executable, - "Inputs": inputs, - "Proxy": proxy - } + # The bundle is not being executed in the InnerCE + if not submitted: + self.log.info(f"Job {jobId} stored successfully in bundle: ", bundleId) + # Return the bundle id as if it was the task id of the asynchronous executing job + return S_OK([jobId]) - # Is the bundle ready for execution in this CE? - if bundle["ProcessorSum"] + n_processors >= bundle["MaxProcessors"]: - # Clear bundle related to the CE - self._initBundle(bundle_id, ceDict) + else: + self.log.info("Submitting job to CE: ", self.ce.ceName) - # Add the job to the bundle ready if it fits. - # Otherwise, add it to the storage - if bundle["ProcessorSum"] + n_processors == bundle["MaxProcessors"]: - bundle["Jobs"].append(job) - else: - self._storeOnBundle(bundle_id, job, n_processors) + # Return the id of the job (NOT THE BUNDLE) + return S_OK([jobId]) - # Make the bundle ready for execution - self.bundleReady = bundle + def getJobOutput(self, jobIDList): + resultDict = {} - else: - # Just store it - self._storeOnBundle(bundle_id, job, n_processors) - - self.log.debug("Current bundle status: ", self.bundles) - - return bundle_id - - def _initBundle(self, bundleId, ceDict, startingProcessors=0): - self.bundles[bundleId] = {} - self.bundles[bundleId]["Jobs"] = [] - self.bundles[bundleId]["ProcessorSum"] = startingProcessors - self.bundles[bundleId]["MaxProcessors"] = ceDict["NumberOfProcessors"] - self.bundles[bundleId]["LastAddedJobTimestamp"] = 0 - self.bundles[bundleId]["CEDict"] = ceDict - - def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, jobDesc=None, inputs=None): - jobID = jobDesc["jobID"] - resourceParams = jobDesc["resourceParams"] - - bundleID = self._storeJob(jobID, executableFiles, resourceParams, numberOfProcessors, proxy=proxy, inputs=inputs) - - if not self.bundleReady: - self.log.info(f"Job {jobID} stored successfully in bundle: ", bundleID) - return S_OK() - - executablePath, proxy, grouped_inputs = self._wrap_bundle(command="bash") - - self.log.info("Submitting job to CE: ", self.ce.ceName) - - # result = self.ce.submitJob(executablePath, proxy, inputs=grouped_inputs) - result = {} - - self.bundleReady = None - - return S_OK(result) - - def _wrap_bundle(self, command): - wrap_string = BUNDLE_STRING - bundle_inputs_string = "(" - inputs = [] - - filepath = f"/tmp/BundledJobs_{uuid.uuid4()}" - for job in self.bundleReady["Jobs"]: - self.jobToBundle[job["ID"]] = filepath - bundle_inputs_string += job["Executable"].replace(" ", "\ ") + " " - - # Add the original executable as an input, as well as the original inputs - inputs.append(job["Executable"]) - inputs += job["Inputs"] - - bundle_inputs_string = bundle_inputs_string[:-1] + ")" - - wrap_string = wrap_string.format(inputs=bundle_inputs_string, command=command) - - with open(filepath, "x") as fd: - fd.write(wrap_string) - - self.log.debug("Bundle created:\n", wrap_string) - self.log.debug("Inputs used:", inputs) - - return filepath, self.bundleReady["Jobs"][0]["Proxy"], inputs + for jobId in jobIDList: + result = self.bundler.getJobOutput(jobId) + + if not result["OK"]: + return result + + resultDict[jobId] = result["Value"] + + return resultDict + + # def getJobStatus(self, jobIDList): + # pass # - # BIG ISSUE HERE - # ---------------- - # If we accept job bundling from multiple CEs, there is no way of obtaining the status of - # the CE, because it's different depending of the job bundle you are asking about - # - # A way of circumvent this is enforcing the usage of just a singular Inner CE + # CAN THIS BE IMPLEMENETED ?? # + def killJob(self, jobIDList): + resultDict = {} + + for jobId in jobIDList: + resultDict[jobId] = S_ERROR("Bundled jobs cannot be killed at the moment") + + return resultDict + def getDescription(self): - pass + return self.innerCE.getDescription() def getCEStatus(self): - pass - -if __name__ == "__main__": - from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement - - bundleCE = BundleComputingElement("BundleCE") - innerCE = InProcessComputingElement("InnerCE") - bundleCE.ce = innerCE - - max_processors = 3 - - CE_DICT = { - 'NumberOfProcessors': max_processors, - 'CPUTime': 3456, - 'FileCatalog': 'FileCatalog', - 'CPUTimeLeft': 10000, - 'WaitingToRunningRatio': 0.5, - 'MaxWaitingJobs': 1, - 'MaxTotalJobs': 366, - 'CEType': 'AREX', - 'architecture': 'x86_64', - 'VO': 'lhcb', - 'VirtualOrganization': 'lhcb', - 'MaxRAM': 16000, - 'SubmissionMode': 'Direct', - 'Preamble': 'source /cvmfs/lhcb.cern.ch/lhcbdirac/diracosrc', - 'XRSLExtraString': '(runtimeEnvironment="ENV/SINGULARITY" "/gpfs/projects/sall73/cvmfs/lhcb.cern.ch/containers/os-base/alma9-devel/prod/amd64/" "" "/apps/GPP/SINGULARITY/3.11.5/bin/singularity")', - 'Port': 8443, - 'Platform': 'skylake-any', - 'Timeout': 300, - 'ARCLogLevel': 'DEBUG', - 'MaxCPUTime': 3456, - 'CPUNormalizationFactor': 30, - 'Tag': ['MultiProcessor'], - 'Queue': 'nordugrid-slurm-gp_resa', - 'GridCE': 'lhcbvs02.ific.uv.es', - 'Site': 'DIRAC.MareNostrum.es', - 'GridEnv': '', - 'Setup': 'MyDIRAC-Production', - 'RequiredTag': [], - 'DIRACVersion': 'v8.0.55', - 'ReleaseVersion': 'v8.0.55', - 'RemoteExecution': True - } - - dummy_proxy = """\ ------BEGIN CERTIFICATE----- -BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 ------END CERTIFICATE----- ------BEGIN PRIVATE KEY----- -BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 ------END PRIVATE KEY----- ------BEGIN CERTIFICATE----- -BJvXrEn9x5zGWgEN4rbiFt6CVBKiKrCDw7FWizGy5ivMwVExj3qMb0QabwxvwHDyeMYDnu8t7tNHk68fGbxqh2Hhg3K1GG9f3i5iQabUn893SpxRqTCXT2XyVZLrZCGQaEWJ5ScRJi6AtDEwd8k14qrptLNJSEUt4YFnF2GNLXMrjzB1aa9KmHmy0RFaprfUFpYzgLQSCvXaqhzUcXgrKdcVFzPzi4eWLLUgS5diL5baeeWE7py3MciKimRT8eCQFQaS9wzax17iv6e4XDGtezhhrLX7ncvFfLM8GTzK7PufcqdPNmzpN9GwGwnu9PzQ1rAB6zWD9TTyULUCmjHjGJJUMAa9q8bXBpwc5nbZbEfHQcYHdGuwM989qdckACWzV3H46cGLCVBP7GvD0871kEQ5nK2jKxg6CNPNWKtL30GM5qFQvVfQzeVKWhPbjZ8X9GbRvc4ujYrJ8WwNyXPHXNDv9w8crP9iiLaV5LjJLftEy0S1fG6Bii0awRQKDdt1Cn54gfWrQnqQ97AbC4X1dWavjdGneirtfTH9XTNY6DzkeEBdt179T6nwVSGQHt0nQKaH56Qk8KyX3Vw16APtG5EcX9e2ZnJWnZNH5WCfxZpCvBWYEwFzX5tFFJKPVKpXSA1brU9dbrR0LzBv1wrVDz6J1bw8hVWp3qvTh2kpx4LqqgQq07GE7LGNMyzS8u5gLw8idb106Z24cfdake8WJwL07eK4MWXM4JRq2mtpDGg5iFBZSGZLjcid7cpLHT4r6EWLbDg0vaaV4PJxyq9mFXDgxxxQad7tGqTddBXuKHJWvqKZaxWVfgHfWW2z2y2hDbN0W5nbvyESaSp6zYN2jH2S3DX9wWcxMYYVrDahuVynmbNQcLmUB2qwTYdwUbPN9ph1kGRhRuThQF8AvdWvw7hAyXbk3gtHJKgqBB2w6xWTb5UBEKZH4XLMgkfzm7bDvTJcwnBVVMaReA3Mdw6zPyGRvU3kVLM7rM2HnCKcX4mWxYDTEXtpS8ZGUMH444HbMupYZq2rfyVZ2E0YCkXBuXLaQdHCU6rXnAtT3ZmkewHGrhcRNwXnUS9gDHwFTqPzHuVY8eKSUP6M0z1aBiJn3EUWZ3AxUqE5Ku1xBL3aH7fJQWxEaHwDmtFN4Jjw7a8WY4KqevDwyiHVEQr09Um8bLeNeib0ke10ZYAG2ErX9fEQg6xcaJrxP5GE5jBF7GjNE7dS7vwCVYvLzUDbdFbRVMmPhDV4jF2H40zUmhGqUk8DiKB64pmrPVEJi03b4xNVtGFeuyjB5BkfBa0PpmAryuEYvUxWp8YaQZp89u4XcNwjEBiYp96Li7c0m3Qzj9fNWY92HppNy0SrwLruPTgipGEnLiRRTZAj7rjFRFKkkyKAki3K7ieCn4RbNbSQa5DGnJS0meNdLbvT1VM9Uj0naUpR1gG7Bvktdrf1AcnGPDwkvmRnNrLb8ZGPwGbuyBLHQDqXxBrEy9qBh37XmTjtXqWaTefQ8yDkHFJrpWSM9u9TgqTWMh4x5m5WCfq3JyF9mtjPPgngNCGUVFrQcSSAcv2NVG3Keur9btfm8MVmbFYuLjby7q5aMQpn1ZGEmVP34c75KnNcbECZyg0ewbUuiyLLX2NaMRCdaekD41e8DeryW10Z8L0jdDq081KVHrrcg8VhH56zwU5yUFgGdgg8j9RVQGdiqw7c1zJwVNSLK7rxjc6kV3AiU6d1PACg73TaUZinYuWpmRu3jNvU2DFcFvJV7fXL7SeLLDrdtaPj9FDMF9B81p5t7bep20wtfFArAdej2Et2Jqx9vfzNgLE0cMLPJnxEv9EXnHmwDHj862Caxjw5x28wRydwjbFw35tZUkQwTQMaSunDKcVbcUgfb5f1NTf6JNSRmSFQZKNkvrdZj7Y1Va11951mc7Ju0cBFdVDQVq5ULyV4UCepchvemHhu8566di7B1Gp06GJa5trXE5WP8Ur8Ymq0hGP6PDmx4EGxNQfMHUD3ZAEA7phpb7cASucb8jwuYpBT23AeXhY1JTxctPK1qvYMyd2zPuqbYVtfeiDi7Jd0HSnYJbDS5GRwExvRhr97tj7W2pP2YgWJMJGcC63TT0NeeLCVEhxvHZVGrNZWHwat0FYXztRyQe5Vi9f3Yg44QzUq6viaa3VU7qrqcdpX3xfnLpEmywmBXh9JEpB9jtDEBaJJJ5SLQ25ZFv61jtX3ZEYY498h4vZfieTk2MEKzVea9zrQWSUC8YGMmbYK6U2vqx1qd2cDa8FrHg9mXGdZfy9m4hZBkDiHL649w2ZN9XvEkriV222T1a48eMQUjnw6ALAMDWJu1h8U61L7VRr8bV8xhZDcZqDbTafBdE17D4Xxmmj8mKmHLaXPncnZZyvnCchiKJrn06WF8qNnuE8SNyNcdFkqgddKTSE2QLTkRFhubx3Q21dAkqCVZHhiqLNXHNb3d8ah9AA84n6DSMVxt5ZpN5SMgZUBCe7h7TehYgcEb40RU4YZP57VdSWeinP6ykJ9eBJk9TV5XZ8QKFNfTehR6mENMPxe3bP6Mu4uC7qR8PqckhbGAqJ1Fi9x14NW5tQiQKYPtknvb8PKpiBZ6tA8mwmRS9e0KrzqiTxJc1WTC8ZSydNUJhp3vtnQxx1chbEbY1fgPm3yBWyp7gWxHm1L7jYTqvCAPNFhcT8eP28z8agDdQLayqVbHL44b1JzQp0UcMkqDTwgTNwD4mB5VE0a29Uagv3F1gxppRNWZVU1ewhCwB157FwNYc37i6FhqHjCbLD8rZXGUq3wU5Qhqg6Q9y2i9jByimVdgbXDiEe4ZP5A7Qi7LUZJJEFHwF80eTkKGBqdyaZqNYz0zDrXPEKyWAKdYtq5BvXVAgyxJxYRb6fze5D94TXLWNBak8ZYTYhj9TZL40fFimcCUB0gCx9JrM3DMj5twCDM5c2NqDHjdqrjmuKawbX9gcvQtGq9nmZPABEMtJz32PQaYvVU5xXH62CHkwi0yqSY6UH36Hu77V6u51SchTxSA6PdiJZJRbS38bDwHynBdKWDu3abtQm9LYfd6pE2fYz1TGkivKyD0YTCYp5kFHQbhEdTw2DiLA1mUMSGwM0ZV22YJ6YR8Dw3egb7j5BjNqU7 ------END CERTIFICATE----- -""" - - for i in range(max_processors*2): - executable_file = f"./test/job_{i}.py" - inputs = [f"./test/wrapper_{i}.py", f"./test/wrapper_{i}.json"] - - jobDesc = {"jobID": i, "resourceParams": CE_DICT} - bundleCE.submitJob(f"./test/job_{i}.py", dummy_proxy, numberOfProcessors=1, jobDesc=jobDesc, inputs=inputs) \ No newline at end of file + return self.innerCE.getCEStatus() diff --git a/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py index 5535d74d541..62171c9af71 100644 --- a/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py +++ b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py @@ -21,4 +21,4 @@ def __init__(self, url=None, **kwargs): self.serverURL = "WorkloadManagement/Bundler" else: - self.serverURL = url \ No newline at end of file + self.serverURL = url diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 2288e6d9ddf..6fb228f5b8b 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -5,7 +5,8 @@ from DIRAC.FrameworkSystem.Client.Logger import contextLogger # This might not be necessary -BUNDLE_STATUS = ('Storing', 'Full', 'Sent', 'Finalized') +BUNDLE_STATUS = ("Storing", "Full", "Sent", "Finalized") + def formatSelectOutput(listOfResults, keys): retVal = [] @@ -18,6 +19,7 @@ def formatSelectOutput(listOfResults, keys): return retVal + class BundleDB(DB): """BundleDB MySQL Database Manager""" @@ -35,21 +37,21 @@ def log(self, value): def getBundleIdFromJobId(self, jobID): result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobID}) - + if not result["OK"]: return result - + if not result["Value"]: return S_ERROR("JobId not present in any bundle") return S_OK(result["Value"][0][0]) - def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): + def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): result = self.__getBundlesFromCEDict(ceDict) - + if not result["OK"]: return result - + bundles = result["Value"] # No bundles matching ceDict, so create a new one @@ -69,11 +71,11 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): # Check the best possible bundle to insert the job bundleId = self.__selectBestBundle(bundles, processors) - + # If it does not fit in an already created bundle, create a new one if not bundleId: result = self.__createNewBundle(ceDict) - + if not result["OK"]: return result @@ -93,7 +95,21 @@ def getBundle(self, bundleId): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], ["BundleID", "ProcessorSum", "MaxProcessors", "Site", "CE", "Queue", "CEDict", "ExecTemplate", "TaskID", "Status"]) + retVal = formatSelectOutput( + result["Value"], + [ + "BundleID", + "ProcessorSum", + "MaxProcessors", + "Site", + "CE", + "Queue", + "CEDict", + "ExecTemplate", + "TaskID", + "Status", + ], + ) return S_OK(retVal[0]) def getJobsOfBundle(self, bundleId): @@ -101,7 +117,6 @@ def getJobsOfBundle(self, bundleId): if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], ["JobID", "ExecutablePath", "Inputs"]) retVal["Inputs"] = retVal["Inputs"].split(" ") return S_OK(retVal) @@ -122,38 +137,25 @@ def __createNewBundle(self, ceDict): "ProcessorSum": 0, "MaxProcessors": ceDict["NumberOfProcessors"], "ExecTemplate": ceDict["ExecTemplate"], - "Site": ceDict['Site'], - "CE": ceDict['GridCE'], - "Queue": ceDict['Queue'], - "CEDict": str(ceDict) + "Site": ceDict["Site"], + "CE": ceDict["GridCE"], + "Queue": ceDict["Queue"], + "CEDict": str(ceDict), } - result = self.insertFields( - "BundlesInfo", - list(insertInfo.keys()), - list(insertInfo.values()) - ) + result = self.insertFields("BundlesInfo", list(insertInfo.keys()), list(insertInfo.values())) if not result["OK"]: return result # Returns the ID of the Bundle (which is automatically incremented) return S_OK(result["lastRowId"]) - + def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): # Insert the job into the bundle - insertInfo = { - "JobID": jobId, - "BundleID": bundleId, - "ExecutablePath": executable, - "Inputs": ' '.join(inputs) - } + insertInfo = {"JobID": jobId, "BundleID": bundleId, "ExecutablePath": executable, "Inputs": " ".join(inputs)} - result = self.insertFields( - "JobToBundle", - list(insertInfo.keys()), - list(insertInfo.values()) - ) + result = self.insertFields("JobToBundle", list(insertInfo.keys()), list(insertInfo.values())) if not result["OK"]: return result @@ -167,7 +169,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): if not result["OK"]: return result - # Obtain the current Sum and the Max available + # Obtain the current Sum and the Max available result = self.getFields("BundlesInfo", ["ProcessorSum", "MaxProcessors"], {"BundleID": bundleId}) if not result["OK"]: @@ -181,41 +183,53 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): def __getBundlesFromCEDict(self, ceDict): conditions = { - "Site": ceDict['Site'], - "CE": ceDict['GridCE'], - "Queue": ceDict['Queue'], + "Site": ceDict["Site"], + "CE": ceDict["GridCE"], + "Queue": ceDict["Queue"], } result = self.getFields("BundlesInfo", [], conditions) if not result["OK"]: return result - + if not result["Value"]: return S_OK() - + # TODO: This line is awful, should change to something easier to scale - retVal = formatSelectOutput(result["Value"], ["BundleID", "ProcessorSum", "MaxProcessors", "Site", "CE", "Queue", "CEDict", "ExecTemplate", "TaskID", "Status"]) - return S_OK(retVal) + retVal = formatSelectOutput( + result["Value"], + [ + "BundleID", + "ProcessorSum", + "MaxProcessors", + "Site", + "CE", + "Queue", + "CEDict", + "ExecTemplate", + "TaskID", + "Status", + ], + ) + return S_OK(retVal) def __updateBundleStatus(self, bundleId, newStatus): if newStatus not in BUNDLE_STATUS: - msg = "The new status '{}' does not correspond with the possible statuses:".format(newStatus) + msg = f"The new status '{newStatus}' does not correspond with the possible statuses:" return S_ERROR(msg, BUNDLE_STATUS) - - cmd = "UPDATE BundlesInfo SET Status = {} WHERE BundleID = {};".format( - newStatus, bundleId - ) + + cmd = f"UPDATE BundlesInfo SET Status = {newStatus} WHERE BundleID = {bundleId};" result = self._query(cmd) if not result["OK"]: return result - return S_OK() + return S_OK() # This is function quite dumb, and should not work like this, but for a fist # aproximation is fine (I guess). - # + # # The best way (in my opinion) of approching this is by taking advantage of # dynamic programming. # We could approach this by considering the bundles as sacks and selecting @@ -224,10 +238,10 @@ def __updateBundleStatus(self, bundleId, newStatus): # REF: https://en.wikipedia.org/wiki/Knapsack_problem # # Each bundle that relates to the same CE would be a Knapsack and each item - # would be a different job. The job would have its 'weight' and 'price' set + # would be a different job. The job would have its 'weight' and 'price' set # to the number of processors it needs, and the algorithm would optimize # how they are distributed around the bundles. - # + # # By having multiple bundles, this would relate more to the Bin Packing Problem, # which is an abstaction of the Knapsack Problem. # @@ -249,10 +263,9 @@ def __selectBestBundle(self, bundles, nProcessors): elif newProcSum > maxProcs: continue - + elif newProcSum > currentBestProcs: currentBestProcs = newProcSum bestBundleId = bundleId - + return bestBundleId - diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index ac44194e97a..2dd04318b91 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -39,4 +39,4 @@ CREATE TABLE `JobToBundle` ( `Inputs` TEXT NOT NULL, PRIMARY KEY (`JobID`), FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) -) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; \ No newline at end of file +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 7ae6c8a2ee5..df7cf0c302d 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -12,7 +12,6 @@ class BundlerHandler(RequestHandler): - @classmethod def initializeHandler(cls, serviceInfoDict): try: @@ -22,12 +21,12 @@ def initializeHandler(cls, serviceInfoDict): cls.bundleDB = result["Value"](parentLogger=cls.log) cls.jobToCE = {} cls.ceFactory = ComputingElementFactory() - + except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp}") return S_OK() - + types_storeInBundle = [str, str, list, str, int, dict] def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceDict): @@ -77,7 +76,7 @@ def export_getOutput(self, jobId): self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) return result - bundleID = result["Value"] + bundleID = result["Value"] # TODO: THIS CAN BE CACHED ce = self.__getJobCE(jobId) @@ -85,12 +84,12 @@ def export_getOutput(self, jobId): if not result["OK"]: self.log.error("Failed to obtain Job Output of JobId ", str(jobId)) - + return result def __getJobBundle(self, jobId): result = self.bundleDB.getBundleIdFromJobId(jobId) - + if not result["OK"]: self.log.error("Failed to obtain BundleId of JobId ", str(jobId)) return result @@ -108,7 +107,7 @@ def __getJobCE(self, jobId): if jobId not in self.jobToCE: # Look for it in the DB result = self.__getJobBundle(jobId) - + if not result["OK"]: self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) return result @@ -117,13 +116,13 @@ def __getJobCE(self, jobId): ceDict = literal_eval(result["Value"]["CEDict"]) # Build the ce obtained from the DB result = self.ceFactory.getCE(ceParametersDict=ceDict) - + if not result["OK"]: self.log.error("Failed to CE of JobId ", str(jobId)) return result - + self.jobToCE[jobId] = result["Value"] - + return self.jobToCE[jobId] def __getJobTask(self, jobId): @@ -137,7 +136,7 @@ def __getJobTask(self, jobId): def __wrapBundle(self, bundleId): result = self.bundleDB.getBundle(bundleId) - + if not result["OK"]: self.log.error("Failed to obtain bundle while wrapping. BundleID=", str(bundleId)) return result @@ -145,29 +144,29 @@ def __wrapBundle(self, bundleId): bundle = result["Value"] result = self.bundleDB.getJobsOfBundle(bundleId) - + if not result["OK"]: self.log.error("Failed to obtain bundled job while wrapping. BundleID=", str(bundleId)) return result jobs = result["Value"] - + template = bundle["ExecTemplate"] inputs = [] for job in jobs: inputs.append(job["ExecutablePath"]) inputs.append(job["Inputs"]) - + result = generate_template(template, inputs) - + if not result["OK"]: return result - + wrappedBundle = result["Value"] wrapperPath = f"/tmp/bundle_wrapper_{bundleId}" with open(wrapperPath, "x") as f: f.write(wrappedBundle) - return wrapperPath, inputs \ No newline at end of file + return wrapperPath, inputs diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 0e333d2f9dd..196886e8641 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -5,14 +5,15 @@ def generate_template(template: str, inputs: list[str]): template_lower = template.lower() func_name = "_generate_" + template_lower generator = globals()[func_name] - + if not generator: return S_ERROR("Template not found") - + template, formatted_inputs = generator(inputs) return S_OK(template.format(inputs=formatted_inputs)) + def _generate_bash(inputs: list[str]): template = """\ #!/bin/bash @@ -41,7 +42,7 @@ def _generate_bash(inputs: list[str]): }} # execute tasks -for input in ${{INPUT}}; do +for input in ${{INPUT}}; do [ -f "$input" ] || break taskdir="task_$(get_id ${{input}})" mkdir ${{taskdir}} && cd "$_" && @@ -53,6 +54,6 @@ def _generate_bash(inputs: list[str]): wait """ - formatted_inputs = '(' + ', '.join(inputs) + ')' + formatted_inputs = "(" + ", ".join(inputs) + ")" return template, formatted_inputs From be44c0da048a458cb0e831aedb09597ae134ab05 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:41:33 +0200 Subject: [PATCH 11/47] chore(BundleDB): Change status to a PilotStatus style --- .../WorkloadManagementSystem/DB/BundleDB.py | 31 +++++++++++++++++-- .../WorkloadManagementSystem/DB/BundleDB.sql | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 6fb228f5b8b..ba91de647f0 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -4,8 +4,7 @@ from DIRAC.Core.Base.DB import DB from DIRAC.FrameworkSystem.Client.Logger import contextLogger -# This might not be necessary -BUNDLE_STATUS = ("Storing", "Full", "Sent", "Finalized") +BUNDLE_STATUS = ("Storing", "Sent", "Finalized", "Failed") def formatSelectOutput(listOfResults, keys): @@ -112,13 +111,23 @@ def getBundle(self, bundleId): ) return S_OK(retVal[0]) + def getBundleStatus(self, bundleId): + result = self.getFields("BundlesInfo", ["Status"], {"BundleID": bundleId}) + + if not result["Value"]: + return S_ERROR("Failed to get bundle Status") + + return S_OK(result["Value"][0][0]) + def getJobsOfBundle(self, bundleId): result = self.getFields("JobToBundle", ["JobID", "ExecutablePath", "Inputs"], {"BundleID": bundleId}) if not result["OK"]: return result retVal = formatSelectOutput(result["Value"], ["JobID", "ExecutablePath", "Inputs"]) - retVal["Inputs"] = retVal["Inputs"].split(" ") + for i in range(len(retVal)): + retVal[i]["Inputs"] = retVal[i]["Inputs"].split(" ") + return S_OK(retVal) def setTaskId(self, bundleId, taskId): @@ -128,6 +137,22 @@ def setTaskId(self, bundleId, taskId): return result return S_OK() + + def getTaskId(self, bundleId): + result = self.getFields("BundlesInfo", ["TaskID"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK(result["Value"][0][0]) + + def setBundleAsFinalized(self, bundleId): + result = self.__updateBundleStatus(bundleId, "Finalized") + return result + + def setBundleAsFailed(self, bundleId): + result = self.__updateBundleStatus(bundleId, "Failed") + return result def __createNewBundle(self, ceDict): if "ExecTemplate" not in ceDict: diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 2dd04318b91..f9dd840f064 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -27,7 +27,7 @@ CREATE TABLE `BundlesInfo` ( `CEDict` TEXT NOT NULL, `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` INTEGER(11) UNSIGNED, - `Status` ENUM('Storing', 'Full', 'Sent', 'Finalized') NOT NULL DEFAULT 'Storing', + `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; From beccb6baeb8974a6c82ebbbefa21e5a453231e86 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:42:48 +0200 Subject: [PATCH 12/47] fix: refactor bundle bash template string into a constant --- .../Utilities/BundlerTemplates.py | 46 +++++++++++-------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 196886e8641..8595eab3eab 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -1,21 +1,6 @@ from DIRAC import S_ERROR, S_OK - -def generate_template(template: str, inputs: list[str]): - template_lower = template.lower() - func_name = "_generate_" + template_lower - generator = globals()[func_name] - - if not generator: - return S_ERROR("Template not found") - - template, formatted_inputs = generator(inputs) - - return S_OK(template.format(inputs=formatted_inputs)) - - -def _generate_bash(inputs: list[str]): - template = """\ +GENERIC_BASH_TEMPLATE = """\ #!/bin/bash set -e @@ -31,7 +16,7 @@ def _generate_bash(inputs: list[str]): local task_id=$(get_id ${{input}}) >&2 echo "Executing task ${{task_id}}" - >&2 bash ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & + >&2 {command} ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & local task_pid=$! >&2 echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." @@ -54,6 +39,29 @@ def _generate_bash(inputs: list[str]): wait """ - formatted_inputs = "(" + ", ".join(inputs) + ")" +def generate_template(template: str, inputs: list): + template = template.lower().replace("-", "_") + func_name = "_generate_" + template + generator = locals()[func_name] + + if not generator: + return S_ERROR("Template not found") - return template, formatted_inputs + result = generator(inputs) + if not result["OK"]: + return result + + return S_OK(result["Value"]) + +def _generate_lb_prod_run(inputs: list): + template = __generate_generic_bash("lb-prod-run", inputs) + return S_OK(template) + +def _generate_bash(inputs: list): + template = __generate_generic_bash("bash", inputs) + return S_OK(template) + +def __generate_generic_bash(command, inputs): + formatted_inputs = "(" + ", ".join(inputs) + ")" + template = GENERIC_BASH_TEMPLATE.format(command=command, inputs=formatted_inputs) + return template From f879cce733cc0a33671c941cbf20b189f44b390e Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Jun 2025 12:52:42 +0200 Subject: [PATCH 13/47] chore: Adapt Bundle status management and CE Building --- .../Computing/BundleComputingElement.py | 121 +++++++-- .../Service/BundlerHandler.py | 231 +++++++++++++++--- 2 files changed, 287 insertions(+), 65 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index ff63c698c3a..52dbd9102bf 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -69,11 +69,14 @@ **Code Documentation** """ +import copy +import inspect import uuid from DIRAC import S_ERROR, S_OK from DIRAC.Resources.Computing.ComputingElement import ComputingElement from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory +from DIRAC.WorkloadManagementSystem.Client import PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient @@ -85,6 +88,7 @@ def __init__(self, ceUniqueID): self.mandatoryParameters = ["ExecTemplate", "InnerCEType"] self.innerCE = None + self.innerCEParams = {} self.bundler = BundlerClient() self.ceFactory = ComputingElementFactory() @@ -94,20 +98,51 @@ def _reset(self): self.ceParameters["AsyncSubmission"] = True # Create the InnerCE from the config obtained from the BundleCE - innerCEParams = self.ceParameters.copy() + innerCEParams = copy.deepcopy(self.ceParameters) innerCEType = innerCEParams.pop("InnerCEType") innerCEParams["CEType"] = innerCEType + innerCeName = self.ceParameters["GridCE"].split("bundled-")[1] + innerCEParams["GridCE"] = innerCeName + # Building of the InnerCE - self.innerCE = self.ceFactory.getCE(ceType=innerCEType, ceParametersDict=innerCEParams) + result = self.ceFactory.getCE(ceType=innerCEType, ceName=innerCeName, ceParametersDict=innerCEParams) + + if not result["OK"]: + self.log.error("Failure while creating the InnerCE") + return result + + self.innerCE = result["Value"] + self.innerCE.setParameters(innerCEParams) + self.innerCEParams = innerCEParams + + self.innerCEMethods = [ + name + for name, _ in + self.inspect.getmembers(self.innerCE, predicate=inspect.ismethod) + if name[0] != "_" + ] - def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=None): - # Create a unique ID that cannot clash with other BundleCEs and Jobs in the database - jobId = f"BUNDLE_{self.ceUniqueID}_{uuid.uuid4()}" + return S_OK() + + def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=None, outputs=[]): + jobId = f"BUNDLE_{self.ceName}_{uuid.uuid4().hex}" # Store the job in a bundle using the ceDict of the InnerCE (containing the template) - ceDict = self.innerCE.getDescription() - result = self.bundler.storeInBundle(jobId, executableFiles, inputs, proxy, numberOfProcessors, ceDict) + result = proxy.dumpAllToString() + + if not result["OK"]: + self.log.error("Error while encoding proxy as string") + return result + + result = self.bundler.storeInBundle( + jobId, + executableFiles, + inputs, + result["Value"], + numberOfProcessors, + self.innerCEParams + ) if not result["OK"]: self.log.error("Failure while storing in the Bundle") @@ -126,37 +161,73 @@ def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=No self.log.info("Submitting job to CE: ", self.ce.ceName) # Return the id of the job (NOT THE BUNDLE) - return S_OK([jobId]) + return S_OK(jobId) - def getJobOutput(self, jobIDList): - resultDict = {} + def getJobOutput(self, jobId, workingDirectory=None): + if ":::" in jobId: + jobId = jobId.split(":::")[0] - for jobId in jobIDList: - result = self.bundler.getJobOutput(jobId) + result = self.bundler.getJobTask(jobId) - if not result["OK"]: - return result + if not result["OK"]: + return result - resultDict[jobId] = result["Value"] + bundleId, taskId = result["Value"] + self.innerCE.getJobOutput(taskId) + + return () - return resultDict + def getJobStatus(self, jobIDList): + resultDict = {} + + if not isinstance(jobIDList, list): + jobIDList = [jobIDList] - # def getJobStatus(self, jobIDList): - # pass + for job in jobIDList: + if ":::" in job: + job = job.split(":::")[0] + + result = self.bundler.getBundleStatusOfJob(job) + + if not result["OK"]: + self.log.error(result["Message"]) + resultDict[job] = PilotStatus.FAILED + else: + if result["Value"] == "Finalized": + resultDict[job] = PilotStatus.DONE + elif result["Value"] == "Failed": + resultDict[job] = PilotStatus.DONE + else: + resultDict[job] = PilotStatus.RUNNING + + return S_OK(resultDict) - # - # CAN THIS BE IMPLEMENETED ?? - # def killJob(self, jobIDList): resultDict = {} for jobId in jobIDList: - resultDict[jobId] = S_ERROR("Bundled jobs cannot be killed at the moment") + result = self.bundler.tryToKillJob(jobId) + resultDict[jobId] = result return resultDict - def getDescription(self): - return self.innerCE.getDescription() - def getCEStatus(self): return self.innerCE.getCEStatus() + + def setProxy(self, proxy): + super().setProxy(proxy) + self.innerCE.setProxy(proxy) + + def setToken(self, token): + super().setToken(token) + self.innerCE.setToken(token) + + def cleanJob(self, jobIDList): + if "cleanJob" not in self.innerCEMethods: + self.log.error(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") + return S_ERROR() + + for job in jobIDList: + if ":::" in job: + job = job.split(":::")[0] + self.bundler.cleanJob(job) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index df7cf0c302d..9e6e21096c6 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -6,8 +6,11 @@ from DIRAC import S_ERROR, S_OK from DIRAC.Core.DISET.RequestHandler import RequestHandler +from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory +from DIRAC.WorkloadManagementSystem.Client import PilotStatus +from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import generate_template @@ -18,19 +21,36 @@ def initializeHandler(cls, serviceInfoDict): result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.BundleDB", "BundleDB") if not result["OK"]: return result - cls.bundleDB = result["Value"](parentLogger=cls.log) + cls.bundleDB : BundleDB = result["Value"](parentLogger=cls.log) + + # Dictionaries entries should be removed afer some time cls.jobToCE = {} + cls.bundleToCE = {} + cls.jobToBundle = {} + cls.ceFactory = ComputingElementFactory() + cls.killBundleOnError = True except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp}") return S_OK() + ############################################################################# + types_storeInBundle = [str, str, list, str, int, dict] - def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceDict): - result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceParametersDict=ceDict) + def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, ceDict): + self.log.debug(f"Received: \n\tjobID={jobId}\n\texecutable={executable}\n\tinputs={inputs}\n\tprocessors={processors}\n\tceDict={ceDict}") + + proxy = X509Chain() + result = proxy.loadChainFromString(proxy) + if not result["OK"]: + self.log.error("Failed to obtain proxy from the input string") + self.log.debug(f"Obtained proxy string:\n{proxy}") + return result + + result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["CEName"] ,ceParametersDict=ceDict) if not result["OK"]: self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) @@ -46,12 +66,18 @@ def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceD bundleId = result["Value"]["BundleId"] readyForSubmission = result["Value"]["Ready"] + self.bundleToCE[bundleId] = ce + self.log.info("Job inserted in bundle successfully") if readyForSubmission: self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") - bundle_exe, bundle_inputs = self.__wrapBundle(bundleId) + result = self._wrapBundle(bundleId) + if not result["OK"]: + return result + bundle_exe, bundle_inputs = result["Value"] + result = ce.submitJob(bundle_exe, inputs=bundle_inputs, proxy=proxy) if not result["OK"]: @@ -67,78 +93,166 @@ def export_storeInBundle(self, jobId, executable, inputs, proxy, processors, ceD return S_OK({"BundleID": bundleId, "Executing": readyForSubmission}) - types_getOutput = [str] + ############################################################################# - def export_getOutput(self, jobId): - result = self.bundleDB.getBundleIdFromJobId(jobId) + types_getJobTask = [str] + + def export_getJobTask(self, jobId): + result = self._getBundleIdFromJobId(jobId) if not result["OK"]: self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) return result - bundleID = result["Value"] + bundleId = result["Value"] + + result = self.bundleDB.getBundleStatus(bundleId) + + if not result["OK"]: + self.log.error("Failed to obtain status of bundle ", str(bundleId)) + return result + + status = result["Value"] + if status == "Storing": + return S_OK() - # TODO: THIS CAN BE CACHED - ce = self.__getJobCE(jobId) - result = ce.getJobOutput(bundleID) + result = self.bundleDB.getTaskId(bundleId) if not result["OK"]: self.log.error("Failed to obtain Job Output of JobId ", str(jobId)) + else: + self.bundleDB.setBundleAsFinalized(bundleId) - return result + return S_OK((result["Value"])) + + ############################################################################# - def __getJobBundle(self, jobId): - result = self.bundleDB.getBundleIdFromJobId(jobId) + types_bundleIdFromJobId = [str] + + def export_bundleIdFromJobId(self, jobId): + return self._getBundleIdFromJobId(jobId) + + ############################################################################# + types_tryToKillJob = [str] + + def export_tryToKillJob(self, jobId): + result = self._killJob(jobId) + if result["OK"]: + self.log.info(f"Job {jobId} killed successfully") + return result + + self.log.warn("Failed to ONLY kill the job with id ", str(jobId)) + + if self.killBundleOnError: + self.log.warn("KillBundleOnError is on, killing the WHOLE bundle containing the job") + result = self._killBundleOfJob(jobId) + if not result["OK"]: + return result + + bundleId = result["Value"] + self.log.info(f"Bundle {bundleId} of Job {jobId} killed successfully") + return S_OK() + + else: + self.log.warn("KillBundleOnError is off, doing nothing") + return S_ERROR(message="KillBundleOnError is off, won't kill the bundle") + + def _killJob(self, jobId): + return S_ERROR() + + def _killBundleOfJob(self, jobId): + ce = self.__getJobCE(jobId) + result = self._getBundleIdFromJobId(jobId) if not result["OK"]: - self.log.error("Failed to obtain BundleId of JobId ", str(jobId)) return result + return ce.killJob(result["Value"]) + + ############################################################################# + types_cleanJob = [str] + + def export_cleanJob(self, jobId): + result = self._getBundleIdFromJobId(jobId) + if not result["OK"]: + return result bundleId = result["Value"] - result = self.bundleDB.getBundle(bundleId) + result = self.bundleDB.getBundleStatus(jobId) + if not result["OK"]: + return result + status = result["Value"] + + if status != "Finalized": + return S_OK("There are jobs running, cleaning is not permitted") + + ce = self.__getJobCE(jobId) + return self._cleanBundle(ce, bundleId) + + def _cleanBundle(self, ce, bundleId): + try: + ce.cleanJob(bundleId) + except AttributeError as e: # If the CE has no method 'cleanJob' + return S_ERROR(e) + return S_OK() + + ############################################################################# + + types_getJobStatus = [str] + + def export_getJobStatus(self, jobId): + return self._getJobStatus(jobId) + + def _getJobStatus(self, jobId): + result = self._getBundleIdFromJobId(jobId) if not result["OK"]: - self.log.error + return result + + bundleId = result["Value"] - return result + result = self.bundleDB.getBundleStatus(bundleId) - def __getJobCE(self, jobId): - if jobId not in self.jobToCE: - # Look for it in the DB - result = self.__getJobBundle(jobId) + if not result["OK"]: + return result - if not result["OK"]: - self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) - return result + status=result["Value"] - # Convert the CEDict from string to a dictionary - ceDict = literal_eval(result["Value"]["CEDict"]) - # Build the ce obtained from the DB - result = self.ceFactory.getCE(ceParametersDict=ceDict) + if status == "Sent": + ce = self.__getJobCE(jobId) + result = ce.getJobStatus(bundleId) if not result["OK"]: - self.log.error("Failed to CE of JobId ", str(jobId)) return result + + if result["Value"] == PilotStatus.DONE: + self.bundleDB.setBundleAsFinalized() + status = "Finalized" + + elif result["Value"] == PilotStatus.FAILED | result["Value"] == PilotStatus.ABORTED: + self.bundleDB.setBundleAsFailed() + status = "Failed" - self.jobToCE[jobId] = result["Value"] + return S_OK(status) - return self.jobToCE[jobId] + ############################################################################# - def __getJobTask(self, jobId): - result = self.__getJobBundle(jobId) + def _getBundleIdFromJobId(self, jobId): + if self.jobToBundle[jobId]: + return self.jobToBundle[jobId] + result = self.bundleDB.getBundleIdFromJobId(jobId) if not result["OK"]: - self.log.error("Failed to obtain task id of Job ", str(jobId)) return result + + self.jobToBundle[jobId] = result["Value"] + return result - return result["Value"]["TaskID"] - - def __wrapBundle(self, bundleId): + def _wrapBundle(self, bundleId): result = self.bundleDB.getBundle(bundleId) if not result["OK"]: - self.log.error("Failed to obtain bundle while wrapping. BundleID=", str(bundleId)) + self.log.error("Failed to obtain bundle while wrapping. BundleID ", str(bundleId)) return result bundle = result["Value"] @@ -161,6 +275,7 @@ def __wrapBundle(self, bundleId): result = generate_template(template, inputs) if not result["OK"]: + self.log.error("Error while generating wrapper") return result wrappedBundle = result["Value"] @@ -169,4 +284,40 @@ def __wrapBundle(self, bundleId): with open(wrapperPath, "x") as f: f.write(wrappedBundle) - return wrapperPath, inputs + return S_OK((wrapperPath, inputs)) + + def _getCeDict(self, jobId): + result = self._getBundleIdFromJobId(jobId) + if not result["OK"]: + return result + bundleId = result["Value"] + + result = self.bundleDB.getBundleCE(bundleId) + if not result["OK"]: + return result + + # Convert the CEDict from string to a dictionary + ceDict = literal_eval(result["Value"]) + return S_OK(ceDict) + + def __getJobCE(self, jobId): + if jobId not in self.jobToCE: + # Look for it in the DB + result = self._getCeDict(jobId) + + if not result["OK"]: + self.log.error("Failed to obtain CE Dict of Bundle with JobId ", str(jobId)) + return result + + ceDict = result["Value"] + + # Build the ce obtained from the DB + result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) + + if not result["OK"]: + self.log.error("Failed to CE of JobId ", str(jobId)) + return result + + self.jobToCE[jobId] = result["Value"] + + return self.jobToCE[jobId] From b855db6bde4dbe88315f46b977743a462897dd58 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 6 Jun 2025 16:16:34 +0200 Subject: [PATCH 14/47] chore: Improve BundleCE, BundleDB and BundleService readability --- .../Computing/BundleComputingElement.py | 115 ++++++++++---- .../WorkloadManagementSystem/DB/BundleDB.py | 141 +++++++++++------- .../WorkloadManagementSystem/DB/BundleDB.sql | 5 +- .../Service/BundlerHandler.py | 128 ++++++++-------- 4 files changed, 246 insertions(+), 143 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 52dbd9102bf..6c26852d617 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -79,8 +79,21 @@ from DIRAC.WorkloadManagementSystem.Client import PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient +class BundleTaskDict(dict): + def __init__(self, getProperty): + self.getProperty = getProperty + + def __getitem__(self, jobId): + if jobId in self: + return super().__getitem__(jobId) + + res = self.getProperty(jobId) + if res: + super().__setitem__(jobId, res) + return res class BundleComputingElement(ComputingElement): + def __init__(self, ceUniqueID): """Standard constructor.""" super().__init__(ceUniqueID) @@ -93,6 +106,10 @@ def __init__(self, ceUniqueID): self.bundler = BundlerClient() self.ceFactory = ComputingElementFactory() + self.taskResults = BundleTaskDict(self.__getTraskResult) + + ############################################################################# + def _reset(self): # Force the CE to make the job submissions asynchronous self.ceParameters["AsyncSubmission"] = True @@ -119,15 +136,20 @@ def _reset(self): self.innerCEMethods = [ name for name, _ in - self.inspect.getmembers(self.innerCE, predicate=inspect.ismethod) + inspect.getmembers(self.innerCE, predicate=inspect.ismethod) if name[0] != "_" ] return S_OK() + ############################################################################# + def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=None, outputs=[]): jobId = f"BUNDLE_{self.ceName}_{uuid.uuid4().hex}" + if not proxy: + proxy = self.proxy + # Store the job in a bundle using the ceDict of the InnerCE (containing the template) result = proxy.dumpAllToString() @@ -149,33 +171,54 @@ def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=No return result bundleId = result["Value"]["BundleID"] - submitted = result["Value"]["Executing"] + submitted = result["Value"]["Executing"] # For logging purposes + + result = S_OK([jobId]) + result["PilotStampDict"] = {jobId: bundleId} - # The bundle is not being executed in the InnerCE if not submitted: self.log.info(f"Job {jobId} stored successfully in bundle: ", bundleId) - # Return the bundle id as if it was the task id of the asynchronous executing job - return S_OK([jobId]) - else: self.log.info("Submitting job to CE: ", self.ce.ceName) # Return the id of the job (NOT THE BUNDLE) - return S_OK(jobId) + return result def getJobOutput(self, jobId, workingDirectory=None): + bundleId = None if ":::" in jobId: - jobId = jobId.split(":::")[0] + jobId, bundleId = jobId.split(":::") + + if workingDirectory is None: + workingDirectory = "." + + if not bundleId: + bundleId = self.bundler.bundleIdFromJobId(jobId) - result = self.bundler.getJobTask(jobId) + result = self.bundler.getTaskInfo(bundleId) if not result["OK"]: return result - bundleId, taskId = result["Value"] - self.innerCE.getJobOutput(taskId) + if result["Value"]["Status"] not in PilotStatus.PILOT_FINAL_STATES: + return S_ERROR("Output not ready yet") + + # If the output path of all of the jobs hasn't been defined yet + if outputPath := result["Value"]["OutputPath"] is None: + taskId = result["Value"]["TaskId"] + result = self.innerCE.getJobOutput(taskId, workingDirectory) + + if not result["OK"]: + return result + + self.bundler.setOutputPath(taskId, workingDirectory) + + self.log.notice(f"Outputs at: {outputPath}") - return () + error = f"{outputPath}/{jobId}/{jobId}.err" + output = f"{outputPath}/{jobId}/{jobId}.out" + + return S_OK((output, error)) def getJobStatus(self, jobIDList): resultDict = {} @@ -185,7 +228,7 @@ def getJobStatus(self, jobIDList): for job in jobIDList: if ":::" in job: - job = job.split(":::")[0] + jobId, bundleId = job.split(":::") result = self.bundler.getBundleStatusOfJob(job) @@ -193,23 +236,11 @@ def getJobStatus(self, jobIDList): self.log.error(result["Message"]) resultDict[job] = PilotStatus.FAILED else: - if result["Value"] == "Finalized": - resultDict[job] = PilotStatus.DONE - elif result["Value"] == "Failed": - resultDict[job] = PilotStatus.DONE - else: - resultDict[job] = PilotStatus.RUNNING + resultDict[job] = result["Value"] return S_OK(resultDict) - def killJob(self, jobIDList): - resultDict = {} - - for jobId in jobIDList: - result = self.bundler.tryToKillJob(jobId) - resultDict[jobId] = result - - return resultDict + ############################################################################# def getCEStatus(self): return self.innerCE.getCEStatus() @@ -229,5 +260,33 @@ def cleanJob(self, jobIDList): for job in jobIDList: if ":::" in job: - job = job.split(":::")[0] + job, bundleId = job.split(":::") self.bundler.cleanJob(job) + + def killJob(self, jobIDList): + resultDict = {} + + for job in jobIDList: + if ":::" in job: + jobId, bundleId = job.split(":::") + + result = self.bundler.tryToKillJob(jobId) + resultDict[jobId] = result + + return resultDict + + ############################################################################# + + def __getTraskResult(self, jobId): + result = self.bundler.getJobStatus(jobId) + + if not result["OK"]: + return result + + if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: + return None + + if result["Value"] == PilotStatus.DONE: + return S_OK(0) + + return S_OK(1) \ No newline at end of file diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index ba91de647f0..b5a7f165105 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -1,11 +1,33 @@ """ BundleDB class is a front-end to the bundle db """ +import uuid + from DIRAC import S_ERROR, S_OK from DIRAC.Core.Base.DB import DB from DIRAC.FrameworkSystem.Client.Logger import contextLogger - -BUNDLE_STATUS = ("Storing", "Sent", "Finalized", "Failed") - +from DIRAC.WorkloadManagementSystem.Client import PilotStatus + +STATUS_MAP = { + "Storing": PilotStatus.WAITING, + "Sent": PilotStatus.RUNNING, + "Finalized": PilotStatus.DONE, + "Failed": PilotStatus.FAILED, +} + +COLUMNS = [ + "BundleID", + "Stamp" + "ProcessorSum", + "MaxProcessors", + "Site", + "CE", + "Queue", + "CEDict", + "ExecTemplate", + "TaskID", + "Status", + "OutputPath", +] def formatSelectOutput(listOfResults, keys): retVal = [] @@ -34,16 +56,7 @@ def log(self): def log(self, value): self._defaultLogger = value - def getBundleIdFromJobId(self, jobID): - result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobID}) - - if not result["OK"]: - return result - - if not result["Value"]: - return S_ERROR("JobId not present in any bundle") - - return S_OK(result["Value"][0][0]) + ############################################################################# def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): result = self.__getBundlesFromCEDict(ceDict) @@ -66,7 +79,7 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): if not result["OK"]: return result - return S_OK({"BundleId": bundleId, "Ready": result["Value"]}) + return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) # Check the best possible bundle to insert the job bundleId = self.__selectBestBundle(bundles, processors) @@ -86,30 +99,20 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): if not result["OK"]: return result - return S_OK({"BundleId": bundleId, "Ready": result["Value"]}) + return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) - def getBundle(self, bundleId): - result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) + ############################################################################# + + def getBundleIdFromJobId(self, jobID): + result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobID}) if not result["OK"]: return result - retVal = formatSelectOutput( - result["Value"], - [ - "BundleID", - "ProcessorSum", - "MaxProcessors", - "Site", - "CE", - "Queue", - "CEDict", - "ExecTemplate", - "TaskID", - "Status", - ], - ) - return S_OK(retVal[0]) + if not result["Value"]: + return S_ERROR("JobId not present in any bundle") + + return S_OK(result["Value"][0][0]) def getBundleStatus(self, bundleId): result = self.getFields("BundlesInfo", ["Status"], {"BundleID": bundleId}) @@ -117,7 +120,7 @@ def getBundleStatus(self, bundleId): if not result["Value"]: return S_ERROR("Failed to get bundle Status") - return S_OK(result["Value"][0][0]) + return S_OK(STATUS_MAP[result["Value"][0][0]]) def getJobsOfBundle(self, bundleId): result = self.getFields("JobToBundle", ["JobID", "ExecutablePath", "Inputs"], {"BundleID": bundleId}) @@ -130,6 +133,8 @@ def getJobsOfBundle(self, bundleId): return S_OK(retVal) + ############################################################################# + def setTaskId(self, bundleId, taskId): result = self.updateFields("BundlesInfo", ["TaskID"], [taskId], {"BundleID": bundleId}) @@ -146,6 +151,8 @@ def getTaskId(self, bundleId): return S_OK(result["Value"][0][0]) + ############################################################################# + def setBundleAsFinalized(self, bundleId): result = self.__updateBundleStatus(bundleId, "Finalized") return result @@ -154,11 +161,45 @@ def setBundleAsFailed(self, bundleId): result = self.__updateBundleStatus(bundleId, "Failed") return result + ############################################################################# + + def setOutputPath(self, bundleId, outputPath): + result = self.updateFields("BundlesInfo", ["OutputPath"], [outputPath], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK() + + def getOutputPath(self, bundleId): + result = self.getFields("BundlesInfo", ["OutputPath"], {"BundleID": bundleId}) + + if not result["Value"]: + return S_ERROR("Failed to get bundle Output Path") + + return S_OK(result["Value"][0][0]) + + ############################################################################# + + def getWholeBundle(self, bundleId): + result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + bundleDict = result["Value"] + bundleDict["Status"] = STATUS_MAP[bundleDict["Status"]] + + return S_OK(formatSelectOutput(bundleDict, COLUMNS)[0]) + + ############################################################################# + def __createNewBundle(self, ceDict): if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") insertInfo = { + "BundleID": uuid.uuid4().hex, "ProcessorSum": 0, "MaxProcessors": ceDict["NumberOfProcessors"], "ExecTemplate": ceDict["ExecTemplate"], @@ -195,16 +236,22 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): return result # Obtain the current Sum and the Max available - result = self.getFields("BundlesInfo", ["ProcessorSum", "MaxProcessors"], {"BundleID": bundleId}) + result = self.getFields("BundlesInfo", ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId}) if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], ["ProcessorSum", "MaxProcessors"]) + retVal = formatSelectOutput(result["Value"], ["ProcessorSum", "MaxProcessors", "Status"]) selection = retVal[0] + selection["Ready"] = selection["ProcessorSum"] == selection["MaxProcessors"] + + selection.pop("ProcessorSum") + selection.pop("MaxProcessors") + + selection["Staus"] = STATUS_MAP[selection["Staus"]] # TODO: Change this to a strategy based selection and remove self.__selectBestBundle(...) - return S_OK(selection["ProcessorSum"] == selection["MaxProcessors"]) + return S_OK(selection) def __getBundlesFromCEDict(self, ceDict): conditions = { @@ -219,30 +266,18 @@ def __getBundlesFromCEDict(self, ceDict): return result if not result["Value"]: - return S_OK() + return S_OK([]) - # TODO: This line is awful, should change to something easier to scale retVal = formatSelectOutput( result["Value"], - [ - "BundleID", - "ProcessorSum", - "MaxProcessors", - "Site", - "CE", - "Queue", - "CEDict", - "ExecTemplate", - "TaskID", - "Status", - ], + COLUMNS, ) return S_OK(retVal) def __updateBundleStatus(self, bundleId, newStatus): - if newStatus not in BUNDLE_STATUS: + if newStatus not in STATUS_MAP.keys(): msg = f"The new status '{newStatus}' does not correspond with the possible statuses:" - return S_ERROR(msg, BUNDLE_STATUS) + return S_ERROR(msg, STATUS_MAP.keys()) cmd = f"UPDATE BundlesInfo SET Status = {newStatus} WHERE BundleID = {bundleId};" result = self._query(cmd) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index f9dd840f064..643fb3bafca 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -18,7 +18,7 @@ DROP TABLE IF EXISTS `JobToBundle`; DROP TABLE IF EXISTS `BundlesInfo`; CREATE TABLE `BundlesInfo` ( - `BundleID` INT(11) UNSIGNED NOT NULL AUTO_INCREMENT, + `BundleID` VARCHAR(32) NOT NULL, `ProcessorSum` INT(5) UNSIGNED NOT NULL DEFAULT 0, `MaxProcessors` INT(5) UNSIGNED NOT NULL, `Site` VARCHAR(128) NOT NULL, @@ -28,13 +28,14 @@ CREATE TABLE `BundlesInfo` ( `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` INTEGER(11) UNSIGNED, `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', + `OutputPath` VARCHAR(255), PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ CREATE TABLE `JobToBundle` ( `JobID` VARCHAR(255) NOT NULL, - `BundleID` INTEGER(11) UNSIGNED NOT NULL, + `BundleID` VARCHAR(32) NOT NULL, `ExecutablePath` VARCHAR(255) NOT NULL, `Inputs` TEXT NOT NULL, PRIMARY KEY (`JobID`), diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 9e6e21096c6..d0c8d189951 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -42,14 +42,8 @@ def initializeHandler(cls, serviceInfoDict): def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, ceDict): self.log.debug(f"Received: \n\tjobID={jobId}\n\texecutable={executable}\n\tinputs={inputs}\n\tprocessors={processors}\n\tceDict={ceDict}") - - proxy = X509Chain() - result = proxy.loadChainFromString(proxy) - if not result["OK"]: - self.log.error("Failed to obtain proxy from the input string") - self.log.debug(f"Obtained proxy string:\n{proxy}") - return result + # Prepare the CE result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["CEName"] ,ceParametersDict=ceDict) if not result["OK"]: @@ -59,6 +53,7 @@ def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, ce = result["Value"] self.jobToCE[jobId] = ce + # Insert the Job into the DB result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, processors, ceDict) if not result["OK"]: self.log.error("Failed to insert into a bundle the job with id ", str(jobId)) @@ -71,6 +66,14 @@ def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, self.log.info("Job inserted in bundle successfully") if readyForSubmission: + # Try to load the Proxy + proxy = X509Chain() + result = proxy.loadChainFromString(proxy) + if not result["OK"]: + self.log.error("Failed to obtain proxy from the input string") + self.log.debug(f"Obtained proxy string:\n{proxy}") + return result + self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") result = self._wrapBundle(bundleId) @@ -84,8 +87,10 @@ def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, self.log.error("Failed to submit job to with id ", str(jobId)) return result - taskID = result["Value"] - result = self.bundleDB.setTaskId(bundleId, taskID) + innerJobId = result["Value"][0] + taskId = innerJobId + ":::" + result[["PilotStampDict"]][innerJobId] + + result = self.bundleDB.setTaskId(bundleId, taskId) if not result["OK"]: self.log.error("Failed to set task id of JobId ", str(jobId)) @@ -95,36 +100,27 @@ def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, ############################################################################# - types_getJobTask = [str] + types_getTaskInfo = [str] - def export_getJobTask(self, jobId): - result = self._getBundleIdFromJobId(jobId) - - if not result["OK"]: - self.log.error("Failed to obtain Bundle of JobId ", str(jobId)) - return result - - bundleId = result["Value"] - - result = self.bundleDB.getBundleStatus(bundleId) + def export_getTaskInfo(self, bundleId): + return self._getTaskInfo(bundleId) + + def _getTaskInfo(self, bundleId): + result = self.bundleDB.getWholeBundle(bundleId) if not result["OK"]: - self.log.error("Failed to obtain status of bundle ", str(bundleId)) + self.log.error("Failed to obtain bundle ", str(bundleId)) return result + + bundleDict = result["Value"] + resultDict = {"Status": bundleDict["Status"]} - status = result["Value"] - if status == "Storing": - return S_OK() - - result = self.bundleDB.getTaskId(bundleId) - - if not result["OK"]: - self.log.error("Failed to obtain Job Output of JobId ", str(jobId)) - else: - self.bundleDB.setBundleAsFinalized(bundleId) - - return S_OK((result["Value"])) - + if bundleDict["Status"] not in PilotStatus.PILOT_FINAL_STATES: + resultDict["TaskID"] = bundleDict["TaskID"] + resultDict["OutputPath"] = bundleDict["OutputPath"] + + return S_OK(resultDict) + ############################################################################# types_bundleIdFromJobId = [str] @@ -158,15 +154,31 @@ def export_tryToKillJob(self, jobId): self.log.warn("KillBundleOnError is off, doing nothing") return S_ERROR(message="KillBundleOnError is off, won't kill the bundle") - def _killJob(self, jobId): - return S_ERROR() - def _killBundleOfJob(self, jobId): ce = self.__getJobCE(jobId) result = self._getBundleIdFromJobId(jobId) + + if not result["OK"]: + return result + + bundleId = result["Value"] + result = self._getTaskInfo(bundleId) if not result["OK"]: return result - return ce.killJob(result["Value"]) + + if result["Value"]["Status"] in PilotStatus.PILOT_FINAL_STATES: + return S_ERROR("Cannot kill finished jobs") + + result = ce.killJob([result["Value"]["TaskID"]]) + + if not result["OK"]: + return result + + self.bundleDB.setBundleAsFailed() + return + + def _killJob(self, jobId): + return S_ERROR("CAN'T STOP JOBS") ############################################################################# @@ -178,32 +190,27 @@ def export_cleanJob(self, jobId): return result bundleId = result["Value"] - result = self.bundleDB.getBundleStatus(jobId) + result = self._getTaskInfo(bundleId) + if not result["OK"]: return result - status = result["Value"] - - if status != "Finalized": - return S_OK("There are jobs running, cleaning is not permitted") + status = result["Value"]["Status"] + + if status not in PilotStatus.PILOT_FINAL_STATES: + return S_ERROR(f"The bundle hasn't finished, cleaning is not permitted. Current Status: {status}") ce = self.__getJobCE(jobId) - return self._cleanBundle(ce, bundleId) - - def _cleanBundle(self, ce, bundleId): try: - ce.cleanJob(bundleId) + ce.cleanJob(result["Value"]["TaskID"]) except AttributeError as e: # If the CE has no method 'cleanJob' return S_ERROR(e) return S_OK() - + ############################################################################# types_getJobStatus = [str] def export_getJobStatus(self, jobId): - return self._getJobStatus(jobId) - - def _getJobStatus(self, jobId): result = self._getBundleIdFromJobId(jobId) if not result["OK"]: @@ -211,27 +218,28 @@ def _getJobStatus(self, jobId): bundleId = result["Value"] - result = self.bundleDB.getBundleStatus(bundleId) + result = self._getTaskInfo(bundleId) if not result["OK"]: return result - status=result["Value"] + status = result["Value"]["Status"] - if status == "Sent": + if status not in PilotStatus.PILOT_FINAL_STATES: ce = self.__getJobCE(jobId) - result = ce.getJobStatus(bundleId) + + task = result["Value"]["TaskID"] + result = ce.getJobStatus(task) if not result["OK"]: return result + status = result["Value"][task] + if result["Value"] == PilotStatus.DONE: self.bundleDB.setBundleAsFinalized() - status = "Finalized" - - elif result["Value"] == PilotStatus.FAILED | result["Value"] == PilotStatus.ABORTED: + elif result["Value"] in PilotStatus.PILOT_FINAL_STATES: self.bundleDB.setBundleAsFailed() - status = "Failed" return S_OK(status) @@ -249,7 +257,7 @@ def _getBundleIdFromJobId(self, jobId): return result def _wrapBundle(self, bundleId): - result = self.bundleDB.getBundle(bundleId) + result = self.bundleDB.getWholeBundle(bundleId) if not result["OK"]: self.log.error("Failed to obtain bundle while wrapping. BundleID ", str(bundleId)) From 34abe0a8d50e45974a7f37bd26cd91ac06740110 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 25 Jun 2025 14:41:59 +0200 Subject: [PATCH 15/47] feat: Add outputs and proxy to bundle creation fix: pre-commit and typos fix: Set RemoteRunner input paths to be absolute --- .../Computing/AREXComputingElement.py | 2 +- .../Computing/BundleComputingElement.py | 73 +++---- .../WorkloadManagementSystem/DB/BundleDB.py | 140 ++++++++++---- .../WorkloadManagementSystem/DB/BundleDB.sql | 7 +- .../Service/BundlerHandler.py | 181 ++++++++++++------ .../Utilities/BundlerTemplates.py | 38 ++-- .../Utilities/RemoteRunner.py | 5 +- 7 files changed, 293 insertions(+), 153 deletions(-) diff --git a/src/DIRAC/Resources/Computing/AREXComputingElement.py b/src/DIRAC/Resources/Computing/AREXComputingElement.py index c70a6a6a28c..1c58d85b102 100755 --- a/src/DIRAC/Resources/Computing/AREXComputingElement.py +++ b/src/DIRAC/Resources/Computing/AREXComputingElement.py @@ -490,7 +490,7 @@ def _writeXRSL(self, executableFile, inputs, outputs): def _bundlePreamble(self, executableFile): """Bundle the preamble with the executable file""" - wrapperContent = f"{self.preamble}\n./{executableFile}" + wrapperContent = f"{self.preamble}\n./{os.path.basename(executableFile)}" # We need to make sure the executable file can be executed by the wrapper # By adding the execution mode to the file, the file will be processed as an "executable" in the XRSL diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 6c26852d617..65a64cdfc3c 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -79,6 +79,7 @@ from DIRAC.WorkloadManagementSystem.Client import PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient + class BundleTaskDict(dict): def __init__(self, getProperty): self.getProperty = getProperty @@ -92,8 +93,8 @@ def __getitem__(self, jobId): super().__setitem__(jobId, res) return res -class BundleComputingElement(ComputingElement): +class BundleComputingElement(ComputingElement): def __init__(self, ceUniqueID): """Standard constructor.""" super().__init__(ceUniqueID) @@ -134,36 +135,36 @@ def _reset(self): self.innerCEParams = innerCEParams self.innerCEMethods = [ - name - for name, _ in - inspect.getmembers(self.innerCE, predicate=inspect.ismethod) - if name[0] != "_" + name for name, _ in inspect.getmembers(self.innerCE, predicate=inspect.ismethod) if name[0] != "_" ] return S_OK() ############################################################################# - def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=None, outputs=[]): - jobId = f"BUNDLE_{self.ceName}_{uuid.uuid4().hex}" + def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=[], outputs=[]): + jobId = str(uuid.uuid4().hex) + + proxy = self.proxy if self.proxy else proxy if not proxy: - proxy = self.proxy - + self.log.error("Proxy not defined. Use setProxy or send proxy during job submission") + return S_ERROR("PROXY NOT DEFINED") + # Store the job in a bundle using the ceDict of the InnerCE (containing the template) - result = proxy.dumpAllToString() + if isinstance(proxy, str): + return S_ERROR("PROXY CANNOT BE IN A STRING FORMAT") + + proxyStr = proxy.dumpAllToString()["Value"] + result = self.writeProxyToFile(proxyStr) if not result["OK"]: - self.log.error("Error while encoding proxy as string") return result + proxyPath = result["Value"] + result = self.bundler.storeInBundle( - jobId, - executableFiles, - inputs, - result["Value"], - numberOfProcessors, - self.innerCEParams + jobId, executableFiles, inputs, outputs, proxyPath, numberOfProcessors, self.innerCEParams ) if not result["OK"]: @@ -171,7 +172,7 @@ def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=No return result bundleId = result["Value"]["BundleID"] - submitted = result["Value"]["Executing"] # For logging purposes + submitted = result["Value"]["Executing"] # For logging purposes result = S_OK([jobId]) result["PilotStampDict"] = {jobId: bundleId} @@ -207,14 +208,14 @@ def getJobOutput(self, jobId, workingDirectory=None): if outputPath := result["Value"]["OutputPath"] is None: taskId = result["Value"]["TaskId"] result = self.innerCE.getJobOutput(taskId, workingDirectory) - + if not result["OK"]: return result self.bundler.setOutputPath(taskId, workingDirectory) self.log.notice(f"Outputs at: {outputPath}") - + error = f"{outputPath}/{jobId}/{jobId}.err" output = f"{outputPath}/{jobId}/{jobId}.out" @@ -230,8 +231,8 @@ def getJobStatus(self, jobIDList): if ":::" in job: jobId, bundleId = job.split(":::") - result = self.bundler.getBundleStatusOfJob(job) - + result = self.bundler.getJobStatus(job) + if not result["OK"]: self.log.error(result["Message"]) resultDict[job] = PilotStatus.FAILED @@ -244,19 +245,19 @@ def getJobStatus(self, jobIDList): def getCEStatus(self): return self.innerCE.getCEStatus() - - def setProxy(self, proxy): - super().setProxy(proxy) - self.innerCE.setProxy(proxy) - - def setToken(self, token): - super().setToken(token) - self.innerCE.setToken(token) + + def setProxy(self, proxy, valid=0): + super().setProxy(proxy, valid) + self.innerCE.setProxy(proxy, valid) + + def setToken(self, token, valid=0): + super().setToken(token, valid) + self.innerCE.setToken(token, valid) def cleanJob(self, jobIDList): if "cleanJob" not in self.innerCEMethods: - self.log.error(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") - return S_ERROR() + self.log.error(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") + return S_ERROR() for job in jobIDList: if ":::" in job: @@ -284,9 +285,9 @@ def __getTraskResult(self, jobId): return result if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: - return None - + return S_OK() + if result["Value"] == PilotStatus.DONE: return S_OK(0) - - return S_OK(1) \ No newline at end of file + + return S_OK(1) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index b5a7f165105..c869a3f6969 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -1,6 +1,7 @@ """ BundleDB class is a front-end to the bundle db """ import uuid +from ast import literal_eval from DIRAC import S_ERROR, S_OK from DIRAC.Core.Base.DB import DB @@ -8,15 +9,14 @@ from DIRAC.WorkloadManagementSystem.Client import PilotStatus STATUS_MAP = { - "Storing": PilotStatus.WAITING, - "Sent": PilotStatus.RUNNING, - "Finalized": PilotStatus.DONE, + "Storing": PilotStatus.WAITING, + "Sent": PilotStatus.RUNNING, + "Finalized": PilotStatus.DONE, "Failed": PilotStatus.FAILED, } -COLUMNS = [ +BUNDLES_INFO_COLUMNS = [ "BundleID", - "Stamp" "ProcessorSum", "MaxProcessors", "Site", @@ -27,8 +27,19 @@ "TaskID", "Status", "OutputPath", + "ProxyPath", ] +JOB_TO_BUNDLE_COLUMNS = [ + "JobID", + "BundleID", + "ExecutablePath", + "Inputs", + "Outputs", + "Processors", +] + + def formatSelectOutput(listOfResults, keys): retVal = [] @@ -58,7 +69,7 @@ def log(self, value): ############################################################################# - def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): + def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDict, proxyPath): result = self.__getBundlesFromCEDict(ceDict) if not result["OK"]: @@ -74,7 +85,7 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): return result bundleId = result["Value"] - result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath) if not result["OK"]: return result @@ -94,17 +105,36 @@ def insertJobToBundle(self, jobId, executable, inputs, processors, ceDict): bundleId = result["Value"] # Insert it and obtain if it is ready to be submitted - result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors) + result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors, proxyPath) if not result["OK"]: return result return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) + def removeJobFromBundle(self, jobId): + result = self.getFields("JobToBundle", ["BundleID", "Processors"], {"JobID": jobId}) + + if not result["OK"]: + return result + + jobInfo = result["Value"][0] + bundleId, procs = jobInfo[0], jobInfo[1] + + result = self.__reduceProcessorSum(bundleId, procs) + + if not result["OK"]: + return result + + result = self.deleteEntries("JobToBundle", {"JobID": jobId}) + + # Rollback on error?? Can this Fail?? + return result + ############################################################################# - def getBundleIdFromJobId(self, jobID): - result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobID}) + def getBundleIdFromJobId(self, jobId): + result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobId}) if not result["OK"]: return result @@ -116,21 +146,26 @@ def getBundleIdFromJobId(self, jobID): def getBundleStatus(self, bundleId): result = self.getFields("BundlesInfo", ["Status"], {"BundleID": bundleId}) - + if not result["Value"]: return S_ERROR("Failed to get bundle Status") return S_OK(STATUS_MAP[result["Value"][0][0]]) def getJobsOfBundle(self, bundleId): - result = self.getFields("JobToBundle", ["JobID", "ExecutablePath", "Inputs"], {"BundleID": bundleId}) + fields = ["JobID", "ExecutablePath", "Inputs", "Outputs"] + + result = self.getFields("JobToBundle", fields, {"BundleID": bundleId}) if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], ["JobID", "ExecutablePath", "Inputs"]) + + retVal = formatSelectOutput(result["Value"], fields) + for i in range(len(retVal)): - retVal[i]["Inputs"] = retVal[i]["Inputs"].split(" ") - + retVal[i]["Inputs"] = literal_eval(retVal[i]["Inputs"]) + retVal[i]["Outputs"] = literal_eval(retVal[i]["Outputs"]) + return S_OK(retVal) ############################################################################# @@ -142,7 +177,7 @@ def setTaskId(self, bundleId, taskId): return result return S_OK() - + def getTaskId(self, bundleId): result = self.getFields("BundlesInfo", ["TaskID"], {"BundleID": bundleId}) @@ -156,7 +191,7 @@ def getTaskId(self, bundleId): def setBundleAsFinalized(self, bundleId): result = self.__updateBundleStatus(bundleId, "Finalized") return result - + def setBundleAsFailed(self, bundleId): result = self.__updateBundleStatus(bundleId, "Failed") return result @@ -178,7 +213,7 @@ def getOutputPath(self, bundleId): return S_ERROR("Failed to get bundle Output Path") return S_OK(result["Value"][0][0]) - + ############################################################################# def getWholeBundle(self, bundleId): @@ -187,19 +222,37 @@ def getWholeBundle(self, bundleId): if not result["OK"]: return result - bundleDict = result["Value"] + if not result["Value"]: + return S_ERROR(f"No bundle with id {bundleId}") + + bundleDict = formatSelectOutput(result["Value"], BUNDLES_INFO_COLUMNS)[0] bundleDict["Status"] = STATUS_MAP[bundleDict["Status"]] - return S_OK(formatSelectOutput(bundleDict, COLUMNS)[0]) + return S_OK(bundleDict) + + def getBundleCE(self, bundleId): + result = self.getFields("BundlesInfo", ["CEDict", "ProxyPath"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK(formatSelectOutput(result["Value"], ["CEDict", "ProxyPath"])[0]) ############################################################################# + def __reduceProcessorSum(self, bundleId, nProcessors): + cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum - {} WHERE BundleID = "{}";'.format( + nProcessors, bundleId + ) + return self._query(cmd) + def __createNewBundle(self, ceDict): if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") + bundleId = uuid.uuid4().hex insertInfo = { - "BundleID": uuid.uuid4().hex, + "BundleID": bundleId, "ProcessorSum": 0, "MaxProcessors": ceDict["NumberOfProcessors"], "ExecTemplate": ceDict["ExecTemplate"], @@ -214,12 +267,19 @@ def __createNewBundle(self, ceDict): if not result["OK"]: return result - # Returns the ID of the Bundle (which is automatically incremented) - return S_OK(result["lastRowId"]) + return S_OK(bundleId) - def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): + def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath): # Insert the job into the bundle - insertInfo = {"JobID": jobId, "BundleID": bundleId, "ExecutablePath": executable, "Inputs": " ".join(inputs)} + insertInfo = { + "JobID": jobId, + "BundleID": bundleId, + "ExecutablePath": executable, + "Inputs": str(inputs), + "Outputs": str(outputs), + "Processors": nProcessors, + "ProxyPath": proxyPath, + } result = self.insertFields("JobToBundle", list(insertInfo.keys()), list(insertInfo.values())) @@ -227,7 +287,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): return result # Modify the number of processors that will be used by the bundle - cmd = "UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {} WHERE BundleID = {};".format( + cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {} WHERE BundleID = "{}";'.format( nProcessors, bundleId ) result = self._query(cmd) @@ -244,23 +304,29 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, nProcessors): retVal = formatSelectOutput(result["Value"], ["ProcessorSum", "MaxProcessors", "Status"]) selection = retVal[0] selection["Ready"] = selection["ProcessorSum"] == selection["MaxProcessors"] - + selection.pop("ProcessorSum") selection.pop("MaxProcessors") - selection["Staus"] = STATUS_MAP[selection["Staus"]] + selection["Status"] = STATUS_MAP[selection["Status"]] # TODO: Change this to a strategy based selection and remove self.__selectBestBundle(...) return S_OK(selection) def __getBundlesFromCEDict(self, ceDict): - conditions = { - "Site": ceDict["Site"], - "CE": ceDict["GridCE"], - "Queue": ceDict["Queue"], - } - - result = self.getFields("BundlesInfo", [], conditions) + # conditions = { + # "Site": ceDict["Site"], + # "CE": ceDict["GridCE"], + # "Queue": ceDict["Queue"], + # } + + cmd = 'SELECT * FROM BundlesInfo WHERE Site = "{Site}" AND CE = "{CE}" AND Queue = "{Queue}";'.format( + Site=ceDict["Site"], + CE=ceDict["GridCE"], + Queue=ceDict["Queue"], + ) + result = self._query(cmd) + # result = self.getFields("BundlesInfo", [], conditions) if not result["OK"]: return result @@ -270,7 +336,7 @@ def __getBundlesFromCEDict(self, ceDict): retVal = formatSelectOutput( result["Value"], - COLUMNS, + BUNDLES_INFO_COLUMNS, ) return S_OK(retVal) @@ -279,7 +345,7 @@ def __updateBundleStatus(self, bundleId, newStatus): msg = f"The new status '{newStatus}' does not correspond with the possible statuses:" return S_ERROR(msg, STATUS_MAP.keys()) - cmd = f"UPDATE BundlesInfo SET Status = {newStatus} WHERE BundleID = {bundleId};" + cmd = f'UPDATE BundlesInfo SET Status = "{newStatus}" WHERE BundleID = "{bundleId}";' result = self._query(cmd) if not result["OK"]: diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 643fb3bafca..f6abb7e16f7 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -26,9 +26,10 @@ CREATE TABLE `BundlesInfo` ( `Queue` VARCHAR(128) NOT NULL, `CEDict` TEXT NOT NULL, `ExecTemplate` VARCHAR(25) NOT NULL, - `TaskID` INTEGER(11) UNSIGNED, + `TaskID` VARCHAR(255), `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', `OutputPath` VARCHAR(255), + `ProxyPath` VARCHAR(255), PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; @@ -37,7 +38,9 @@ CREATE TABLE `JobToBundle` ( `JobID` VARCHAR(255) NOT NULL, `BundleID` VARCHAR(32) NOT NULL, `ExecutablePath` VARCHAR(255) NOT NULL, - `Inputs` TEXT NOT NULL, + `Inputs` VARCHAR(255) NOT NULL, + `Outputs` VARCHAR(255) NOT NULL, + `Processors` INT(5) UNSIGNED NOT NULL DEFAULT 1, PRIMARY KEY (`JobID`), FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index d0c8d189951..42a88b0e56a 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -2,10 +2,13 @@ It connects to a BundleDB to store and retrive bundles. """ +import os +import shutil from ast import literal_eval from DIRAC import S_ERROR, S_OK from DIRAC.Core.DISET.RequestHandler import RequestHandler +from DIRAC.Core.Security.ProxyInfo import getProxyInfo from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory @@ -21,13 +24,13 @@ def initializeHandler(cls, serviceInfoDict): result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.BundleDB", "BundleDB") if not result["OK"]: return result - cls.bundleDB : BundleDB = result["Value"](parentLogger=cls.log) - + cls.bundleDB: BundleDB = result["Value"](parentLogger=cls.log) + # Dictionaries entries should be removed afer some time cls.jobToCE = {} cls.bundleToCE = {} cls.jobToBundle = {} - + cls.ceFactory = ComputingElementFactory() cls.killBundleOnError = True @@ -38,58 +41,52 @@ def initializeHandler(cls, serviceInfoDict): ############################################################################# - types_storeInBundle = [str, str, list, str, int, dict] + types_storeInBundle = [str, str, list, list, str, int, dict] - def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, ceDict): - self.log.debug(f"Received: \n\tjobID={jobId}\n\texecutable={executable}\n\tinputs={inputs}\n\tprocessors={processors}\n\tceDict={ceDict}") - - # Prepare the CE - result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["CEName"] ,ceParametersDict=ceDict) + def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, processors, ceDict): + result = self.__setupCE(ceDict, proxyPath) if not result["OK"]: - self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) return result - ce = result["Value"] + ce = result["Value"]["CE"] + proxy = result["Value"]["Proxy"] + self.jobToCE[jobId] = ce # Insert the Job into the DB - result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, processors, ceDict) + result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, outputs, processors, ceDict) if not result["OK"]: self.log.error("Failed to insert into a bundle the job with id ", str(jobId)) return result bundleId = result["Value"]["BundleId"] readyForSubmission = result["Value"]["Ready"] + self.bundleToCE[bundleId] = ce self.log.info("Job inserted in bundle successfully") if readyForSubmission: - # Try to load the Proxy - proxy = X509Chain() - result = proxy.loadChainFromString(proxy) - if not result["OK"]: - self.log.error("Failed to obtain proxy from the input string") - self.log.debug(f"Obtained proxy string:\n{proxy}") - return result - self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") result = self._wrapBundle(bundleId) if not result["OK"]: return result - bundle_exe, bundle_inputs = result["Value"] - result = ce.submitJob(bundle_exe, inputs=bundle_inputs, proxy=proxy) + jobIds, bundle_exe, bundle_inputs, bundle_outputs = result["Value"] + extra_outputs = [item for job_id in jobIds for item in [f"{job_id}.out", f"{job_id}.status"]] + bundle_outputs.extend(extra_outputs) + + result = ce.submitJob(bundle_exe, proxy=proxy, inputs=bundle_inputs, outputs=bundle_outputs) if not result["OK"]: self.log.error("Failed to submit job to with id ", str(jobId)) return result innerJobId = result["Value"][0] - taskId = innerJobId + ":::" + result[["PilotStampDict"]][innerJobId] - + taskId = innerJobId + ":::" + result["PilotStampDict"][innerJobId] + result = self.bundleDB.setTaskId(bundleId, taskId) if not result["OK"]: @@ -104,27 +101,27 @@ def export_storeInBundle(self, jobId, executable, inputs, proxyDict, processors, def export_getTaskInfo(self, bundleId): return self._getTaskInfo(bundleId) - + def _getTaskInfo(self, bundleId): result = self.bundleDB.getWholeBundle(bundleId) if not result["OK"]: self.log.error("Failed to obtain bundle ", str(bundleId)) return result - + bundleDict = result["Value"] resultDict = {"Status": bundleDict["Status"]} if bundleDict["Status"] not in PilotStatus.PILOT_FINAL_STATES: resultDict["TaskID"] = bundleDict["TaskID"] resultDict["OutputPath"] = bundleDict["OutputPath"] - + return S_OK(resultDict) - + ############################################################################# types_bundleIdFromJobId = [str] - + def export_bundleIdFromJobId(self, jobId): return self._getBundleIdFromJobId(jobId) @@ -145,7 +142,7 @@ def export_tryToKillJob(self, jobId): result = self._killBundleOfJob(jobId) if not result["OK"]: return result - + bundleId = result["Value"] self.log.info(f"Bundle {bundleId} of Job {jobId} killed successfully") return S_OK() @@ -153,9 +150,12 @@ def export_tryToKillJob(self, jobId): else: self.log.warn("KillBundleOnError is off, doing nothing") return S_ERROR(message="KillBundleOnError is off, won't kill the bundle") - + def _killBundleOfJob(self, jobId): - ce = self.__getJobCE(jobId) + result = self.__getJobCE(jobId) + if not result["OK"]: + return result + ce = result["Value"] result = self._getBundleIdFromJobId(jobId) if not result["OK"]: @@ -174,8 +174,8 @@ def _killBundleOfJob(self, jobId): if not result["OK"]: return result - self.bundleDB.setBundleAsFailed() - return + self.bundleDB.setBundleAsFailed() + return def _killJob(self, jobId): return S_ERROR("CAN'T STOP JOBS") @@ -191,7 +191,7 @@ def export_cleanJob(self, jobId): bundleId = result["Value"] result = self._getTaskInfo(bundleId) - + if not result["OK"]: return result status = result["Value"]["Status"] @@ -199,13 +199,19 @@ def export_cleanJob(self, jobId): if status not in PilotStatus.PILOT_FINAL_STATES: return S_ERROR(f"The bundle hasn't finished, cleaning is not permitted. Current Status: {status}") - ce = self.__getJobCE(jobId) + result = self.__getJobCE(jobId) + if not result["OK"]: + return result + ce = result["Value"] try: ce.cleanJob(result["Value"]["TaskID"]) - except AttributeError as e: # If the CE has no method 'cleanJob' + except AttributeError as e: # If the CE has no method 'cleanJob' return S_ERROR(e) + + os.remove(f"/tmp/bundle_{bundleId}") + return S_OK() - + ############################################################################# types_getJobStatus = [str] @@ -215,7 +221,7 @@ def export_getJobStatus(self, jobId): if not result["OK"]: return result - + bundleId = result["Value"] result = self._getTaskInfo(bundleId) @@ -224,16 +230,24 @@ def export_getJobStatus(self, jobId): return result status = result["Value"]["Status"] + task = result["Value"]["TaskID"] if status not in PilotStatus.PILOT_FINAL_STATES: - ce = self.__getJobCE(jobId) + if not task: + return S_OK(PilotStatus.FAILED) + + result = self.__getJobCE(jobId) + + if not result["OK"]: + return result + + ce = result["Value"] - task = result["Value"]["TaskID"] result = ce.getJobStatus(task) if not result["OK"]: return result - + status = result["Value"][task] if result["Value"] == PilotStatus.DONE: @@ -246,13 +260,13 @@ def export_getJobStatus(self, jobId): ############################################################################# def _getBundleIdFromJobId(self, jobId): - if self.jobToBundle[jobId]: + if jobId in self.jobToBundle: return self.jobToBundle[jobId] result = self.bundleDB.getBundleIdFromJobId(jobId) if not result["OK"]: return result - + self.jobToBundle[jobId] = result["Value"] return result @@ -274,27 +288,50 @@ def _wrapBundle(self, bundleId): jobs = result["Value"] template = bundle["ExecTemplate"] + executables = [] inputs = [] + outputs = () + jobIds = [] + + basedir = f"/tmp/bundle_{bundleId}" + os.mkdir(basedir) for job in jobs: - inputs.append(job["ExecutablePath"]) - inputs.append(job["Inputs"]) + jobId = job["JobID"] + jobIds.append(jobId) + + # Copy the original file in a new location with the rest + job_executable = job["ExecutablePath"] + job_executable_dst = os.path.join(basedir, jobId + "_" + os.path.basename(job_executable)) + + shutil.copy(job_executable, job_executable_dst) - result = generate_template(template, inputs) + executables.append(os.path.basename(job_executable_dst)) + inputs.append(job_executable_dst) + + for job_input in job["Inputs"]: + job_input_dst = os.path.join(basedir, jobId + "_" + os.path.basename(job_input)) + shutil.copy(job_input, job_input_dst) + inputs.append(job_input_dst) + + for job_output in job["Outputs"]: + outputs += job_output + + result = generate_template(template, executables) if not result["OK"]: self.log.error("Error while generating wrapper") return result wrappedBundle = result["Value"] - wrapperPath = f"/tmp/bundle_wrapper_{bundleId}" + wrapperPath = os.path.join(basedir, "bundle_wrapper") with open(wrapperPath, "x") as f: f.write(wrappedBundle) - return S_OK((wrapperPath, inputs)) + return S_OK((jobIds, wrapperPath, inputs, outputs)) - def _getCeDict(self, jobId): + def _getCE(self, jobId): result = self._getBundleIdFromJobId(jobId) if not result["OK"]: return result @@ -305,27 +342,53 @@ def _getCeDict(self, jobId): return result # Convert the CEDict from string to a dictionary - ceDict = literal_eval(result["Value"]) - return S_OK(ceDict) + ceDict = literal_eval(result["Value"]["CEDict"]) + + return S_OK(ceDict, result["Value"]["ProxyPath"]) def __getJobCE(self, jobId): if jobId not in self.jobToCE: # Look for it in the DB - result = self._getCeDict(jobId) + result = self._getCE(jobId) if not result["OK"]: self.log.error("Failed to obtain CE Dict of Bundle with JobId ", str(jobId)) return result - ceDict = result["Value"] - - # Build the ce obtained from the DB - result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) + result = self.__setupCE(result["Value"]["CEDict"], result["Value"]["ProxyPath"]) if not result["OK"]: - self.log.error("Failed to CE of JobId ", str(jobId)) return result self.jobToCE[jobId] = result["Value"] - return self.jobToCE[jobId] + return S_OK(self.jobToCE[jobId]) + + def __setupCE(self, ceDict, proxyPath): + result = getProxyInfo(proxy=proxyPath) + + if not result["OK"]: + self.log.error("Failed to obtain proxy from path") + return result + + proxy = result["Value"]["chain"] + + result = proxy.getRemainingSecs() + if not result["OK"]: + self.log.error("Failed to obtain remaining seconds of proxy") + return result + + valid = result["Value"] + + # Setup CE + result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) + + if not result["OK"]: + self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) + return result + + ce = result["Value"] + + ce.setProxy(proxy, valid) + + return S_OK({"CE": ce, "Proxy": proxy}) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 8595eab3eab..53f39fdad60 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -8,59 +8,63 @@ INPUT={inputs} get_id() {{ - basename ${{1}} .json + basename ${{1}} _workloadExec.sh }} run_task() {{ local input=$1 local task_id=$(get_id ${{input}}) - >&2 echo "Executing task ${{task_id}}" - >&2 {command} ${{BASEDIR}}/${{input}} >task_${{task_id}}.log 2>&1 & + # Setup + touch ${{task_id}}.status + touch ${{task_id}}.out + + echo "Executing task ${{task_id}}" + {command} ${{BASEDIR}}/${{input}} >${{task_id}}.out 2>&1 & local task_pid=$! - >&2 echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." + echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." wait ${{task_pid}} ; local task_status=$? - # report status - echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee task_${{task_id}}.status + # Report status + echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee ${{task_id}}.status }} # execute tasks -for input in ${{INPUT}}; do +for input in ${{INPUT[@]}}; do [ -f "$input" ] || break - taskdir="task_$(get_id ${{input}})" - mkdir ${{taskdir}} && cd "$_" && - run_task ${{input}} >> ${{BASEDIR}}/tasks_status.log & - cd ${{BASEDIR}} + run_task ${{input}} & done # wait for all tasks wait """ + def generate_template(template: str, inputs: list): template = template.lower().replace("-", "_") func_name = "_generate_" + template - generator = locals()[func_name] + generator = globals()[func_name] if not generator: return S_ERROR("Template not found") - result = generator(inputs) - if not result["OK"]: - return result - - return S_OK(result["Value"]) + if inputs is None: + inputs = [] + + return generator(inputs) + def _generate_lb_prod_run(inputs: list): template = __generate_generic_bash("lb-prod-run", inputs) return S_OK(template) + def _generate_bash(inputs: list): template = __generate_generic_bash("bash", inputs) return S_OK(template) + def __generate_generic_bash(command, inputs): formatted_inputs = "(" + ", ".join(inputs) + ")" template = GENERIC_BASH_TEMPLATE.format(command=command, inputs=formatted_inputs) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py b/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py index 5a2eb8a13e5..fbb73b92a9d 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py @@ -73,13 +73,16 @@ def execute(self, command, workingDirectory=".", numberOfProcessors=1, cleanRemo # Request the whole directory as output outputs = ["/"] + absWorkingDirectory = os.path.abspath(workingDirectory) + asbInputs = [os.path.join(absWorkingDirectory, _input) for _input in inputs] + # Interactions with the CE might be unstable, we need to retry the operations maxRetries = 10 timeBetweenRetries = 120 # Submit the command as a job with retries for _ in range(maxRetries): - result = workloadCE.submitJob(self.executable, workloadCE.proxy, inputs=inputs, outputs=outputs) + result = workloadCE.submitJob(self.executable, workloadCE.proxy, inputs=asbInputs, outputs=outputs) if result["OK"]: break else: From 4bcf49e67e4ca3bf77870552b2e590d1b2d3c824 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 9 Jul 2025 10:49:43 +0200 Subject: [PATCH 16/47] fix: Couple of bugs at BundleCE, BundleDB and BundleService chore: Add Alvaro's AREXEnhancedComputingElement --- .../Computing/AREXEnhancedComputingElement.py | 121 ++++++++++++++++++ .../Computing/BundleComputingElement.py | 64 ++++++--- .../WorkloadManagementSystem/DB/BundleDB.py | 41 ++---- .../WorkloadManagementSystem/DB/BundleDB.sql | 1 - .../Service/BundlerHandler.py | 59 +++++---- .../Utilities/BundlerTemplates.py | 22 +++- 6 files changed, 228 insertions(+), 80 deletions(-) create mode 100644 src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py diff --git a/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py new file mode 100644 index 00000000000..7826501df27 --- /dev/null +++ b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py @@ -0,0 +1,121 @@ +import os +import sys +import time +from DIRAC.Core.Security.X509Chain import X509Chain +from DIRAC.Resources.Computing.AREXComputingElement import AREXComputingElement + +# AREXComputingElement redefinition +import os +import json +import requests +import shutil +from DIRAC import S_OK, S_ERROR + + +class AREXEnhancedComputingElement(AREXComputingElement): + def _getListOfAvailableOutputs(self, jobID, arcJobID, path=None): + """Request a list of outputs available for a given jobID. + + :param str jobID: job reference without the DIRAC stamp + :param str arcJobID: ARC job ID + :param str path: remote path + :return list: names of the available outputs + """ + query = self._urlJoin(os.path.join("jobs", arcJobID, "session", path or '')) + + # Submit the GET request to retrieve the names of the outputs + #self.log.debug(f"Retrieving the names of the outputs for {jobID}") + self.log.debug(f"Retrieving the names of the outputs with {query}") + result = self._request("get", query) + if not result["OK"]: + self.log.error("Failed to retrieve at least some outputs", f"for {jobID}: {result['Message']}") + return S_ERROR(f"Failed to retrieve at least some outputs for {jobID}") + response = result["Value"] + + if not response.text: + return S_ERROR(f"There is no output for job {jobID}") + + #return S_OK(response.json()["file"]) + return S_OK(response.json()) + + def getJobOutput(self, jobID, workingDirectory=None, path=None): + """Get the outputs of the given job reference. + + Outputs and stored in workingDirectory if present, else in a new directory named . + + :param str jobID: job reference followed by the DIRAC stamp. + :param str workingDirectory: name of the directory containing the retrieved outputs. + :param str path: remote path + :return: content of stdout and stderr + """ + result = self._checkSession() + if not result["OK"]: + self.log.error("Cannot get job outputs", result["Message"]) + return result + + # Extract stamp from the Job ID + if ":::" in jobID: + jobRef, stamp = jobID.split(":::") + else: + return S_ERROR(f"DIRAC stamp not defined for {jobID}") + arcJob = self._jobReferenceToArcID(jobRef) + + # Get the list of available outputs + result = self._getListOfAvailableOutputs(jobRef, arcJob, path) + if not result["OK"]: + return result + remoteOutputs = result["Value"] + self.log.debug("Outputs to get are", remoteOutputs) + + remoteOutputsFiles = [] + if 'file' in remoteOutputs: + remoteOutputsFiles = remoteOutputs["file"] + + remoteOutputsDirs = [] + if 'dir' in remoteOutputs: + remoteOutputsDirs = remoteOutputs["dir"] + + if not workingDirectory: + if "WorkingDirectory" in self.ceParameters: + # We assume that workingDirectory exists + workingDirectory = os.path.join(self.ceParameters["WorkingDirectory"], arcJob) + else: + workingDirectory = arcJob + + if not os.path.exists(workingDirectory): + os.mkdir(workingDirectory) + + # Directories + for remoteOutput in remoteOutputsDirs: + self.getJobOutput(jobID, + workingDirectory=os.path.join(workingDirectory, remoteOutput), + path=os.path.join(path or '', remoteOutput)) + + # Files + stdout = None + stderr = None + for remoteOutput in remoteOutputsFiles: + # Prepare the command + #query = self._urlJoin(os.path.join("jobs", arcJob, "session", remoteOutput)) + query = self._urlJoin(os.path.join("jobs", arcJob, "session", path or '', remoteOutput)) + + # Submit the GET request to retrieve outputs + result = self._request("get", query, stream=True) + if not result["OK"]: + self.log.error("Error downloading", f"{remoteOutput} for {arcJob}: {result['Message']}") + return S_ERROR(f"Error downloading {remoteOutput} for {jobID}") + response = result["Value"] + + localOutput = os.path.join(workingDirectory, remoteOutput) + with open(localOutput, "wb") as f: + shutil.copyfileobj(response.raw, f) + + if remoteOutput == f"{stamp}.out": + with open(localOutput) as f: + stdout = f.read() + if remoteOutput == f"{stamp}.err": + with open(localOutput) as f: + stderr = f.read() + + + return S_OK((stdout, stderr)) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 65a64cdfc3c..7a81b5fe0d9 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -71,6 +71,7 @@ import copy import inspect +import os import uuid from DIRAC import S_ERROR, S_OK @@ -180,19 +181,16 @@ def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=[] if not submitted: self.log.info(f"Job {jobId} stored successfully in bundle: ", bundleId) else: - self.log.info("Submitting job to CE: ", self.ce.ceName) + self.log.info("Submitting job to CE: ", self.innerCE.ceName) # Return the id of the job (NOT THE BUNDLE) return result - def getJobOutput(self, jobId, workingDirectory=None): + def getJobOutput(self, jobId, workingDirectory="."): bundleId = None if ":::" in jobId: jobId, bundleId = jobId.split(":::") - if workingDirectory is None: - workingDirectory = "." - if not bundleId: bundleId = self.bundler.bundleIdFromJobId(jobId) @@ -205,19 +203,44 @@ def getJobOutput(self, jobId, workingDirectory=None): return S_ERROR("Output not ready yet") # If the output path of all of the jobs hasn't been defined yet - if outputPath := result["Value"]["OutputPath"] is None: - taskId = result["Value"]["TaskId"] - result = self.innerCE.getJobOutput(taskId, workingDirectory) + taskId = result["Value"]["TaskID"] - if not result["OK"]: - return result + _, innerStamp = taskId.split(":::") + + result = self.innerCE.getJobOutput(taskId, workingDirectory) - self.bundler.setOutputPath(taskId, workingDirectory) + if not result["OK"]: + return result + outputPath = os.path.abspath(workingDirectory) self.log.notice(f"Outputs at: {outputPath}") - error = f"{outputPath}/{jobId}/{jobId}.err" - output = f"{outputPath}/{jobId}/{jobId}.out" + # Change the name of the files containing the stamp of the real job to the BundleID + for item in os.listdir(outputPath): + if os.path.isfile(item): + if innerStamp in item: + newName = item.replace(innerStamp, bundleId) + os.rename(item, newName) + + error = os.path.join(outputPath, jobId, f"{jobId}.err") + output = os.path.join(outputPath, jobId, f"{jobId}.out") + + if os.path.exists("md5Checksum.txt"): + with open("md5Checksum.txt", "r+") as f: + content = f.read() + content = content.replace(innerStamp, bundleId) + f.seek(0) + f.write(content) + f.truncate() + + if not os.path.exists(output) or not os.path.exists(error): + return S_ERROR("Outputs unable to be obtained") + + with open(output, "r") as f: + output = f.read() + + with open(error, "r") as f: + error = f.read() return S_OK((output, error)) @@ -228,16 +251,16 @@ def getJobStatus(self, jobIDList): jobIDList = [jobIDList] for job in jobIDList: + jobId = job if ":::" in job: - jobId, bundleId = job.split(":::") + jobId, _ = job.split(":::") - result = self.bundler.getJobStatus(job) + result = self.bundler.getJobStatus(jobId) if not result["OK"]: - self.log.error(result["Message"]) - resultDict[job] = PilotStatus.FAILED + return S_ERROR("Failed to obtain the status of the job") else: - resultDict[job] = result["Value"] + resultDict[jobId] = result["Value"] return S_OK(resultDict) @@ -257,12 +280,13 @@ def setToken(self, token, valid=0): def cleanJob(self, jobIDList): if "cleanJob" not in self.innerCEMethods: self.log.error(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") - return S_ERROR() + return S_ERROR(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") for job in jobIDList: + if ":::" in job: job, bundleId = job.split(":::") - self.bundler.cleanJob(job) + return self.bundler.cleanJob(job) def killJob(self, jobIDList): resultDict = {} diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index c869a3f6969..cfb0ae6f971 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -26,7 +26,6 @@ "ExecTemplate", "TaskID", "Status", - "OutputPath", "ProxyPath", ] @@ -79,7 +78,7 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi # No bundles matching ceDict, so create a new one if not bundles: - result = self.__createNewBundle(ceDict) + result = self.__createNewBundle(ceDict, proxyPath) if not result["OK"]: return result @@ -97,15 +96,17 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi # If it does not fit in an already created bundle, create a new one if not bundleId: - result = self.__createNewBundle(ceDict) + result = self.__createNewBundle(ceDict, proxyPath) if not result["OK"]: return result bundleId = result["Value"] + # TODO: CHECK IF THE JOB IS ALREADY IN THE BUNDLE + # Insert it and obtain if it is ready to be submitted - result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, processors, proxyPath) + result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath) if not result["OK"]: return result @@ -171,12 +172,8 @@ def getJobsOfBundle(self, bundleId): ############################################################################# def setTaskId(self, bundleId, taskId): - result = self.updateFields("BundlesInfo", ["TaskID"], [taskId], {"BundleID": bundleId}) - - if not result["OK"]: - return result - - return S_OK() + result = self.updateFields("BundlesInfo", ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId}) + return result def getTaskId(self, bundleId): result = self.getFields("BundlesInfo", ["TaskID"], {"BundleID": bundleId}) @@ -198,24 +195,6 @@ def setBundleAsFailed(self, bundleId): ############################################################################# - def setOutputPath(self, bundleId, outputPath): - result = self.updateFields("BundlesInfo", ["OutputPath"], [outputPath], {"BundleID": bundleId}) - - if not result["OK"]: - return result - - return S_OK() - - def getOutputPath(self, bundleId): - result = self.getFields("BundlesInfo", ["OutputPath"], {"BundleID": bundleId}) - - if not result["Value"]: - return S_ERROR("Failed to get bundle Output Path") - - return S_OK(result["Value"][0][0]) - - ############################################################################# - def getWholeBundle(self, bundleId): result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) @@ -228,6 +207,8 @@ def getWholeBundle(self, bundleId): bundleDict = formatSelectOutput(result["Value"], BUNDLES_INFO_COLUMNS)[0] bundleDict["Status"] = STATUS_MAP[bundleDict["Status"]] + self.log.debug(f"Look at this cool bundle: {bundleDict}") + return S_OK(bundleDict) def getBundleCE(self, bundleId): @@ -246,7 +227,7 @@ def __reduceProcessorSum(self, bundleId, nProcessors): ) return self._query(cmd) - def __createNewBundle(self, ceDict): + def __createNewBundle(self, ceDict, proxyPath): if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") @@ -260,6 +241,7 @@ def __createNewBundle(self, ceDict): "CE": ceDict["GridCE"], "Queue": ceDict["Queue"], "CEDict": str(ceDict), + "ProxyPath": proxyPath, } result = self.insertFields("BundlesInfo", list(insertInfo.keys()), list(insertInfo.values())) @@ -278,7 +260,6 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro "Inputs": str(inputs), "Outputs": str(outputs), "Processors": nProcessors, - "ProxyPath": proxyPath, } result = self.insertFields("JobToBundle", list(insertInfo.keys()), list(insertInfo.values())) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index f6abb7e16f7..6c7bb62934b 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -28,7 +28,6 @@ CREATE TABLE `BundlesInfo` ( `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` VARCHAR(255), `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', - `OutputPath` VARCHAR(255), `ProxyPath` VARCHAR(255), PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 42a88b0e56a..15b65df157c 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -55,7 +55,7 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr self.jobToCE[jobId] = ce # Insert the Job into the DB - result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, outputs, processors, ceDict) + result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, outputs, processors, ceDict, proxyPath) if not result["OK"]: self.log.error("Failed to insert into a bundle the job with id ", str(jobId)) return result @@ -103,18 +103,25 @@ def export_getTaskInfo(self, bundleId): return self._getTaskInfo(bundleId) def _getTaskInfo(self, bundleId): - result = self.bundleDB.getWholeBundle(bundleId) + result = self.bundleDB.getBundleStatus(bundleId) if not result["OK"]: - self.log.error("Failed to obtain bundle ", str(bundleId)) + self.log.error("Failed to obtain status of bundle ", str(bundleId)) return result - bundleDict = result["Value"] - resultDict = {"Status": bundleDict["Status"]} + resultDict = {"Status": result["Value"]} + + # If it hasn't been uploaded yet + if resultDict["Status"] == PilotStatus.WAITING: + return S_OK(resultDict) + + result = self.bundleDB.getTaskId(bundleId) + + if not result["OK"]: + self.log.error("Failed to obtain taskId of bundle ", str(bundleId)) + return result - if bundleDict["Status"] not in PilotStatus.PILOT_FINAL_STATES: - resultDict["TaskID"] = bundleDict["TaskID"] - resultDict["OutputPath"] = bundleDict["OutputPath"] + resultDict["TaskID"] = result["Value"] return S_OK(resultDict) @@ -199,12 +206,14 @@ def export_cleanJob(self, jobId): if status not in PilotStatus.PILOT_FINAL_STATES: return S_ERROR(f"The bundle hasn't finished, cleaning is not permitted. Current Status: {status}") + taskId = result["Value"]["TaskID"] + result = self.__getJobCE(jobId) if not result["OK"]: return result ce = result["Value"] try: - ce.cleanJob(result["Value"]["TaskID"]) + ce.cleanJob(taskId) except AttributeError as e: # If the CE has no method 'cleanJob' return S_ERROR(e) @@ -230,11 +239,12 @@ def export_getJobStatus(self, jobId): return result status = result["Value"]["Status"] - task = result["Value"]["TaskID"] - + if status not in PilotStatus.PILOT_FINAL_STATES: - if not task: - return S_OK(PilotStatus.FAILED) + task = result["Value"]["TaskID"] + + if ":::" in task: + task = task.split(":::")[0] result = self.__getJobCE(jobId) @@ -250,10 +260,10 @@ def export_getJobStatus(self, jobId): status = result["Value"][task] - if result["Value"] == PilotStatus.DONE: - self.bundleDB.setBundleAsFinalized() - elif result["Value"] in PilotStatus.PILOT_FINAL_STATES: - self.bundleDB.setBundleAsFailed() + if status == PilotStatus.DONE: + self.bundleDB.setBundleAsFinalized(bundleId) + elif status in PilotStatus.PILOT_FINAL_STATES: + self.bundleDB.setBundleAsFailed(bundleId) return S_OK(status) @@ -261,7 +271,7 @@ def export_getJobStatus(self, jobId): def _getBundleIdFromJobId(self, jobId): if jobId in self.jobToBundle: - return self.jobToBundle[jobId] + return S_OK(self.jobToBundle[jobId]) result = self.bundleDB.getBundleIdFromJobId(jobId) if not result["OK"]: @@ -290,7 +300,7 @@ def _wrapBundle(self, bundleId): template = bundle["ExecTemplate"] executables = [] inputs = [] - outputs = () + outputs = [] jobIds = [] basedir = f"/tmp/bundle_{bundleId}" @@ -314,8 +324,7 @@ def _wrapBundle(self, bundleId): shutil.copy(job_input, job_input_dst) inputs.append(job_input_dst) - for job_output in job["Outputs"]: - outputs += job_output + outputs.extend(job["Outputs"]) result = generate_template(template, executables) @@ -329,6 +338,10 @@ def _wrapBundle(self, bundleId): with open(wrapperPath, "x") as f: f.write(wrappedBundle) + # outputs = list(set(outputs)) + # if "/" in outputs: + # outputs = outputs.remove("/") + return S_OK((jobIds, wrapperPath, inputs, outputs)) def _getCE(self, jobId): @@ -344,7 +357,7 @@ def _getCE(self, jobId): # Convert the CEDict from string to a dictionary ceDict = literal_eval(result["Value"]["CEDict"]) - return S_OK(ceDict, result["Value"]["ProxyPath"]) + return S_OK({"CEDict": ceDict, "ProxyPath": result["Value"]["ProxyPath"]}) def __getJobCE(self, jobId): if jobId not in self.jobToCE: @@ -360,7 +373,7 @@ def __getJobCE(self, jobId): if not result["OK"]: return result - self.jobToCE[jobId] = result["Value"] + self.jobToCE[jobId] = result["Value"]["CE"] return S_OK(self.jobToCE[jobId]) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 53f39fdad60..ba2403866ea 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -15,29 +15,39 @@ local input=$1 local task_id=$(get_id ${{input}}) + cd "$task_id" + # Setup touch ${{task_id}}.status - touch ${{task_id}}.out + #touch ${{task_id}}.out + + echo "[${{task_id}}] Executing task" + + {command} ${{BASEDIR}}/${{input}} \\ + 1> >(tee ${{task_id}}.out) \\ + 2> >(tee ${{task_id}}.err 1>&2) & - echo "Executing task ${{task_id}}" - {command} ${{BASEDIR}}/${{input}} >${{task_id}}.out 2>&1 & local task_pid=$! - echo "Task ${{task_id}} waiting for pid ${{task_pid}}..." + echo "[${{task_id}}] Waiting for pid ${{task_pid}}..." wait ${{task_pid}} ; local task_status=$? # Report status - echo "${{task_id}} ${{task_pid}} ${{task_status}}" | tee ${{task_id}}.status + echo "[${{task_id}}] ${{task_pid}} ${{task_status}}" | tee ${{task_id}}.status }} # execute tasks for input in ${{INPUT[@]}}; do [ -f "$input" ] || break + mkdir $(get_id ${{input}}) run_task ${{input}} & done # wait for all tasks wait + +# Checksum of all files in the root and the job subdirectories +find -H ! -type d ! -name md5Checksum.txt -exec md5sum {{}} + >md5Checksum.txt """ @@ -66,6 +76,6 @@ def _generate_bash(inputs: list): def __generate_generic_bash(command, inputs): - formatted_inputs = "(" + ", ".join(inputs) + ")" + formatted_inputs = "(" + " ".join(inputs) + ")" template = GENERIC_BASH_TEMPLATE.format(command=command, inputs=formatted_inputs) return template From f9b74cb94961c560780038656aad56f029838350 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 21 Jul 2025 10:52:10 +0200 Subject: [PATCH 17/47] feat: Add a proper job status notification and output retrival --- .../Computing/BundleComputingElement.py | 184 +++++++++++++++--- .../WorkloadManagementSystem/DB/BundleDB.py | 41 ++-- .../WorkloadManagementSystem/DB/BundleDB.sql | 1 + .../Service/BundlerHandler.py | 86 +++++--- .../Utilities/BundlerTemplates.py | 58 +++--- 5 files changed, 268 insertions(+), 102 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 7a81b5fe0d9..b2e05f3666e 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -72,9 +72,12 @@ import copy import inspect import os +import shutil import uuid -from DIRAC import S_ERROR, S_OK +from filelock import FileLock, Timeout + +from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory from DIRAC.WorkloadManagementSystem.Client import PilotStatus @@ -85,6 +88,17 @@ class BundleTaskDict(dict): def __init__(self, getProperty): self.getProperty = getProperty + def __contains__(self, jobId): + if super().__contains__(jobId): + return True + + res = self.getProperty(jobId) + if res: + self.__setitem__(jobId, res) + return True + + return False + def __getitem__(self, jobId): if jobId in self: return super().__getitem__(jobId) @@ -92,6 +106,7 @@ def __getitem__(self, jobId): res = self.getProperty(jobId) if res: super().__setitem__(jobId, res) + return res @@ -139,6 +154,8 @@ def _reset(self): name for name, _ in inspect.getmembers(self.innerCE, predicate=inspect.ismethod) if name[0] != "_" ] + self.bundlesBaseDir = gConfig.getValue("/LocalSite/BundlesBaseDir", "/tmp/bundles") + return S_OK() ############################################################################# @@ -202,36 +219,47 @@ def getJobOutput(self, jobId, workingDirectory="."): if result["Value"]["Status"] not in PilotStatus.PILOT_FINAL_STATES: return S_ERROR("Output not ready yet") - # If the output path of all of the jobs hasn't been defined yet taskId = result["Value"]["TaskID"] - _, innerStamp = taskId.split(":::") - result = self.innerCE.getJobOutput(taskId, workingDirectory) + result = self.__getOutputPath(bundleId, taskId) if not result["OK"]: return result + + outputsPath = result["Value"] + outputAbsPath = os.path.abspath(workingDirectory) + + jobBaseDir = os.path.join(outputsPath, jobId) - outputPath = os.path.abspath(workingDirectory) - self.log.notice(f"Outputs at: {outputPath}") + if not os.path.exists(jobBaseDir): + return S_ERROR("Failed to locate job output files from base output directory") - # Change the name of the files containing the stamp of the real job to the BundleID - for item in os.listdir(outputPath): - if os.path.isfile(item): - if innerStamp in item: - newName = item.replace(innerStamp, bundleId) - os.rename(item, newName) + self.log.notice(f"Outputs at: {jobBaseDir}") - error = os.path.join(outputPath, jobId, f"{jobId}.err") - output = os.path.join(outputPath, jobId, f"{jobId}.out") + # Move all items from + for item in os.listdir(jobBaseDir): + # newName = item - if os.path.exists("md5Checksum.txt"): - with open("md5Checksum.txt", "r+") as f: - content = f.read() - content = content.replace(innerStamp, bundleId) - f.seek(0) - f.write(content) - f.truncate() + # if jobId in item: + # newName = item.replace(jobId, bundleId) + + # move the item to the working directory + # os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, newName)) + #os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) + shutil.copy2(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) + + # checksumFile = os.path.join(outputAbsPath, "md5Checksum.txt") + # if os.path.exists(checksumFile): + # with open(checksumFile, "r+") as f: + # content = f.read() + # content = content.replace(innerStamp, bundleId) + # f.seek(0) + # f.write(content) + # f.truncate() + + error = os.path.join(workingDirectory, f"{bundleId}.err") + output = os.path.join(workingDirectory, f"{bundleId}.out") if not os.path.exists(output) or not os.path.exists(error): return S_ERROR("Outputs unable to be obtained") @@ -252,15 +280,72 @@ def getJobStatus(self, jobIDList): for job in jobIDList: jobId = job + bundleId = None if ":::" in job: - jobId, _ = job.split(":::") + jobId, bundleId = job.split(":::") + + if not bundleId: + result = self.bundler.bundleIdFromJobId(jobId) + if not result["OK"]: + return result + bundleId = result["Value"] - result = self.bundler.getJobStatus(jobId) + self.log.debug(f"Obtaining the status of job: '{jobId}' with bundleID: '{bundleId}'") + result = self.bundler.getBundleStatus(bundleId) if not result["OK"]: - return S_ERROR("Failed to obtain the status of the job") - else: - resultDict[jobId] = result["Value"] + return result + + # Default Value: The one from the Bundle + resultDict[jobId] = result["Value"] + self.log.debug(f"Status of bundle '{bundleId}': {result['Value']}") + + # Check if the bundle has ended + if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: + continue + + # If the bundle Failed, we asume all of the jobs failed + if result["Value"] != PilotStatus.DONE: + resultDict[jobId] = PilotStatus.FAILED + continue + + # If the bundle ended properly, get the status of the independent job + result = self.bundler.getTaskInfo(bundleId) + + if not result["OK"]: + return result + + taskId = result["Value"]["TaskID"] + self.log.debug(f"Obtaining bundle output of '{bundleId}'") + result = self.__getOutputPath(bundleId, taskId) + + if not result["OK"]: + return result + + outputPath = result["Value"] + + # The file that contains a singular line with the following format: + # {JobId} {processId} {jobStatus} + jobStatusFile = os.path.join(outputPath, f"{jobId}.status") + + # If it was not created or is empty, the job failed + if not os.path.exists(jobStatusFile) or os.path.getsize(jobStatusFile) == 0: + self.log.warn(f".status file of job '{jobId}' not found or is empty. Assuming it failed") + resultDict[jobId] = PilotStatus.FAILED + continue + + # Read the exit value of the process launched + with open(jobStatusFile, "r") as f: + jobStatus = f.readline() + jobStatus = int(jobStatus.split()[2]) + + # 0 -> All ok Any other -> Fail + if jobStatus == 0: + resultDict[jobId] = PilotStatus.DONE + else: + resultDict[jobId] = PilotStatus.FAILED + + self.log.debug(f"Status of job '{jobId}': {resultDict[jobId]}") return S_OK(resultDict) @@ -282,10 +367,13 @@ def cleanJob(self, jobIDList): self.log.error(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") return S_ERROR(f"Inner CE {self.innerCE.ceName} has no function called 'cleanJob'") + if not isinstance(jobIDList, list): + jobIDList = [jobIDList] + for job in jobIDList: - if ":::" in job: job, bundleId = job.split(":::") + return self.bundler.cleanJob(job) def killJob(self, jobIDList): @@ -303,15 +391,51 @@ def killJob(self, jobIDList): ############################################################################# def __getTraskResult(self, jobId): - result = self.bundler.getJobStatus(jobId) + result = self.getJobStatus(jobId) if not result["OK"]: return result - if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: + if ":::" in jobId: + jobId, _ = jobId.split(":::") + + status = result["Value"][jobId] + + if status not in PilotStatus.PILOT_FINAL_STATES: return S_OK() - if result["Value"] == PilotStatus.DONE: + if status == PilotStatus.DONE: return S_OK(0) return S_OK(1) + + def __getOutputPath(self, bundleId, innerTaskId): + """Returns the output path of the whole bundle + If it hasn't been created yet, it obtains the output from the Inner CE. + """ + self.log.debug(f"Obtaining the output path of bundle '{bundleId}' with task '{innerTaskId}'") + + basePath = os.path.join(self.bundlesBaseDir, bundleId) + lock = FileLock(os.path.join(basePath, "outputs.lock")) + + outputsPath = os.path.join(basePath, "outputs") + + try: + # Always acquire the lock before checking anything + with lock.acquire(timeout=60): + self.log.debug("Acquiring outputs lock") + # If the output does not exist, dowload the outputs + if not os.path.exists(outputsPath): + os.mkdir(outputsPath) + self.log.debug(f"Saving inner CE outputs from task '{innerTaskId}' into '{outputsPath}'") + result = self.innerCE.getJobOutput(innerTaskId, outputsPath) + + if not result["OK"]: + self.log.error("Failed to obtain the outputs, removing the directory") + os.rmdir(outputsPath) + return result + + except TimeoutError: + return S_ERROR("Outputs not available yet") + + return S_OK(outputsPath) \ No newline at end of file diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index cfb0ae6f971..eb03968205a 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -27,6 +27,7 @@ "TaskID", "Status", "ProxyPath", + "Cleaned", ] JOB_TO_BUNDLE_COLUMNS = [ @@ -58,6 +59,9 @@ def __init__(self, parentLogger=None): super().__init__("BundleDB", "WorkloadManagement/BundleDB", parentLogger=parentLogger) self._defaultLogger = self.log + self.BUNDLES_INFO_TABLE = "BundlesInfo" + self.JOB_TO_BUNDLE_TABLE = "JobToBundle" + @property def log(self): return contextLogger.get() or self._defaultLogger @@ -114,7 +118,7 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) def removeJobFromBundle(self, jobId): - result = self.getFields("JobToBundle", ["BundleID", "Processors"], {"JobID": jobId}) + result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID", "Processors"], {"JobID": jobId}) if not result["OK"]: return result @@ -127,7 +131,7 @@ def removeJobFromBundle(self, jobId): if not result["OK"]: return result - result = self.deleteEntries("JobToBundle", {"JobID": jobId}) + result = self.deleteEntries(self.JOB_TO_BUNDLE_TABLE, {"JobID": jobId}) # Rollback on error?? Can this Fail?? return result @@ -135,7 +139,7 @@ def removeJobFromBundle(self, jobId): ############################################################################# def getBundleIdFromJobId(self, jobId): - result = self.getFields("JobToBundle", ["BundleID"], {"JobID": jobId}) + result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID"], {"JobID": jobId}) if not result["OK"]: return result @@ -146,7 +150,7 @@ def getBundleIdFromJobId(self, jobId): return S_OK(result["Value"][0][0]) def getBundleStatus(self, bundleId): - result = self.getFields("BundlesInfo", ["Status"], {"BundleID": bundleId}) + result = self.getFields(self.BUNDLES_INFO_TABLE, ["Status"], {"BundleID": bundleId}) if not result["Value"]: return S_ERROR("Failed to get bundle Status") @@ -156,7 +160,7 @@ def getBundleStatus(self, bundleId): def getJobsOfBundle(self, bundleId): fields = ["JobID", "ExecutablePath", "Inputs", "Outputs"] - result = self.getFields("JobToBundle", fields, {"BundleID": bundleId}) + result = self.getFields(self.JOB_TO_BUNDLE_TABLE, fields, {"BundleID": bundleId}) if not result["OK"]: return result @@ -172,11 +176,11 @@ def getJobsOfBundle(self, bundleId): ############################################################################# def setTaskId(self, bundleId, taskId): - result = self.updateFields("BundlesInfo", ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId}) + result = self.updateFields(self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId}) return result def getTaskId(self, bundleId): - result = self.getFields("BundlesInfo", ["TaskID"], {"BundleID": bundleId}) + result = self.getFields(self.BUNDLES_INFO_TABLE, ["TaskID"], {"BundleID": bundleId}) if not result["OK"]: return result @@ -193,10 +197,21 @@ def setBundleAsFailed(self, bundleId): result = self.__updateBundleStatus(bundleId, "Failed") return result + def setBundleAsCleaned(self, bundleId): + return self.updateFields(self.BUNDLES_INFO_TABLE, ["Cleaned"], [True], {"BundleID": bundleId}) + + def isBundleCleaned(self, bundleId): + result = self.getFields(self.BUNDLES_INFO_TABLE, ["Cleaned"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK(result["Value"][0][0]) + ############################################################################# def getWholeBundle(self, bundleId): - result = self.getFields("BundlesInfo", [], {"BundleID": bundleId}) + result = self.getFields(self.BUNDLES_INFO_TABLE, [], {"BundleID": bundleId}) if not result["OK"]: return result @@ -212,7 +227,7 @@ def getWholeBundle(self, bundleId): return S_OK(bundleDict) def getBundleCE(self, bundleId): - result = self.getFields("BundlesInfo", ["CEDict", "ProxyPath"], {"BundleID": bundleId}) + result = self.getFields(self.BUNDLES_INFO_TABLE, ["CEDict", "ProxyPath"], {"BundleID": bundleId}) if not result["OK"]: return result @@ -244,7 +259,7 @@ def __createNewBundle(self, ceDict, proxyPath): "ProxyPath": proxyPath, } - result = self.insertFields("BundlesInfo", list(insertInfo.keys()), list(insertInfo.values())) + result = self.insertFields(self.BUNDLES_INFO_TABLE, list(insertInfo.keys()), list(insertInfo.values())) if not result["OK"]: return result @@ -262,7 +277,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro "Processors": nProcessors, } - result = self.insertFields("JobToBundle", list(insertInfo.keys()), list(insertInfo.values())) + result = self.insertFields(self.JOB_TO_BUNDLE_TABLE, list(insertInfo.keys()), list(insertInfo.values())) if not result["OK"]: return result @@ -277,7 +292,7 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro return result # Obtain the current Sum and the Max available - result = self.getFields("BundlesInfo", ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId}) + result = self.getFields(self.BUNDLES_INFO_TABLE, ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId}) if not result["OK"]: return result @@ -307,7 +322,7 @@ def __getBundlesFromCEDict(self, ceDict): Queue=ceDict["Queue"], ) result = self._query(cmd) - # result = self.getFields("BundlesInfo", [], conditions) + # result = self.getFields(self.BUNDLES_INFO_TABLE, [], conditions) if not result["OK"]: return result diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 6c7bb62934b..2061f9c3f0d 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -29,6 +29,7 @@ CREATE TABLE `BundlesInfo` ( `TaskID` VARCHAR(255), `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', `ProxyPath` VARCHAR(255), + `Cleaned` BOOLEAN DEFAULT FALSE, PRIMARY KEY (BundleID) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 15b65df157c..b74b98e11e7 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -9,7 +9,6 @@ from DIRAC import S_ERROR, S_OK from DIRAC.Core.DISET.RequestHandler import RequestHandler from DIRAC.Core.Security.ProxyInfo import getProxyInfo -from DIRAC.Core.Security.X509Chain import X509Chain # pylint: disable=import-error from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory from DIRAC.WorkloadManagementSystem.Client import PilotStatus @@ -32,13 +31,19 @@ def initializeHandler(cls, serviceInfoDict): cls.jobToBundle = {} cls.ceFactory = ComputingElementFactory() - cls.killBundleOnError = True except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp}") return S_OK() + def initialize(self): + self.killBundleOnError = self.getCSOption("KillBundleOnError", True) + self.bundlesBaseDir = self.getCSOption("/LocalSite/BundlesBaseDir", "/tmp/bundles") + + if not os.path.exists(self.bundlesBaseDir): + os.mkdir(self.bundlesBaseDir) + ############################################################################# types_storeInBundle = [str, str, list, list, str, int, dict] @@ -81,6 +86,7 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr result = ce.submitJob(bundle_exe, proxy=proxy, inputs=bundle_inputs, outputs=bundle_outputs) if not result["OK"]: + self.bundleDB.setBundleAsFailed(bundleId) self.log.error("Failed to submit job to with id ", str(jobId)) return result @@ -90,6 +96,7 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr result = self.bundleDB.setTaskId(bundleId, taskId) if not result["OK"]: + self.bundleDB.setBundleAsFailed(bundleId) self.log.error("Failed to set task id of JobId ", str(jobId)) return result @@ -197,6 +204,15 @@ def export_cleanJob(self, jobId): return result bundleId = result["Value"] + result = self.bundleDB.isBundleCleaned(bundleId) + + if not result["OK"]: + return result + + # Bundle already got cleaned + if result["Value"]: + return S_OK() + result = self._getTaskInfo(bundleId) if not result["OK"]: @@ -212,27 +228,28 @@ def export_cleanJob(self, jobId): if not result["OK"]: return result ce = result["Value"] + try: - ce.cleanJob(taskId) + result = ce.cleanJob(taskId) + if result["OK"]: + self.bundleDB.setBundleAsCleaned(bundleId) except AttributeError as e: # If the CE has no method 'cleanJob' return S_ERROR(e) - os.remove(f"/tmp/bundle_{bundleId}") - + # Remove bundle specific files (NOT THE OUTPUTS OF THE JOBS) + bundlePath = os.path.join(self.bundlesBaseDir, bundleId) + for item in os.listdir(bundlePath): + itemPath = os.path.join(bundlePath, item) + if os.path.isfile(item): + os.remove(itemPath) + return S_OK() ############################################################################# - types_getJobStatus = [str] - - def export_getJobStatus(self, jobId): - result = self._getBundleIdFromJobId(jobId) - - if not result["OK"]: - return result - - bundleId = result["Value"] + types_getBundleStatus = [str] + def export_getBundleStatus(self, bundleId): result = self._getTaskInfo(bundleId) if not result["OK"]: @@ -240,18 +257,23 @@ def export_getJobStatus(self, jobId): status = result["Value"]["Status"] - if status not in PilotStatus.PILOT_FINAL_STATES: + if status == PilotStatus.RUNNING: task = result["Value"]["TaskID"] if ":::" in task: task = task.split(":::")[0] - result = self.__getJobCE(jobId) + result = self.__getBundleCE(bundleId) + + if not result["OK"]: + return result + + result = self.__setupCE(result["Value"]["CEDict"], result["Value"]["ProxyPath"]) if not result["OK"]: return result - ce = result["Value"] + ce = result["Value"]["CE"] result = ce.getJobStatus(task) @@ -262,7 +284,7 @@ def export_getJobStatus(self, jobId): if status == PilotStatus.DONE: self.bundleDB.setBundleAsFinalized(bundleId) - elif status in PilotStatus.PILOT_FINAL_STATES: + elif status in PilotStatus.PILOT_FINAL_STATES: # ABORTED, DELETED or FAILED self.bundleDB.setBundleAsFailed(bundleId) return S_OK(status) @@ -303,8 +325,8 @@ def _wrapBundle(self, bundleId): outputs = [] jobIds = [] - basedir = f"/tmp/bundle_{bundleId}" - os.mkdir(basedir) + bundlePath = os.path.join(self.bundlesBaseDir, bundleId) + os.mkdir(bundlePath) for job in jobs: jobId = job["JobID"] @@ -312,7 +334,7 @@ def _wrapBundle(self, bundleId): # Copy the original file in a new location with the rest job_executable = job["ExecutablePath"] - job_executable_dst = os.path.join(basedir, jobId + "_" + os.path.basename(job_executable)) + job_executable_dst = os.path.join(bundlePath, jobId + "_" + os.path.basename(job_executable)) shutil.copy(job_executable, job_executable_dst) @@ -320,20 +342,20 @@ def _wrapBundle(self, bundleId): inputs.append(job_executable_dst) for job_input in job["Inputs"]: - job_input_dst = os.path.join(basedir, jobId + "_" + os.path.basename(job_input)) + job_input_dst = os.path.join(bundlePath, jobId + "_" + os.path.basename(job_input)) shutil.copy(job_input, job_input_dst) inputs.append(job_input_dst) outputs.extend(job["Outputs"]) - result = generate_template(template, executables) + result = generate_template(template, executables, bundleId) if not result["OK"]: self.log.error("Error while generating wrapper") return result wrappedBundle = result["Value"] - wrapperPath = os.path.join(basedir, "bundle_wrapper") + wrapperPath = os.path.join(bundlePath, "bundle_wrapper") with open(wrapperPath, "x") as f: f.write(wrappedBundle) @@ -344,12 +366,7 @@ def _wrapBundle(self, bundleId): return S_OK((jobIds, wrapperPath, inputs, outputs)) - def _getCE(self, jobId): - result = self._getBundleIdFromJobId(jobId) - if not result["OK"]: - return result - bundleId = result["Value"] - + def __getBundleCE(self, bundleId): result = self.bundleDB.getBundleCE(bundleId) if not result["OK"]: return result @@ -359,6 +376,15 @@ def _getCE(self, jobId): return S_OK({"CEDict": ceDict, "ProxyPath": result["Value"]["ProxyPath"]}) + def _getCE(self, jobId): + result = self._getBundleIdFromJobId(jobId) + + if not result["OK"]: + return result + bundleId = result["Value"] + + return self.__getBundleCE(bundleId) + def __getJobCE(self, jobId): if jobId not in self.jobToCE: # Look for it in the DB diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index ba2403866ea..49fa439d700 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -1,45 +1,53 @@ from DIRAC import S_ERROR, S_OK -GENERIC_BASH_TEMPLATE = """\ +BASH_TEMPLATE = """\ #!/bin/bash -set -e - BASEDIR=${{PWD}} INPUT={inputs} +BUNDLE_ID={bundleId} get_id() {{ basename ${{1}} _workloadExec.sh }} run_task() {{ - local input=$1 - local task_id=$(get_id ${{input}}) + local task_id=$(get_id $1) + local input=${{1#${{task_id}}_*}} cd "$task_id" - # Setup - touch ${{task_id}}.status - #touch ${{task_id}}.out - echo "[${{task_id}}] Executing task" - {command} ${{BASEDIR}}/${{input}} \\ - 1> >(tee ${{task_id}}.out) \\ - 2> >(tee ${{task_id}}.err 1>&2) & + # 'set -e' inside the job execution to obtain the real exit status in case of failure + bash -e ${{input}} \\ + 1> >(tee ${{BUNDLE_ID}}.out) \\ + 2> >(tee ${{BUNDLE_ID}}.err 1>&2) & local task_pid=$! echo "[${{task_id}}] Waiting for pid ${{task_pid}}..." - wait ${{task_pid}} ; local task_status=$? + + wait ${{task_pid}} + local task_status=$? # Report status - echo "[${{task_id}}] ${{task_pid}} ${{task_status}}" | tee ${{task_id}}.status + echo "[${{task_id}}] ${{task_pid}} ${{task_status}}" | tee ${{BASEDIR}}/${{task_id}}.status }} # execute tasks for input in ${{INPUT[@]}}; do [ -f "$input" ] || break - mkdir $(get_id ${{input}}) + + jobId=$(get_id ${{input}}) + mkdir ${{jobId}} + + for filename in ${{jobId}}*; do + [ -f ${{filename}} ] || continue + touch ${{jobId}}.status + # Move the job specific files to its directory, removing the jobId from its name + mv $filename ${{jobId}}/${{filename#${{jobId}}_*}} + done + run_task ${{input}} & done @@ -51,7 +59,7 @@ """ -def generate_template(template: str, inputs: list): +def generate_template(template: str, inputs: list, bundleId: str): template = template.lower().replace("-", "_") func_name = "_generate_" + template generator = globals()[func_name] @@ -62,20 +70,12 @@ def generate_template(template: str, inputs: list): if inputs is None: inputs = [] - return generator(inputs) - - -def _generate_lb_prod_run(inputs: list): - template = __generate_generic_bash("lb-prod-run", inputs) - return S_OK(template) + template, formatMap = generator(inputs) + formatMap["bundleId"] = bundleId + return S_OK(template.format(**formatMap)) def _generate_bash(inputs: list): - template = __generate_generic_bash("bash", inputs) - return S_OK(template) - - -def __generate_generic_bash(command, inputs): formatted_inputs = "(" + " ".join(inputs) + ")" - template = GENERIC_BASH_TEMPLATE.format(command=command, inputs=formatted_inputs) - return template + formatMap = {"inputs": formatted_inputs} + return BASH_TEMPLATE, formatMap From 9d688f9c82b83dbbc56270c65db02496122fb9fc Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:03:29 +0200 Subject: [PATCH 18/47] chore(BundleCE): Setup bundled CE proxy --- .../Computing/BundleComputingElement.py | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index b2e05f3666e..f251a9b6a3d 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -91,7 +91,7 @@ def __init__(self, getProperty): def __contains__(self, jobId): if super().__contains__(jobId): return True - + res = self.getProperty(jobId) if res: self.__setitem__(jobId, res) @@ -106,7 +106,7 @@ def __getitem__(self, jobId): res = self.getProperty(jobId) if res: super().__setitem__(jobId, res) - + return res @@ -128,6 +128,8 @@ def __init__(self, ceUniqueID): ############################################################################# def _reset(self): + self.taskResults = BundleTaskDict(self.__getTraskResult) + # Force the CE to make the job submissions asynchronous self.ceParameters["AsyncSubmission"] = True @@ -160,7 +162,7 @@ def _reset(self): ############################################################################# - def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=[], outputs=[]): + def submitJob(self, executableFile, proxy=None, numberOfProcessors=1, inputs=[], outputs=[]): jobId = str(uuid.uuid4().hex) proxy = self.proxy if self.proxy else proxy @@ -182,11 +184,11 @@ def submitJob(self, executableFiles, proxy=None, numberOfProcessors=1, inputs=[] proxyPath = result["Value"] result = self.bundler.storeInBundle( - jobId, executableFiles, inputs, outputs, proxyPath, numberOfProcessors, self.innerCEParams + jobId, executableFile, inputs, outputs, proxyPath, numberOfProcessors, self.innerCEParams ) if not result["OK"]: - self.log.error("Failure while storing in the Bundle") + self.log.error(f"Failure while storing in the Bundle: {result}") return result bundleId = result["Value"]["BundleID"] @@ -226,18 +228,18 @@ def getJobOutput(self, jobId, workingDirectory="."): if not result["OK"]: return result - + outputsPath = result["Value"] outputAbsPath = os.path.abspath(workingDirectory) - jobBaseDir = os.path.join(outputsPath, jobId) + jobBaseDir = os.path.join(outputsPath, f"{jobId}") if not os.path.exists(jobBaseDir): return S_ERROR("Failed to locate job output files from base output directory") self.log.notice(f"Outputs at: {jobBaseDir}") - # Move all items from + # Move all items from for item in os.listdir(jobBaseDir): # newName = item @@ -246,9 +248,9 @@ def getJobOutput(self, jobId, workingDirectory="."): # move the item to the working directory # os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, newName)) - #os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) + # os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) shutil.copy2(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) - + # checksumFile = os.path.join(outputAbsPath, "md5Checksum.txt") # if os.path.exists(checksumFile): # with open(checksumFile, "r+") as f: @@ -258,15 +260,15 @@ def getJobOutput(self, jobId, workingDirectory="."): # f.write(content) # f.truncate() - error = os.path.join(workingDirectory, f"{bundleId}.err") - output = os.path.join(workingDirectory, f"{bundleId}.out") + error = os.path.join(workingDirectory, f"{bundleId}.err") + output = os.path.join(workingDirectory, f"{bundleId}.out") if not os.path.exists(output) or not os.path.exists(error): return S_ERROR("Outputs unable to be obtained") with open(output, "r") as f: output = f.read() - + with open(error, "r") as f: error = f.read() @@ -283,7 +285,7 @@ def getJobStatus(self, jobIDList): bundleId = None if ":::" in job: jobId, bundleId = job.split(":::") - + if not bundleId: result = self.bundler.bundleIdFromJobId(jobId) if not result["OK"]: @@ -308,13 +310,13 @@ def getJobStatus(self, jobIDList): if result["Value"] != PilotStatus.DONE: resultDict[jobId] = PilotStatus.FAILED continue - + # If the bundle ended properly, get the status of the independent job result = self.bundler.getTaskInfo(bundleId) if not result["OK"]: return result - + taskId = result["Value"]["TaskID"] self.log.debug(f"Obtaining bundle output of '{bundleId}'") result = self.__getOutputPath(bundleId, taskId) @@ -325,7 +327,6 @@ def getJobStatus(self, jobIDList): outputPath = result["Value"] # The file that contains a singular line with the following format: - # {JobId} {processId} {jobStatus} jobStatusFile = os.path.join(outputPath, f"{jobId}.status") # If it was not created or is empty, the job failed @@ -335,9 +336,9 @@ def getJobStatus(self, jobIDList): continue # Read the exit value of the process launched + # - The file contains a singular line with just the status with open(jobStatusFile, "r") as f: - jobStatus = f.readline() - jobStatus = int(jobStatus.split()[2]) + jobStatus = int(f.readline()) # 0 -> All ok Any other -> Fail if jobStatus == 0: @@ -354,9 +355,9 @@ def getJobStatus(self, jobIDList): def getCEStatus(self): return self.innerCE.getCEStatus() - def setProxy(self, proxy, valid=0): - super().setProxy(proxy, valid) - self.innerCE.setProxy(proxy, valid) + def setProxy(self, proxy): + super().setProxy(proxy) + self.innerCE.setProxy(proxy) def setToken(self, token, valid=0): super().setToken(token, valid) @@ -391,6 +392,8 @@ def killJob(self, jobIDList): ############################################################################# def __getTraskResult(self, jobId): + self.log.debug(f"Obtaining the task results of {jobId}") + result = self.getJobStatus(jobId) if not result["OK"]: @@ -411,7 +414,7 @@ def __getTraskResult(self, jobId): def __getOutputPath(self, bundleId, innerTaskId): """Returns the output path of the whole bundle - If it hasn't been created yet, it obtains the output from the Inner CE. + If it hasn't been created yet, it obtains the output from the Inner CE. """ self.log.debug(f"Obtaining the output path of bundle '{bundleId}' with task '{innerTaskId}'") @@ -434,8 +437,8 @@ def __getOutputPath(self, bundleId, innerTaskId): self.log.error("Failed to obtain the outputs, removing the directory") os.rmdir(outputsPath) return result - + except TimeoutError: return S_ERROR("Outputs not available yet") - - return S_OK(outputsPath) \ No newline at end of file + + return S_OK(outputsPath) From 8cc13398b50ac25e40107027691e814e1dcb9136 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:12:49 +0200 Subject: [PATCH 19/47] feat(BundleDB): Add new table for long input treatment --- .../WorkloadManagementSystem/DB/BundleDB.py | 86 ++++++++++++++++--- .../WorkloadManagementSystem/DB/BundleDB.sql | 10 ++- 2 files changed, 81 insertions(+), 15 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index eb03968205a..e0306758187 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -1,5 +1,5 @@ -""" BundleDB class is a front-end to the bundle db -""" +"""BundleDB class is a front-end to the bundle db""" + import uuid from ast import literal_eval @@ -34,11 +34,16 @@ "JobID", "BundleID", "ExecutablePath", - "Inputs", "Outputs", "Processors", ] +JOB_INPUTS_COLUMNS = [ + "InputID", + "JobID", + "InputPath", +] + def formatSelectOutput(listOfResults, keys): retVal = [] @@ -61,6 +66,7 @@ def __init__(self, parentLogger=None): self.BUNDLES_INFO_TABLE = "BundlesInfo" self.JOB_TO_BUNDLE_TABLE = "JobToBundle" + self.JOB_INPUTS_TABLE = "JobInputs" @property def log(self): @@ -158,25 +164,64 @@ def getBundleStatus(self, bundleId): return S_OK(STATUS_MAP[result["Value"][0][0]]) def getJobsOfBundle(self, bundleId): - fields = ["JobID", "ExecutablePath", "Inputs", "Outputs"] + cmd = f"""\ + SELECT JobToBundle.JobID, ExecutablePath, Outputs, InputPath + FROM JobToBundle + LEFT JOIN JobInputs + ON JobToBundle.JobID = JobInputs.JobID + WHERE BundleID = "{bundleId}";""" - result = self.getFields(self.JOB_TO_BUNDLE_TABLE, fields, {"BundleID": bundleId}) + result = self._query(cmd) if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], fields) + rows = list(result["Value"]) + retVal = {} + + # For each row (JobID, ExecutablePath, Outputs, [InputPath | Empty]) + for row in rows: + # The job has no input + if len(row) == 3: + jobID, jobExecutablePath, jobOutputs = row + jobInputPath = "" + else: + jobID, jobExecutablePath, jobOutputs, jobInputPath = row + + if jobID not in retVal: + retVal[jobID] = { + "ExecutablePath": jobExecutablePath, + "Inputs": [], + "Outputs": [], + } + + retVal[jobID]["Outputs"].extend(literal_eval(jobOutputs)) + + if jobInputPath: + retVal[jobID]["Inputs"].append(jobInputPath) + + # for i in range(len(retVal)): + # result = self.getFields(self.JOB_INPUTS_TABLE, "InputPath", {"JobID": retVal[i]["JobID"]}) + # if not result["OK"]: + # return result + + # inputs = list(result["Value"]) - for i in range(len(retVal)): - retVal[i]["Inputs"] = literal_eval(retVal[i]["Inputs"]) - retVal[i]["Outputs"] = literal_eval(retVal[i]["Outputs"]) + # # Go through every input path + # for idx, item in inputs: + # inputs[idx] = item[0] # Just the input path + + # retVal[i]["Inputs"] = inputs + # retVal[i]["Outputs"] = literal_eval(retVal[i]["Outputs"]) return S_OK(retVal) ############################################################################# def setTaskId(self, bundleId, taskId): - result = self.updateFields(self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId}) + result = self.updateFields( + self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId} + ) return result def getTaskId(self, bundleId): @@ -202,10 +247,10 @@ def setBundleAsCleaned(self, bundleId): def isBundleCleaned(self, bundleId): result = self.getFields(self.BUNDLES_INFO_TABLE, ["Cleaned"], {"BundleID": bundleId}) - + if not result["OK"]: return result - + return S_OK(result["Value"][0][0]) ############################################################################# @@ -272,7 +317,6 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro "JobID": jobId, "BundleID": bundleId, "ExecutablePath": executable, - "Inputs": str(inputs), "Outputs": str(outputs), "Processors": nProcessors, } @@ -282,6 +326,18 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro if not result["OK"]: return result + # Insert the Inputs + for _input in inputs: + insertInfo = { + "JobID": jobId, + "InputPath": _input, + } + + result = self.insertFields(self.JOB_INPUTS_TABLE, list(insertInfo.keys()), list(insertInfo.values())) + + if not result["OK"]: + return result + # Modify the number of processors that will be used by the bundle cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {} WHERE BundleID = "{}";'.format( nProcessors, bundleId @@ -292,7 +348,9 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro return result # Obtain the current Sum and the Max available - result = self.getFields(self.BUNDLES_INFO_TABLE, ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId}) + result = self.getFields( + self.BUNDLES_INFO_TABLE, ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId} + ) if not result["OK"]: return result diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 2061f9c3f0d..8e8b3464cd4 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -38,9 +38,17 @@ CREATE TABLE `JobToBundle` ( `JobID` VARCHAR(255) NOT NULL, `BundleID` VARCHAR(32) NOT NULL, `ExecutablePath` VARCHAR(255) NOT NULL, - `Inputs` VARCHAR(255) NOT NULL, `Outputs` VARCHAR(255) NOT NULL, `Processors` INT(5) UNSIGNED NOT NULL DEFAULT 1, PRIMARY KEY (`JobID`), FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + +-- ------------------------------------------------------------------------------ +CREATE TABLE `JobInputs` ( + `InputID` INTEGER NOT NULL AUTO_INCREMENT, + `JobID` VARCHAR(255) NOT NULL, + `InputPath` VARCHAR(255) NOT NULL, + PRIMARY KEY (`InputID`), + FOREIGN KEY (`JobID`) REFERENCES `JobToBundle`(`JobID`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; From 08edb0b3eac3064f31ffa4e05c6509aefb15009c Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:15:04 +0200 Subject: [PATCH 20/47] chore(BundleService): Change input insertion and status retrieval --- .../Service/BundlerHandler.py | 37 ++++++++----------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index b74b98e11e7..03edd4f4ad3 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -1,7 +1,8 @@ -""" The Bundler service provides an interface for bundling jobs into a a big job +"""The Bundler service provides an interface for bundling jobs into a a big job - It connects to a BundleDB to store and retrive bundles. +It connects to a BundleDB to store and retrive bundles. """ + import os import shutil from ast import literal_eval @@ -39,7 +40,7 @@ def initializeHandler(cls, serviceInfoDict): def initialize(self): self.killBundleOnError = self.getCSOption("KillBundleOnError", True) - self.bundlesBaseDir = self.getCSOption("/LocalSite/BundlesBaseDir", "/tmp/bundles") + self.bundlesBaseDir = self.getCSOption("/LocalSite/BundlesBaseDir", "/tmp/bundles") if not os.path.exists(self.bundlesBaseDir): os.mkdir(self.bundlesBaseDir) @@ -242,7 +243,7 @@ def export_cleanJob(self, jobId): itemPath = os.path.join(bundlePath, item) if os.path.isfile(item): os.remove(itemPath) - + return S_OK() ############################################################################# @@ -256,7 +257,7 @@ def export_getBundleStatus(self, bundleId): return result status = result["Value"]["Status"] - + if status == PilotStatus.RUNNING: task = result["Value"]["TaskID"] @@ -264,7 +265,7 @@ def export_getBundleStatus(self, bundleId): task = task.split(":::")[0] result = self.__getBundleCE(bundleId) - + if not result["OK"]: return result @@ -284,7 +285,7 @@ def export_getBundleStatus(self, bundleId): if status == PilotStatus.DONE: self.bundleDB.setBundleAsFinalized(bundleId) - elif status in PilotStatus.PILOT_FINAL_STATES: # ABORTED, DELETED or FAILED + elif status in PilotStatus.PILOT_FINAL_STATES: # ABORTED, DELETED or FAILED self.bundleDB.setBundleAsFailed(bundleId) return S_OK(status) @@ -317,7 +318,7 @@ def _wrapBundle(self, bundleId): self.log.error("Failed to obtain bundled job while wrapping. BundleID=", str(bundleId)) return result - jobs = result["Value"] + jobs: dict = result["Value"] template = bundle["ExecTemplate"] executables = [] @@ -328,12 +329,11 @@ def _wrapBundle(self, bundleId): bundlePath = os.path.join(self.bundlesBaseDir, bundleId) os.mkdir(bundlePath) - for job in jobs: - jobId = job["JobID"] + for jobId, jobInfo in jobs.items(): jobIds.append(jobId) # Copy the original file in a new location with the rest - job_executable = job["ExecutablePath"] + job_executable = jobInfo["ExecutablePath"] job_executable_dst = os.path.join(bundlePath, jobId + "_" + os.path.basename(job_executable)) shutil.copy(job_executable, job_executable_dst) @@ -341,12 +341,12 @@ def _wrapBundle(self, bundleId): executables.append(os.path.basename(job_executable_dst)) inputs.append(job_executable_dst) - for job_input in job["Inputs"]: + for job_input in jobInfo["Inputs"]: job_input_dst = os.path.join(bundlePath, jobId + "_" + os.path.basename(job_input)) shutil.copy(job_input, job_input_dst) inputs.append(job_input_dst) - outputs.extend(job["Outputs"]) + outputs.extend(list(set(jobInfo["Outputs"]))) # Remove duplicated entries result = generate_template(template, executables, bundleId) @@ -378,7 +378,7 @@ def __getBundleCE(self, bundleId): def _getCE(self, jobId): result = self._getBundleIdFromJobId(jobId) - + if not result["OK"]: return result bundleId = result["Value"] @@ -412,13 +412,6 @@ def __setupCE(self, ceDict, proxyPath): proxy = result["Value"]["chain"] - result = proxy.getRemainingSecs() - if not result["OK"]: - self.log.error("Failed to obtain remaining seconds of proxy") - return result - - valid = result["Value"] - # Setup CE result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) @@ -428,6 +421,6 @@ def __setupCE(self, ceDict, proxyPath): ce = result["Value"] - ce.setProxy(proxy, valid) + ce.setProxy(proxy) return S_OK({"CE": ce, "Proxy": proxy}) From 7a5e17e83b30f026402ad22d2157e992b7977ae2 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:16:42 +0200 Subject: [PATCH 21/47] chore(BundleTemplates): Remove unnecessary background process --- .../Utilities/BundlerTemplates.py | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 49fa439d700..0eab7d6eb45 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -6,8 +6,26 @@ INPUT={inputs} BUNDLE_ID={bundleId} +monitor_job() {{ + local job_pid=$1 + + #First time with headers + ps -p "$job_pid" -o pid,psr,%cpu,%mem,time,wchan,class,vsz,drs,rss,uss,size,rops,wops,wbytes + + while : ; do + sleep 5 + + # If the job finished, kill the monitoring + if ! kill -0 "$job_pid" 2>/dev/null; then + break + fi + + ps -p -h "$job_pid" -o pid,psr,%cpu,%mem,time,wchan,class,vsz,drs,rss,uss,size,rops,wops,wbytes + done +}} + get_id() {{ - basename ${{1}} _workloadExec.sh + echo $1 | cut -d '_' -f 1 }} run_task() {{ @@ -21,24 +39,21 @@ # 'set -e' inside the job execution to obtain the real exit status in case of failure bash -e ${{input}} \\ 1> >(tee ${{BUNDLE_ID}}.out) \\ - 2> >(tee ${{BUNDLE_ID}}.err 1>&2) & - - local task_pid=$! - - echo "[${{task_id}}] Waiting for pid ${{task_pid}}..." + 2> >(tee ${{BUNDLE_ID}}.err 1>&2) - wait ${{task_pid}} local task_status=$? - # Report status - echo "[${{task_id}}] ${{task_pid}} ${{task_status}}" | tee ${{BASEDIR}}/${{task_id}}.status + # Report job ending and status + echo "[${{task_id}}] Task Finished" + echo "${{task_status}}" 1>${{BASEDIR}}/${{task_id}}.status + echo "[${{task_id}}] Process final status: ${{task_status}}" }} # execute tasks for input in ${{INPUT[@]}}; do [ -f "$input" ] || break - jobId=$(get_id ${{input}}) + local jobId=$(get_id ${{input}}) mkdir ${{jobId}} for filename in ${{jobId}}*; do @@ -49,10 +64,14 @@ done run_task ${{input}} & + pid=$! + pids+=($pid) + + monitor_job "$pid" > ${{jobId}}/monitoring.stats & done # wait for all tasks -wait +wait "${{pids[@]}}" # Checksum of all files in the root and the job subdirectories find -H ! -type d ! -name md5Checksum.txt -exec md5sum {{}} + >md5Checksum.txt @@ -75,6 +94,7 @@ def generate_template(template: str, inputs: list, bundleId: str): return S_OK(template.format(**formatMap)) + def _generate_bash(inputs: list): formatted_inputs = "(" + " ".join(inputs) + ")" formatMap = {"inputs": formatted_inputs} From c84915e118f2d85861985a7e10ee784e9c314ff7 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 17 Sep 2025 11:17:29 +0200 Subject: [PATCH 22/47] fix: Obtain node variables at job wrapper offline wrapper (temporary) --- .../JobWrapper/JobWrapperOfflineTemplate.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py index bd2a5d297be..a58066e8595 100644 --- a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py +++ b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py @@ -40,7 +40,17 @@ def execute(arguments: dict): gLogger.exception("JobWrapper failed the initialization phase", lException=exc) return 1 - payloadResult = job.process(**payloadParams) + result = job.preProcess() + if not result["OK"]: + gLogger.error("JobWrapper failed the pre-processing phase") + return 1 + + payloadParams = result["Value"] + + payloadResult = job.process( + command=payloadParams["command"], + env=payloadParams["env"], + ) if not payloadResult["OK"]: return 1 From fd031bd71867a3e92bbfc7e7aa08ef2dab13f7d5 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 1 Oct 2025 16:12:49 +0200 Subject: [PATCH 23/47] chore(BundleCE): Improve output retrieval --- .../Computing/BundleComputingElement.py | 51 +++++++++---------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index f251a9b6a3d..35274c9832f 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -229,36 +229,24 @@ def getJobOutput(self, jobId, workingDirectory="."): if not result["OK"]: return result - outputsPath = result["Value"] + # The output obtation Timed Out, we need to wait a little longer + if not result["Value"]["Available"]: + return S_ERROR("Outputs not yet available") + + outputsPath = result["Value"]["Path"] outputAbsPath = os.path.abspath(workingDirectory) - jobBaseDir = os.path.join(outputsPath, f"{jobId}") + jobOutputDir = os.path.join(outputsPath, f"{jobId}") - if not os.path.exists(jobBaseDir): + if not os.path.exists(jobOutputDir): return S_ERROR("Failed to locate job output files from base output directory") - self.log.notice(f"Outputs at: {jobBaseDir}") - - # Move all items from - for item in os.listdir(jobBaseDir): - # newName = item - - # if jobId in item: - # newName = item.replace(jobId, bundleId) + self.log.notice(f"Outputs at: {jobOutputDir}") - # move the item to the working directory - # os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, newName)) - # os.rename(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) - shutil.copy2(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) - - # checksumFile = os.path.join(outputAbsPath, "md5Checksum.txt") - # if os.path.exists(checksumFile): - # with open(checksumFile, "r+") as f: - # content = f.read() - # content = content.replace(innerStamp, bundleId) - # f.seek(0) - # f.write(content) - # f.truncate() + # Move all outputs from the temporary directory, to where they should belong + for item in os.listdir(jobOutputDir): + # shutil.move(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) + shutil.copy2(os.path.join(jobOutputDir, item), os.path.join(outputAbsPath, item)) error = os.path.join(workingDirectory, f"{bundleId}.err") output = os.path.join(workingDirectory, f"{bundleId}.out") @@ -304,17 +292,20 @@ def getJobStatus(self, jobIDList): # Check if the bundle has ended if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: + self.log.debug("Bundle still running") continue # If the bundle Failed, we asume all of the jobs failed if result["Value"] != PilotStatus.DONE: resultDict[jobId] = PilotStatus.FAILED + self.log.error("Bundle FAILED") continue # If the bundle ended properly, get the status of the independent job result = self.bundler.getTaskInfo(bundleId) if not result["OK"]: + self.log.error("Couldn't get the TaskID of the Bundle") return result taskId = result["Value"]["TaskID"] @@ -324,7 +315,13 @@ def getJobStatus(self, jobIDList): if not result["OK"]: return result - outputPath = result["Value"] + # The output obtation Timed Out, we need to wait a little longer + if not result["Value"]["Available"]: + self.log.debug("Outputs not yet available") + resultDict[jobId] = PilotStatus.RUNNING + continue + + outputPath = result["Value"]["Path"] # The file that contains a singular line with the following format: jobStatusFile = os.path.join(outputPath, f"{jobId}.status") @@ -439,6 +436,6 @@ def __getOutputPath(self, bundleId, innerTaskId): return result except TimeoutError: - return S_ERROR("Outputs not available yet") + return S_OK({"Available": False}) - return S_OK(outputsPath) + return S_OK({"Available": True, "Path": outputsPath}) From 644203b0662b440ae64d3b738e6c25847e837405 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 1 Oct 2025 16:15:51 +0200 Subject: [PATCH 24/47] chore(BundleTemplates): Added extra runner file at wrapper --- .../Service/BundlerHandler.py | 13 +- .../Utilities/BundlerTemplates.py | 149 ++++++++++++++---- 2 files changed, 124 insertions(+), 38 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 03edd4f4ad3..25c75d43d45 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -14,7 +14,7 @@ from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory from DIRAC.WorkloadManagementSystem.Client import PilotStatus from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB -from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import generate_template +from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import BASH_RUN_TASK, generate_template class BundlerHandler(RequestHandler): @@ -342,7 +342,8 @@ def _wrapBundle(self, bundleId): inputs.append(job_executable_dst) for job_input in jobInfo["Inputs"]: - job_input_dst = os.path.join(bundlePath, jobId + "_" + os.path.basename(job_input)) + inputBasename = os.path.basename(job_input) + job_input_dst = os.path.join(bundlePath, jobId + "_" + inputBasename) shutil.copy(job_input, job_input_dst) inputs.append(job_input_dst) @@ -356,13 +357,15 @@ def _wrapBundle(self, bundleId): wrappedBundle = result["Value"] wrapperPath = os.path.join(bundlePath, "bundle_wrapper") + runnerPath = os.path.join(bundlePath, "run_task.sh") with open(wrapperPath, "x") as f: f.write(wrappedBundle) - # outputs = list(set(outputs)) - # if "/" in outputs: - # outputs = outputs.remove("/") + with open(runnerPath, "x") as f: + f.write(BASH_RUN_TASK) + + inputs.append(runnerPath) return S_OK((jobIds, wrapperPath, inputs, outputs)) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 0eab7d6eb45..4f1cf13a821 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -6,54 +6,104 @@ INPUT={inputs} BUNDLE_ID={bundleId} -monitor_job() {{ - local job_pid=$1 +PROC_MONITOR_VARS=(Pid Name State Threads Cpus_allowed_list) - #First time with headers - ps -p "$job_pid" -o pid,psr,%cpu,%mem,time,wchan,class,vsz,drs,rss,uss,size,rops,wops,wbytes +OLD_IFS=$IFS - while : ; do - sleep 5 +# cpu management +bundler_pid=$$ +allowed_cpus=$(grep -w Cpus_allowed_list /proc/"$bundler_pid"/status | awk '{{print $2}}') +IFS=',' read -a cpu_ranges <<< "$allowed_cpus" - # If the job finished, kill the monitoring - if ! kill -0 "$job_pid" 2>/dev/null; then - break - fi +IFS=$OLD_IFS + +first_allowed_cpu=$(cut -d "-" -f 1 - <<<"${{cpu_ranges[0]}}") +last_allowed_cpu=$(cut -d "-" -f 2 - <<<"${{cpu_ranges[-1]}}") +cpu_offset=0 +total_allowed_cpus=0 - ps -p -h "$job_pid" -o pid,psr,%cpu,%mem,time,wchan,class,vsz,drs,rss,uss,size,rops,wops,wbytes +calc_total_cpus() {{ + for range in "${{cpu_ranges[@]}}"; do + local min=$(cut -d "-" -f 1 - <<<"$range") + local max=$(cut -d "-" -f 2 - <<<"$range") + total_allowed_cpus=$(($total_allowed_cpus+$max-$min+1)) done + + # Hypercharged cores check + local inputs_len=${{#INPUT[@]}} + if (( ($inputs_len * 2) == $total_allowed_cpus )); then + cpu_offset=$inputs_len + fi }} -get_id() {{ - echo $1 | cut -d '_' -f 1 +next_allowed_cpu() {{ + echo $allowed_cpus + return 0 + + local desired_cpu=$(( ($1 + $cpu_offset) % $total_allowed_cpus )) + local cpu=$first_allowed_cpu + + for range in "${{cpu_ranges[@]}}"; do + local min=$(cut -d "-" -f 1 - <<<"$range") + local max=$(cut -d "-" -f 2 - <<<"$range") + local real_cpu=$(($min+$desired_cpu)) + + if (( $real_cpu <= $max )); then + cpu=$real_cpu + break + fi + + # Check next range + local cpus_on_range=$(($max-$min+1)) + local desired_cpu=$(($desired_cpu-$cpus_on_range)) + done + + # Return cpu + echo $cpu }} -run_task() {{ - local task_id=$(get_id $1) - local input=${{1#${{task_id}}_*}} +calc_total_cpus + +echo This machine has "$total_allowed_cpus" valid cores +echo Ranges: "${{cpu_ranges[@]}}" - cd "$task_id" +monitor_job() {{ + local job_pid=$1 + local job_id=$2 + local log_file=$3 + + echo ID Timestamp CPU ${{PROC_MONITOR_VARS[*]}} | sed 's/ /\\t/g' > $log_file + + while : ; do + # If the job finished, finish the monitoring + if ! kill -0 "$job_pid" 2>/dev/null; then + break + fi - echo "[${{task_id}}] Executing task" + local cpu=$(ps -h -p "$job_pid" -o psr) + local timestamp=$(date "+%Y-%m-%d_%H:%M:%S") + local vars=() - # 'set -e' inside the job execution to obtain the real exit status in case of failure - bash -e ${{input}} \\ - 1> >(tee ${{BUNDLE_ID}}.out) \\ - 2> >(tee ${{BUNDLE_ID}}.err 1>&2) + for var in ${{PROC_MONITOR_VARS[@]}}; do + vars+=($(grep -w "$var" /proc/"$pid"/status | awk '{{print $2}}')) + done - local task_status=$? + echo $timestamp $job_id $cpu ${{vars[*]}} | sed 's/ /\\t/g' >> $log_file + sleep 5 + done +}} - # Report job ending and status - echo "[${{task_id}}] Task Finished" - echo "${{task_status}}" 1>${{BASEDIR}}/${{task_id}}.status - echo "[${{task_id}}] Process final status: ${{task_status}}" +get_id() {{ + echo $1 | cut -d '_' -f 1 }} +job_number=0 + # execute tasks for input in ${{INPUT[@]}}; do [ -f "$input" ] || break - local jobId=$(get_id ${{input}}) + jobId=$(get_id ${{input}}) mkdir ${{jobId}} for filename in ${{jobId}}*; do @@ -63,20 +113,53 @@ mv $filename ${{jobId}}/${{filename#${{jobId}}_*}} done - run_task ${{input}} & + # run_task ${{input}} & + # pid=$! + # pids+=($pid) + + cpu=$(next_allowed_cpu $job_number) + + chmod u+x run_task.sh + taskset -c $cpu ${{BASEDIR}}/run_task.sh ${{jobId}} ${{input}} ${{BUNDLE_ID}} ${{BASEDIR}} & pid=$! pids+=($pid) - monitor_job "$pid" > ${{jobId}}/monitoring.stats & + taskset -cp $cpu $pid + job_number=$(($job_number+1)) + + monitor_job "$pid" "$jobId" "$jobId/monitoring.stats" & + pid=$! + monitor_pids+=($pid) done # wait for all tasks wait "${{pids[@]}}" - -# Checksum of all files in the root and the job subdirectories -find -H ! -type d ! -name md5Checksum.txt -exec md5sum {{}} + >md5Checksum.txt +wait "${{monitor_pids[@]}}" """ +BASH_RUN_TASK = """\ +#!/bin/bash +task_id=$1 +input=${2#${task_id}_*} +bundle_id=$3 +base_dir=$4 + +cd "$task_id" + +echo "[${task_id}] Executing task" + +# 'set -e' inside the job execution to obtain the real exit status in case of failure +bash -e ${input} \\ + 1> ${bundle_id}.out \\ + 2> ${bundle_id}.err + +task_status=$? + +# Report job ending and status +echo "[${task_id}] Task Finished" +echo "${task_status}" 1>${base_dir}/${task_id}.status +echo "[${task_id}] Process final status: ${task_status}" +""" def generate_template(template: str, inputs: list, bundleId: str): template = template.lower().replace("-", "_") From d8ce358aaadcc5637ab7150463bccbddb9971bd5 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 1 Oct 2025 16:20:01 +0200 Subject: [PATCH 25/47] feat(BundleDB): Add a timestamp to avoid bundle stallin --- .../WorkloadManagementSystem/DB/BundleDB.py | 89 ++++++++++++++----- .../WorkloadManagementSystem/DB/BundleDB.sql | 32 ++++--- 2 files changed, 86 insertions(+), 35 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index e0306758187..b9aad339f05 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -2,6 +2,7 @@ import uuid from ast import literal_eval +from datetime import datetime, timedelta, timezone from DIRAC import S_ERROR, S_OK from DIRAC.Core.Base.DB import DB @@ -28,6 +29,8 @@ "Status", "ProxyPath", "Cleaned", + "FirstTimestamp", + "LastTimestamp" ] JOB_TO_BUNDLE_COLUMNS = [ @@ -44,6 +47,8 @@ "InputPath", ] +MYSQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' + def formatSelectOutput(listOfResults, keys): retVal = [] @@ -68,6 +73,8 @@ def __init__(self, parentLogger=None): self.JOB_TO_BUNDLE_TABLE = "JobToBundle" self.JOB_INPUTS_TABLE = "JobInputs" + self.maxMinsInBundle = self.getCSOption("MaxMinutesInBundle", 60) + @property def log(self): return contextLogger.get() or self._defaultLogger @@ -113,8 +120,6 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi bundleId = result["Value"] - # TODO: CHECK IF THE JOB IS ALREADY IN THE BUNDLE - # Insert it and obtain if it is ready to be submitted result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath) @@ -144,6 +149,24 @@ def removeJobFromBundle(self, jobId): ############################################################################# + def getFinishedBundles(self): + result = self.getFields(self.BUNDLES_INFO_TABLE, ["BundleID"], {"Status": "FINISHED"}) + + if not result["OK"]: + return result + + return S_OK([entry[0] for entry in result["Value"]]) + + def getWaitingBundles(self): + result = self.getFields(self.BUNDLES_INFO_TABLE, ["BundleID"], {"Status": "WAITING"}) + + if not result["OK"]: + return result + + return S_OK([entry[0] for entry in result["Value"]]) + + ############################################################################# + def getBundleIdFromJobId(self, jobId): result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID"], {"JobID": jobId}) @@ -216,6 +239,17 @@ def getJobsOfBundle(self, bundleId): return S_OK(retVal) + def getJobIDsOfBundle(self, bundleId): + result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["JobID"], {"BundleID": bundleId}) + + if not result["OK"]: + return result + + return S_OK([entry[0] for entry in result["Value"]]) + + def removeJobInputs(self, jobId): + return self.deleteEntries(self.JOB_INPUTS_TABLE, {"JobID": jobId}) + ############################################################################# def setTaskId(self, bundleId, taskId): @@ -291,6 +325,8 @@ def __createNewBundle(self, ceDict, proxyPath): if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") + timestamp = datetime.now(tz=timezone.utc).strftime(MYSQL_DATETIME_FORMAT) + bundleId = uuid.uuid4().hex insertInfo = { "BundleID": bundleId, @@ -302,6 +338,8 @@ def __createNewBundle(self, ceDict, proxyPath): "Queue": ceDict["Queue"], "CEDict": str(ceDict), "ProxyPath": proxyPath, + "FirstTimestamp": timestamp, + "LastTimestamp": timestamp, } result = self.insertFields(self.BUNDLES_INFO_TABLE, list(insertInfo.keys()), list(insertInfo.values())) @@ -312,6 +350,8 @@ def __createNewBundle(self, ceDict, proxyPath): return S_OK(bundleId) def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath): + timestamp = datetime.now(tz=timezone.utc).strftime(MYSQL_DATETIME_FORMAT) + # Insert the job into the bundle insertInfo = { "JobID": jobId, @@ -339,48 +379,53 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro return result # Modify the number of processors that will be used by the bundle - cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {} WHERE BundleID = "{}";'.format( - nProcessors, bundleId + cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {}, LastTimestamp = "{}" WHERE BundleID = "{}";'.format( + nProcessors, timestamp, bundleId ) result = self._query(cmd) if not result["OK"]: return result - - # Obtain the current Sum and the Max available + + # Obtain the info to be returned to the Service result = self.getFields( - self.BUNDLES_INFO_TABLE, ["ProcessorSum", "MaxProcessors", "Status"], {"BundleID": bundleId} + self.BUNDLES_INFO_TABLE, + ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"], + {"BundleID": bundleId} ) if not result["OK"]: return result - retVal = formatSelectOutput(result["Value"], ["ProcessorSum", "MaxProcessors", "Status"]) - selection = retVal[0] - selection["Ready"] = selection["ProcessorSum"] == selection["MaxProcessors"] + selection = formatSelectOutput( + result["Value"], + ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"] + ) + selection = selection[0] + + ready = self.__getBundleRediness(selection) + + return S_OK({"BundleId": bundleId, "Ready": ready}) - selection.pop("ProcessorSum") - selection.pop("MaxProcessors") + def __getBundleRediness(self, bundleInfo): + elapsedTime : timedelta = bundleInfo["LastTimestamp"] - bundleInfo["FirstTimestamp"] + elapsedMinutes = elapsedTime.total_seconds() // 60 - selection["Status"] = STATUS_MAP[selection["Status"]] + if elapsedMinutes > self.maxMinsInBundle: + return True - # TODO: Change this to a strategy based selection and remove self.__selectBestBundle(...) - return S_OK(selection) + if bundleInfo["ProcessorSum"] == bundleInfo["MaxProcessors"]: + return True - def __getBundlesFromCEDict(self, ceDict): - # conditions = { - # "Site": ceDict["Site"], - # "CE": ceDict["GridCE"], - # "Queue": ceDict["Queue"], - # } + return False + def __getBundlesFromCEDict(self, ceDict): cmd = 'SELECT * FROM BundlesInfo WHERE Site = "{Site}" AND CE = "{CE}" AND Queue = "{Queue}";'.format( Site=ceDict["Site"], CE=ceDict["GridCE"], Queue=ceDict["Queue"], ) result = self._query(cmd) - # result = self.getFields(self.BUNDLES_INFO_TABLE, [], conditions) if not result["OK"]: return result diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 8e8b3464cd4..21756f50bd2 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -14,23 +14,29 @@ USE BundleDB; -- ------------------------------------------------------------------------------ +DROP TABLE IF EXISTS `JobInputs`; DROP TABLE IF EXISTS `JobToBundle`; DROP TABLE IF EXISTS `BundlesInfo`; CREATE TABLE `BundlesInfo` ( - `BundleID` VARCHAR(32) NOT NULL, - `ProcessorSum` INT(5) UNSIGNED NOT NULL DEFAULT 0, - `MaxProcessors` INT(5) UNSIGNED NOT NULL, - `Site` VARCHAR(128) NOT NULL, - `CE` VARCHAR(128) NOT NULL, - `Queue` VARCHAR(128) NOT NULL, - `CEDict` TEXT NOT NULL, - `ExecTemplate` VARCHAR(25) NOT NULL, - `TaskID` VARCHAR(255), - `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', - `ProxyPath` VARCHAR(255), - `Cleaned` BOOLEAN DEFAULT FALSE, - PRIMARY KEY (BundleID) + `BundleID` VARCHAR(32) NOT NULL, + `ProcessorSum` INT(5) UNSIGNED NOT NULL DEFAULT 0, + `MaxProcessors` INT(5) UNSIGNED NOT NULL, + `Site` VARCHAR(128) NOT NULL, + `CE` VARCHAR(128) NOT NULL, + `Queue` VARCHAR(128) NOT NULL, + `CEDict` TEXT NOT NULL, + `ExecTemplate` VARCHAR(25) NOT NULL, + `TaskID` VARCHAR(255), + `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', + `ProxyPath` VARCHAR(255), + `Cleaned` BOOLEAN DEFAULT FALSE, + `FirstTimestamp` DATETIME, + `LastTimestamp` DATETIME, + PRIMARY KEY (`BundleID`), + INDEX (`Site`,`CE`,`Queue`), + INDEX (`Status`), + INDEX (`Cleaned`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ From 3f8ac484bf329bf097ff1d1ea385e90708a48922 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 8 Oct 2025 10:51:30 +0200 Subject: [PATCH 26/47] chore: Remove unnecesary status files --- .../Computing/BundleComputingElement.py | 55 ------------------- .../Utilities/BundlerTemplates.py | 6 -- 2 files changed, 61 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 35274c9832f..7acda21a389 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -290,61 +290,6 @@ def getJobStatus(self, jobIDList): resultDict[jobId] = result["Value"] self.log.debug(f"Status of bundle '{bundleId}': {result['Value']}") - # Check if the bundle has ended - if result["Value"] not in PilotStatus.PILOT_FINAL_STATES: - self.log.debug("Bundle still running") - continue - - # If the bundle Failed, we asume all of the jobs failed - if result["Value"] != PilotStatus.DONE: - resultDict[jobId] = PilotStatus.FAILED - self.log.error("Bundle FAILED") - continue - - # If the bundle ended properly, get the status of the independent job - result = self.bundler.getTaskInfo(bundleId) - - if not result["OK"]: - self.log.error("Couldn't get the TaskID of the Bundle") - return result - - taskId = result["Value"]["TaskID"] - self.log.debug(f"Obtaining bundle output of '{bundleId}'") - result = self.__getOutputPath(bundleId, taskId) - - if not result["OK"]: - return result - - # The output obtation Timed Out, we need to wait a little longer - if not result["Value"]["Available"]: - self.log.debug("Outputs not yet available") - resultDict[jobId] = PilotStatus.RUNNING - continue - - outputPath = result["Value"]["Path"] - - # The file that contains a singular line with the following format: - jobStatusFile = os.path.join(outputPath, f"{jobId}.status") - - # If it was not created or is empty, the job failed - if not os.path.exists(jobStatusFile) or os.path.getsize(jobStatusFile) == 0: - self.log.warn(f".status file of job '{jobId}' not found or is empty. Assuming it failed") - resultDict[jobId] = PilotStatus.FAILED - continue - - # Read the exit value of the process launched - # - The file contains a singular line with just the status - with open(jobStatusFile, "r") as f: - jobStatus = int(f.readline()) - - # 0 -> All ok Any other -> Fail - if jobStatus == 0: - resultDict[jobId] = PilotStatus.DONE - else: - resultDict[jobId] = PilotStatus.FAILED - - self.log.debug(f"Status of job '{jobId}': {resultDict[jobId]}") - return S_OK(resultDict) ############################################################################# diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 4f1cf13a821..ca0fa879662 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -108,15 +108,10 @@ for filename in ${{jobId}}*; do [ -f ${{filename}} ] || continue - touch ${{jobId}}.status # Move the job specific files to its directory, removing the jobId from its name mv $filename ${{jobId}}/${{filename#${{jobId}}_*}} done - # run_task ${{input}} & - # pid=$! - # pids+=($pid) - cpu=$(next_allowed_cpu $job_number) chmod u+x run_task.sh @@ -157,7 +152,6 @@ # Report job ending and status echo "[${task_id}] Task Finished" -echo "${task_status}" 1>${base_dir}/${task_id}.status echo "[${task_id}] Process final status: ${task_status}" """ From 6114b59fb25d3adff51814b889ef00f67a574a5b Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 8 Oct 2025 11:47:14 +0200 Subject: [PATCH 27/47] fix(BundleDB): Avoid job insertion in running or finished bundles --- src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index b9aad339f05..39fda0f44b1 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -480,16 +480,17 @@ def __selectBestBundle(self, bundles, nProcessors): bundleId = bundle["BundleID"] procs = bundle["ProcessorSum"] maxProcs = bundle["MaxProcessors"] + status = bundle["Status"] newProcSum = procs + nProcessors + if status != "Storing": + continue + if newProcSum == maxProcs: return bundleId - elif newProcSum > maxProcs: - continue - - elif newProcSum > currentBestProcs: + if newProcSum > currentBestProcs: currentBestProcs = newProcSum bestBundleId = bundleId From 8fb75bc2db732c9480557fe5d7f356700b62fc4d Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 8 Oct 2025 16:05:21 +0200 Subject: [PATCH 28/47] chore: Remove debugging code --- .../WorkloadManagementSystem/Utilities/BundlerTemplates.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index ca0fa879662..280df181ba9 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -28,12 +28,6 @@ local max=$(cut -d "-" -f 2 - <<<"$range") total_allowed_cpus=$(($total_allowed_cpus+$max-$min+1)) done - - # Hypercharged cores check - local inputs_len=${{#INPUT[@]}} - if (( ($inputs_len * 2) == $total_allowed_cpus )); then - cpu_offset=$inputs_len - fi }} next_allowed_cpu() {{ From c7140cc9c93a6277aba52d3eacf2a6d90cb8f5b5 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Wed, 8 Oct 2025 17:00:23 +0200 Subject: [PATCH 29/47] feat: Add agent to monitor bundles (untested) --- .../Agent/BundleManagerAgent.py | 85 +++++++++++++++++++ .../ConfigTemplate.cfg | 6 ++ 2 files changed, 91 insertions(+) create mode 100644 src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py new file mode 100644 index 00000000000..bf47284291d --- /dev/null +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -0,0 +1,85 @@ +import os + +from DIRAC import S_ERROR, S_OK, gConfig +from DIRAC.Core.Base.AgentModule import AgentModule +from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader +from DIRAC.WorkloadManagementSystem.Client import PilotStatus +from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient +from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient +from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB +from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import generate_template +from DIRAC.WorkloadManagementSystem.Client import JobStatus + +class BundleManagerAgent(AgentModule): + def __init__(self, agentName, loadName, baseAgentName=False, properties=None): + if not properties: + properties = {} + super().__init__(agentName, loadName, baseAgentName, properties) + + self.bundleDB = None + + ############################################################################# + + def initialize(self): + self.bundleDB = BundleDB() + self.jobMonitor = JobMonitoringClient() + self.bundler = BundlerClient() + + def execute(self): + self._sendStalledBundles() + self._cleanFinishedBundles() + self._removeKilledJobs() + + def finalize(self): + pass + + ############################################################################# + + def _cleanFinishedBundles(self): + self.log.info("Cleaning inputs of finished bundles bundles") + + result = self.bundleDB.getFinishedBundles() + if not result["OK"]: + return result + bundleIDs = result["Value"] + + for bundleId in bundleIDs: + result = self.getJobIDsOfBundle(bundleId) + if not result["OK"]: + return result + jobIDs = result["Value"] + + for jobId in jobIDs: + result = self.bundleDB.removeJobInputs(jobId) + if not result["OK"]: + self.log.error(f"Failed to remove inputs of job {jobId} from bundle {bundleId}, skipping...") + self.log.error(result) + + return S_OK() + + def _removeKilledJobs(self): + killedJobs = [] + + result = self.bundleDB.getWaitingBundles() + if not result["OK"]: + return result + + for bundleId in result["Value"]: + result = self.bundleDB.getJobsOfBundle(bundleId) + if not result["OK"]: + return result + + result = self.jobMonitor.getJobsStatus(result["Value"]) + if not result["OK"]: + return result + + statusDict = result["Value"] + for job, status in statusDict.items(): + if status == JobStatus.KILLED: + killedJobs.append(job) + + result = self.bundleDB.removeJobs(killedJobs) + return result + + def _sendStalledBundles(self): + pass \ No newline at end of file diff --git a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg index 7b212e3c5c9..cc3d08af6c2 100644 --- a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +++ b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg @@ -336,6 +336,12 @@ Agents PollingTime = 120 } ##END + ##BEGIN BundleManagerAgent + BundleManagerAgent + { + PollingTime = 120 + } + ##END } Executors { From 3c71969bcfc66ae3218bb48f101b43ddd1c9109c Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:21:05 +0200 Subject: [PATCH 30/47] chore(PushJobAgent): Modify ce.submitJob to be the same as the submission through a JobAgent --- .../WorkloadManagementSystem/Agent/PushJobAgent.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py index 492c1c0e200..9f60b79113f 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py @@ -366,6 +366,9 @@ def execute(self): resourceParams=ceDict, optimizerParams=optimizerParams, processors=submissionParams["processors"], + wholeNode=submissionParams["wholeNode"], + maxNumberOfProcessors=submissionParams["maxNumberOfProcessors"], + mpTag=submissionParams["mpTag"], ) if not result["OK"]: self.failedQueues[queueName] += 1 @@ -521,6 +524,9 @@ def _submitJobWrapper( resourceParams: dict, optimizerParams: dict, processors: int, + wholeNode: bool, + maxNumberOfProcessors: int, + mpTag: bool, ): """Submit a JobWrapper to the remote site @@ -618,6 +624,13 @@ def _submitJobWrapper( proxy=None, inputs=inputs, outputs=outputs, + numberOfProcessors=processors, + maxNumberOfProcessors=maxNumberOfProcessors, + wholeNode=wholeNode, + mpTag=mpTag, + jobDesc=jobDesc, + log=self.log, + logLevel=self.logLevel, ) if not result["OK"]: rescheduleResult = rescheduleFailedJob( From c71ca70dbb9f5f0b2938a05afd43b8c0d1792057 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:22:01 +0200 Subject: [PATCH 31/47] chore: Add BundleManagerAgent to ConfigTemplate --- src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg index cc3d08af6c2..859d71cf6c2 100644 --- a/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +++ b/src/DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg @@ -340,6 +340,8 @@ Agents BundleManagerAgent { PollingTime = 120 + MaxMinutesInBundle = 60 + MaxDaysInDB = 2 } ##END } From c1e38f518cae1a78f22bb828d5445f27effdd614 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:26:02 +0200 Subject: [PATCH 32/47] chore(BundleTemplate): Remove debug bundle monitoring --- .../Utilities/BundlerTemplates.py | 75 +++++++++---------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 280df181ba9..cdc312ddb23 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -1,13 +1,12 @@ from DIRAC import S_ERROR, S_OK -BASH_TEMPLATE = """\ +# DEPRECATED +BASH_TESTING_TEMPLATE = """\ #!/bin/bash BASEDIR=${{PWD}} INPUT={inputs} BUNDLE_ID={bundleId} -PROC_MONITOR_VARS=(Pid Name State Threads Cpus_allowed_list) - OLD_IFS=$IFS # cpu management @@ -61,37 +60,12 @@ echo This machine has "$total_allowed_cpus" valid cores echo Ranges: "${{cpu_ranges[@]}}" -monitor_job() {{ - local job_pid=$1 - local job_id=$2 - local log_file=$3 - - echo ID Timestamp CPU ${{PROC_MONITOR_VARS[*]}} | sed 's/ /\\t/g' > $log_file - - while : ; do - # If the job finished, finish the monitoring - if ! kill -0 "$job_pid" 2>/dev/null; then - break - fi - - local cpu=$(ps -h -p "$job_pid" -o psr) - local timestamp=$(date "+%Y-%m-%d_%H:%M:%S") - local vars=() - - for var in ${{PROC_MONITOR_VARS[@]}}; do - vars+=($(grep -w "$var" /proc/"$pid"/status | awk '{{print $2}}')) - done - - echo $timestamp $job_id $cpu ${{vars[*]}} | sed 's/ /\\t/g' >> $log_file - sleep 5 - done -}} - get_id() {{ echo $1 | cut -d '_' -f 1 }} job_number=0 +chmod u+x run_task.sh # execute tasks for input in ${{INPUT[@]}}; do @@ -107,23 +81,48 @@ done cpu=$(next_allowed_cpu $job_number) - - chmod u+x run_task.sh taskset -c $cpu ${{BASEDIR}}/run_task.sh ${{jobId}} ${{input}} ${{BUNDLE_ID}} ${{BASEDIR}} & pid=$! - pids+=($pid) - taskset -cp $cpu $pid + pids+=($pid) job_number=$(($job_number+1)) - - monitor_job "$pid" "$jobId" "$jobId/monitoring.stats" & - pid=$! - monitor_pids+=($pid) done # wait for all tasks wait "${{pids[@]}}" -wait "${{monitor_pids[@]}}" +""" + +BASH_TEMPLATE = """\ +#!/bin/bash +BASEDIR=${{PWD}} +INPUT={inputs} +BUNDLE_ID={bundleId} + +get_id() {{ + echo $1 | cut -d '_' -f 1 +}} + +job_number=0 +chmod u+x run_task.sh + +# execute tasks +for input in ${{INPUT[@]}}; do + [ -f "$input" ] || break + + jobId=$(get_id ${{input}}) + mkdir ${{jobId}} + + for filename in ${{jobId}}*; do + [ -f ${{filename}} ] || continue + # Move the job specific files to its directory, removing the jobId from its name + mv $filename ${{jobId}}/${{filename#${{jobId}}_*}} + done + + ${{BASEDIR}}/run_task.sh ${{jobId}} ${{input}} ${{BUNDLE_ID}} ${{BASEDIR}} & +done + +# wait for all tasks +wait """ BASH_RUN_TASK = """\ From 100bd0fda4e1fdf0c6d3546f8755d7962644b1ac Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:31:18 +0200 Subject: [PATCH 33/47] feat(BundleDB): Add flags to control Bundle stages and accept the JobID obtained though the matcher --- .../WorkloadManagementSystem/DB/BundleDB.py | 263 +++++++++--------- .../WorkloadManagementSystem/DB/BundleDB.sql | 9 +- 2 files changed, 144 insertions(+), 128 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 39fda0f44b1..038b5099561 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -2,53 +2,13 @@ import uuid from ast import literal_eval -from datetime import datetime, timedelta, timezone +from datetime import datetime, timezone from DIRAC import S_ERROR, S_OK from DIRAC.Core.Base.DB import DB from DIRAC.FrameworkSystem.Client.Logger import contextLogger from DIRAC.WorkloadManagementSystem.Client import PilotStatus -STATUS_MAP = { - "Storing": PilotStatus.WAITING, - "Sent": PilotStatus.RUNNING, - "Finalized": PilotStatus.DONE, - "Failed": PilotStatus.FAILED, -} - -BUNDLES_INFO_COLUMNS = [ - "BundleID", - "ProcessorSum", - "MaxProcessors", - "Site", - "CE", - "Queue", - "CEDict", - "ExecTemplate", - "TaskID", - "Status", - "ProxyPath", - "Cleaned", - "FirstTimestamp", - "LastTimestamp" -] - -JOB_TO_BUNDLE_COLUMNS = [ - "JobID", - "BundleID", - "ExecutablePath", - "Outputs", - "Processors", -] - -JOB_INPUTS_COLUMNS = [ - "InputID", - "JobID", - "InputPath", -] - -MYSQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' - def formatSelectOutput(listOfResults, keys): retVal = [] @@ -73,7 +33,52 @@ def __init__(self, parentLogger=None): self.JOB_TO_BUNDLE_TABLE = "JobToBundle" self.JOB_INPUTS_TABLE = "JobInputs" - self.maxMinsInBundle = self.getCSOption("MaxMinutesInBundle", 60) + self.BUNDLES_INFO_COLUMNS = [ + "BundleID", + "ProcessorSum", + "MaxProcessors", + "Site", + "CE", + "Queue", + "CEDict", + "ExecTemplate", + "TaskID", + "Status", + "ProxyPath", + "Flags", + "FirstTimestamp", + "LastTimestamp" + ] + + self.JOB_TO_BUNDLE_COLUMNS = [ + "JobID", + "BundleID", + "DiracID", + "ExecutablePath", + "Outputs", + "Processors", + ] + + self.JOB_INPUTS_COLUMNS = [ + "InputID", + "JobID", + "InputPath", + ] + + self.STATUS_MAP = { + "Storing": PilotStatus.WAITING, + "Sent": PilotStatus.RUNNING, + "Finalized": PilotStatus.DONE, + "Failed": PilotStatus.FAILED, + } + + self.MYSQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' + + self.BUNDLE_FLAGS = { + "Cleaned": 1, + "Purged": 1 << 1, + } + @property def log(self): @@ -85,8 +90,8 @@ def log(self, value): ############################################################################# - def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDict, proxyPath): - result = self.__getBundlesFromCEDict(ceDict) + def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDict, proxyPath, diracId): + result = self._getBundlesFromCEDict(ceDict) if not result["OK"]: return result @@ -95,13 +100,13 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi # No bundles matching ceDict, so create a new one if not bundles: - result = self.__createNewBundle(ceDict, proxyPath) + result = self._createNewBundle(ceDict, proxyPath) if not result["OK"]: return result bundleId = result["Value"] - result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath) + result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId) if not result["OK"]: return result @@ -113,7 +118,7 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi # If it does not fit in an already created bundle, create a new one if not bundleId: - result = self.__createNewBundle(ceDict, proxyPath) + result = self._createNewBundle(ceDict, proxyPath) if not result["OK"]: return result @@ -121,36 +126,39 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi bundleId = result["Value"] # Insert it and obtain if it is ready to be submitted - result = self.__insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath) + result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId) if not result["OK"]: return result return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) - def removeJobFromBundle(self, jobId): - result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID", "Processors"], {"JobID": jobId}) + def removeJobsFromBundle(self, jobIds): + for jobId in jobIds: + result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID", "Processors"], {"JobID": jobId}) - if not result["OK"]: - return result - - jobInfo = result["Value"][0] - bundleId, procs = jobInfo[0], jobInfo[1] + if not result["OK"]: + return result - result = self.__reduceProcessorSum(bundleId, procs) + jobInfo = result["Value"][0] + bundleId, procs = jobInfo[0], jobInfo[1] - if not result["OK"]: - return result + result = self._reduceProcessorSum(bundleId, procs) - result = self.deleteEntries(self.JOB_TO_BUNDLE_TABLE, {"JobID": jobId}) + if not result["OK"]: + return result - # Rollback on error?? Can this Fail?? - return result + result = self.deleteEntries(self.JOB_TO_BUNDLE_TABLE, {"JobID": jobIds}) + return S_OK(result) ############################################################################# - def getFinishedBundles(self): - result = self.getFields(self.BUNDLES_INFO_TABLE, ["BundleID"], {"Status": "FINISHED"}) + def getUnpurgedBundles(self): + cmd = 'SELECT BundleID FROM BundlesInfo WHERE Status = "Finalized" AND Flags & {flag} != {flag};'.format( + flag=self.BUNDLE_FLAGS["Purged"] + ) + + result = self._query(cmd) if not result["OK"]: return result @@ -158,12 +166,13 @@ def getFinishedBundles(self): return S_OK([entry[0] for entry in result["Value"]]) def getWaitingBundles(self): - result = self.getFields(self.BUNDLES_INFO_TABLE, ["BundleID"], {"Status": "WAITING"}) + result = self.getFields(self.BUNDLES_INFO_TABLE, self.BUNDLES_INFO_COLUMNS, {"Status": "Storing"}) if not result["OK"]: return result - return S_OK([entry[0] for entry in result["Value"]]) + bundlesDict = formatSelectOutput(result["Value"], self.BUNDLES_INFO_COLUMNS) + return S_OK(bundlesDict) ############################################################################# @@ -184,15 +193,17 @@ def getBundleStatus(self, bundleId): if not result["Value"]: return S_ERROR("Failed to get bundle Status") - return S_OK(STATUS_MAP[result["Value"][0][0]]) + return S_OK(self.STATUS_MAP[result["Value"][0][0]]) def getJobsOfBundle(self, bundleId): - cmd = f"""\ - SELECT JobToBundle.JobID, ExecutablePath, Outputs, InputPath + cmd = """\ + SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, InputPath FROM JobToBundle LEFT JOIN JobInputs ON JobToBundle.JobID = JobInputs.JobID - WHERE BundleID = "{bundleId}";""" + WHERE BundleID = "{bundleId}";""".format( + bundleId=bundleId + ) result = self._query(cmd) @@ -202,18 +213,19 @@ def getJobsOfBundle(self, bundleId): rows = list(result["Value"]) retVal = {} - # For each row (JobID, ExecutablePath, Outputs, [InputPath | Empty]) + # For each row (JobID, ExecutablePath, Outputs, [InputPath]) for row in rows: # The job has no input - if len(row) == 3: - jobID, jobExecutablePath, jobOutputs = row + if len(row) == 4: + jobID, diracId, jobExecutablePath, jobOutputs = row jobInputPath = "" else: - jobID, jobExecutablePath, jobOutputs, jobInputPath = row + jobID, diracId, jobExecutablePath, jobOutputs, jobInputPath = row if jobID not in retVal: retVal[jobID] = { "ExecutablePath": jobExecutablePath, + "DiracID": diracId, "Inputs": [], "Outputs": [], } @@ -223,20 +235,6 @@ def getJobsOfBundle(self, bundleId): if jobInputPath: retVal[jobID]["Inputs"].append(jobInputPath) - # for i in range(len(retVal)): - # result = self.getFields(self.JOB_INPUTS_TABLE, "InputPath", {"JobID": retVal[i]["JobID"]}) - # if not result["OK"]: - # return result - - # inputs = list(result["Value"]) - - # # Go through every input path - # for idx, item in inputs: - # inputs[idx] = item[0] # Just the input path - - # retVal[i]["Inputs"] = inputs - # retVal[i]["Outputs"] = literal_eval(retVal[i]["Outputs"]) - return S_OK(retVal) def getJobIDsOfBundle(self, bundleId): @@ -247,8 +245,11 @@ def getJobIDsOfBundle(self, bundleId): return S_OK([entry[0] for entry in result["Value"]]) - def removeJobInputs(self, jobId): - return self.deleteEntries(self.JOB_INPUTS_TABLE, {"JobID": jobId}) + def removeJobInputs(self, jobIds): + if not isinstance(jobIds, list): + jobIds = [jobIds] + + return self.deleteEntries(self.JOB_INPUTS_TABLE, {"JobID": jobIds}) ############################################################################# @@ -269,23 +270,40 @@ def getTaskId(self, bundleId): ############################################################################# def setBundleAsFinalized(self, bundleId): - result = self.__updateBundleStatus(bundleId, "Finalized") + result = self._updateBundleStatus(bundleId, "Finalized") return result def setBundleAsFailed(self, bundleId): - result = self.__updateBundleStatus(bundleId, "Failed") + result = self._updateBundleStatus(bundleId, "Failed") return result + def setBundleAsPurged(self, bundleId): + cmd = 'UPDATE BundlesInfo SET Flags = Flags | {flag} WHERE BundleID = "{bundleId}";'.format( + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Purged"] + ) + + return self._query(cmd) + def setBundleAsCleaned(self, bundleId): - return self.updateFields(self.BUNDLES_INFO_TABLE, ["Cleaned"], [True], {"BundleID": bundleId}) + cmd = 'UPDATE BundlesInfo SET Flags = Flags | {flag} WHERE BundleID = "{bundleId}";'.format( + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] + ) + + return self._query(cmd) def isBundleCleaned(self, bundleId): - result = self.getFields(self.BUNDLES_INFO_TABLE, ["Cleaned"], {"BundleID": bundleId}) + cmd = 'SELECT BundleID FROM BundlesInfo WHERE BundleID = "{bundleId}" AND Flags & {flag} = {flag};'.format( + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] + ) + + result = self._query(cmd) if not result["OK"]: return result + + cleaned = result["Value"] != [] - return S_OK(result["Value"][0][0]) + return S_OK(cleaned) ############################################################################# @@ -298,8 +316,8 @@ def getWholeBundle(self, bundleId): if not result["Value"]: return S_ERROR(f"No bundle with id {bundleId}") - bundleDict = formatSelectOutput(result["Value"], BUNDLES_INFO_COLUMNS)[0] - bundleDict["Status"] = STATUS_MAP[bundleDict["Status"]] + bundleDict = formatSelectOutput(result["Value"], self.BUNDLES_INFO_COLUMNS)[0] + bundleDict["Status"] = self.STATUS_MAP[bundleDict["Status"]] self.log.debug(f"Look at this cool bundle: {bundleDict}") @@ -315,17 +333,17 @@ def getBundleCE(self, bundleId): ############################################################################# - def __reduceProcessorSum(self, bundleId, nProcessors): - cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum - {} WHERE BundleID = "{}";'.format( - nProcessors, bundleId + def _reduceProcessorSum(self, bundleId, nProcessors): + cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum - {nProcs} WHERE BundleID = "{bundleId}";'.format( + bundleId=bundleId, nProcs=nProcessors ) return self._query(cmd) - def __createNewBundle(self, ceDict, proxyPath): + def _createNewBundle(self, ceDict, proxyPath): if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") - timestamp = datetime.now(tz=timezone.utc).strftime(MYSQL_DATETIME_FORMAT) + timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) bundleId = uuid.uuid4().hex insertInfo = { @@ -349,8 +367,8 @@ def __createNewBundle(self, ceDict, proxyPath): return S_OK(bundleId) - def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath): - timestamp = datetime.now(tz=timezone.utc).strftime(MYSQL_DATETIME_FORMAT) + def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath, diracId): + timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) # Insert the job into the bundle insertInfo = { @@ -361,6 +379,9 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro "Processors": nProcessors, } + if diracId: + insertInfo["DiracID"] = diracId + result = self.insertFields(self.JOB_TO_BUNDLE_TABLE, list(insertInfo.keys()), list(insertInfo.values())) if not result["OK"]: @@ -379,8 +400,12 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro return result # Modify the number of processors that will be used by the bundle - cmd = 'UPDATE BundlesInfo SET ProcessorSum = ProcessorSum + {}, LastTimestamp = "{}" WHERE BundleID = "{}";'.format( - nProcessors, timestamp, bundleId + cmd = """\ + UPDATE BundlesInfo + SET ProcessorSum = ProcessorSum + {nProcs}, LastTimestamp = "{timestamp}" + WHERE BundleID = "{bundleId}"; + """.format( + bundleId=bundleId, nProcs=nProcessors, timestamp=timestamp ) result = self._query(cmd) @@ -403,23 +428,11 @@ def __insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nPro ) selection = selection[0] - ready = self.__getBundleRediness(selection) + ready = selection["ProcessorSum"] == selection["MaxProcessors"] return S_OK({"BundleId": bundleId, "Ready": ready}) - def __getBundleRediness(self, bundleInfo): - elapsedTime : timedelta = bundleInfo["LastTimestamp"] - bundleInfo["FirstTimestamp"] - elapsedMinutes = elapsedTime.total_seconds() // 60 - - if elapsedMinutes > self.maxMinsInBundle: - return True - - if bundleInfo["ProcessorSum"] == bundleInfo["MaxProcessors"]: - return True - - return False - - def __getBundlesFromCEDict(self, ceDict): + def _getBundlesFromCEDict(self, ceDict): cmd = 'SELECT * FROM BundlesInfo WHERE Site = "{Site}" AND CE = "{CE}" AND Queue = "{Queue}";'.format( Site=ceDict["Site"], CE=ceDict["GridCE"], @@ -435,16 +448,18 @@ def __getBundlesFromCEDict(self, ceDict): retVal = formatSelectOutput( result["Value"], - BUNDLES_INFO_COLUMNS, + self.BUNDLES_INFO_COLUMNS, ) return S_OK(retVal) - def __updateBundleStatus(self, bundleId, newStatus): - if newStatus not in STATUS_MAP.keys(): + def _updateBundleStatus(self, bundleId, newStatus): + if newStatus not in self.STATUS_MAP.keys(): msg = f"The new status '{newStatus}' does not correspond with the possible statuses:" - return S_ERROR(msg, STATUS_MAP.keys()) + return S_ERROR(msg, self.STATUS_MAP.keys()) - cmd = f'UPDATE BundlesInfo SET Status = "{newStatus}" WHERE BundleID = "{bundleId}";' + cmd = 'UPDATE BundlesInfo SET Status = "{status}" WHERE BundleID = "{bundleId}";'.format( + bundleId=bundleId, status=newStatus + ) result = self._query(cmd) if not result["OK"]: diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 21756f50bd2..49aac51375c 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -30,24 +30,25 @@ CREATE TABLE `BundlesInfo` ( `TaskID` VARCHAR(255), `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', `ProxyPath` VARCHAR(255), - `Cleaned` BOOLEAN DEFAULT FALSE, + `Flags` SET('Cleaned', 'Purged') NOT NULL DEFAULT '', `FirstTimestamp` DATETIME, `LastTimestamp` DATETIME, PRIMARY KEY (`BundleID`), INDEX (`Site`,`CE`,`Queue`), - INDEX (`Status`), - INDEX (`Cleaned`) + INDEX (`Status`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ CREATE TABLE `JobToBundle` ( `JobID` VARCHAR(255) NOT NULL, `BundleID` VARCHAR(32) NOT NULL, + `DiracID` INTEGER, `ExecutablePath` VARCHAR(255) NOT NULL, `Outputs` VARCHAR(255) NOT NULL, `Processors` INT(5) UNSIGNED NOT NULL DEFAULT 1, PRIMARY KEY (`JobID`), - FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`) + FOREIGN KEY (`BundleID`) REFERENCES `BundlesInfo`(`BundleID`), + INDEX (`DiracID`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; -- ------------------------------------------------------------------------------ From b0c181aa5935e32c3a014b07e09e3a7c0c6503d7 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Fri, 17 Oct 2025 12:34:13 +0200 Subject: [PATCH 34/47] feat(BundleManagerAgent): Add possibility to force-submit bundles chore: Accommodate Bundle Service and CE to the schema of the DB fix: pre-commit --- .../Computing/AREXEnhancedComputingElement.py | 31 ++-- .../Computing/BundleComputingElement.py | 18 +- .../Agent/BundleManagerAgent.py | 119 ++++++++++-- .../WorkloadManagementSystem/DB/BundleDB.py | 52 +++--- .../Service/BundlerHandler.py | 175 ++++++++++-------- .../Utilities/BundlerTemplates.py | 11 +- 6 files changed, 262 insertions(+), 144 deletions(-) diff --git a/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py index 7826501df27..ab5001dca7b 100644 --- a/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py +++ b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py @@ -21,10 +21,10 @@ def _getListOfAvailableOutputs(self, jobID, arcJobID, path=None): :param str path: remote path :return list: names of the available outputs """ - query = self._urlJoin(os.path.join("jobs", arcJobID, "session", path or '')) + query = self._urlJoin(os.path.join("jobs", arcJobID, "session", path or "")) # Submit the GET request to retrieve the names of the outputs - #self.log.debug(f"Retrieving the names of the outputs for {jobID}") + # self.log.debug(f"Retrieving the names of the outputs for {jobID}") self.log.debug(f"Retrieving the names of the outputs with {query}") result = self._request("get", query) if not result["OK"]: @@ -35,9 +35,9 @@ def _getListOfAvailableOutputs(self, jobID, arcJobID, path=None): if not response.text: return S_ERROR(f"There is no output for job {jobID}") - #return S_OK(response.json()["file"]) + # return S_OK(response.json()["file"]) return S_OK(response.json()) - + def getJobOutput(self, jobID, workingDirectory=None, path=None): """Get the outputs of the given job reference. @@ -68,11 +68,11 @@ def getJobOutput(self, jobID, workingDirectory=None, path=None): self.log.debug("Outputs to get are", remoteOutputs) remoteOutputsFiles = [] - if 'file' in remoteOutputs: + if "file" in remoteOutputs: remoteOutputsFiles = remoteOutputs["file"] - remoteOutputsDirs = [] - if 'dir' in remoteOutputs: + remoteOutputsDirs = [] + if "dir" in remoteOutputs: remoteOutputsDirs = remoteOutputs["dir"] if not workingDirectory: @@ -81,23 +81,25 @@ def getJobOutput(self, jobID, workingDirectory=None, path=None): workingDirectory = os.path.join(self.ceParameters["WorkingDirectory"], arcJob) else: workingDirectory = arcJob - + if not os.path.exists(workingDirectory): os.mkdir(workingDirectory) - + # Directories for remoteOutput in remoteOutputsDirs: - self.getJobOutput(jobID, - workingDirectory=os.path.join(workingDirectory, remoteOutput), - path=os.path.join(path or '', remoteOutput)) + self.getJobOutput( + jobID, + workingDirectory=os.path.join(workingDirectory, remoteOutput), + path=os.path.join(path or "", remoteOutput), + ) # Files stdout = None stderr = None for remoteOutput in remoteOutputsFiles: # Prepare the command - #query = self._urlJoin(os.path.join("jobs", arcJob, "session", remoteOutput)) - query = self._urlJoin(os.path.join("jobs", arcJob, "session", path or '', remoteOutput)) + # query = self._urlJoin(os.path.join("jobs", arcJob, "session", remoteOutput)) + query = self._urlJoin(os.path.join("jobs", arcJob, "session", path or "", remoteOutput)) # Submit the GET request to retrieve outputs result = self._request("get", query, stream=True) @@ -117,5 +119,4 @@ def getJobOutput(self, jobID, workingDirectory=None, path=None): with open(localOutput) as f: stderr = f.read() - return S_OK((stdout, stderr)) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 7acda21a389..be055e062a3 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -113,6 +113,9 @@ def __getitem__(self, jobId): class BundleComputingElement(ComputingElement): def __init__(self, ceUniqueID): """Standard constructor.""" + if not ceUniqueID.startswith("bundled-"): + ceUniqueID = f"bundled-{ceUniqueID}" + super().__init__(ceUniqueID) self.mandatoryParameters = ["ExecTemplate", "InnerCEType"] @@ -138,7 +141,8 @@ def _reset(self): innerCEType = innerCEParams.pop("InnerCEType") innerCEParams["CEType"] = innerCEType - innerCeName = self.ceParameters["GridCE"].split("bundled-")[1] + innerCeName = self.ceParameters["GridCE"][len("bundled-") :] + innerCEParams["GridCE"] = innerCeName # Building of the InnerCE @@ -162,7 +166,7 @@ def _reset(self): ############################################################################# - def submitJob(self, executableFile, proxy=None, numberOfProcessors=1, inputs=[], outputs=[]): + def submitJob(self, executableFile, proxy=None, numberOfProcessors=1, inputs=[], outputs=[], **kwargs): jobId = str(uuid.uuid4().hex) proxy = self.proxy if self.proxy else proxy @@ -183,8 +187,12 @@ def submitJob(self, executableFile, proxy=None, numberOfProcessors=1, inputs=[], proxyPath = result["Value"] + diracId = kwargs.get("jobDesc", {}).get("jobID", None) + if diracId: + diracId = int(diracId) + result = self.bundler.storeInBundle( - jobId, executableFile, inputs, outputs, proxyPath, numberOfProcessors, self.innerCEParams + jobId, executableFile, inputs, outputs, proxyPath, numberOfProcessors, self.innerCEParams, diracId ) if not result["OK"]: @@ -254,10 +262,10 @@ def getJobOutput(self, jobId, workingDirectory="."): if not os.path.exists(output) or not os.path.exists(error): return S_ERROR("Outputs unable to be obtained") - with open(output, "r") as f: + with open(output) as f: output = f.read() - with open(error, "r") as f: + with open(error) as f: error = f.read() return S_OK((output, error)) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index bf47284291d..fc4abd046a7 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -1,14 +1,14 @@ import os +from datetime import datetime, timedelta, timezone from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Core.Base.AgentModule import AgentModule from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader -from DIRAC.WorkloadManagementSystem.Client import PilotStatus +from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB -from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import generate_template -from DIRAC.WorkloadManagementSystem.Client import JobStatus + class BundleManagerAgent(AgentModule): def __init__(self, agentName, loadName, baseAgentName=False, properties=None): @@ -24,37 +24,62 @@ def initialize(self): self.bundleDB = BundleDB() self.jobMonitor = JobMonitoringClient() self.bundler = BundlerClient() + self.maxMinsInBundle = self.am_getOption("MaxMinutesInBundle", defaultValue=10) + return S_OK() def execute(self): - self._sendStalledBundles() - self._cleanFinishedBundles() - self._removeKilledJobs() + self.log.info("Sending stalled Bundles") + result = self._sendStalledBundles() + if not result["OK"]: + self.log.warn(f"Failed send the bundles: {result}") + + self.log.info("Cleaning inputs of finished bundles bundles") + result = self._cleanFinishedBundles() + if not result["OK"]: + self.log.warn(f"Failed to clean the inputs: {result}") + + self.log.info("Deleting killed jobs from bundles") + result = self._removeKilledJobs() + if not result["OK"]: + self.log.warn(f"Failed to delete the inputs: {result}") + + return S_OK() def finalize(self): - pass + return S_OK() ############################################################################# def _cleanFinishedBundles(self): - self.log.info("Cleaning inputs of finished bundles bundles") - - result = self.bundleDB.getFinishedBundles() + result = self.bundleDB.getUnpurgedBundles() if not result["OK"]: return result + bundleIDs = result["Value"] + self.log.debug(f"> Found {len(bundleIDs)} finished and unpurged bundles") for bundleId in bundleIDs: - result = self.getJobIDsOfBundle(bundleId) + success = True + result = self.bundleDB.getJobIDsOfBundle(bundleId) if not result["OK"]: + self.log.error(f"Failed to obtain the jobs of the bundle {bundleId}") return result + jobIDs = result["Value"] + self.log.debug(f"> Purging inputs of bundle with ID '{bundleId}'") + for jobId in jobIDs: result = self.bundleDB.removeJobInputs(jobId) if not result["OK"]: + success = False self.log.error(f"Failed to remove inputs of job {jobId} from bundle {bundleId}, skipping...") self.log.error(result) + if success: + self.log.info(f"> Inputs of bundle with ID '{bundleId}' were removed from DB") + self.bundleDB.setBundleAsPurged(bundleId) + return S_OK() def _removeKilledJobs(self): @@ -64,22 +89,80 @@ def _removeKilledJobs(self): if not result["OK"]: return result - for bundleId in result["Value"]: + bundles = result["Value"] + self.log.debug(f"> Found {len(bundles)} waiting bundles") + + for bundleInfo in bundles: + bundleId = bundleInfo["BundleID"] + result = self.bundleDB.getJobsOfBundle(bundleId) if not result["OK"]: + self.log.error(f"Failed to get the jobs of the bundle '{bundleId}'") return result - result = self.jobMonitor.getJobsStatus(result["Value"]) + jobs = result["Value"] + jobIds = list(jobs.keys()) + + diracIds = [] + diracIdToJobId = {} + for jobId in jobIds: + if "DiracID" not in jobs[jobId]: + continue + + diracId = jobs[jobId]["DiracID"] + if diracId: + diracIds.append(diracId) + diracIdToJobId[diracId] = jobId + + result = self.jobMonitor.getJobsStatus(diracIds) if not result["OK"]: + self.log.error(f"Failed to get the status of the jobs with ids: {diracIds}") + self.log.error(result) return result statusDict = result["Value"] - for job, status in statusDict.items(): + for diracId, status in statusDict.items(): if status == JobStatus.KILLED: - killedJobs.append(job) + self.log.info(f"> Status of job '{diracId}' is 'Killed', adding it to the deletion list") + killedJobs.append(diracIdToJobId[diracId]) + + result = self.bundleDB.removeJobsFromBundle(killedJobs) + if not result["OK"]: + return result - result = self.bundleDB.removeJobs(killedJobs) - return result + deletedDict = result["Value"] + + failedDeletions = {} + for jobId, jobResult in deletedDict.items(): + if not jobResult["OK"]: + failedDeletions[jobId] = jobResult + + if failedDeletions: + return S_ERROR(f"Failed to delete the following jobs: {failedDeletions}") + + return S_OK() def _sendStalledBundles(self): - pass \ No newline at end of file + result = self.bundleDB.getWaitingBundles() + if not result["OK"]: + return result + + bundles = result["Value"] + self.log.debug(f"> Found {len(bundles)} waiting bundles") + + bundleIds = [] + currentTime = datetime.now(tz=timezone.utc).replace(tzinfo=None) + + for bundleInfo in bundles: + elapsedTime: timedelta = currentTime - bundleInfo["LastTimestamp"] + elapsedMinutes = elapsedTime.total_seconds() // 60 + + if elapsedMinutes > self.maxMinsInBundle: + _id = bundleInfo["BundleID"] + bundleIds.append(bundleInfo["BundleID"]) + + if bundleIds: + self.log.info(f"> Force-Submitting {len(bundleIds)} bundles due to timeout, IDs: ({bundleIds})") + result = self.bundler.forceSubmitBundles(bundleIds) + + return S_OK() diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 038b5099561..adbd3fabc34 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -47,7 +47,7 @@ def __init__(self, parentLogger=None): "ProxyPath", "Flags", "FirstTimestamp", - "LastTimestamp" + "LastTimestamp", ] self.JOB_TO_BUNDLE_COLUMNS = [ @@ -72,14 +72,13 @@ def __init__(self, parentLogger=None): "Failed": PilotStatus.FAILED, } - self.MYSQL_DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' + self.MYSQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" self.BUNDLE_FLAGS = { - "Cleaned": 1, - "Purged": 1 << 1, + "Cleaned": 1, + "Purged": 1 << 1, } - @property def log(self): return contextLogger.get() or self._defaultLogger @@ -106,7 +105,9 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi return result bundleId = result["Value"] - result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId) + result = self._insertJobInBundle( + jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId + ) if not result["OK"]: return result @@ -173,7 +174,7 @@ def getWaitingBundles(self): bundlesDict = formatSelectOutput(result["Value"], self.BUNDLES_INFO_COLUMNS) return S_OK(bundlesDict) - + ############################################################################# def getBundleIdFromJobId(self, jobId): @@ -198,9 +199,9 @@ def getBundleStatus(self, bundleId): def getJobsOfBundle(self, bundleId): cmd = """\ SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, InputPath - FROM JobToBundle - LEFT JOIN JobInputs - ON JobToBundle.JobID = JobInputs.JobID + FROM JobToBundle + LEFT JOIN JobInputs + ON JobToBundle.JobID = JobInputs.JobID WHERE BundleID = "{bundleId}";""".format( bundleId=bundleId ) @@ -248,7 +249,7 @@ def getJobIDsOfBundle(self, bundleId): def removeJobInputs(self, jobIds): if not isinstance(jobIds, list): jobIds = [jobIds] - + return self.deleteEntries(self.JOB_INPUTS_TABLE, {"JobID": jobIds}) ############################################################################# @@ -279,31 +280,31 @@ def setBundleAsFailed(self, bundleId): def setBundleAsPurged(self, bundleId): cmd = 'UPDATE BundlesInfo SET Flags = Flags | {flag} WHERE BundleID = "{bundleId}";'.format( - bundleId=bundleId, flag=self.BUNDLE_FLAGS["Purged"] + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Purged"] ) return self._query(cmd) def setBundleAsCleaned(self, bundleId): cmd = 'UPDATE BundlesInfo SET Flags = Flags | {flag} WHERE BundleID = "{bundleId}";'.format( - bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] ) return self._query(cmd) def isBundleCleaned(self, bundleId): cmd = 'SELECT BundleID FROM BundlesInfo WHERE BundleID = "{bundleId}" AND Flags & {flag} = {flag};'.format( - bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] ) result = self._query(cmd) if not result["OK"]: return result - + cleaned = result["Value"] != [] - return S_OK(cleaned) + return S_OK(cleaned) ############################################################################# @@ -344,7 +345,7 @@ def _createNewBundle(self, ceDict, proxyPath): return S_ERROR("CE must have a properly formatted ExecTemplate") timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) - + bundleId = uuid.uuid4().hex insertInfo = { "BundleID": bundleId, @@ -401,8 +402,8 @@ def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProc # Modify the number of processors that will be used by the bundle cmd = """\ - UPDATE BundlesInfo - SET ProcessorSum = ProcessorSum + {nProcs}, LastTimestamp = "{timestamp}" + UPDATE BundlesInfo + SET ProcessorSum = ProcessorSum + {nProcs}, LastTimestamp = "{timestamp}" WHERE BundleID = "{bundleId}"; """.format( bundleId=bundleId, nProcs=nProcessors, timestamp=timestamp @@ -411,23 +412,22 @@ def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProc if not result["OK"]: return result - + # Obtain the info to be returned to the Service result = self.getFields( - self.BUNDLES_INFO_TABLE, - ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"], - {"BundleID": bundleId} + self.BUNDLES_INFO_TABLE, + ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"], + {"BundleID": bundleId}, ) if not result["OK"]: return result selection = formatSelectOutput( - result["Value"], - ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"] + result["Value"], ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"] ) selection = selection[0] - + ready = selection["ProcessorSum"] == selection["MaxProcessors"] return S_OK({"BundleId": bundleId, "Ready": ready}) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 25c75d43d45..6e9e9cc8d4e 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -47,21 +47,18 @@ def initialize(self): ############################################################################# - types_storeInBundle = [str, str, list, list, str, int, dict] + types_storeInBundle = [str, str, list, list, str, int, dict, [int, type(None)]] - def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, processors, ceDict): - result = self.__setupCE(ceDict, proxyPath) + def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, processors, ceDict, diracId): + result = self._setupCE(ceDict, proxyPath) if not result["OK"]: return result - ce = result["Value"]["CE"] - proxy = result["Value"]["Proxy"] - - self.jobToCE[jobId] = ce - # Insert the Job into the DB - result = self.bundleDB.insertJobToBundle(jobId, executable, inputs, outputs, processors, ceDict, proxyPath) + result = self.bundleDB.insertJobToBundle( + jobId, executable, inputs, outputs, processors, ceDict, proxyPath, diracId + ) if not result["OK"]: self.log.error("Failed to insert into a bundle the job with id ", str(jobId)) return result @@ -69,37 +66,10 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr bundleId = result["Value"]["BundleId"] readyForSubmission = result["Value"]["Ready"] - self.bundleToCE[bundleId] = ce - self.log.info("Job inserted in bundle successfully") if readyForSubmission: - self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") - - result = self._wrapBundle(bundleId) - if not result["OK"]: - return result - - jobIds, bundle_exe, bundle_inputs, bundle_outputs = result["Value"] - extra_outputs = [item for job_id in jobIds for item in [f"{job_id}.out", f"{job_id}.status"]] - bundle_outputs.extend(extra_outputs) - - result = ce.submitJob(bundle_exe, proxy=proxy, inputs=bundle_inputs, outputs=bundle_outputs) - - if not result["OK"]: - self.bundleDB.setBundleAsFailed(bundleId) - self.log.error("Failed to submit job to with id ", str(jobId)) - return result - - innerJobId = result["Value"][0] - taskId = innerJobId + ":::" + result["PilotStampDict"][innerJobId] - - result = self.bundleDB.setTaskId(bundleId, taskId) - - if not result["OK"]: - self.bundleDB.setBundleAsFailed(bundleId) - self.log.error("Failed to set task id of JobId ", str(jobId)) - return result + self._submitBundle(bundleId) return S_OK({"BundleID": bundleId, "Executing": readyForSubmission}) @@ -167,10 +137,10 @@ def export_tryToKillJob(self, jobId): return S_ERROR(message="KillBundleOnError is off, won't kill the bundle") def _killBundleOfJob(self, jobId): - result = self.__getJobCE(jobId) + result = self._getJobCE(jobId) if not result["OK"]: return result - ce = result["Value"] + ce = result["Value"]["CE"] result = self._getBundleIdFromJobId(jobId) if not result["OK"]: @@ -225,10 +195,10 @@ def export_cleanJob(self, jobId): taskId = result["Value"]["TaskID"] - result = self.__getJobCE(jobId) + result = self._getJobCE(jobId) if not result["OK"]: return result - ce = result["Value"] + ce = result["Value"]["CE"] try: result = ce.cleanJob(taskId) @@ -264,12 +234,7 @@ def export_getBundleStatus(self, bundleId): if ":::" in task: task = task.split(":::")[0] - result = self.__getBundleCE(bundleId) - - if not result["OK"]: - return result - - result = self.__setupCE(result["Value"]["CEDict"], result["Value"]["ProxyPath"]) + result = self._getBundleCE(bundleId) if not result["OK"]: return result @@ -292,6 +257,58 @@ def export_getBundleStatus(self, bundleId): ############################################################################# + types_forceSubmitBundles = [list] + + def export_forceSubmitBundles(self, bundleIds): + resultDict = {} + + if not isinstance(bundleIds, list): + bundleIds = [bundleIds] + + for bundleId in bundleIds: + result = self._submitBundle(bundleId) + resultDict[bundleId] = result + + return S_OK(resultDict) + + def _submitBundle(self, bundleId): + result = self._getBundleCE(bundleId) + + if not result["OK"]: + return result + + ce = result["Value"]["CE"] + proxy = result["Value"]["Proxy"] + + result = self._wrapBundle(bundleId) + if not result["OK"]: + return result + + jobIds, bundle_exe, bundle_inputs, bundle_outputs = result["Value"] + extra_outputs = [item for job_id in jobIds for item in [f"{job_id}.out", f"{job_id}.status"]] + bundle_outputs.extend(extra_outputs) + + self.log.info(f"Submitting bundle '{bundleId}' to CE '{ce.ceName}'") + + ce.ceParameters["NumberOfProcessors"] = len(jobIds) + result = ce.submitJob(bundle_exe, proxy=proxy, inputs=bundle_inputs, outputs=bundle_outputs) + + if not result["OK"]: + self.bundleDB.setBundleAsFailed(bundleId) + return result + + innerJobId = result["Value"][0] + taskId = innerJobId + ":::" + result["PilotStampDict"][innerJobId] + + result = self.bundleDB.setTaskId(bundleId, taskId) + + if not result["OK"]: + return S_ERROR("Failed to set the task id of the Bundle") + + return S_OK() + + ############################################################################# + def _getBundleIdFromJobId(self, jobId): if jobId in self.jobToBundle: return S_OK(self.jobToBundle[jobId]) @@ -369,7 +386,7 @@ def _wrapBundle(self, bundleId): return S_OK((jobIds, wrapperPath, inputs, outputs)) - def __getBundleCE(self, bundleId): + def _getBundleCEDict(self, bundleId): result = self.bundleDB.getBundleCE(bundleId) if not result["OK"]: return result @@ -379,51 +396,59 @@ def __getBundleCE(self, bundleId): return S_OK({"CEDict": ceDict, "ProxyPath": result["Value"]["ProxyPath"]}) - def _getCE(self, jobId): - result = self._getBundleIdFromJobId(jobId) + def _setupCE(self, ceDict, proxyPath): + result = getProxyInfo(proxy=proxyPath) if not result["OK"]: + self.log.error("Failed to obtain proxy from path") return result - bundleId = result["Value"] - return self.__getBundleCE(bundleId) + proxy = result["Value"]["chain"] - def __getJobCE(self, jobId): - if jobId not in self.jobToCE: - # Look for it in the DB - result = self._getCE(jobId) + # Setup CE + result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) + + if not result["OK"]: + self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) + return result + + ce = result["Value"] + + ce.setProxy(proxy) + + return S_OK({"CE": ce, "Proxy": proxy}) + + def _getBundleCE(self, bundleId): + if bundleId not in self.bundleToCE: + result = self._getBundleCEDict(bundleId) if not result["OK"]: - self.log.error("Failed to obtain CE Dict of Bundle with JobId ", str(jobId)) return result - result = self.__setupCE(result["Value"]["CEDict"], result["Value"]["ProxyPath"]) + result = self._setupCE(result["Value"]["CEDict"], result["Value"]["ProxyPath"]) if not result["OK"]: return result - self.jobToCE[jobId] = result["Value"]["CE"] - - return S_OK(self.jobToCE[jobId]) + self.bundleToCE[bundleId] = result["Value"] # CE + Proxy - def __setupCE(self, ceDict, proxyPath): - result = getProxyInfo(proxy=proxyPath) + return S_OK(self.bundleToCE[bundleId]) - if not result["OK"]: - self.log.error("Failed to obtain proxy from path") - return result + def _getJobCE(self, jobId): + if jobId not in self.jobToCE: + result = self._getBundleIdFromJobId(jobId) - proxy = result["Value"]["chain"] + if not result["OK"]: + self.log.error("Failed to obtain BundleId with JobId ", str(jobId)) + return result - # Setup CE - result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) + bundleId = result["Value"] - if not result["OK"]: - self.log.error("Failed obtain the CE with configuration: ", str(ceDict)) - return result + result = self._getBundleCE(bundleId) - ce = result["Value"] + if not result["OK"]: + return result - ce.setProxy(proxy) + self.jobToCE[jobId] = result["Value"] - return S_OK({"CE": ce, "Proxy": proxy}) + return S_OK(self.jobToCE[jobId]) diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index cdc312ddb23..25d0714097d 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -32,7 +32,7 @@ next_allowed_cpu() {{ echo $allowed_cpus return 0 - + local desired_cpu=$(( ($1 + $cpu_offset) % $total_allowed_cpus )) local cpu=$first_allowed_cpu @@ -40,12 +40,12 @@ local min=$(cut -d "-" -f 1 - <<<"$range") local max=$(cut -d "-" -f 2 - <<<"$range") local real_cpu=$(($min+$desired_cpu)) - + if (( $real_cpu <= $max )); then cpu=$real_cpu break fi - + # Check next range local cpus_on_range=$(($max-$min+1)) local desired_cpu=$(($desired_cpu-$cpus_on_range)) @@ -73,7 +73,7 @@ jobId=$(get_id ${{input}}) mkdir ${{jobId}} - + for filename in ${{jobId}}*; do [ -f ${{filename}} ] || continue # Move the job specific files to its directory, removing the jobId from its name @@ -111,7 +111,7 @@ jobId=$(get_id ${{input}}) mkdir ${{jobId}} - + for filename in ${{jobId}}*; do [ -f ${{filename}} ] || continue # Move the job specific files to its directory, removing the jobId from its name @@ -148,6 +148,7 @@ echo "[${task_id}] Process final status: ${task_status}" """ + def generate_template(template: str, inputs: list, bundleId: str): template = template.lower().replace("-", "_") func_name = "_generate_" + template From 6e2bd4456bc71d54b09fc50f4f49083a5b591050 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:37:34 +0200 Subject: [PATCH 35/47] chore(BundleDB): Generalize Bundle Status using PilotStatus --- .../WorkloadManagementSystem/DB/BundleDB.py | 105 +++++++++--------- .../WorkloadManagementSystem/DB/BundleDB.sql | 2 +- .../Service/BundlerHandler.py | 24 +++- 3 files changed, 76 insertions(+), 55 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index adbd3fabc34..d9254cf4d09 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -65,13 +65,6 @@ def __init__(self, parentLogger=None): "InputPath", ] - self.STATUS_MAP = { - "Storing": PilotStatus.WAITING, - "Sent": PilotStatus.RUNNING, - "Finalized": PilotStatus.DONE, - "Failed": PilotStatus.FAILED, - } - self.MYSQL_DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" self.BUNDLE_FLAGS = { @@ -155,8 +148,8 @@ def removeJobsFromBundle(self, jobIds): ############################################################################# def getUnpurgedBundles(self): - cmd = 'SELECT BundleID FROM BundlesInfo WHERE Status = "Finalized" AND Flags & {flag} != {flag};'.format( - flag=self.BUNDLE_FLAGS["Purged"] + cmd = 'SELECT BundleID FROM BundlesInfo WHERE Status = "{status}" AND Flags & {flag} != {flag};'.format( + status=PilotStatus.DONE, flag=self.BUNDLE_FLAGS["Purged"] ) result = self._query(cmd) @@ -166,8 +159,28 @@ def getUnpurgedBundles(self): return S_OK([entry[0] for entry in result["Value"]]) + def isBundleCleaned(self, bundleId): + cmd = 'SELECT BundleID FROM BundlesInfo WHERE BundleID = "{bundleId}" AND Flags & {flag} = {flag};'.format( + bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] + ) + + result = self._query(cmd) + + if not result["OK"]: + return result + + cleaned = result["Value"] != [] + + return S_OK(cleaned) + def getWaitingBundles(self): - result = self.getFields(self.BUNDLES_INFO_TABLE, self.BUNDLES_INFO_COLUMNS, {"Status": "Storing"}) + return self._getBundlesWithStatus(PilotStatus.WAITING) + + def getRunningBundles(self): + return self._getBundlesWithStatus(PilotStatus.RUNNING) + + def _getBundlesWithStatus(self, status): + result = self.getFields(self.BUNDLES_INFO_TABLE, self.BUNDLES_INFO_COLUMNS, {"Status": status}) if not result["OK"]: return result @@ -194,17 +207,25 @@ def getBundleStatus(self, bundleId): if not result["Value"]: return S_ERROR("Failed to get bundle Status") - return S_OK(self.STATUS_MAP[result["Value"][0][0]]) + return S_OK(result["Value"][0][0]) - def getJobsOfBundle(self, bundleId): - cmd = """\ - SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, InputPath - FROM JobToBundle - LEFT JOIN JobInputs - ON JobToBundle.JobID = JobInputs.JobID - WHERE BundleID = "{bundleId}";""".format( - bundleId=bundleId - ) + def getJobsOfBundle(self, bundleId, noInputs=False): + if noInputs: + cmd = """\ + SELECT JobID, DiracID, ExecutablePath, Outputs, Processors + FROM JobToBundle + WHERE BundleID = "{bundleId}";""".format( + bundleId=bundleId + ) + else: + cmd = """\ + SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, Processors, InputPath + FROM JobToBundle + LEFT JOIN JobInputs + ON JobToBundle.JobID = JobInputs.JobID + WHERE BundleID = "{bundleId}";""".format( + bundleId=bundleId + ) result = self._query(cmd) @@ -214,23 +235,26 @@ def getJobsOfBundle(self, bundleId): rows = list(result["Value"]) retVal = {} - # For each row (JobID, ExecutablePath, Outputs, [InputPath]) + # For each row (JobID, ExecutablePath, Outputs, Processors, [InputPath]) for row in rows: # The job has no input - if len(row) == 4: - jobID, diracId, jobExecutablePath, jobOutputs = row + if len(row) == len(self.JOB_TO_BUNDLE_COLUMNS) - 1: # All columns except BundleID + jobID, diracId, jobExecutablePath, jobOutputs, processors = row jobInputPath = "" else: - jobID, diracId, jobExecutablePath, jobOutputs, jobInputPath = row + jobID, diracId, jobExecutablePath, jobOutputs, processors, jobInputPath = row if jobID not in retVal: retVal[jobID] = { "ExecutablePath": jobExecutablePath, "DiracID": diracId, - "Inputs": [], "Outputs": [], + "Processors": processors, } + if not noInputs: + retVal[jobID]["Inputs"] = [] + retVal[jobID]["Outputs"].extend(literal_eval(jobOutputs)) if jobInputPath: @@ -256,7 +280,7 @@ def removeJobInputs(self, jobIds): def setTaskId(self, bundleId, taskId): result = self.updateFields( - self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, "Sent"], {"BundleID": bundleId} + self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, PilotStatus.RUNNING], {"BundleID": bundleId} ) return result @@ -270,12 +294,12 @@ def getTaskId(self, bundleId): ############################################################################# - def setBundleAsFinalized(self, bundleId): - result = self._updateBundleStatus(bundleId, "Finalized") + def setBundleAsDone(self, bundleId): + result = self._updateBundleStatus(bundleId, PilotStatus.DONE) return result def setBundleAsFailed(self, bundleId): - result = self._updateBundleStatus(bundleId, "Failed") + result = self._updateBundleStatus(bundleId, PilotStatus.FAILED) return result def setBundleAsPurged(self, bundleId): @@ -292,20 +316,6 @@ def setBundleAsCleaned(self, bundleId): return self._query(cmd) - def isBundleCleaned(self, bundleId): - cmd = 'SELECT BundleID FROM BundlesInfo WHERE BundleID = "{bundleId}" AND Flags & {flag} = {flag};'.format( - bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] - ) - - result = self._query(cmd) - - if not result["OK"]: - return result - - cleaned = result["Value"] != [] - - return S_OK(cleaned) - ############################################################################# def getWholeBundle(self, bundleId): @@ -318,9 +328,6 @@ def getWholeBundle(self, bundleId): return S_ERROR(f"No bundle with id {bundleId}") bundleDict = formatSelectOutput(result["Value"], self.BUNDLES_INFO_COLUMNS)[0] - bundleDict["Status"] = self.STATUS_MAP[bundleDict["Status"]] - - self.log.debug(f"Look at this cool bundle: {bundleDict}") return S_OK(bundleDict) @@ -453,10 +460,6 @@ def _getBundlesFromCEDict(self, ceDict): return S_OK(retVal) def _updateBundleStatus(self, bundleId, newStatus): - if newStatus not in self.STATUS_MAP.keys(): - msg = f"The new status '{newStatus}' does not correspond with the possible statuses:" - return S_ERROR(msg, self.STATUS_MAP.keys()) - cmd = 'UPDATE BundlesInfo SET Status = "{status}" WHERE BundleID = "{bundleId}";'.format( bundleId=bundleId, status=newStatus ) @@ -499,7 +502,7 @@ def __selectBestBundle(self, bundles, nProcessors): newProcSum = procs + nProcessors - if status != "Storing": + if status != PilotStatus.WAITING: continue if newProcSum == maxProcs: diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 49aac51375c..8aed8525dc1 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -28,7 +28,7 @@ CREATE TABLE `BundlesInfo` ( `CEDict` TEXT NOT NULL, `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` VARCHAR(255), - `Status` ENUM('Storing', 'Sent', 'Finalized', 'Failed') NOT NULL DEFAULT 'Storing', + `Status` ENUM('Waiting', 'Running', 'Done', 'Failed') NOT NULL DEFAULT 'Waiting', `ProxyPath` VARCHAR(255), `Flags` SET('Cleaned', 'Purged') NOT NULL DEFAULT '', `FirstTimestamp` DATETIME, diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 6e9e9cc8d4e..4ac622b26f9 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -13,6 +13,7 @@ from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory from DIRAC.WorkloadManagementSystem.Client import PilotStatus +from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import BASH_RUN_TASK, generate_template @@ -33,6 +34,8 @@ def initializeHandler(cls, serviceInfoDict): cls.ceFactory = ComputingElementFactory() + cls.jobReports: dict[int, JobReport] = {} + except RuntimeError as excp: return S_ERROR(f"Can't connect to DB: {excp}") @@ -68,6 +71,11 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr self.log.info("Job inserted in bundle successfully") + if diracId: + self.jobReports[jobId] = JobReport(diracId, self.__class__.__name__) + + self.__reportJob(jobId, PilotStatus.RUNNING, "Job Stored in a bundle") + if readyForSubmission: self._submitBundle(bundleId) @@ -249,7 +257,7 @@ def export_getBundleStatus(self, bundleId): status = result["Value"][task] if status == PilotStatus.DONE: - self.bundleDB.setBundleAsFinalized(bundleId) + self.bundleDB.setBundleAsDone(bundleId) elif status in PilotStatus.PILOT_FINAL_STATES: # ABORTED, DELETED or FAILED self.bundleDB.setBundleAsFailed(bundleId) @@ -305,6 +313,9 @@ def _submitBundle(self, bundleId): if not result["OK"]: return S_ERROR("Failed to set the task id of the Bundle") + for jobId in jobIds: + self.__reportJob(jobId, PilotStatus.RUNNING, "Bundle of Job submitted to CE") + return S_OK() ############################################################################# @@ -364,7 +375,7 @@ def _wrapBundle(self, bundleId): shutil.copy(job_input, job_input_dst) inputs.append(job_input_dst) - outputs.extend(list(set(jobInfo["Outputs"]))) # Remove duplicated entries + outputs.extend(jobInfo["Outputs"]) result = generate_template(template, executables, bundleId) @@ -384,7 +395,7 @@ def _wrapBundle(self, bundleId): inputs.append(runnerPath) - return S_OK((jobIds, wrapperPath, inputs, outputs)) + return S_OK((jobIds, wrapperPath, inputs, list(set(outputs)))) def _getBundleCEDict(self, bundleId): result = self.bundleDB.getBundleCE(bundleId) @@ -452,3 +463,10 @@ def _getJobCE(self, jobId): self.jobToCE[jobId] = result["Value"] return S_OK(self.jobToCE[jobId]) + + def __reportJob(self, jobId: int, status: PilotStatus, info: str): + if jobId not in self.jobReports: + return + + self.jobReports[jobId].setJobStatus(status=status, minorStatus=info) + self.jobReports[jobId].commit() From b2894abf4003b1442b96f962a9e799385564e3ab Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Thu, 23 Oct 2025 09:39:40 +0200 Subject: [PATCH 36/47] feat(BundleHandler): Send heartbeat to keep bundles alive --- .../Agent/BundleManagerAgent.py | 37 +++++++++++++++++-- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index fc4abd046a7..15826522292 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -4,10 +4,11 @@ from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Core.Base.AgentModule import AgentModule from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader -from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus +from DIRAC.WorkloadManagementSystem.Client import PilotStatus, JobStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB +from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB class BundleManagerAgent(AgentModule): @@ -17,11 +18,13 @@ def __init__(self, agentName, loadName, baseAgentName=False, properties=None): super().__init__(agentName, loadName, baseAgentName, properties) self.bundleDB = None + self.jobDB = None ############################################################################# def initialize(self): self.bundleDB = BundleDB() + self.jobDB = JobDB() self.jobMonitor = JobMonitoringClient() self.bundler = BundlerClient() self.maxMinsInBundle = self.am_getOption("MaxMinutesInBundle", defaultValue=10) @@ -43,6 +46,8 @@ def execute(self): if not result["OK"]: self.log.warn(f"Failed to delete the inputs: {result}") + self._checkHeartBeat() + return S_OK() def finalize(self): @@ -56,7 +61,7 @@ def _cleanFinishedBundles(self): return result bundleIDs = result["Value"] - self.log.debug(f"> Found {len(bundleIDs)} finished and unpurged bundles") + self.log.verbose(f"> Found {len(bundleIDs)} finished and unpurged bundles") for bundleId in bundleIDs: success = True @@ -67,7 +72,7 @@ def _cleanFinishedBundles(self): jobIDs = result["Value"] - self.log.debug(f"> Purging inputs of bundle with ID '{bundleId}'") + self.log.verbose(f"> Purging inputs of bundle with ID '{bundleId}'") for jobId in jobIDs: result = self.bundleDB.removeJobInputs(jobId) @@ -95,7 +100,7 @@ def _removeKilledJobs(self): for bundleInfo in bundles: bundleId = bundleInfo["BundleID"] - result = self.bundleDB.getJobsOfBundle(bundleId) + result = self.bundleDB.getJobsOfBundle(bundleId, noInputs=True) if not result["OK"]: self.log.error(f"Failed to get the jobs of the bundle '{bundleId}'") return result @@ -126,6 +131,10 @@ def _removeKilledJobs(self): self.log.info(f"> Status of job '{diracId}' is 'Killed', adding it to the deletion list") killedJobs.append(diracIdToJobId[diracId]) + if not killedJobs: + self.log.verbose("Nothing to delete...") + return S_OK() + result = self.bundleDB.removeJobsFromBundle(killedJobs) if not result["OK"]: return result @@ -166,3 +175,23 @@ def _sendStalledBundles(self): result = self.bundler.forceSubmitBundles(bundleIds) return S_OK() + + def _checkHeartBeat(self): + """Hack to avoid stalled jobs when they are not""" + self.log.info("Sending heartbeats to running bundles") + result = self.bundleDB.getRunningBundles() + if not result["OK"]: + return result + + for bundleInfo in result["Value"]: + if bundleInfo["Status"] == PilotStatus.RUNNING: + result = self.bundleDB.getJobsOfBundle(bundleInfo["BundleID"], noInputs=True) + if not result["OK"]: + continue + + for _, jobDesc in result["Value"].items(): + diracId = jobDesc["DiracID"] + if not diracId: + continue + + self.jobDB.setHeartBeatData(diracId, {}) From 3861d939864aed55e3b5b821422ba90f295ff54f Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:24:53 +0100 Subject: [PATCH 37/47] fix(PushJobAgent): Bug while obtaining job output in failed job wrappers --- src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py index 9f60b79113f..a33243815f2 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py @@ -814,6 +814,14 @@ def _checkSubmittedJobWrappers(self, ce: ComputingElement, site: str): self.log.exception("JobWrapper failed the initialization phase", jobID) continue + if status == PilotStatus.FAILED: + job.jobReport.setJobStatus( + status=JobStatus.FAILED, minorStatus="Payload failed", sendFlag=False + ) + job.sendFailoverRequest() + job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) + continue + # Get the output of the job self.log.info(f"Getting the outputs of taskID {taskID} for {jobID}") if not (result := ce.getJobOutput(f"{taskID}:::{stamp}", job.jobIDPath))["OK"]: From 0ad48f069d3e18930f84b57ae2e59745b1b35d3b Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:48:19 +0100 Subject: [PATCH 38/47] chore: Clean and document --- .../Computing/BundleComputingElement.py | 11 ++-- .../Agent/BundleManagerAgent.py | 2 +- .../Client/BundlerClient.py | 2 - .../WorkloadManagementSystem/DB/BundleDB.py | 56 ++++++++++--------- .../Service/BundlerHandler.py | 34 ++++++++++- 5 files changed, 66 insertions(+), 39 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index be055e062a3..5a68c9f7c74 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -75,7 +75,7 @@ import shutil import uuid -from filelock import FileLock, Timeout +from filelock import FileLock from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement @@ -210,7 +210,7 @@ def submitJob(self, executableFile, proxy=None, numberOfProcessors=1, inputs=[], else: self.log.info("Submitting job to CE: ", self.innerCE.ceName) - # Return the id of the job (NOT THE BUNDLE) + # Return the id of the job, setting the "PilotStamp" to the BundleID return result def getJobOutput(self, jobId, workingDirectory="."): @@ -251,10 +251,9 @@ def getJobOutput(self, jobId, workingDirectory="."): self.log.notice(f"Outputs at: {jobOutputDir}") - # Move all outputs from the temporary directory, to where they should belong + # Move all outputs from the temporary directory, to the job working directory for item in os.listdir(jobOutputDir): - # shutil.move(os.path.join(jobBaseDir, item), os.path.join(outputAbsPath, item)) - shutil.copy2(os.path.join(jobOutputDir, item), os.path.join(outputAbsPath, item)) + shutil.move(os.path.join(jobOutputDir, item), os.path.join(outputAbsPath, item)) error = os.path.join(workingDirectory, f"{bundleId}.err") output = os.path.join(workingDirectory, f"{bundleId}.out") @@ -376,7 +375,7 @@ def __getOutputPath(self, bundleId, innerTaskId): try: # Always acquire the lock before checking anything with lock.acquire(timeout=60): - self.log.debug("Acquiring outputs lock") + self.log.debug("Outputs lock acquired") # If the output does not exist, dowload the outputs if not os.path.exists(outputsPath): os.mkdir(outputsPath) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index 15826522292..c3480dbddec 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -4,7 +4,7 @@ from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Core.Base.AgentModule import AgentModule from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader -from DIRAC.WorkloadManagementSystem.Client import PilotStatus, JobStatus +from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB diff --git a/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py index 62171c9af71..2ffdd986d8a 100644 --- a/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py +++ b/src/DIRAC/WorkloadManagementSystem/Client/BundlerClient.py @@ -2,7 +2,6 @@ """ from DIRAC.Core.Base.Client import Client, createClient -from DIRAC.Core.Utilities.DEncode import ignoreEncodeWarning @createClient("WorkloadManagement/Bundler") @@ -11,7 +10,6 @@ class BundlerClient(Client): This inherits the DIRAC base Client for direct execution of server functionality. The following methods are available (although not visible here). - """ def __init__(self, url=None, **kwargs): diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index d9254cf4d09..c5b9df14247 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -83,6 +83,7 @@ def log(self, value): ############################################################################# def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDict, proxyPath, diracId): + """Inserts a new job in a new or existing Bundle depending of the CE to be submitted.""" result = self._getBundlesFromCEDict(ceDict) if not result["OK"]: @@ -128,6 +129,10 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi return S_OK({"BundleId": bundleId, "Ready": result["Value"]["Ready"]}) def removeJobsFromBundle(self, jobIds): + """Receives a list of DIRAC JobIds, matches them to their corresponding bundle and removes them.""" + if not isinstance(jobIds, list): + jobIds = list(jobIds) + for jobId in jobIds: result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID", "Processors"], {"JobID": jobId}) @@ -135,9 +140,10 @@ def removeJobsFromBundle(self, jobIds): return result jobInfo = result["Value"][0] - bundleId, procs = jobInfo[0], jobInfo[1] + bundleId = jobInfo[0] + nProcs = jobInfo[1] - result = self._reduceProcessorSum(bundleId, procs) + result = self._reduceProcessorSum(bundleId, nProcs) if not result["OK"]: return result @@ -148,6 +154,7 @@ def removeJobsFromBundle(self, jobIds): ############################################################################# def getUnpurgedBundles(self): + """Obtains the list of Bundles that inputs haven't been removed locally.""" cmd = 'SELECT BundleID FROM BundlesInfo WHERE Status = "{status}" AND Flags & {flag} != {flag};'.format( status=PilotStatus.DONE, flag=self.BUNDLE_FLAGS["Purged"] ) @@ -160,6 +167,7 @@ def getUnpurgedBundles(self): return S_OK([entry[0] for entry in result["Value"]]) def isBundleCleaned(self, bundleId): + """Check if ce.cleanJob has been performed properly.""" cmd = 'SELECT BundleID FROM BundlesInfo WHERE BundleID = "{bundleId}" AND Flags & {flag} = {flag};'.format( bundleId=bundleId, flag=self.BUNDLE_FLAGS["Cleaned"] ) @@ -180,6 +188,7 @@ def getRunningBundles(self): return self._getBundlesWithStatus(PilotStatus.RUNNING) def _getBundlesWithStatus(self, status): + """Get Bundles that match certain status.""" result = self.getFields(self.BUNDLES_INFO_TABLE, self.BUNDLES_INFO_COLUMNS, {"Status": status}) if not result["OK"]: @@ -191,6 +200,7 @@ def _getBundlesWithStatus(self, status): ############################################################################# def getBundleIdFromJobId(self, jobId): + """Returns the BundleId that corresponds to a DIRAC JobId.""" result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["BundleID"], {"JobID": jobId}) if not result["OK"]: @@ -202,6 +212,7 @@ def getBundleIdFromJobId(self, jobId): return S_OK(result["Value"][0][0]) def getBundleStatus(self, bundleId): + """Obtain the status of the Bundle.""" result = self.getFields(self.BUNDLES_INFO_TABLE, ["Status"], {"BundleID": bundleId}) if not result["Value"]: @@ -209,7 +220,9 @@ def getBundleStatus(self, bundleId): return S_OK(result["Value"][0][0]) + # TODO: This whole function is incomprehensible, needs to be split in 2 def getJobsOfBundle(self, bundleId, noInputs=False): + """Get every Job that comprise a Bundle.""" if noInputs: cmd = """\ SELECT JobID, DiracID, ExecutablePath, Outputs, Processors @@ -241,7 +254,7 @@ def getJobsOfBundle(self, bundleId, noInputs=False): if len(row) == len(self.JOB_TO_BUNDLE_COLUMNS) - 1: # All columns except BundleID jobID, diracId, jobExecutablePath, jobOutputs, processors = row jobInputPath = "" - else: + else: # All columns except BundleID but with the inputs jobID, diracId, jobExecutablePath, jobOutputs, processors, jobInputPath = row if jobID not in retVal: @@ -263,6 +276,7 @@ def getJobsOfBundle(self, bundleId, noInputs=False): return S_OK(retVal) def getJobIDsOfBundle(self, bundleId): + """Returns the list of JobIds that are contained in a bundle""" result = self.getFields(self.JOB_TO_BUNDLE_TABLE, ["JobID"], {"BundleID": bundleId}) if not result["OK"]: @@ -271,6 +285,7 @@ def getJobIDsOfBundle(self, bundleId): return S_OK([entry[0] for entry in result["Value"]]) def removeJobInputs(self, jobIds): + """Removes the contents of the JobInputs table for each corresponding JobID.""" if not isinstance(jobIds, list): jobIds = [jobIds] @@ -279,12 +294,14 @@ def removeJobInputs(self, jobIds): ############################################################################# def setTaskId(self, bundleId, taskId): + """Sets the value of the TaskID generetad by the real CE during Bundle submission.""" result = self.updateFields( self.BUNDLES_INFO_TABLE, ["TaskID", "Status"], [taskId, PilotStatus.RUNNING], {"BundleID": bundleId} ) return result def getTaskId(self, bundleId): + """Returns the value of the TaskId stored.""" result = self.getFields(self.BUNDLES_INFO_TABLE, ["TaskID"], {"BundleID": bundleId}) if not result["OK"]: @@ -348,6 +365,7 @@ def _reduceProcessorSum(self, bundleId, nProcessors): return self._query(cmd) def _createNewBundle(self, ceDict, proxyPath): + """Initialize a new Bundle.""" if "ExecTemplate" not in ceDict: return S_ERROR("CE must have a properly formatted ExecTemplate") @@ -376,9 +394,10 @@ def _createNewBundle(self, ceDict, proxyPath): return S_OK(bundleId) def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath, diracId): + """Add the info of a Job to a Bundle.""" timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) - # Insert the job into the bundle + # Job Insertion insertInfo = { "JobID": jobId, "BundleID": bundleId, @@ -395,7 +414,6 @@ def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProc if not result["OK"]: return result - # Insert the Inputs for _input in inputs: insertInfo = { "JobID": jobId, @@ -420,6 +438,7 @@ def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProc if not result["OK"]: return result + # TODO: Move all of this out of the function # Obtain the info to be returned to the Service result = self.getFields( self.BUNDLES_INFO_TABLE, @@ -432,14 +451,14 @@ def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProc selection = formatSelectOutput( result["Value"], ["ProcessorSum", "MaxProcessors", "Status", "FirstTimestamp", "LastTimestamp"] - ) - selection = selection[0] + )[0] ready = selection["ProcessorSum"] == selection["MaxProcessors"] return S_OK({"BundleId": bundleId, "Ready": ready}) def _getBundlesFromCEDict(self, ceDict): + """Returns the bundles that match a CE (Site, CE and Queue).""" cmd = 'SELECT * FROM BundlesInfo WHERE Site = "{Site}" AND CE = "{CE}" AND Queue = "{Queue}";'.format( Site=ceDict["Site"], CE=ceDict["GridCE"], @@ -460,6 +479,7 @@ def _getBundlesFromCEDict(self, ceDict): return S_OK(retVal) def _updateBundleStatus(self, bundleId, newStatus): + """Changes the status of a Bundle.""" cmd = 'UPDATE BundlesInfo SET Status = "{status}" WHERE BundleID = "{bundleId}";'.format( bundleId=bundleId, status=newStatus ) @@ -470,27 +490,9 @@ def _updateBundleStatus(self, bundleId, newStatus): return S_OK() - # This is function quite dumb, and should not work like this, but for a fist - # aproximation is fine (I guess). - # - # The best way (in my opinion) of approching this is by taking advantage of - # dynamic programming. - # We could approach this by considering the bundles as sacks and selecting - # the bundle to insert the same way it is done in the Knapsack Problem. - # - # REF: https://en.wikipedia.org/wiki/Knapsack_problem - # - # Each bundle that relates to the same CE would be a Knapsack and each item - # would be a different job. The job would have its 'weight' and 'price' set - # to the number of processors it needs, and the algorithm would optimize - # how they are distributed around the bundles. - # - # By having multiple bundles, this would relate more to the Bin Packing Problem, - # which is an abstaction of the Knapsack Problem. - # - # REF: https://en.wikipedia.org/wiki/Bin_packing_problem - # def __selectBestBundle(self, bundles, nProcessors): + """Return the BundleID of the best match from a list of bundles and the number of processors requested. + """ bestBundleId = None currentBestProcs = 0 diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 4ac622b26f9..768240fa704 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -53,6 +53,9 @@ def initialize(self): types_storeInBundle = [str, str, list, list, str, int, dict, [int, type(None)]] def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, processors, ceDict, diracId): + """Stores a job in a bundle depending on the information on the CEDict. + If the bundle fills, it automatically gets send. + """ result = self._setupCE(ceDict, proxyPath) if not result["OK"]: @@ -86,9 +89,15 @@ def export_storeInBundle(self, jobId, executable, inputs, outputs, proxyPath, pr types_getTaskInfo = [str] def export_getTaskInfo(self, bundleId): + """Return the TaskID of the submitted bundle + If the Bundle hasn't been submitted yet, returns S_ERROR + """ return self._getTaskInfo(bundleId) def _getTaskInfo(self, bundleId): + """Return the TaskID of the submitted bundle + If the Bundle hasn't been submitted yet, returns S_ERROR + """ result = self.bundleDB.getBundleStatus(bundleId) if not result["OK"]: @@ -116,6 +125,7 @@ def _getTaskInfo(self, bundleId): types_bundleIdFromJobId = [str] def export_bundleIdFromJobId(self, jobId): + """Returns the BundleID of a specific job from its ID.""" return self._getBundleIdFromJobId(jobId) ############################################################################# @@ -178,6 +188,7 @@ def _killJob(self, jobId): types_cleanJob = [str] def export_cleanJob(self, jobId): + """Tries to clean the working directory of a specific job both locally and remotely.""" result = self._getBundleIdFromJobId(jobId) if not result["OK"]: return result @@ -229,6 +240,12 @@ def export_cleanJob(self, jobId): types_getBundleStatus = [str] def export_getBundleStatus(self, bundleId): + """Reports the Bundle status. + Waiting -> Bundle still waiting for more jobs + Running -> Bundle submitted to CE + Done -> Bundle finished (not the specific job) + Failed -> Bundle failed to execute + """ result = self._getTaskInfo(bundleId) if not result["OK"]: @@ -268,6 +285,9 @@ def export_getBundleStatus(self, bundleId): types_forceSubmitBundles = [list] def export_forceSubmitBundles(self, bundleIds): + """Forcibly submits a list of bundles. + This is useful for stalled bundles. + """ resultDict = {} if not isinstance(bundleIds, list): @@ -280,6 +300,7 @@ def export_forceSubmitBundles(self, bundleIds): return S_OK(resultDict) def _submitBundle(self, bundleId): + """Submits a Bundle from its ID.""" result = self._getBundleCE(bundleId) if not result["OK"]: @@ -321,6 +342,7 @@ def _submitBundle(self, bundleId): ############################################################################# def _getBundleIdFromJobId(self, jobId): + """Obtains the BundleID corresponding to a JobID.""" if jobId in self.jobToBundle: return S_OK(self.jobToBundle[jobId]) @@ -332,6 +354,7 @@ def _getBundleIdFromJobId(self, jobId): return result def _wrapBundle(self, bundleId): + """Bundles the jobs in a bundle for its submission.""" result = self.bundleDB.getWholeBundle(bundleId) if not result["OK"]: @@ -398,6 +421,7 @@ def _wrapBundle(self, bundleId): return S_OK((jobIds, wrapperPath, inputs, list(set(outputs)))) def _getBundleCEDict(self, bundleId): + """Returns the CEDict of a specific Bundle as a dictionary.""" result = self.bundleDB.getBundleCE(bundleId) if not result["OK"]: return result @@ -408,6 +432,7 @@ def _getBundleCEDict(self, bundleId): return S_OK({"CEDict": ceDict, "ProxyPath": result["Value"]["ProxyPath"]}) def _setupCE(self, ceDict, proxyPath): + """Prepares the CE instance.""" result = getProxyInfo(proxy=proxyPath) if not result["OK"]: @@ -416,7 +441,7 @@ def _setupCE(self, ceDict, proxyPath): proxy = result["Value"]["chain"] - # Setup CE + # CE Initialization result = self.ceFactory.getCE(ceType=ceDict["CEType"], ceName=ceDict["GridCE"], ceParametersDict=ceDict) if not result["OK"]: @@ -430,6 +455,7 @@ def _setupCE(self, ceDict, proxyPath): return S_OK({"CE": ce, "Proxy": proxy}) def _getBundleCE(self, bundleId): + """Returns the CE of a the corresponding Bundle from its ID.""" if bundleId not in self.bundleToCE: result = self._getBundleCEDict(bundleId) @@ -446,6 +472,7 @@ def _getBundleCE(self, bundleId): return S_OK(self.bundleToCE[bundleId]) def _getJobCE(self, jobId): + """Returns the CE of a the corresponding Job from its ID.""" if jobId not in self.jobToCE: result = self._getBundleIdFromJobId(jobId) @@ -464,9 +491,10 @@ def _getJobCE(self, jobId): return S_OK(self.jobToCE[jobId]) - def __reportJob(self, jobId: int, status: PilotStatus, info: str): + def __reportJob(self, jobId, status, minorStatus): + """Calls the JobReport of the Job if possible.""" if jobId not in self.jobReports: return - self.jobReports[jobId].setJobStatus(status=status, minorStatus=info) + self.jobReports[jobId].setJobStatus(status=status, minorStatus=minorStatus) self.jobReports[jobId].commit() From be0ebf8931add17af28abfee9352f9b9715cc58a Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 15:59:36 +0100 Subject: [PATCH 39/47] chore(BundleManagerAgent): Remove unnecesary _cleanFinishedBundles --- .../Agent/BundleManagerAgent.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index c3480dbddec..32c9baedd90 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -36,11 +36,6 @@ def execute(self): if not result["OK"]: self.log.warn(f"Failed send the bundles: {result}") - self.log.info("Cleaning inputs of finished bundles bundles") - result = self._cleanFinishedBundles() - if not result["OK"]: - self.log.warn(f"Failed to clean the inputs: {result}") - self.log.info("Deleting killed jobs from bundles") result = self._removeKilledJobs() if not result["OK"]: @@ -55,38 +50,6 @@ def finalize(self): ############################################################################# - def _cleanFinishedBundles(self): - result = self.bundleDB.getUnpurgedBundles() - if not result["OK"]: - return result - - bundleIDs = result["Value"] - self.log.verbose(f"> Found {len(bundleIDs)} finished and unpurged bundles") - - for bundleId in bundleIDs: - success = True - result = self.bundleDB.getJobIDsOfBundle(bundleId) - if not result["OK"]: - self.log.error(f"Failed to obtain the jobs of the bundle {bundleId}") - return result - - jobIDs = result["Value"] - - self.log.verbose(f"> Purging inputs of bundle with ID '{bundleId}'") - - for jobId in jobIDs: - result = self.bundleDB.removeJobInputs(jobId) - if not result["OK"]: - success = False - self.log.error(f"Failed to remove inputs of job {jobId} from bundle {bundleId}, skipping...") - self.log.error(result) - - if success: - self.log.info(f"> Inputs of bundle with ID '{bundleId}' were removed from DB") - self.bundleDB.setBundleAsPurged(bundleId) - - return S_OK() - def _removeKilledJobs(self): killedJobs = [] From 5e241f647b1df86dd05c82f8e38fc20418c480be Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 16:00:46 +0100 Subject: [PATCH 40/47] chore(BundleDB): Remove unnecesary proxyPath fix: pre-commit --- .../Agent/PushJobAgent.py | 4 +--- .../WorkloadManagementSystem/DB/BundleDB.py | 13 +++++------- .../Service/BundlerHandler.py | 2 +- .../WorkloadManagementSystem/Test_BundleDB.py | 21 ++++++++++--------- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py index a33243815f2..e3724d514c7 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py @@ -815,9 +815,7 @@ def _checkSubmittedJobWrappers(self, ce: ComputingElement, site: str): continue if status == PilotStatus.FAILED: - job.jobReport.setJobStatus( - status=JobStatus.FAILED, minorStatus="Payload failed", sendFlag=False - ) + job.jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Payload failed", sendFlag=False) job.sendFailoverRequest() job.sendJobAccounting(status=JobStatus.FAILED, minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC) continue diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index c5b9df14247..70bf720e1a1 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -99,9 +99,7 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi return result bundleId = result["Value"] - result = self._insertJobInBundle( - jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId - ) + result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, diracId) if not result["OK"]: return result @@ -121,7 +119,7 @@ def insertJobToBundle(self, jobId, executable, inputs, outputs, processors, ceDi bundleId = result["Value"] # Insert it and obtain if it is ready to be submitted - result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, proxyPath, diracId) + result = self._insertJobInBundle(jobId, bundleId, executable, inputs, outputs, processors, diracId) if not result["OK"]: return result @@ -254,7 +252,7 @@ def getJobsOfBundle(self, bundleId, noInputs=False): if len(row) == len(self.JOB_TO_BUNDLE_COLUMNS) - 1: # All columns except BundleID jobID, diracId, jobExecutablePath, jobOutputs, processors = row jobInputPath = "" - else: # All columns except BundleID but with the inputs + else: # All columns except BundleID but with the inputs jobID, diracId, jobExecutablePath, jobOutputs, processors, jobInputPath = row if jobID not in retVal: @@ -393,7 +391,7 @@ def _createNewBundle(self, ceDict, proxyPath): return S_OK(bundleId) - def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, proxyPath, diracId): + def _insertJobInBundle(self, jobId, bundleId, executable, inputs, outputs, nProcessors, diracId): """Add the info of a Job to a Bundle.""" timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) @@ -491,8 +489,7 @@ def _updateBundleStatus(self, bundleId, newStatus): return S_OK() def __selectBestBundle(self, bundles, nProcessors): - """Return the BundleID of the best match from a list of bundles and the number of processors requested. - """ + """Return the BundleID of the best match from a list of bundles and the number of processors requested.""" bestBundleId = None currentBestProcs = 0 diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 768240fa704..0c47ad55342 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -241,7 +241,7 @@ def export_cleanJob(self, jobId): def export_getBundleStatus(self, bundleId): """Reports the Bundle status. - Waiting -> Bundle still waiting for more jobs + Waiting -> Bundle still waiting for more jobs Running -> Bundle submitted to CE Done -> Bundle finished (not the specific job) Failed -> Bundle failed to execute diff --git a/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py b/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py index b6cfc7a76ac..c3b3fc11a6d 100644 --- a/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py +++ b/tests/Integration/WorkloadManagementSystem/Test_BundleDB.py @@ -10,7 +10,7 @@ @pytest.fixture(name="jobInfos") def fixtureJobInfo(): - return [ + return [ { "Executable": "./executable1.sh", "Inputs": ["./input1.py", "./input1.json"], @@ -22,7 +22,7 @@ def fixtureJobInfo(): "Site": "DIRAC.Site1.fake", "GridCE": "FakeCE", "Queue": "FakeQueue", - } + }, }, { "Executable": "./executable2.sh", @@ -35,7 +35,7 @@ def fixtureJobInfo(): "Site": "DIRAC.Site1.fake", "GridCE": "FakeCE", "Queue": "FakeQueue", - } + }, }, { "Executable": "./executable3.sh", @@ -48,7 +48,7 @@ def fixtureJobInfo(): "Site": "DIRAC.Site2.fake", "GridCE": "FakeCE", "Queue": "FakeQueue", - } + }, }, { "Executable": "./executable4.sh", @@ -61,21 +61,23 @@ def fixtureJobInfo(): "Site": "DIRAC.Site1.fake", "GridCE": "FakeCE", "Queue": "FakeQueue", - } + }, }, ] + @pytest.fixture(name="bundleDB") def fixtureBundleDB(): db = BundleDB() yield db db._query("DELETE FROM JobToBundle") db._query("DELETE FROM BundlesInfo") - + +@pytest.mark.skip(reason="Old tests, need to be remade") def test_AddToBundle(bundleDB: BundleDB, jobInfos): jobId = 0 - + # # Should return error result = bundleDB.getBundleIdFromJobId(jobId) @@ -101,7 +103,7 @@ def test_AddToBundle(bundleDB: BundleDB, jobInfos): assert result["Value"] == bundleId1 jobId += 1 - + # # Should create a new bundle because it does not fit job = jobInfos[1] @@ -113,7 +115,7 @@ def test_AddToBundle(bundleDB: BundleDB, jobInfos): assert bundleId2 != bundleId1 jobId += 1 - + # # Should create a new bundle because a different CE job = jobInfos[2] @@ -144,4 +146,3 @@ def test_AddToBundle(bundleDB: BundleDB, jobInfos): assert result["Value"] jobIds = [job["JobID"] for job in result["Value"]] assert jobId1 in jobIds and jobId4 in jobIds - \ No newline at end of file From 6ba3e9efb2b2ef14b69faefa18967d90bec99929 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 16:18:07 +0100 Subject: [PATCH 41/47] chore: Remove ExecTemplate from BundleCE and BundleDB --- .../Computing/BundleComputingElement.py | 8 +- .../WorkloadManagementSystem/DB/BundleDB.py | 5 - .../WorkloadManagementSystem/DB/BundleDB.sql | 1 - .../Service/BundlerHandler.py | 20 +-- .../Utilities/BundlerTemplates.py | 119 +----------------- 5 files changed, 6 insertions(+), 147 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 5a68c9f7c74..56ecd2b5d04 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -7,11 +7,6 @@ Configuration for the BundleComputingElemenet submission can be done via the configuration system. Below, you can find a list of parameters specific to the BundleCE. -ExecTemplate: - Name of the execution template to be used to bundle the jobs. - This template will the one that be passed to the CE to be executed alongside - each jobExecutable file and input as the inputs of the template. - InnerCEType: Type of the CE that will end up executing the templated wrapper. @@ -50,7 +45,6 @@ { CEType = BUNDLE InnerCEType = SSH - ExecTemplate = BASH SSHHost = host SSHUser = user @@ -118,7 +112,7 @@ def __init__(self, ceUniqueID): super().__init__(ceUniqueID) - self.mandatoryParameters = ["ExecTemplate", "InnerCEType"] + self.mandatoryParameters = ["InnerCEType"] self.innerCE = None self.innerCEParams = {} diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 70bf720e1a1..219f27efc21 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -41,7 +41,6 @@ def __init__(self, parentLogger=None): "CE", "Queue", "CEDict", - "ExecTemplate", "TaskID", "Status", "ProxyPath", @@ -364,9 +363,6 @@ def _reduceProcessorSum(self, bundleId, nProcessors): def _createNewBundle(self, ceDict, proxyPath): """Initialize a new Bundle.""" - if "ExecTemplate" not in ceDict: - return S_ERROR("CE must have a properly formatted ExecTemplate") - timestamp = datetime.now(tz=timezone.utc).strftime(self.MYSQL_DATETIME_FORMAT) bundleId = uuid.uuid4().hex @@ -374,7 +370,6 @@ def _createNewBundle(self, ceDict, proxyPath): "BundleID": bundleId, "ProcessorSum": 0, "MaxProcessors": ceDict["NumberOfProcessors"], - "ExecTemplate": ceDict["ExecTemplate"], "Site": ceDict["Site"], "CE": ceDict["GridCE"], "Queue": ceDict["Queue"], diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql index 8aed8525dc1..ba5f3ac4f4f 100644 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.sql @@ -26,7 +26,6 @@ CREATE TABLE `BundlesInfo` ( `CE` VARCHAR(128) NOT NULL, `Queue` VARCHAR(128) NOT NULL, `CEDict` TEXT NOT NULL, - `ExecTemplate` VARCHAR(25) NOT NULL, `TaskID` VARCHAR(255), `Status` ENUM('Waiting', 'Running', 'Done', 'Failed') NOT NULL DEFAULT 'Waiting', `ProxyPath` VARCHAR(255), diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 0c47ad55342..14ec0cff783 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -15,7 +15,7 @@ from DIRAC.WorkloadManagementSystem.Client import PilotStatus from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport from DIRAC.WorkloadManagementSystem.DB.BundleDB import BundleDB -from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import BASH_RUN_TASK, generate_template +from DIRAC.WorkloadManagementSystem.Utilities.BundlerTemplates import BASH_RUN_TASK, BASH_WRAPPER class BundlerHandler(RequestHandler): @@ -355,14 +355,6 @@ def _getBundleIdFromJobId(self, jobId): def _wrapBundle(self, bundleId): """Bundles the jobs in a bundle for its submission.""" - result = self.bundleDB.getWholeBundle(bundleId) - - if not result["OK"]: - self.log.error("Failed to obtain bundle while wrapping. BundleID ", str(bundleId)) - return result - - bundle = result["Value"] - result = self.bundleDB.getJobsOfBundle(bundleId) if not result["OK"]: @@ -371,7 +363,6 @@ def _wrapBundle(self, bundleId): jobs: dict = result["Value"] - template = bundle["ExecTemplate"] executables = [] inputs = [] outputs = [] @@ -400,13 +391,10 @@ def _wrapBundle(self, bundleId): outputs.extend(jobInfo["Outputs"]) - result = generate_template(template, executables, bundleId) - - if not result["OK"]: - self.log.error("Error while generating wrapper") - return result + formatted_inputs = "(" + " ".join(inputs) + ")" + formatMap = {"inputs": formatted_inputs, "bundleId": bundleId} + wrappedBundle = BASH_WRAPPER.format(**formatMap) - wrappedBundle = result["Value"] wrapperPath = os.path.join(bundlePath, "bundle_wrapper") runnerPath = os.path.join(bundlePath, "run_task.sh") diff --git a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py index 25d0714097d..31fb49e43e8 100644 --- a/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py +++ b/src/DIRAC/WorkloadManagementSystem/Utilities/BundlerTemplates.py @@ -1,98 +1,4 @@ -from DIRAC import S_ERROR, S_OK - -# DEPRECATED -BASH_TESTING_TEMPLATE = """\ -#!/bin/bash -BASEDIR=${{PWD}} -INPUT={inputs} -BUNDLE_ID={bundleId} - -OLD_IFS=$IFS - -# cpu management -bundler_pid=$$ -allowed_cpus=$(grep -w Cpus_allowed_list /proc/"$bundler_pid"/status | awk '{{print $2}}') -IFS=',' read -a cpu_ranges <<< "$allowed_cpus" - -IFS=$OLD_IFS - -first_allowed_cpu=$(cut -d "-" -f 1 - <<<"${{cpu_ranges[0]}}") -last_allowed_cpu=$(cut -d "-" -f 2 - <<<"${{cpu_ranges[-1]}}") -cpu_offset=0 -total_allowed_cpus=0 - -calc_total_cpus() {{ - for range in "${{cpu_ranges[@]}}"; do - local min=$(cut -d "-" -f 1 - <<<"$range") - local max=$(cut -d "-" -f 2 - <<<"$range") - total_allowed_cpus=$(($total_allowed_cpus+$max-$min+1)) - done -}} - -next_allowed_cpu() {{ - echo $allowed_cpus - return 0 - - local desired_cpu=$(( ($1 + $cpu_offset) % $total_allowed_cpus )) - local cpu=$first_allowed_cpu - - for range in "${{cpu_ranges[@]}}"; do - local min=$(cut -d "-" -f 1 - <<<"$range") - local max=$(cut -d "-" -f 2 - <<<"$range") - local real_cpu=$(($min+$desired_cpu)) - - if (( $real_cpu <= $max )); then - cpu=$real_cpu - break - fi - - # Check next range - local cpus_on_range=$(($max-$min+1)) - local desired_cpu=$(($desired_cpu-$cpus_on_range)) - done - - # Return cpu - echo $cpu -}} - -calc_total_cpus - -echo This machine has "$total_allowed_cpus" valid cores -echo Ranges: "${{cpu_ranges[@]}}" - -get_id() {{ - echo $1 | cut -d '_' -f 1 -}} - -job_number=0 -chmod u+x run_task.sh - -# execute tasks -for input in ${{INPUT[@]}}; do - [ -f "$input" ] || break - - jobId=$(get_id ${{input}}) - mkdir ${{jobId}} - - for filename in ${{jobId}}*; do - [ -f ${{filename}} ] || continue - # Move the job specific files to its directory, removing the jobId from its name - mv $filename ${{jobId}}/${{filename#${{jobId}}_*}} - done - - cpu=$(next_allowed_cpu $job_number) - taskset -c $cpu ${{BASEDIR}}/run_task.sh ${{jobId}} ${{input}} ${{BUNDLE_ID}} ${{BASEDIR}} & - pid=$! - - pids+=($pid) - job_number=$(($job_number+1)) -done - -# wait for all tasks -wait "${{pids[@]}}" -""" - -BASH_TEMPLATE = """\ +BASH_WRAPPER = """\ #!/bin/bash BASEDIR=${{PWD}} INPUT={inputs} @@ -147,26 +53,3 @@ echo "[${task_id}] Task Finished" echo "[${task_id}] Process final status: ${task_status}" """ - - -def generate_template(template: str, inputs: list, bundleId: str): - template = template.lower().replace("-", "_") - func_name = "_generate_" + template - generator = globals()[func_name] - - if not generator: - return S_ERROR("Template not found") - - if inputs is None: - inputs = [] - - template, formatMap = generator(inputs) - formatMap["bundleId"] = bundleId - - return S_OK(template.format(**formatMap)) - - -def _generate_bash(inputs: list): - formatted_inputs = "(" + " ".join(inputs) + ")" - formatMap = {"inputs": formatted_inputs} - return BASH_TEMPLATE, formatMap From 7cfc79eee12b004fd8722c794126cebec7f61589 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Mon, 2 Mar 2026 16:43:59 +0100 Subject: [PATCH 42/47] chore(JobWrapperOfflineTemplate): Remove temporary debugging code --- .../JobWrapper/JobWrapperOfflineTemplate.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py index a58066e8595..bd2a5d297be 100644 --- a/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py +++ b/src/DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py @@ -40,17 +40,7 @@ def execute(arguments: dict): gLogger.exception("JobWrapper failed the initialization phase", lException=exc) return 1 - result = job.preProcess() - if not result["OK"]: - gLogger.error("JobWrapper failed the pre-processing phase") - return 1 - - payloadParams = result["Value"] - - payloadResult = job.process( - command=payloadParams["command"], - env=payloadParams["env"], - ) + payloadResult = job.process(**payloadParams) if not payloadResult["OK"]: return 1 From a3e162318f46ee289f87c745a64d0357dcf158ff Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Mar 2026 10:58:11 +0100 Subject: [PATCH 43/47] chore(BundleDB): Split getJobsOfBundle in 2 functions --- .../Agent/BundleManagerAgent.py | 4 +- .../WorkloadManagementSystem/DB/BundleDB.py | 78 +++++++++++-------- .../Service/BundlerHandler.py | 2 +- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index 32c9baedd90..487f586d2a9 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -63,7 +63,7 @@ def _removeKilledJobs(self): for bundleInfo in bundles: bundleId = bundleInfo["BundleID"] - result = self.bundleDB.getJobsOfBundle(bundleId, noInputs=True) + result = self.bundleDB.getJobsOfBundle(bundleId) if not result["OK"]: self.log.error(f"Failed to get the jobs of the bundle '{bundleId}'") return result @@ -148,7 +148,7 @@ def _checkHeartBeat(self): for bundleInfo in result["Value"]: if bundleInfo["Status"] == PilotStatus.RUNNING: - result = self.bundleDB.getJobsOfBundle(bundleInfo["BundleID"], noInputs=True) + result = self.bundleDB.getJobsOfBundle(bundleInfo["BundleID"]) if not result["OK"]: continue diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 219f27efc21..4f9a060a610 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -217,25 +217,17 @@ def getBundleStatus(self, bundleId): return S_OK(result["Value"][0][0]) - # TODO: This whole function is incomprehensible, needs to be split in 2 - def getJobsOfBundle(self, bundleId, noInputs=False): - """Get every Job that comprise a Bundle.""" - if noInputs: - cmd = """\ - SELECT JobID, DiracID, ExecutablePath, Outputs, Processors - FROM JobToBundle - WHERE BundleID = "{bundleId}";""".format( - bundleId=bundleId - ) - else: - cmd = """\ - SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, Processors, InputPath - FROM JobToBundle - LEFT JOIN JobInputs - ON JobToBundle.JobID = JobInputs.JobID - WHERE BundleID = "{bundleId}";""".format( - bundleId=bundleId - ) + def getJobsAndInputsOfBundle(self, bundleId): + """Get every Job and Inputs that comprise a Bundle.""" + + cmd = """\ + SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, Processors, InputPath + FROM JobToBundle + LEFT JOIN JobInputs + ON JobToBundle.JobID = JobInputs.JobID + WHERE BundleID = "{bundleId}";""".format( + bundleId=bundleId + ) result = self._query(cmd) @@ -245,14 +237,9 @@ def getJobsOfBundle(self, bundleId, noInputs=False): rows = list(result["Value"]) retVal = {} - # For each row (JobID, ExecutablePath, Outputs, Processors, [InputPath]) + # For each row (JobID, ExecutablePath, Outputs, Processors, InputPath) for row in rows: - # The job has no input - if len(row) == len(self.JOB_TO_BUNDLE_COLUMNS) - 1: # All columns except BundleID - jobID, diracId, jobExecutablePath, jobOutputs, processors = row - jobInputPath = "" - else: # All columns except BundleID but with the inputs - jobID, diracId, jobExecutablePath, jobOutputs, processors, jobInputPath = row + jobID, diracId, jobExecutablePath, jobOutputs, processors, jobInputPath = row if jobID not in retVal: retVal[jobID] = { @@ -260,15 +247,44 @@ def getJobsOfBundle(self, bundleId, noInputs=False): "DiracID": diracId, "Outputs": [], "Processors": processors, + "Inputs": [], } - if not noInputs: - retVal[jobID]["Inputs"] = [] - retVal[jobID]["Outputs"].extend(literal_eval(jobOutputs)) + retVal[jobID]["Inputs"].append(jobInputPath) + + return S_OK(retVal) + + def getJobsOfBundle(self, bundleId): + """Get every Job that comprise a Bundle.""" + cmd = """\ + SELECT JobID, DiracID, ExecutablePath, Outputs, Processors + FROM JobToBundle + WHERE BundleID = "{bundleId}";""".format( + bundleId=bundleId + ) + + result = self._query(cmd) + + if not result["OK"]: + return result - if jobInputPath: - retVal[jobID]["Inputs"].append(jobInputPath) + rows = list(result["Value"]) + retVal = {} + + # For each row (JobID, ExecutablePath, Outputs, Processors) + for row in rows: + jobID, diracId, jobExecutablePath, jobOutputs, processors = row + + if jobID not in retVal: + retVal[jobID] = { + "ExecutablePath": jobExecutablePath, + "DiracID": diracId, + "Outputs": [], + "Processors": processors, + } + + retVal[jobID]["Outputs"].extend(literal_eval(jobOutputs)) return S_OK(retVal) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 14ec0cff783..1b3ac2fff51 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -355,7 +355,7 @@ def _getBundleIdFromJobId(self, jobId): def _wrapBundle(self, bundleId): """Bundles the jobs in a bundle for its submission.""" - result = self.bundleDB.getJobsOfBundle(bundleId) + result = self.bundleDB.getJobsAndInputsOfBundle(bundleId) if not result["OK"]: self.log.error("Failed to obtain bundled job while wrapping. BundleID=", str(bundleId)) From 6599af5c5575f9f028182794b506447943a50a5b Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:11:21 +0100 Subject: [PATCH 44/47] fix(BundleService): Wrapping inputs instead of executables --- src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py index 1b3ac2fff51..3c5ceaa81ad 100644 --- a/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py +++ b/src/DIRAC/WorkloadManagementSystem/Service/BundlerHandler.py @@ -391,7 +391,7 @@ def _wrapBundle(self, bundleId): outputs.extend(jobInfo["Outputs"]) - formatted_inputs = "(" + " ".join(inputs) + ")" + formatted_inputs = "(" + " ".join(executables) + ")" formatMap = {"inputs": formatted_inputs, "bundleId": bundleId} wrappedBundle = BASH_WRAPPER.format(**formatMap) From 906ba461c86dde3fcc697ec9fa46fef7ca4f53b4 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:17:23 +0100 Subject: [PATCH 45/47] chore(BundleCE): Improve job output obtaining, tmp dir no longer needed --- .../Computing/BundleComputingElement.py | 58 +------------------ 1 file changed, 1 insertion(+), 57 deletions(-) diff --git a/src/DIRAC/Resources/Computing/BundleComputingElement.py b/src/DIRAC/Resources/Computing/BundleComputingElement.py index 56ecd2b5d04..da28a7cf928 100644 --- a/src/DIRAC/Resources/Computing/BundleComputingElement.py +++ b/src/DIRAC/Resources/Computing/BundleComputingElement.py @@ -66,11 +66,8 @@ import copy import inspect import os -import shutil import uuid -from filelock import FileLock - from DIRAC import S_ERROR, S_OK, gConfig from DIRAC.Resources.Computing.ComputingElement import ComputingElement from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory @@ -224,30 +221,8 @@ def getJobOutput(self, jobId, workingDirectory="."): return S_ERROR("Output not ready yet") taskId = result["Value"]["TaskID"] - _, innerStamp = taskId.split(":::") - - result = self.__getOutputPath(bundleId, taskId) - - if not result["OK"]: - return result - - # The output obtation Timed Out, we need to wait a little longer - if not result["Value"]["Available"]: - return S_ERROR("Outputs not yet available") - - outputsPath = result["Value"]["Path"] - outputAbsPath = os.path.abspath(workingDirectory) - - jobOutputDir = os.path.join(outputsPath, f"{jobId}") - - if not os.path.exists(jobOutputDir): - return S_ERROR("Failed to locate job output files from base output directory") - self.log.notice(f"Outputs at: {jobOutputDir}") - - # Move all outputs from the temporary directory, to the job working directory - for item in os.listdir(jobOutputDir): - shutil.move(os.path.join(jobOutputDir, item), os.path.join(outputAbsPath, item)) + result = self.innerCE.getJobOutput(taskId, workingDirectory=workingDirectory, path=jobId) error = os.path.join(workingDirectory, f"{bundleId}.err") output = os.path.join(workingDirectory, f"{bundleId}.out") @@ -354,34 +329,3 @@ def __getTraskResult(self, jobId): return S_OK(0) return S_OK(1) - - def __getOutputPath(self, bundleId, innerTaskId): - """Returns the output path of the whole bundle - If it hasn't been created yet, it obtains the output from the Inner CE. - """ - self.log.debug(f"Obtaining the output path of bundle '{bundleId}' with task '{innerTaskId}'") - - basePath = os.path.join(self.bundlesBaseDir, bundleId) - lock = FileLock(os.path.join(basePath, "outputs.lock")) - - outputsPath = os.path.join(basePath, "outputs") - - try: - # Always acquire the lock before checking anything - with lock.acquire(timeout=60): - self.log.debug("Outputs lock acquired") - # If the output does not exist, dowload the outputs - if not os.path.exists(outputsPath): - os.mkdir(outputsPath) - self.log.debug(f"Saving inner CE outputs from task '{innerTaskId}' into '{outputsPath}'") - result = self.innerCE.getJobOutput(innerTaskId, outputsPath) - - if not result["OK"]: - self.log.error("Failed to obtain the outputs, removing the directory") - os.rmdir(outputsPath) - return result - - except TimeoutError: - return S_OK({"Available": False}) - - return S_OK({"Available": True, "Path": outputsPath}) From d4ce6a4555f1116d7d6092438a79cbaee38d1481 Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:19:35 +0100 Subject: [PATCH 46/47] chore: Remove unused imports --- .../Computing/AREXEnhancedComputingElement.py | 13 +++---------- .../Agent/BundleManagerAgent.py | 4 +--- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py index ab5001dca7b..45c1cbe76a6 100644 --- a/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py +++ b/src/DIRAC/Resources/Computing/AREXEnhancedComputingElement.py @@ -1,15 +1,8 @@ import os -import sys -import time -from DIRAC.Core.Security.X509Chain import X509Chain -from DIRAC.Resources.Computing.AREXComputingElement import AREXComputingElement - -# AREXComputingElement redefinition -import os -import json -import requests import shutil -from DIRAC import S_OK, S_ERROR + +from DIRAC import S_ERROR, S_OK +from DIRAC.Resources.Computing.AREXComputingElement import AREXComputingElement class AREXEnhancedComputingElement(AREXComputingElement): diff --git a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py index 487f586d2a9..2ac6054684c 100644 --- a/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py +++ b/src/DIRAC/WorkloadManagementSystem/Agent/BundleManagerAgent.py @@ -1,9 +1,7 @@ -import os from datetime import datetime, timedelta, timezone -from DIRAC import S_ERROR, S_OK, gConfig +from DIRAC import S_ERROR, S_OK from DIRAC.Core.Base.AgentModule import AgentModule -from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus from DIRAC.WorkloadManagementSystem.Client.BundlerClient import BundlerClient from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient From 944f38b4563c27afccb5b84afc7393ce2daa166f Mon Sep 17 00:00:00 2001 From: Jorge Lisa <64639359+AcquaDiGiorgio@users.noreply.github.com> Date: Tue, 3 Mar 2026 15:40:32 +0100 Subject: [PATCH 47/47] fix(BundleDB): Add extra safeguard while selecting best bundle --- src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py index 4f9a060a610..d8f0c20bac5 100755 --- a/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py +++ b/src/DIRAC/WorkloadManagementSystem/DB/BundleDB.py @@ -219,7 +219,7 @@ def getBundleStatus(self, bundleId): def getJobsAndInputsOfBundle(self, bundleId): """Get every Job and Inputs that comprise a Bundle.""" - + cmd = """\ SELECT JobToBundle.JobID, DiracID, ExecutablePath, Outputs, Processors, InputPath FROM JobToBundle @@ -257,12 +257,9 @@ def getJobsAndInputsOfBundle(self, bundleId): def getJobsOfBundle(self, bundleId): """Get every Job that comprise a Bundle.""" - cmd = """\ - SELECT JobID, DiracID, ExecutablePath, Outputs, Processors + cmd = f""" SELECT JobID, DiracID, ExecutablePath, Outputs, Processors FROM JobToBundle - WHERE BundleID = "{bundleId}";""".format( - bundleId=bundleId - ) + WHERE BundleID = "{bundleId}";""" result = self._query(cmd) @@ -512,7 +509,7 @@ def __selectBestBundle(self, bundles, nProcessors): newProcSum = procs + nProcessors - if status != PilotStatus.WAITING: + if status != PilotStatus.WAITING or newProcSum > maxProcs: continue if newProcSum == maxProcs: