Skip to content

Commit f9cf1bb

Browse files
committed
Only hold for specific reason
1 parent ca9a359 commit f9cf1bb

1 file changed

Lines changed: 12 additions & 2 deletions

File tree

src/slurm_plugin/clustermgtd.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ class ClustermgtdConfig:
152152
"orphaned_instance_timeout": 300,
153153
"ec2_instance_missing_max_count": 0,
154154
"hold_drain_nodes_timeout": 30,
155+
"hold_drain_nodes_reasons": ["Prolog error"],
155156
# Health check configs
156157
"disable_ec2_health_check": False,
157158
"disable_scheduled_event_health_check": False,
@@ -297,6 +298,14 @@ def _get_terminate_config(self, config):
297298
self.hold_drain_nodes_timeout = config.getint(
298299
"clustermgtd", "hold_drain_nodes_timeout", fallback=self.DEFAULTS.get("hold_drain_nodes_timeout")
299300
)
301+
# Parse comma-separated list of reasons
302+
hold_drain_nodes_reasons_str = config.get(
303+
"clustermgtd", "hold_drain_nodes_reasons", fallback=None
304+
)
305+
if hold_drain_nodes_reasons_str:
306+
self.hold_drain_nodes_reasons = [r.strip() for r in hold_drain_nodes_reasons_str.split(",")]
307+
else:
308+
self.hold_drain_nodes_reasons = self.DEFAULTS.get("hold_drain_nodes_reasons")
300309
self.terminate_down_nodes = config.getboolean(
301310
"clustermgtd", "terminate_down_nodes", fallback=self.DEFAULTS.get("terminate_down_nodes")
302311
)
@@ -788,9 +797,10 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes):
788797
# do not consider as unhealthy the nodes reserved for capacity blocks
789798
continue
790799

791-
# Track when the node was first found unhealthy
800+
# Track when the node was first found unhealthy, only if drain reason matches configured reasons
792801
if node.name not in self._held_compute_resources:
793-
self._held_compute_resources[node.name] = self._current_time
802+
if node.reason and any(reason in node.reason for reason in self._config.hold_drain_nodes_reasons):
803+
self._held_compute_resources[node.name] = self._current_time
794804

795805
all_unhealthy_nodes.append(node)
796806

0 commit comments

Comments
 (0)