Skip to content

Commit cba7a37

Browse files
committed
Make timeout in minutes
1 parent dfcd0dd commit cba7a37

1 file changed

Lines changed: 5 additions & 5 deletions

File tree

src/slurm_plugin/clustermgtd.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ class ClustermgtdConfig:
151151
"terminate_down_nodes": True,
152152
"orphaned_instance_timeout": 300,
153153
"ec2_instance_missing_max_count": 0,
154-
"hold_drain_nodes_timeout": 30,
154+
"hold_drain_nodes_timeout": 5,
155155
# Health check configs
156156
"disable_ec2_health_check": False,
157157
"disable_scheduled_event_health_check": False,
@@ -848,14 +848,14 @@ def _handle_unhealthy_dynamic_nodes(self, unhealthy_dynamic_nodes):
848848
for node in unhealthy_dynamic_nodes:
849849
if node.name not in self._held_compute_resources:
850850
nodes_to_terminate.append(node)
851-
elif time_is_up(self._held_compute_resources[node.name], self._current_time, self._config.hold_drain_nodes_timeout):
851+
elif time_is_up(self._held_compute_resources[node.name], self._current_time, self._config.hold_drain_nodes_timeout * 60):
852852
nodes_to_terminate.append(node)
853853
self._held_compute_resources.pop(node.name, None)
854854

855855
nodes_being_held = set(node.name for node in unhealthy_dynamic_nodes) - set(node.name for node in nodes_to_terminate)
856856
if nodes_being_held:
857857
log.info(
858-
"Holding termination for unhealthy dynamic nodes (timeout: %ss): %s",
858+
"Holding termination for unhealthy dynamic nodes (timeout: %sm): %s",
859859
self._config.hold_drain_nodes_timeout,
860860
print_with_count(nodes_being_held),
861861
)
@@ -918,14 +918,14 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
918918
for node in unhealthy_static_nodes:
919919
if node.name not in self._held_compute_resources:
920920
nodes_to_terminate.append(node)
921-
elif time_is_up(self._held_compute_resources[node.name], self._current_time, self._config.hold_drain_nodes_timeout):
921+
elif time_is_up(self._held_compute_resources[node.name], self._current_time, self._config.hold_drain_nodes_timeout * 60):
922922
nodes_to_terminate.append(node)
923923
self._held_compute_resources.pop(node.name, None)
924924

925925
nodes_being_held = set(node.name for node in unhealthy_static_nodes) - set(node.name for node in nodes_to_terminate)
926926
if nodes_being_held:
927927
log.info(
928-
"Holding termination for unhealthy static nodes (timeout: %ss): %s",
928+
"Holding termination for unhealthy static nodes (timeout: %sm): %s",
929929
self._config.hold_drain_nodes_timeout,
930930
print_with_count(nodes_being_held),
931931
)

0 commit comments

Comments
 (0)