@@ -151,7 +151,7 @@ class ClustermgtdConfig:
151151 "terminate_down_nodes" : True ,
152152 "orphaned_instance_timeout" : 300 ,
153153 "ec2_instance_missing_max_count" : 0 ,
154- "hold_drain_nodes_timeout" : 30 ,
154+ "hold_drain_nodes_timeout" : 5 ,
155155 # Health check configs
156156 "disable_ec2_health_check" : False ,
157157 "disable_scheduled_event_health_check" : False ,
@@ -848,14 +848,14 @@ def _handle_unhealthy_dynamic_nodes(self, unhealthy_dynamic_nodes):
848848 for node in unhealthy_dynamic_nodes :
849849 if node .name not in self ._held_compute_resources :
850850 nodes_to_terminate .append (node )
851- elif time_is_up (self ._held_compute_resources [node .name ], self ._current_time , self ._config .hold_drain_nodes_timeout ):
851+ elif time_is_up (self ._held_compute_resources [node .name ], self ._current_time , self ._config .hold_drain_nodes_timeout * 60 ):
852852 nodes_to_terminate .append (node )
853853 self ._held_compute_resources .pop (node .name , None )
854854
855855 nodes_being_held = set (node .name for node in unhealthy_dynamic_nodes ) - set (node .name for node in nodes_to_terminate )
856856 if nodes_being_held :
857857 log .info (
858- "Holding termination for unhealthy dynamic nodes (timeout: %ss ): %s" ,
858+ "Holding termination for unhealthy dynamic nodes (timeout: %sm ): %s" ,
859859 self ._config .hold_drain_nodes_timeout ,
860860 print_with_count (nodes_being_held ),
861861 )
@@ -918,14 +918,14 @@ def _handle_unhealthy_static_nodes(self, unhealthy_static_nodes):
918918 for node in unhealthy_static_nodes :
919919 if node .name not in self ._held_compute_resources :
920920 nodes_to_terminate .append (node )
921- elif time_is_up (self ._held_compute_resources [node .name ], self ._current_time , self ._config .hold_drain_nodes_timeout ):
921+ elif time_is_up (self ._held_compute_resources [node .name ], self ._current_time , self ._config .hold_drain_nodes_timeout * 60 ):
922922 nodes_to_terminate .append (node )
923923 self ._held_compute_resources .pop (node .name , None )
924924
925925 nodes_being_held = set (node .name for node in unhealthy_static_nodes ) - set (node .name for node in nodes_to_terminate )
926926 if nodes_being_held :
927927 log .info (
928- "Holding termination for unhealthy static nodes (timeout: %ss ): %s" ,
928+ "Holding termination for unhealthy static nodes (timeout: %sm ): %s" ,
929929 self ._config .hold_drain_nodes_timeout ,
930930 print_with_count (nodes_being_held ),
931931 )
0 commit comments