@@ -152,6 +152,7 @@ class ClustermgtdConfig:
152152 "orphaned_instance_timeout" : 300 ,
153153 "ec2_instance_missing_max_count" : 0 ,
154154 "hold_drain_nodes_timeout" : 30 ,
155+ "hold_drain_nodes_reasons" : ["Prolog error" ],
155156 # Health check configs
156157 "disable_ec2_health_check" : False ,
157158 "disable_scheduled_event_health_check" : False ,
@@ -297,6 +298,14 @@ def _get_terminate_config(self, config):
297298 self .hold_drain_nodes_timeout = config .getint (
298299 "clustermgtd" , "hold_drain_nodes_timeout" , fallback = self .DEFAULTS .get ("hold_drain_nodes_timeout" )
299300 )
301+ # Parse comma-separated list of reasons
302+ hold_drain_nodes_reasons_str = config .get (
303+ "clustermgtd" , "hold_drain_nodes_reasons" , fallback = None
304+ )
305+ if hold_drain_nodes_reasons_str :
306+ self .hold_drain_nodes_reasons = [r .strip () for r in hold_drain_nodes_reasons_str .split ("," )]
307+ else :
308+ self .hold_drain_nodes_reasons = self .DEFAULTS .get ("hold_drain_nodes_reasons" )
300309 self .terminate_down_nodes = config .getboolean (
301310 "clustermgtd" , "terminate_down_nodes" , fallback = self .DEFAULTS .get ("terminate_down_nodes" )
302311 )
@@ -788,9 +797,10 @@ def _find_unhealthy_slurm_nodes(self, slurm_nodes):
788797 # do not consider as unhealthy the nodes reserved for capacity blocks
789798 continue
790799
791- # Track when the node was first found unhealthy
800+ # Track when the node was first found unhealthy, only if drain reason matches configured reasons
792801 if node .name not in self ._held_compute_resources :
793- self ._held_compute_resources [node .name ] = self ._current_time
802+ if node .reason and any (reason in node .reason for reason in self ._config .hold_drain_nodes_reasons ):
803+ self ._held_compute_resources [node .name ] = self ._current_time
794804
795805 all_unhealthy_nodes .append (node )
796806
0 commit comments