From a9e35d3df1c478aa2b676cf64054245ff7b242cf Mon Sep 17 00:00:00 2001 From: Uday Bhaskar Date: Fri, 3 Apr 2026 02:54:01 +0530 Subject: [PATCH] anr - fixes (#1268) * anr - fixes for applylabels step * multiple anr fixes (cherry picked from commit b33e4c9cc14ac69c6eab868e86bc20b295414c03) --- .../template-patch/config-manager-rbac.yaml | 2 ++ .../template-patch/default-deviceconfig.yaml | 5 +-- .../templates/config-manager-rbac.yaml | 2 ++ .../templates/default-deviceconfig.yaml | 5 +-- .../remediation/scripts/applylabels.sh | 8 +++++ .../remediation/scripts/removelabels.sh | 8 +++++ .../controllers/remediation/scripts/test.sh | 9 +++-- internal/controllers/remediation_handler.go | 35 +++++++++---------- 8 files changed, 46 insertions(+), 28 deletions(-) diff --git a/hack/k8s-patch/template-patch/config-manager-rbac.yaml b/hack/k8s-patch/template-patch/config-manager-rbac.yaml index 212deda8e..4f14ce281 100644 --- a/hack/k8s-patch/template-patch/config-manager-rbac.yaml +++ b/hack/k8s-patch/template-patch/config-manager-rbac.yaml @@ -25,10 +25,12 @@ rules: - list - watch - update + - patch - apiGroups: - "apps" resources: - daemonsets + - deployments verbs: - get - list diff --git a/hack/k8s-patch/template-patch/default-deviceconfig.yaml b/hack/k8s-patch/template-patch/default-deviceconfig.yaml index 52c57b529..3a6708d32 100644 --- a/hack/k8s-patch/template-patch/default-deviceconfig.yaml +++ b/hack/k8s-patch/template-patch/default-deviceconfig.yaml @@ -523,9 +523,10 @@ spec: {{- toYaml . | nindent 6 }} {{- end }} - {{- with .autoStartWorkflow }} - autoStartWorkflow: {{ . }} + {{- if hasKey . "autoStartWorkflow" }} + autoStartWorkflow: {{ .autoStartWorkflow }} {{- end }} + {{- end }} {{- end }} diff --git a/helm-charts-k8s/templates/config-manager-rbac.yaml b/helm-charts-k8s/templates/config-manager-rbac.yaml index 212deda8e..4f14ce281 100644 --- a/helm-charts-k8s/templates/config-manager-rbac.yaml +++ b/helm-charts-k8s/templates/config-manager-rbac.yaml @@ -25,10 +25,12 @@ rules: - list - watch - update + - patch - apiGroups: - "apps" resources: - daemonsets + - deployments verbs: - get - list diff --git a/helm-charts-k8s/templates/default-deviceconfig.yaml b/helm-charts-k8s/templates/default-deviceconfig.yaml index 52c57b529..3a6708d32 100644 --- a/helm-charts-k8s/templates/default-deviceconfig.yaml +++ b/helm-charts-k8s/templates/default-deviceconfig.yaml @@ -523,9 +523,10 @@ spec: {{- toYaml . | nindent 6 }} {{- end }} - {{- with .autoStartWorkflow }} - autoStartWorkflow: {{ . }} + {{- if hasKey . "autoStartWorkflow" }} + autoStartWorkflow: {{ .autoStartWorkflow }} {{- end }} + {{- end }} {{- end }} diff --git a/internal/controllers/remediation/scripts/applylabels.sh b/internal/controllers/remediation/scripts/applylabels.sh index 2a265f3be..a462d498e 100644 --- a/internal/controllers/remediation/scripts/applylabels.sh +++ b/internal/controllers/remediation/scripts/applylabels.sh @@ -18,6 +18,8 @@ fi echo "Applying $LENGTH labels to node '$NODE_NAME'..." +FAILED=0 + # Loop through each label in the JSON array for i in $(seq 0 $((LENGTH - 1))); do LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]") @@ -33,7 +35,13 @@ for i in $(seq 0 $((LENGTH - 1))); do echo "Successfully applied label '$LABEL'" else echo "Failed to apply label '$LABEL'" + FAILED=1 fi done +if [ "$FAILED" -eq 1 ]; then + echo "One or more labels failed to apply on node '$NODE_NAME'" + exit 1 +fi + echo "Done applying all labels to node '$NODE_NAME'" diff --git a/internal/controllers/remediation/scripts/removelabels.sh b/internal/controllers/remediation/scripts/removelabels.sh index 2318771b4..5e8250551 100644 --- a/internal/controllers/remediation/scripts/removelabels.sh +++ b/internal/controllers/remediation/scripts/removelabels.sh @@ -18,6 +18,8 @@ fi echo "Removing $LENGTH labels from node '$NODE_NAME'..." +FAILED=0 + # Loop through each label in the JSON array for i in $(seq 0 $((LENGTH - 1))); do LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]") @@ -41,7 +43,13 @@ for i in $(seq 0 $((LENGTH - 1))); do echo "Successfully removed label '$LABEL_KEY'" else echo "Failed to remove label '$LABEL_KEY'" + FAILED=1 fi done +if [ "$FAILED" -eq 1 ]; then + echo "One or more labels failed to be removed from node '$NODE_NAME'" + exit 1 +fi + echo "Done removing all labels from node '$NODE_NAME'" \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/test.sh b/internal/controllers/remediation/scripts/test.sh index 170e29a9a..e1fb7c41f 100644 --- a/internal/controllers/remediation/scripts/test.sh +++ b/internal/controllers/remediation/scripts/test.sh @@ -7,14 +7,14 @@ WORKFLOW_NAME='{{workflow.name}}' WORKFLOW_UID='{{workflow.uid}}' # Extract last 8 characters of workflow UID for uniqueness -SHORT_UID="${WORKFLOW_UID: -8}" +SHORT_UID=$(printf '%s' "$WORKFLOW_UID" | tail -c 8) # Calculate max length for workflow name prefix (63 - 8 (uid) - 1 (dash) - 5 ("-test") - 1 (dash) = 48) MAX_PREFIX_LEN=48 # Truncate workflow name if needed if [ ${#WORKFLOW_NAME} -gt $MAX_PREFIX_LEN ]; then - WORKFLOW_PREFIX="${WORKFLOW_NAME:0:$MAX_PREFIX_LEN}" + WORKFLOW_PREFIX=$(printf '%.48s' "$WORKFLOW_NAME") else WORKFLOW_PREFIX="$WORKFLOW_NAME" fi @@ -39,7 +39,6 @@ if [ -n "$TESTRUNNERIMAGESECRET" ]; then OLD_IFS="$IFS" IFS=',' for secret in $TESTRUNNERIMAGESECRET; do - secret=$(echo "$secret" | xargs) IMAGE_PULL_SECRETS="${IMAGE_PULL_SECRETS} - name: ${secret}" done @@ -188,7 +187,7 @@ EOF sleep 20 echo "Verifying Job creation..." -if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then +if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" >/dev/null 2>&1; then echo "Error: Job $JOB_NAME was not created in namespace $NAMESPACE" exit 1 fi @@ -199,7 +198,7 @@ echo "Overall timeout for the job is set to $timeout seconds." echo "Waiting for Job $JOB_NAME to complete..." while true; do - if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then + if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" >/dev/null 2>&1; then echo "Error: Job $JOB_NAME is not found in namespace $NAMESPACE" exit 1 fi diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index d56917f0b..61eb77175 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -271,14 +271,11 @@ func (n *remediationMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1al log.FromContext(ctx).Info(fmt.Sprintf("Deleted workflow: %s", wf.Name)) } - var cfgMapName string - if deviceConfig.Spec.RemediationWorkflow.Config != nil { - cfgMapName = deviceConfig.Spec.RemediationWorkflow.Config.Name - } else { - cfgMapName = deviceConfig.Name + "-" + DefaultConfigMapSuffix - } - if err := n.helper.deleteConfigMap(ctx, cfgMapName, deviceConfig.Namespace); err == nil { - log.FromContext(ctx).Info(fmt.Sprintf("Deleted ConfigMap: %s", cfgMapName)) + if deviceConfig.Spec.RemediationWorkflow.Config == nil || deviceConfig.Spec.RemediationWorkflow.Config.Name == "" { + cfgMapName := deviceConfig.Name + "-" + DefaultConfigMapSuffix + if err := n.helper.deleteConfigMap(ctx, cfgMapName, deviceConfig.Namespace); err == nil { + log.FromContext(ctx).Info(fmt.Sprintf("Deleted ConfigMap: %s", cfgMapName)) + } } return @@ -527,7 +524,10 @@ func (h *remediationMgrHelper) checkIfTaintExists(node *v1.Node, devConfig *amdv func (h *remediationMgrHelper) getWorkflowList(ctx context.Context, namespace string) (*workflowv1alpha1.WorkflowList, error) { wfList := &workflowv1alpha1.WorkflowList{} - if err := h.client.List(ctx, wfList, &client.ListOptions{Namespace: namespace}); err != nil { + labelSelector := labels.SelectorFromSet(map[string]string{ + ArgoWorkflowInstaceIDLabelKey: ArgoWorkflowInstaceIDLabelValue, + }) + if err := h.client.List(ctx, wfList, &client.ListOptions{Namespace: namespace, LabelSelector: labelSelector}); err != nil { return nil, err } return wfList, nil @@ -690,7 +690,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context Metadata: instanceIDMeta, Steps: []workflowv1alpha1.ParallelSteps{ {Steps: []workflowv1alpha1.WorkflowStep{{Name: "awaitapproval", Template: "suspend", When: "{{workflow.parameters.auto_start}} == false"}}}, // If auto start is disabled, workflow will be created in suspended state and needs to be manually resumed by user - {Steps: []workflowv1alpha1.WorkflowStep{{Name: "applylabels", Template: "applylabels"}}}, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "applylabels", Template: "applylabels", When: "{{workflow.parameters.node_labels}} != []"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "taint", Template: "taint"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "drain", Template: "drain"}}}, {Steps: []workflowv1alpha1.WorkflowStep{ @@ -725,7 +725,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, }, - {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failurecleanup", Template: "removelabels", When: "{{steps.test.exitCode}} != 0"}}}, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failurecleanup", Template: "removelabels", When: "{{steps.test.exitCode}} != 0 && {{workflow.parameters.node_labels}} != []"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failworkflow", Template: "failworkflow", When: "{{steps.test.exitCode}} != 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "wait", Template: "wait", When: "{{steps.test.exitCode}} == 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "untaint", Template: "untaint", When: "{{steps.test.exitCode}} == 0"}}}, @@ -744,7 +744,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, }, - {Steps: []workflowv1alpha1.WorkflowStep{{Name: "successcleanup", Template: "removelabels", When: "{{steps.test.exitCode}} == 0"}}}, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "successcleanup", Template: "removelabels", When: "{{steps.test.exitCode}} == 0 && {{workflow.parameters.node_labels}} != []"}}}, }, }, { @@ -854,6 +854,10 @@ containers: Name: "initContainerImage", Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.initContainerImage}}"), }, + { + Name: "testRunnerImageSecret", + Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.testRunnerImageSecret}}"), + }, }, }, Script: &workflowv1alpha1.ScriptTemplate{ @@ -1673,13 +1677,6 @@ func (h *remediationMgrHelper) removeForceResumeWorkflowLabelFromNode(ctx contex func (h *remediationMgrHelper) canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping, stageName string) bool { logger := log.FromContext(ctx) - // Check if the recovery policy is violated, if so, do not allow resumption - recoveryPolicyViolated := h.isRecoveryPolicyViolated(ctx, node.Name, mapping) - if recoveryPolicyViolated { - logger.Info(fmt.Sprintf("Recovery policy is violated for node %s with condition %s, not allowing workflow resumption", node.Name, mapping.NodeCondition)) - return false - } - // if no physical action is needed, allow resumption of workflow if !mapping.PhysicalActionNeeded && stageName != "autostart" { return true