Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions hack/k8s-patch/template-patch/config-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ rules:
- list
- watch
- update
- patch
- apiGroups:
- "apps"
resources:
- daemonsets
- deployments
verbs:
- get
- list
Expand Down
5 changes: 3 additions & 2 deletions hack/k8s-patch/template-patch/default-deviceconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,10 @@ spec:
{{- toYaml . | nindent 6 }}
{{- end }}

{{- with .autoStartWorkflow }}
autoStartWorkflow: {{ . }}
{{- if hasKey . "autoStartWorkflow" }}
autoStartWorkflow: {{ .autoStartWorkflow }}
{{- end }}

{{- end }}

{{- end }}
Expand Down
2 changes: 2 additions & 0 deletions helm-charts-k8s/templates/config-manager-rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@ rules:
- list
- watch
- update
- patch
- apiGroups:
- "apps"
resources:
- daemonsets
- deployments
verbs:
- get
- list
Expand Down
5 changes: 3 additions & 2 deletions helm-charts-k8s/templates/default-deviceconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -523,9 +523,10 @@ spec:
{{- toYaml . | nindent 6 }}
{{- end }}

{{- with .autoStartWorkflow }}
autoStartWorkflow: {{ . }}
{{- if hasKey . "autoStartWorkflow" }}
autoStartWorkflow: {{ .autoStartWorkflow }}
{{- end }}

{{- end }}

{{- end }}
Expand Down
8 changes: 8 additions & 0 deletions internal/controllers/remediation/scripts/applylabels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ fi

echo "Applying $LENGTH labels to node '$NODE_NAME'..."

FAILED=0

# Loop through each label in the JSON array
for i in $(seq 0 $((LENGTH - 1))); do
LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]")
Expand All @@ -33,7 +35,13 @@ for i in $(seq 0 $((LENGTH - 1))); do
echo "Successfully applied label '$LABEL'"
else
echo "Failed to apply label '$LABEL'"
FAILED=1
fi
done

if [ "$FAILED" -eq 1 ]; then
echo "One or more labels failed to apply on node '$NODE_NAME'"
exit 1
fi

echo "Done applying all labels to node '$NODE_NAME'"
8 changes: 8 additions & 0 deletions internal/controllers/remediation/scripts/removelabels.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ fi

echo "Removing $LENGTH labels from node '$NODE_NAME'..."

FAILED=0

# Loop through each label in the JSON array
for i in $(seq 0 $((LENGTH - 1))); do
LABEL=$(echo "$NODE_LABELS" | jq -r ".[$i]")
Expand All @@ -41,7 +43,13 @@ for i in $(seq 0 $((LENGTH - 1))); do
echo "Successfully removed label '$LABEL_KEY'"
else
echo "Failed to remove label '$LABEL_KEY'"
FAILED=1
fi
done

if [ "$FAILED" -eq 1 ]; then
echo "One or more labels failed to be removed from node '$NODE_NAME'"
exit 1
fi

echo "Done removing all labels from node '$NODE_NAME'"
9 changes: 4 additions & 5 deletions internal/controllers/remediation/scripts/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ WORKFLOW_NAME='{{workflow.name}}'
WORKFLOW_UID='{{workflow.uid}}'

# Extract last 8 characters of workflow UID for uniqueness
SHORT_UID="${WORKFLOW_UID: -8}"
SHORT_UID=$(printf '%s' "$WORKFLOW_UID" | tail -c 8)

# Calculate max length for workflow name prefix (63 - 8 (uid) - 1 (dash) - 5 ("-test") - 1 (dash) = 48)
MAX_PREFIX_LEN=48

# Truncate workflow name if needed
if [ ${#WORKFLOW_NAME} -gt $MAX_PREFIX_LEN ]; then
WORKFLOW_PREFIX="${WORKFLOW_NAME:0:$MAX_PREFIX_LEN}"
WORKFLOW_PREFIX=$(printf '%.48s' "$WORKFLOW_NAME")
else
WORKFLOW_PREFIX="$WORKFLOW_NAME"
fi
Expand All @@ -39,7 +39,6 @@ if [ -n "$TESTRUNNERIMAGESECRET" ]; then
OLD_IFS="$IFS"
IFS=','
for secret in $TESTRUNNERIMAGESECRET; do
secret=$(echo "$secret" | xargs)
IMAGE_PULL_SECRETS="${IMAGE_PULL_SECRETS}
- name: ${secret}"
done
Expand Down Expand Up @@ -188,7 +187,7 @@ EOF
sleep 20

echo "Verifying Job creation..."
if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then
if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" >/dev/null 2>&1; then
echo "Error: Job $JOB_NAME was not created in namespace $NAMESPACE"
exit 1
fi
Expand All @@ -199,7 +198,7 @@ echo "Overall timeout for the job is set to $timeout seconds."
echo "Waiting for Job $JOB_NAME to complete..."

while true; do
if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then
if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" >/dev/null 2>&1; then
echo "Error: Job $JOB_NAME is not found in namespace $NAMESPACE"
exit 1
fi
Expand Down
35 changes: 16 additions & 19 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,14 +271,11 @@ func (n *remediationMgr) HandleDelete(ctx context.Context, deviceConfig *amdv1al
log.FromContext(ctx).Info(fmt.Sprintf("Deleted workflow: %s", wf.Name))
}

var cfgMapName string
if deviceConfig.Spec.RemediationWorkflow.Config != nil {
cfgMapName = deviceConfig.Spec.RemediationWorkflow.Config.Name
} else {
cfgMapName = deviceConfig.Name + "-" + DefaultConfigMapSuffix
}
if err := n.helper.deleteConfigMap(ctx, cfgMapName, deviceConfig.Namespace); err == nil {
log.FromContext(ctx).Info(fmt.Sprintf("Deleted ConfigMap: %s", cfgMapName))
if deviceConfig.Spec.RemediationWorkflow.Config == nil || deviceConfig.Spec.RemediationWorkflow.Config.Name == "" {
cfgMapName := deviceConfig.Name + "-" + DefaultConfigMapSuffix
if err := n.helper.deleteConfigMap(ctx, cfgMapName, deviceConfig.Namespace); err == nil {
log.FromContext(ctx).Info(fmt.Sprintf("Deleted ConfigMap: %s", cfgMapName))
}
}

return
Expand Down Expand Up @@ -527,7 +524,10 @@ func (h *remediationMgrHelper) checkIfTaintExists(node *v1.Node, devConfig *amdv

func (h *remediationMgrHelper) getWorkflowList(ctx context.Context, namespace string) (*workflowv1alpha1.WorkflowList, error) {
wfList := &workflowv1alpha1.WorkflowList{}
if err := h.client.List(ctx, wfList, &client.ListOptions{Namespace: namespace}); err != nil {
labelSelector := labels.SelectorFromSet(map[string]string{
ArgoWorkflowInstaceIDLabelKey: ArgoWorkflowInstaceIDLabelValue,
})
if err := h.client.List(ctx, wfList, &client.ListOptions{Namespace: namespace, LabelSelector: labelSelector}); err != nil {
return nil, err
}
return wfList, nil
Expand Down Expand Up @@ -690,7 +690,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
Metadata: instanceIDMeta,
Steps: []workflowv1alpha1.ParallelSteps{
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "awaitapproval", Template: "suspend", When: "{{workflow.parameters.auto_start}} == false"}}}, // If auto start is disabled, workflow will be created in suspended state and needs to be manually resumed by user
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "applylabels", Template: "applylabels"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "applylabels", Template: "applylabels", When: "{{workflow.parameters.node_labels}} != []"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "taint", Template: "taint"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "drain", Template: "drain"}}},
{Steps: []workflowv1alpha1.WorkflowStep{
Expand Down Expand Up @@ -725,7 +725,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
},
},
},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "failurecleanup", Template: "removelabels", When: "{{steps.test.exitCode}} != 0"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "failurecleanup", Template: "removelabels", When: "{{steps.test.exitCode}} != 0 && {{workflow.parameters.node_labels}} != []"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "failworkflow", Template: "failworkflow", When: "{{steps.test.exitCode}} != 0"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "wait", Template: "wait", When: "{{steps.test.exitCode}} == 0"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "untaint", Template: "untaint", When: "{{steps.test.exitCode}} == 0"}}},
Expand All @@ -744,7 +744,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
},
},
},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "successcleanup", Template: "removelabels", When: "{{steps.test.exitCode}} == 0"}}},
{Steps: []workflowv1alpha1.WorkflowStep{{Name: "successcleanup", Template: "removelabels", When: "{{steps.test.exitCode}} == 0 && {{workflow.parameters.node_labels}} != []"}}},
},
},
{
Expand Down Expand Up @@ -854,6 +854,10 @@ containers:
Name: "initContainerImage",
Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.initContainerImage}}"),
},
{
Name: "testRunnerImageSecret",
Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.testRunnerImageSecret}}"),
},
},
},
Script: &workflowv1alpha1.ScriptTemplate{
Expand Down Expand Up @@ -1673,13 +1677,6 @@ func (h *remediationMgrHelper) removeForceResumeWorkflowLabelFromNode(ctx contex
func (h *remediationMgrHelper) canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping, stageName string) bool {
logger := log.FromContext(ctx)

// Check if the recovery policy is violated, if so, do not allow resumption
recoveryPolicyViolated := h.isRecoveryPolicyViolated(ctx, node.Name, mapping)
if recoveryPolicyViolated {
logger.Info(fmt.Sprintf("Recovery policy is violated for node %s with condition %s, not allowing workflow resumption", node.Name, mapping.NodeCondition))
return false
}

// if no physical action is needed, allow resumption of workflow
if !mapping.PhysicalActionNeeded && stageName != "autostart" {
return true
Expand Down
Loading