Skip to content

Commit 08caa80

Browse files
committed
pkg/risk: Refactor alerts into a generic update-risk interface
3b365d0 (OTA-1813: Populate risks from alerts, 2026-02-27, #1329) added the alert-risk logic. But the implementation in that commit was fairly directly integrated into pkg/cvo. We want to extend update-risk detection with additional providers in the future, like preflights from the target release [1], and that could get complicated with logic tucked into pkg/cvo. And pkg/cvo is already plenty complicated, even without this functionality inline. This commit refactors to remove a few hundred lines of code from pkg/cvo out to a new pkg/risk. And that allows the pkg/cvo handling to move from "I'm integrating additional risks from alerts" to "I'm integrating additional risks from a source". As we more forward from here, we can continue to extend the functionality of the source, and will only have to tweak the source in cvo.go's New(...). Most of the code is either straightforward interface work or lifted-and-shifted pre-existing code. But the ReplaceAll for U+00A0 No-break space took some time to work out, so I'll explain that in detail here. It flies in the face of [2]: Do not depend on this output being stable. If you need the ability to programmatically interpret the difference, consider using a custom Reporter. I'm manually unwinding the uncertainty they inject in github.com/google/go-cmp/cmp /report_text.go's indentMode.appendIndent, because: * I want to test the error message content, to ensure it's sufficiently readable. * I'm ok manually updating the test expectations if a go-cpm bump brings in rendering improvements. That manual update will give us space to review to ensure we're still readable. [1]: openshift/enhancements#1930 [2]: https://pkg.go.dev/github.com/google/go-cmp/cmp#Diff
1 parent e1832cc commit 08caa80

File tree

10 files changed

+894
-606
lines changed

10 files changed

+894
-606
lines changed

pkg/cvo/availableupdates.go

Lines changed: 34 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,11 @@ import (
1212
"time"
1313

1414
"github.com/blang/semver/v4"
15-
"github.com/google/go-cmp/cmp"
16-
"github.com/google/go-cmp/cmp/cmpopts"
1715
"github.com/google/uuid"
18-
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1916

2017
"k8s.io/apimachinery/pkg/api/equality"
2118
"k8s.io/apimachinery/pkg/api/meta"
2219
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23-
utilerrors "k8s.io/apimachinery/pkg/util/errors"
2420
"k8s.io/apimachinery/pkg/util/sets"
2521
"k8s.io/klog/v2"
2622

@@ -30,6 +26,7 @@ import (
3026
"github.com/openshift/cluster-version-operator/pkg/cincinnati"
3127
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
3228
"github.com/openshift/cluster-version-operator/pkg/internal"
29+
"github.com/openshift/cluster-version-operator/pkg/risk"
3330
)
3431

3532
const noArchitecture string = "NoArchitecture"
@@ -103,7 +100,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
103100
if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) {
104101
needsConditionalUpdateEval = true
105102
}
106-
// If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
107103
if !needsConditionalUpdateEval {
108104
klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
109105
return nil
@@ -164,7 +160,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
164160

165161
optrAvailableUpdates.AcceptRisks = acceptRisks
166162
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
167-
optrAvailableUpdates.AlertGetter = optr.AlertGetter
163+
optrAvailableUpdates.risks = optr.risks
168164
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
169165

170166
queueKey := optr.queueKey()
@@ -219,7 +215,9 @@ type availableUpdates struct {
219215
// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
220216
RiskConditions map[string][]metav1.Condition
221217

222-
AlertGetter AlertGetter
218+
// risks holds update-risk source (in-cluster alerts, etc.)
219+
// that will be aggregated into conditional update risks.
220+
risks risk.Source
223221
}
224222

225223
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -325,7 +323,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
325323
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
326324
AcceptRisks: optr.availableUpdates.AcceptRisks,
327325
RiskConditions: optr.availableUpdates.RiskConditions,
328-
AlertGetter: optr.availableUpdates.AlertGetter,
326+
risks: optr.risks,
329327
LastAttempt: optr.availableUpdates.LastAttempt,
330328
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
331329
Current: *optr.availableUpdates.Current.DeepCopy(),
@@ -364,9 +362,14 @@ func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[st
364362
riskCondition.Status = metav1.ConditionUnknown
365363
riskCondition.Reason = riskConditionReasonEvaluationFailed
366364
riskCondition.Message = msg
367-
} else if match {
368-
riskCondition.Status = metav1.ConditionTrue
369-
riskCondition.Reason = riskConditionReasonMatch
365+
} else {
366+
if match {
367+
riskCondition.Status = metav1.ConditionTrue
368+
riskCondition.Reason = riskConditionReasonMatch
369+
}
370+
if existingCondition := meta.FindStatusCondition(risk.Conditions, riskCondition.Type); existingCondition != nil && existingCondition.Status == riskCondition.Status {
371+
riskCondition = *existingCondition // preserve more-detailed information, LastTransitionTime, etc., from the source
372+
}
370373
}
371374
riskConditions[risk.Name] = []metav1.Condition{riskCondition}
372375
}
@@ -518,174 +521,36 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
518521
}
519522
}
520523

521-
type AlertGetter interface {
522-
Get(ctx context.Context) prometheusv1.AlertsResult
523-
}
524-
525-
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) []configv1.ConditionalUpdateRisk {
526-
if u == nil || u.AlertGetter == nil {
527-
return nil
524+
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
525+
if u == nil {
526+
return
528527
}
529-
alertsResult := u.AlertGetter.Get(ctx)
530-
return alertsToRisks(alertsResult.Alerts)
531-
}
532-
533-
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
534-
klog.V(2).Infof("Found %d alerts", len(alerts))
535-
risks := map[string]configv1.ConditionalUpdateRisk{}
536-
for _, alert := range alerts {
537-
var alertName string
538-
if alertName = string(alert.Labels["alertname"]); alertName == "" {
539-
continue
540-
}
541-
if alert.State == "pending" {
542-
continue
543-
}
544-
545-
var summary string
546-
if summary = string(alert.Annotations["summary"]); summary == "" {
547-
summary = alertName
548-
}
549-
if !strings.HasSuffix(summary, ".") {
550-
summary += "."
551-
}
552-
553-
var description string
554-
alertMessage := string(alert.Annotations["message"])
555-
alertDescription := string(alert.Annotations["description"])
556-
switch {
557-
case alertMessage != "" && alertDescription != "":
558-
description += " The alert description is: " + alertDescription + " | " + alertMessage
559-
case alertDescription != "":
560-
description += " The alert description is: " + alertDescription
561-
case alertMessage != "":
562-
description += " The alert description is: " + alertMessage
563-
default:
564-
description += " The alert has no description."
565-
}
566528

567-
var runbook string
568-
alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
569-
if runbook = string(alert.Annotations["runbook_url"]); runbook == "" {
570-
runbook = "<alert does not have a runbook_url annotation>"
571-
} else {
572-
alertURL = runbook
573-
}
574-
575-
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
576-
577-
severity := string(alert.Labels["severity"])
578-
if severity == "critical" {
579-
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
580-
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
529+
if u.ShouldReconcileAcceptRisks() {
530+
allVersions := make([]string, 0, len(u.Updates)+len(u.ConditionalUpdates))
531+
for _, u := range u.Updates {
532+
allVersions = append(allVersions, u.Version)
533+
}
534+
for _, u := range u.ConditionalUpdates {
535+
allVersions = append(allVersions, u.Release.Version)
536+
}
537+
if u.risks != nil {
538+
name := u.risks.Name()
539+
risks, versions, err := u.risks.Risks(ctx, allVersions)
540+
if err != nil {
541+
klog.Errorf("detecting risks from %s failed, which might impact risk evaluation: %v", name, err)
542+
}
543+
u.Updates, u.ConditionalUpdates, err = risk.Merge(u.Updates, u.ConditionalUpdates, risks, versions)
544+
if err != nil {
545+
klog.Errorf("merging risks from %s failed, which might impact risk evaluation: %v", name, err)
581546
}
582-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
583-
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
584-
Status: metav1.ConditionTrue,
585-
Reason: fmt.Sprintf("Alert:%s", alert.State),
586-
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details),
587-
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
588-
})
589-
continue
590-
}
591-
592-
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
593-
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
594-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
595-
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
596-
Status: metav1.ConditionTrue,
597-
Reason: internal.AlertConditionReason(string(alert.State)),
598-
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details),
599-
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
600-
})
601-
continue
602-
}
603-
604-
if internal.HavePullWaiting.Has(alertName) ||
605-
internal.HaveNodes.Has(alertName) ||
606-
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
607-
alertName == internal.AlertNameVMCannotBeEvicted {
608-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
609-
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
610-
Status: metav1.ConditionTrue,
611-
Reason: internal.AlertConditionReason(string(alert.State)),
612-
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details),
613-
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
614-
})
615-
continue
616-
}
617-
618-
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
619-
if updatePrecheck == "true" {
620-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
621-
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
622-
Status: metav1.ConditionTrue,
623-
Reason: fmt.Sprintf("Alert:%s", alert.State),
624-
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details),
625-
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
626-
})
627-
continue
628-
}
629-
}
630-
631-
klog.V(2).Infof("Got %d risks", len(risks))
632-
if len(risks) == 0 {
633-
return nil
634-
}
635-
636-
var ret []configv1.ConditionalUpdateRisk
637-
var keys []string
638-
for k := range risks {
639-
keys = append(keys, k)
640-
}
641-
sort.Strings(keys)
642-
for _, k := range keys {
643-
ret = append(ret, risks[k])
644-
}
645-
return ret
646-
}
647-
648-
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
649-
risk, ok := risks[riskName]
650-
if !ok {
651-
return configv1.ConditionalUpdateRisk{
652-
Name: riskName,
653-
Message: message,
654-
URL: url,
655-
// Always as the alert is firing
656-
MatchingRules: []configv1.ClusterCondition{{Type: "Always"}},
657-
Conditions: []metav1.Condition{condition},
658-
}
659-
}
660-
661-
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
662-
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
663-
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
664-
c.LastTransitionTime = condition.LastTransitionTime
665547
}
666-
meta.SetStatusCondition(&risk.Conditions, *c)
667-
}
668-
669-
return risk
670-
}
671-
672-
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
673-
if u == nil {
674-
return
675548
}
676549

677550
riskVersions := loadRiskVersions(u.ConditionalUpdates)
678551
risks := risksInOrder(riskVersions)
679552
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
680553

681-
if u.ShouldReconcileAcceptRisks() {
682-
u.attachAlertRisksToUpdates(u.evaluateAlertRisks(ctx))
683-
}
684-
685-
if err := sanityCheck(u.ConditionalUpdates); err != nil {
686-
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
687-
}
688-
689554
for i, conditionalUpdate := range u.ConditionalUpdates {
690555
condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions)
691556

@@ -701,25 +566,6 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
701566
}
702567
}
703568

704-
func sanityCheck(updates []configv1.ConditionalUpdate) error {
705-
risks := map[string]configv1.ConditionalUpdateRisk{}
706-
var errs []error
707-
for _, update := range updates {
708-
for _, risk := range update.Risks {
709-
if v, ok := risks[risk.Name]; ok {
710-
if diff := cmp.Diff(v, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" {
711-
errs = append(errs, fmt.Errorf("found collision on risk %s: %v and %v", risk.Name, v, risk))
712-
}
713-
} else if trimmed := strings.TrimSpace(risk.Name); trimmed == "" {
714-
errs = append(errs, fmt.Errorf("found invalid name on risk %v", risk))
715-
} else {
716-
risks[risk.Name] = risk
717-
}
718-
}
719-
}
720-
return utilerrors.NewAggregate(errs)
721-
}
722-
723569
func (u *availableUpdates) addUpdate(release configv1.Release) {
724570
for _, update := range u.Updates {
725571
if update.Image == release.Image {
@@ -738,45 +584,6 @@ func (u *availableUpdates) removeUpdate(image string) {
738584
}
739585
}
740586

741-
func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) {
742-
if u == nil || len(alertRisks) == 0 {
743-
return
744-
}
745-
if u.RiskConditions == nil {
746-
u.RiskConditions = map[string][]metav1.Condition{}
747-
}
748-
for _, alertRisk := range alertRisks {
749-
u.RiskConditions[alertRisk.Name] = alertRisk.Conditions
750-
}
751-
var conditionalUpdates []configv1.ConditionalUpdate
752-
for _, update := range u.Updates {
753-
conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{
754-
Release: update,
755-
Risks: alertRisks,
756-
})
757-
}
758-
u.Updates = nil
759-
for _, conditionalUpdate := range u.ConditionalUpdates {
760-
for _, alertRisk := range alertRisks {
761-
var found bool
762-
for _, risk := range conditionalUpdate.Risks {
763-
if alertRisk.Name == risk.Name {
764-
found = true
765-
break
766-
}
767-
}
768-
if !found {
769-
conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk)
770-
}
771-
}
772-
conditionalUpdates = append(conditionalUpdates, conditionalUpdate)
773-
}
774-
sort.Slice(conditionalUpdates, func(i, j int) bool {
775-
return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version
776-
})
777-
u.ConditionalUpdates = conditionalUpdates
778-
}
779-
780587
func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string {
781588
template := `Could not evaluate exposure to update risk %s (%v)
782589
%s description: %s

0 commit comments

Comments
 (0)