From c3a28cdddd27027cd4188540f0a3ae04833e5268 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Thu, 2 Apr 2026 19:56:44 -0700 Subject: [PATCH 1/3] pkg/risk: Refactor alerts into a generic update-risk interface 3b365d0e7b (OTA-1813: Populate risks from alerts, 2026-02-27, #1329) added the alert-risk logic. But the implementation in that commit was fairly directly integrated into pkg/cvo. We want to extend update-risk detection with additional providers in the future, like preflights from the target release [1], and that could get complicated with logic tucked into pkg/cvo. And pkg/cvo is already plenty complicated, even without this functionality inline. This commit refactors to remove a few hundred lines of code from pkg/cvo out to a new pkg/risk. And that allows the pkg/cvo handling to move from "I'm integrating additional risks from alerts" to "I'm integrating additional risks from a source". As we more forward from here, we can continue to extend the functionality of the source, and will only have to tweak the source in cvo.go's New(...). Most of the code is either straightforward interface work or lifted-and-shifted pre-existing code. But the ReplaceAll for U+00A0 No-break space took some time to work out, so I'll explain that in detail here. It flies in the face of [2]: Do not depend on this output being stable. If you need the ability to programmatically interpret the difference, consider using a custom Reporter. I'm manually unwinding the uncertainty they inject in github.com/google/go-cmp/cmp /report_text.go's indentMode.appendIndent, because: * I want to test the error message content, to ensure it's sufficiently readable. * I'm ok manually updating the test expectations if a go-cpm bump brings in rendering improvements. That manual update will give us space to review to ensure we're still readable. [1]: https://github.com/openshift/enhancements/pull/1930 [2]: https://pkg.go.dev/github.com/google/go-cmp/cmp#Diff --- pkg/cvo/availableupdates.go | 261 ++---------- pkg/cvo/availableupdates_test.go | 401 ++----------------- pkg/cvo/cvo.go | 9 +- pkg/cvo/cvo_test.go | 2 +- pkg/risk/alert/alert.go | 186 +++++++++ pkg/risk/alert/alert_test.go | 198 +++++++++ pkg/{cvo => risk/alert}/testdata/alerts.json | 0 pkg/risk/mock/mock.go | 46 +++ pkg/risk/risk.go | 163 ++++++++ pkg/risk/risk_test.go | 262 ++++++++++++ 10 files changed, 922 insertions(+), 606 deletions(-) create mode 100644 pkg/risk/alert/alert.go create mode 100644 pkg/risk/alert/alert_test.go rename pkg/{cvo => risk/alert}/testdata/alerts.json (100%) create mode 100644 pkg/risk/mock/mock.go create mode 100644 pkg/risk/risk.go create mode 100644 pkg/risk/risk_test.go diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 178fd380ec..c96abaa924 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -12,15 +12,11 @@ import ( "time" "github.com/blang/semver/v4" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" - prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" @@ -30,6 +26,7 @@ import ( "github.com/openshift/cluster-version-operator/pkg/cincinnati" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" "github.com/openshift/cluster-version-operator/pkg/internal" + "github.com/openshift/cluster-version-operator/pkg/risk" ) const noArchitecture string = "NoArchitecture" @@ -103,7 +100,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) { needsConditionalUpdateEval = true } - // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval if !needsConditionalUpdateEval { klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) return nil @@ -164,7 +160,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 optrAvailableUpdates.AcceptRisks = acceptRisks optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks - optrAvailableUpdates.AlertGetter = optr.AlertGetter + optrAvailableUpdates.risks = optr.risks optrAvailableUpdates.evaluateConditionalUpdates(ctx) queueKey := optr.queueKey() @@ -219,7 +215,9 @@ type availableUpdates struct { // RiskConditions stores the condition for every risk (name, url, message, matchingRules). RiskConditions map[string][]metav1.Condition - AlertGetter AlertGetter + // risks holds update-risk source (in-cluster alerts, etc.) + // that will be aggregated into conditional update risks. + risks risk.Source } func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool { @@ -325,7 +323,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks, AcceptRisks: optr.availableUpdates.AcceptRisks, RiskConditions: optr.availableUpdates.RiskConditions, - AlertGetter: optr.availableUpdates.AlertGetter, + risks: optr.risks, LastAttempt: optr.availableUpdates.LastAttempt, LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange, Current: *optr.availableUpdates.Current.DeepCopy(), @@ -364,9 +362,14 @@ func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[st riskCondition.Status = metav1.ConditionUnknown riskCondition.Reason = riskConditionReasonEvaluationFailed riskCondition.Message = msg - } else if match { - riskCondition.Status = metav1.ConditionTrue - riskCondition.Reason = riskConditionReasonMatch + } else { + if match { + riskCondition.Status = metav1.ConditionTrue + riskCondition.Reason = riskConditionReasonMatch + } + if existingCondition := meta.FindStatusCondition(risk.Conditions, riskCondition.Type); existingCondition != nil && existingCondition.Status == riskCondition.Status { + riskCondition = *existingCondition // preserve more-detailed information, LastTransitionTime, etc., from the source + } } riskConditions[risk.Name] = []metav1.Condition{riskCondition} } @@ -518,174 +521,36 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran } } -type AlertGetter interface { - Get(ctx context.Context) prometheusv1.AlertsResult -} - -func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) []configv1.ConditionalUpdateRisk { - if u == nil || u.AlertGetter == nil { - return nil +func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { + if u == nil { + return } - alertsResult := u.AlertGetter.Get(ctx) - return alertsToRisks(alertsResult.Alerts) -} - -func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { - klog.V(2).Infof("Found %d alerts", len(alerts)) - risks := map[string]configv1.ConditionalUpdateRisk{} - for _, alert := range alerts { - var alertName string - if alertName = string(alert.Labels["alertname"]); alertName == "" { - continue - } - if alert.State == "pending" { - continue - } - - var summary string - if summary = string(alert.Annotations["summary"]); summary == "" { - summary = alertName - } - if !strings.HasSuffix(summary, ".") { - summary += "." - } - - var description string - alertMessage := string(alert.Annotations["message"]) - alertDescription := string(alert.Annotations["description"]) - switch { - case alertMessage != "" && alertDescription != "": - description += " The alert description is: " + alertDescription + " | " + alertMessage - case alertDescription != "": - description += " The alert description is: " + alertDescription - case alertMessage != "": - description += " The alert description is: " + alertMessage - default: - description += " The alert has no description." - } - var runbook string - alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound" - if runbook = string(alert.Annotations["runbook_url"]); runbook == "" { - runbook = "" - } else { - alertURL = runbook - } - - details := fmt.Sprintf("%s%s %s", summary, description, runbook) - - severity := string(alert.Labels["severity"]) - if severity == "critical" { - if alertName == internal.AlertNamePodDisruptionBudgetLimit { - details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + if u.ShouldReconcileAcceptRisks() { + allVersions := make([]string, 0, len(u.Updates)+len(u.ConditionalUpdates)) + for _, u := range u.Updates { + allVersions = append(allVersions, u.Version) + } + for _, u := range u.ConditionalUpdates { + allVersions = append(allVersions, u.Release.Version) + } + if u.risks != nil { + name := u.risks.Name() + risks, versions, err := u.risks.Risks(ctx, allVersions) + if err != nil { + klog.Errorf("detecting risks from %s failed, which might impact risk evaluation: %v", name, err) + } + u.Updates, u.ConditionalUpdates, err = risk.Merge(u.Updates, u.ConditionalUpdates, risks, versions) + if err != nil { + klog.Errorf("merging risks from %s failed, which might impact risk evaluation: %v", name, err) } - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: fmt.Sprintf("Alert:%s", alert.State), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { - details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: internal.AlertConditionReason(string(alert.State)), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - if internal.HavePullWaiting.Has(alertName) || - internal.HaveNodes.Has(alertName) || - alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || - alertName == internal.AlertNameVMCannotBeEvicted { - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: internal.AlertConditionReason(string(alert.State)), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) - if updatePrecheck == "true" { - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: fmt.Sprintf("Alert:%s", alert.State), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - } - - klog.V(2).Infof("Got %d risks", len(risks)) - if len(risks) == 0 { - return nil - } - - var ret []configv1.ConditionalUpdateRisk - var keys []string - for k := range risks { - keys = append(keys, k) - } - sort.Strings(keys) - for _, k := range keys { - ret = append(ret, risks[k]) - } - return ret -} - -func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk { - risk, ok := risks[riskName] - if !ok { - return configv1.ConditionalUpdateRisk{ - Name: riskName, - Message: message, - URL: url, - // Always as the alert is firing - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{condition}, - } - } - - if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { - c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) - if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { - c.LastTransitionTime = condition.LastTransitionTime } - meta.SetStatusCondition(&risk.Conditions, *c) - } - - return risk -} - -func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { - if u == nil { - return } riskVersions := loadRiskVersions(u.ConditionalUpdates) risks := risksInOrder(riskVersions) u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry) - if u.ShouldReconcileAcceptRisks() { - u.attachAlertRisksToUpdates(u.evaluateAlertRisks(ctx)) - } - - if err := sanityCheck(u.ConditionalUpdates); err != nil { - klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err) - } - for i, conditionalUpdate := range u.ConditionalUpdates { condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) @@ -701,25 +566,6 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { } } -func sanityCheck(updates []configv1.ConditionalUpdate) error { - risks := map[string]configv1.ConditionalUpdateRisk{} - var errs []error - for _, update := range updates { - for _, risk := range update.Risks { - if v, ok := risks[risk.Name]; ok { - if diff := cmp.Diff(v, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { - errs = append(errs, fmt.Errorf("found collision on risk %s: %v and %v", risk.Name, v, risk)) - } - } else if trimmed := strings.TrimSpace(risk.Name); trimmed == "" { - errs = append(errs, fmt.Errorf("found invalid name on risk %v", risk)) - } else { - risks[risk.Name] = risk - } - } - } - return utilerrors.NewAggregate(errs) -} - func (u *availableUpdates) addUpdate(release configv1.Release) { for _, update := range u.Updates { if update.Image == release.Image { @@ -738,45 +584,6 @@ func (u *availableUpdates) removeUpdate(image string) { } } -func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) { - if u == nil || len(alertRisks) == 0 { - return - } - if u.RiskConditions == nil { - u.RiskConditions = map[string][]metav1.Condition{} - } - for _, alertRisk := range alertRisks { - u.RiskConditions[alertRisk.Name] = alertRisk.Conditions - } - var conditionalUpdates []configv1.ConditionalUpdate - for _, update := range u.Updates { - conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{ - Release: update, - Risks: alertRisks, - }) - } - u.Updates = nil - for _, conditionalUpdate := range u.ConditionalUpdates { - for _, alertRisk := range alertRisks { - var found bool - for _, risk := range conditionalUpdate.Risks { - if alertRisk.Name == risk.Name { - found = true - break - } - } - if !found { - conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk) - } - } - conditionalUpdates = append(conditionalUpdates, conditionalUpdate) - } - sort.Slice(conditionalUpdates, func(i, j int) bool { - return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version - }) - u.ConditionalUpdates = conditionalUpdates -} - func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string { template := `Could not evaluate exposure to update risk %s (%v) %s description: %s diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 400f8a5185..86b6b6c4f2 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -2,14 +2,11 @@ package cvo import ( "context" - "encoding/json" "errors" "fmt" "net/http" "net/http/httptest" "net/url" - "os" - "path/filepath" "runtime" "testing" "time" @@ -17,15 +14,12 @@ import ( "github.com/blang/semver/v4" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/workqueue" @@ -35,6 +29,8 @@ import ( "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/mock" "github.com/openshift/cluster-version-operator/pkg/featuregates" + "github.com/openshift/cluster-version-operator/pkg/risk" + riskmock "github.com/openshift/cluster-version-operator/pkg/risk/mock" ) // notFoundProxyLister is a stub for ProxyLister @@ -221,7 +217,7 @@ var availableUpdatesCmpOpts = []cmp.Option{ cmpopts.IgnoreTypes(time.Time{}), cmpopts.IgnoreInterfaces(struct { clusterconditions.ConditionRegistry - AlertGetter + risk.Source }{}), } @@ -784,58 +780,6 @@ func TestSyncAvailableUpdatesDesiredUpdate(t *testing.T) { } } -func Test_sanityCheck(t *testing.T) { - tests := []struct { - name string - updates []configv1.ConditionalUpdate - expected error - }{ - { - name: "good", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskB"}}}, - }, - }, - { - name: "invalid risk name", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}, {Name: "", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found invalid name on risk {[] some []}")}), - }, - { - name: "bad in one update", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}, {Name: "riskA", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found collision on risk riskA: {[] riskA []} and {[] some riskA []}")}), - }, - { - name: "bad in two updates", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found collision on risk riskA: {[] riskA []} and {[] some riskA []}")}), - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - actual := sanityCheck(tt.updates) - if diff := cmp.Diff(tt.expected, actual, cmp.Comparer(func(x, y error) bool { - if x == nil || y == nil { - return x == nil && y == nil - } - return x.Error() == y.Error() - })); diff != "" { - t.Errorf("sanityCheck() mismatch (-want +got):\n%s", diff) - } - - }) - } -} - func Test_loadRiskVersions(t *testing.T) { testcases := []struct { name string @@ -957,170 +901,6 @@ func Test_loadRiskConditions(t *testing.T) { } } -type mockAlertGetter struct { - ret prometheusv1.AlertsResult - jsonFile string - t *testing.T -} - -func (m *mockAlertGetter) Get(_ context.Context) prometheusv1.AlertsResult { - var ret prometheusv1.AlertsResult - if m.jsonFile != "" { - data, err := os.ReadFile(m.jsonFile) - if err != nil { - m.t.Fatal(err) - } - err = json.Unmarshal(data, &ret) - if err != nil { - m.t.Fatal(err) - } - return ret - } - return m.ret -} - -func Test_evaluateAlertConditions(t *testing.T) { - t1 := time.Now() - t2 := time.Now().Add(-3 * time.Minute) - t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") - if err != nil { - t.Fatalf("failed to parse time: %v", err) - } - tests := []struct { - name string - u *availableUpdates - expectedErr error - expectedAlertRisks []configv1.ConditionalUpdateRisk - }{ - { - name: "basic case", - u: &availableUpdates{ - AlertGetter: &mockAlertGetter{ - t: t, - ret: prometheusv1.AlertsResult{ - Alerts: []prometheusv1.Alert{ - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), - model.LabelName("severity"): model.LabelValue("critical"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), - }, - ActiveAt: t1, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("not-important"), - }, - State: prometheusv1.AlertStatePending, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), - model.LabelName("severity"): model.LabelValue("severity"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), - }, - ActiveAt: t1, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), - model.LabelName("severity"): model.LabelValue("severity"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), - }, - ActiveAt: t2, - }, - }, - }, - }, - }, - expectedAlertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "PodDisruptionBudgetAtLimit", - Message: "summary.", - URL: "http://runbook.example.com/runbooks/bbb.md", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", - LastTransitionTime: metav1.NewTime(t2), - }}, - }, - { - Name: "PodDisruptionBudgetLimit", - Message: "summary.", - URL: "http://runbook.example.com/runbooks/abc.md", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", - LastTransitionTime: metav1.NewTime(t1), - }}, - }, - }, - }, - { - name: "from file", - u: &availableUpdates{ - AlertGetter: &mockAlertGetter{ - t: t, - jsonFile: filepath.Join("testdata", "alerts.json"), - }, - }, - expectedAlertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: metav1.NewTime(t3), - }}, - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - actual := tt.u.evaluateAlertRisks(context.TODO()) - if diff := cmp.Diff(tt.expectedAlertRisks, actual); diff != "" { - t.Errorf("AlertRisks mismatch (-want +got):\n%s", diff) - } - }) - } -} - func Test_newRecommendedReason(t *testing.T) { tests := []struct { name string @@ -1259,152 +1039,6 @@ func Test_newRecommendedReason(t *testing.T) { } } -func Test_attachAlertRisksToUpdates(t *testing.T) { - now := metav1.Now() - tests := []struct { - name string - alertRisks []configv1.ConditionalUpdateRisk - updates []configv1.Release - conditionUpdates []configv1.ConditionalUpdate - expected []configv1.Release - expectedConditionUpdates []configv1.ConditionalUpdate - expectedRiskConditions map[string][]metav1.Condition - }{ - { - name: "no alert risks", - }, - { - name: "basic case", - alertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - updates: []configv1.Release{ - { - Version: "4.21.1", - }, - { - Version: "4.21.2", - }, - }, - conditionUpdates: []configv1.ConditionalUpdate{ - { - Release: configv1.Release{ - Version: "4.21.3", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "Risk1", - }, - }, - }, - }, - expectedConditionUpdates: []configv1.ConditionalUpdate{ - { - Release: configv1.Release{ - Version: "4.21.1", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - { - Release: configv1.Release{ - Version: "4.21.2", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - { - Release: configv1.Release{ - Version: "4.21.3", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "Risk1", - }, - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - }, - expectedRiskConditions: map[string][]metav1.Condition{ - "TestAlert": []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - u := &availableUpdates{Updates: tt.updates, ConditionalUpdates: tt.conditionUpdates} - u.attachAlertRisksToUpdates(tt.alertRisks) - // attaching is idempotent - u.attachAlertRisksToUpdates(tt.alertRisks) - if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { - t.Errorf("available updates mismatch (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tt.expectedConditionUpdates, u.ConditionalUpdates); diff != "" { - t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tt.expectedRiskConditions, u.RiskConditions); diff != "" { - t.Errorf("risk conditions mismatch (-want +got):\n%s", diff) - } - }) - } -} - func Test_evaluateConditionalUpdates(t *testing.T) { t1, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") if err != nil { @@ -1419,6 +1053,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { shouldReconcileAcceptRisks func() bool updates []configv1.Release conditionUpdates []configv1.ConditionalUpdate + risks risk.Source expected []configv1.Release expectedConditionUpdates []configv1.ConditionalUpdate expectedRiskConditions map[string][]metav1.Condition @@ -1434,6 +1069,22 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Version: "4.21.2", }, }, + risks: &riskmock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }}, + }, expectedConditionUpdates: []configv1.ConditionalUpdate{ { Release: configv1.Release{ @@ -1443,7 +1094,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { { Name: "TestAlert", Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + URL: "https://example.com/testAlert", MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, Conditions: []metav1.Condition{{ Type: "Applies", @@ -1458,7 +1109,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Type: "Recommended", Status: "False", Reason: "TestAlert", - Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + Message: "Test summary. https://example.com/testAlert", LastTransitionTime: metav1.NewTime(t1), }}, }, @@ -1470,7 +1121,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { { Name: "TestAlert", Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + URL: "https://example.com/testAlert", MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, Conditions: []metav1.Condition{{ Type: "Applies", @@ -1485,7 +1136,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Type: "Recommended", Status: "False", Reason: "TestAlert", - Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + Message: "Test summary. https://example.com/testAlert", LastTransitionTime: metav1.NewTime(t1), }}, }, @@ -1507,9 +1158,9 @@ func Test_evaluateConditionalUpdates(t *testing.T) { ConditionalUpdates: tt.conditionUpdates, ShouldReconcileAcceptRisks: tt.shouldReconcileAcceptRisks, ConditionRegistry: conditionRegistry, - AlertGetter: &mockAlertGetter{t: t, jsonFile: filepath.Join("testdata", "alerts.json")}, + risks: tt.risks, } - u.evaluateConditionalUpdates(context.TODO()) + u.evaluateConditionalUpdates(context.Background()) if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { t.Errorf("available updates mismatch (-want +got):\n%s", diff) } diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 9866fe80c3..e9eb1d3f3a 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -43,7 +43,6 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcebuilder" "github.com/openshift/cluster-version-operator/lib/validation" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" - "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" "github.com/openshift/cluster-version-operator/pkg/customsignaturestore" "github.com/openshift/cluster-version-operator/pkg/cvo/configuration" @@ -54,6 +53,8 @@ import ( "github.com/openshift/cluster-version-operator/pkg/payload" "github.com/openshift/cluster-version-operator/pkg/payload/precondition" preconditioncv "github.com/openshift/cluster-version-operator/pkg/payload/precondition/clusterversion" + "github.com/openshift/cluster-version-operator/pkg/risk" + "github.com/openshift/cluster-version-operator/pkg/risk/alert" ) const ( @@ -209,7 +210,9 @@ type Operator struct { // configuration, if enabled, reconciles the ClusterVersionOperator configuration. configuration *configuration.ClusterVersionOperatorConfiguration - AlertGetter AlertGetter + // risks holds update-risk source (in-cluster alerts, etc.) + // that will be aggregated into conditional update risks. + risks risk.Source } // New returns a new cluster version operator. @@ -270,7 +273,7 @@ func New( exclude: exclude, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), - AlertGetter: promql.NewAlertGetter(promqlTarget), + risks: alert.New("Alert", promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, requiredFeatureSet: featureSet, diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index 5a8ba48e16..5a287f70cd 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -2783,7 +2783,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) { } } - if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks")); diff != "" { + if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks", "risks")); diff != "" { t.Fatalf("unexpected: %s", diff) } if (optr.queue.Len() > 0) != (optr.availableUpdates != nil) { diff --git a/pkg/risk/alert/alert.go b/pkg/risk/alert/alert.go new file mode 100644 index 0000000000..496d54d70b --- /dev/null +++ b/pkg/risk/alert/alert.go @@ -0,0 +1,186 @@ +package alert + +import ( + "context" + "fmt" + "sort" + "strings" + + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" + "github.com/openshift/cluster-version-operator/pkg/internal" + "github.com/openshift/cluster-version-operator/pkg/risk" +) + +func New(name string, promQLTarget clusterconditions.PromQLTarget) risk.Source { + return &alertRisk{ + name: name, + getter: promql.NewAlertGetter(promQLTarget), + } +} + +type alertRisk struct { + name string + getter promql.Getter +} + +func (a *alertRisk) Name() string { + return a.name +} + +func (a *alertRisk) Risks(ctx context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) { + alerts := a.getter.Get(ctx) + risks := alertsToRisks(alerts.Alerts) + versionMap := make(map[string][]string, len(risks)) + for _, risk := range risks { + versionMap[risk.Name] = versions + } + return risks, versionMap, nil +} + +func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { + klog.V(2).Infof("Found %d alerts", len(alerts)) + risks := map[string]configv1.ConditionalUpdateRisk{} + for _, alert := range alerts { + var alertName string + if alertName = string(alert.Labels["alertname"]); alertName == "" { + continue + } + if alert.State == "pending" { + continue + } + + var summary string + if summary = string(alert.Annotations["summary"]); summary == "" { + summary = alertName + } + if !strings.HasSuffix(summary, ".") { + summary += "." + } + + var description string + alertMessage := string(alert.Annotations["message"]) + alertDescription := string(alert.Annotations["description"]) + switch { + case alertMessage != "" && alertDescription != "": + description += " The alert description is: " + alertDescription + " | " + alertMessage + case alertDescription != "": + description += " The alert description is: " + alertDescription + case alertMessage != "": + description += " The alert description is: " + alertMessage + default: + description += " The alert has no description." + } + + var runbook string + alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound" + if runbook = string(alert.Annotations["runbook_url"]); runbook == "" { + runbook = "" + } else { + alertURL = runbook + } + + details := fmt.Sprintf("%s%s %s", summary, description, runbook) + + severity := string(alert.Labels["severity"]) + if severity == "critical" { + if alertName == internal.AlertNamePodDisruptionBudgetLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + } + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if internal.HavePullWaiting.Has(alertName) || + internal.HaveNodes.Has(alertName) || + alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || + alertName == internal.AlertNameVMCannotBeEvicted { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) + if updatePrecheck == "true" { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + } + + klog.V(2).Infof("Got %d risks", len(risks)) + if len(risks) == 0 { + return nil + } + + var ret []configv1.ConditionalUpdateRisk + var keys []string + for k := range risks { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + ret = append(ret, risks[k]) + } + return ret +} + +func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk { + risk, ok := risks[riskName] + if !ok { + return configv1.ConditionalUpdateRisk{ + Name: riskName, + Message: message, + URL: url, + // Always as the alert is firing + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{condition}, + } + } + + if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { + c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) + if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { + c.LastTransitionTime = condition.LastTransitionTime + } + meta.SetStatusCondition(&risk.Conditions, *c) + } + + return risk +} diff --git a/pkg/risk/alert/alert_test.go b/pkg/risk/alert/alert_test.go new file mode 100644 index 0000000000..9a12724d57 --- /dev/null +++ b/pkg/risk/alert/alert_test.go @@ -0,0 +1,198 @@ +package alert + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" +) + +type mockAlertGetter struct { + ret prometheusv1.AlertsResult + jsonFile string + t *testing.T +} + +func (m *mockAlertGetter) Get(_ context.Context) prometheusv1.AlertsResult { + var ret prometheusv1.AlertsResult + if m.jsonFile != "" { + data, err := os.ReadFile(m.jsonFile) + if err != nil { + m.t.Fatal(err) + } + err = json.Unmarshal(data, &ret) + if err != nil { + m.t.Fatal(err) + } + return ret + } + return m.ret +} + +func Test_alertRisk(t *testing.T) { + t1 := time.Now() + t2 := time.Now().Add(-3 * time.Minute) + t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } + tests := []struct { + name string + getter promql.Getter + expectedErr error + expectedAlertRisks []configv1.ConditionalUpdateRisk + }{ + { + name: "basic case", + getter: &mockAlertGetter{ + t: t, + ret: prometheusv1.AlertsResult{ + Alerts: []prometheusv1.Alert{ + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), + model.LabelName("severity"): model.LabelValue("critical"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("not-important"), + }, + State: prometheusv1.AlertStatePending, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t2, + }, + }, + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/bbb.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t2), + }}, + }, + { + Name: "PodDisruptionBudgetLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/abc.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + }, + { + name: "from file", + getter: &mockAlertGetter{ + t: t, + jsonFile: filepath.Join("testdata", "alerts.json"), + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t3), + }}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + source := &alertRisk{ + name: "Alert", + getter: tt.getter, + } + allVersions := []string{"1.2.3"} + risks, versions, err := source.Risks(context.Background(), allVersions) + + if diff := cmp.Diff(tt.expectedAlertRisks, risks); diff != "" { + t.Errorf("Alert Risks mismatch (-want +got):\n%s", diff) + } + + expectedVersions := make(map[string][]string, len(risks)) + for _, risk := range risks { + expectedVersions[risk.Name] = allVersions + } + if diff := cmp.Diff(expectedVersions, versions); diff != "" { + t.Errorf("Alert Risks version mismatch (-want +got):\n%s", diff) + } + + if err != nil { + t.Errorf("Alert Risks evaluation error: %v", err) + } + }) + } +} diff --git a/pkg/cvo/testdata/alerts.json b/pkg/risk/alert/testdata/alerts.json similarity index 100% rename from pkg/cvo/testdata/alerts.json rename to pkg/risk/alert/testdata/alerts.json diff --git a/pkg/risk/mock/mock.go b/pkg/risk/mock/mock.go new file mode 100644 index 0000000000..b5c6e88a03 --- /dev/null +++ b/pkg/risk/mock/mock.go @@ -0,0 +1,46 @@ +// Package mock implements an update risk source with mock responses, +// for convenient testing. +package mock + +import ( + "context" + "errors" + + configv1 "github.com/openshift/api/config/v1" +) + +// Mock implements an update risk source with mock responses. +type Mock struct { + // InternalName is the source name. + InternalName string + + // InternalRisks are the detected update risks. + InternalRisks []configv1.ConditionalUpdateRisk + + // InternalVersions maps the detected risks to version + // strings. If unset, all internal risks will be reported as + // applicable to all known versions. + InternalVersions map[string][]string +} + +// Name returns the source's name. +func (m *Mock) Name() string { + return m.InternalName +} + +// Risks returns the current set of risks the source is aware of. +func (m *Mock) Risks(_ context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) { + if m.InternalRisks == nil { + return nil, nil, errors.New("failed to calculate risks") + } + var versionMap map[string][]string + if m.InternalVersions != nil { + versionMap = m.InternalVersions + } else { + versionMap = make(map[string][]string, len(m.InternalRisks)) + for _, risk := range m.InternalRisks { + versionMap[risk.Name] = versions + } + } + return m.InternalRisks, versionMap, nil +} diff --git a/pkg/risk/risk.go b/pkg/risk/risk.go new file mode 100644 index 0000000000..065999a0ba --- /dev/null +++ b/pkg/risk/risk.go @@ -0,0 +1,163 @@ +// Package risk implements conditional update risk detection from a +// generic source. For example, risks could come from the update +// service's conditionalEdges. Or from firing in-cluster alerts. Or +// from other sources. This package makes it easy to extend the +// cluster-version operator with additional risk-detection sources over +// time. +package risk + +import ( + "context" + "errors" + "fmt" + "regexp" + "slices" + "sort" + "strings" + + "github.com/blang/semver/v4" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + configv1 "github.com/openshift/api/config/v1" +) + +// validName enforces the non-empty CamelCase risk-naming convention. +var validName = regexp.MustCompile(`^[A-Z][A-Za-z0-9]*$`) + +// Source represents a risk source, like the update service's +// conditionalEdges. Or firing in-cluster alerts. Or another source. +type Source interface { + // Name returns the source's name, which can be prefixed to + // the returned risk names to avoid collisions when aggregating + // in a single map. + Name() string + + // Risks returns the current set of risks the source is aware of. + // Implementations are encouraged to make this an inexpensive call, + // for example by caching expensive precursors, so the caller does + // not have to worry about the load generated by frequent calls. + // Arguments are a context to allow cancellation, and a slice of all + // update versions (for sources that are reporting on current cluster + // state, and want any detected risks to apply to all versions). + // Returns a slice of detected risks, and a map from each risk name + // to the version strings that risk is associated with. + Risks(ctx context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) +} + +// Merge updates the input updates and conditionalUpdates by merging +// additional risks. The versions map has risk names as the key, and +// a slice of version strings that risk should be merged into as the +// value. It returns the modified updates and conditionalUpdates slices. +func Merge(updates []configv1.Release, conditionalUpdates []configv1.ConditionalUpdate, risks []configv1.ConditionalUpdateRisk, versions map[string][]string) ([]configv1.Release, []configv1.ConditionalUpdate, error) { + var errs []error + riskMap := make(map[string]configv1.ConditionalUpdateRisk, len(risks)) + invalidRiskMap := make(map[string]struct{}, len(risks)) + for _, risk := range risks { + if !validName.MatchString(risk.Name) { + invalidRiskMap[risk.Name] = struct{}{} + errs = append(errs, fmt.Errorf("invalid risk name %q does not match %q", risk.Name, validName)) + continue + } + + existingRisk, ok := riskMap[risk.Name] + if ok { + if diff := cmp.Diff(existingRisk, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { + errs = append(errs, fmt.Errorf("divergent definitions of risk %q from the merging source:\n%s", risk.Name, diff)) + } + continue + } + riskMap[risk.Name] = risk + } + + riskMissMap := make(map[string]struct{}, len(versions)) + versionRiskNames := make([]string, 0, len(versions)) + for riskName := range versions { + if _, ok := invalidRiskMap[riskName]; ok { + continue // we're already complaining about these + } + versionRiskNames = append(versionRiskNames, riskName) + } + sort.Strings(versionRiskNames) + for _, riskName := range versionRiskNames { + risk, ok := riskMap[riskName] + if !ok { + riskMissMap[riskName] = struct{}{} + continue + } + for _, version := range versions[riskName] { + found := false + for i, update := range updates { + if update.Version != version { + continue + } + found = true + conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{ + Release: update, + Risks: []configv1.ConditionalUpdateRisk{risk}, + }) + updates = append(updates[:i], updates[i+1:]...) + if len(updates) == 0 { + updates = nil + } + break + } + if found { + continue + } + for i, conditionalUpdate := range conditionalUpdates { + if conditionalUpdate.Release.Version != version { + continue + } + found = true + riskFound := false + for _, existingRisk := range conditionalUpdate.Risks { + if existingRisk.Name != risk.Name { + continue + } + riskFound = true + if diff := cmp.Diff(existingRisk, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { + errs = append(errs, fmt.Errorf("divergent definitions of risk %q:\n%s", risk.Name, diff)) + continue + } + break + } + if !riskFound { + conditionalUpdates[i].Risks = append(conditionalUpdate.Risks, risk) + } + } + if !found { + errs = append(errs, fmt.Errorf("cannot merge risk %q into unknown release version %q", risk.Name, version)) + } + } + } + if len(riskMissMap) > 0 { + riskMisses := make([]string, 0, len(riskMissMap)) + for riskName := range riskMissMap { + riskMisses = append(riskMisses, riskName) + } + sort.Strings(riskMisses) + errs = append(errs, fmt.Errorf("merge versions referenced unknown risks: %s", strings.Join(riskMisses, ", "))) + } + + slices.SortFunc(conditionalUpdates, func(a, b configv1.ConditionalUpdate) int { + va, errA := semver.Parse(a.Release.Version) + vb, errB := semver.Parse(b.Release.Version) + if errA == nil && errB == nil { + return va.Compare(vb) + } else if errA == nil { + return 1 + } else if errB == nil { + return -1 + } + return strings.Compare(a.Release.Version, b.Release.Version) + }) + + for i := range conditionalUpdates { + slices.SortFunc(conditionalUpdates[i].Risks, func(a, b configv1.ConditionalUpdateRisk) int { + return strings.Compare(a.Name, b.Name) + }) + } + + return updates, conditionalUpdates, errors.Join(errs...) +} diff --git a/pkg/risk/risk_test.go b/pkg/risk/risk_test.go new file mode 100644 index 0000000000..c0b3638a50 --- /dev/null +++ b/pkg/risk/risk_test.go @@ -0,0 +1,262 @@ +package risk_test + +import ( + "context" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/risk" + "github.com/openshift/cluster-version-operator/pkg/risk/mock" +) + +func Test_Merge(t *testing.T) { + tests := []struct { + name string + updates []configv1.Release + conditionalUpdates []configv1.ConditionalUpdate + risks risk.Source + expectedUpdates []configv1.Release + expectedConditionalUpdates []configv1.ConditionalUpdate + expectedError string + }{ + { + name: "basic case", + updates: []configv1.Release{ + { + Version: "4.21.1", + }, + { + Version: "4.21.2", + }, + }, + conditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "ExistingRisk", + }, + }, + }, + }, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }}, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "ExistingRisk", + }, + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + }, + }, + { + name: "invalid risk name", + updates: []configv1.Release{{Version: "1.2.3"}}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }}, + }, + expectedUpdates: []configv1.Release{{Version: "1.2.3"}}, + expectedError: `invalid risk name "" does not match "^[A-Z][A-Za-z0-9]*$"`, + }, + { + name: "single-source collision", + updates: []configv1.Release{{Version: "1.2.3"}}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + { + Name: "RiskA", + Message: "Test summary 2.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + expectedError: `divergent definitions of risk "RiskA" from the merging source: + v1.ConditionalUpdateRisk{ + ... // 1 ignored field + URL: "https://example.com/testAlert", + Name: "RiskA", +- Message: "Test summary 1.", ++ Message: "Test summary 2.", + MatchingRules: {{Type: "Always"}}, + } +`, + }, + { + name: "multi-source collision", + conditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 2.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + expectedError: `divergent definitions of risk "RiskA": + v1.ConditionalUpdateRisk{ + ... // 1 ignored field + URL: "https://example.com/testAlert", + Name: "RiskA", +- Message: "Test summary 1.", ++ Message: "Test summary 2.", + MatchingRules: {{Type: "Always"}}, + } +`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + allVersions := make([]string, 0, len(tt.updates)+len(tt.conditionalUpdates)) + for _, u := range tt.updates { + allVersions = append(allVersions, u.Version) + } + for _, u := range tt.conditionalUpdates { + allVersions = append(allVersions, u.Release.Version) + } + risks, versions, err := tt.risks.Risks(context.Background(), allVersions) + if err != nil { + t.Fatalf("failed to get risks from %s: %v", tt.risks.Name(), err) + } + updates, conditionalUpdates, err := risk.Merge(tt.updates, tt.conditionalUpdates, risks, versions) + initialPassSuccess := true + if diff := cmp.Diff(tt.expectedUpdates, updates); diff != "" { + initialPassSuccess = false + t.Errorf("available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionalUpdates, conditionalUpdates); diff != "" { + initialPassSuccess = false + t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) + } + var actualError string + if err != nil { + actualError = strings.ReplaceAll(err.Error(), "\u00a0", " ") + } + if diff := cmp.Diff(tt.expectedError, actualError); diff != "" { + initialPassSuccess = false + t.Errorf("merge error mismatch (-want +got):\n%s", diff) + } + if !initialPassSuccess { + return // we already have enough errors to work through; skip the idempotency test + } + updates, conditionalUpdates, err = risk.Merge(updates, conditionalUpdates, risks, versions) + if diff := cmp.Diff(tt.expectedUpdates, updates); diff != "" { + t.Errorf("idempotent available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionalUpdates, conditionalUpdates); diff != "" { + t.Errorf("idempotent conditional updates mismatch (-want +got):\n%s", diff) + } + actualError = "" + if err != nil { + actualError = strings.ReplaceAll(err.Error(), "\u00a0", " ") + } + if diff := cmp.Diff(tt.expectedError, actualError); diff != "" { + t.Errorf("idempotent merge error mismatch (-want +got):\n%s", diff) + } + }) + } +} From 913e324924e7f1028974c87544e1b51bea276b6d Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 3 Apr 2026 20:52:40 -0700 Subject: [PATCH 2/3] pkg/cvo/availableupdates: Cache original upstream updates We've been mixing in alerts since 3b365d0e7b (OTA-1813: Populate risks from alerts, 2026-02-27, #1329). But when the alerts get resolved, we want the cluster to notice quickly. With this change, we keep the original upstream data isolated in new properties, so an evaluation round will be able to clear out resolved alerts, even if needFreshFetch is false because available updates were recently retrieved. --- pkg/cvo/availableupdates.go | 56 ++++++++++++++++++++++++-------- pkg/cvo/availableupdates_test.go | 1 + pkg/cvo/cvo_test.go | 3 +- 3 files changed, 44 insertions(+), 16 deletions(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index c96abaa924..fd767941bf 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -153,11 +153,13 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 condition.Reason == "ResponseInvalid")) if !responseFailed || (responseFailed && !preserveCacheOnFailure) { optrAvailableUpdates.Current = current - optrAvailableUpdates.Updates = updates - optrAvailableUpdates.ConditionalUpdates = conditionalUpdates + optrAvailableUpdates.upstreamUpdates = updates + optrAvailableUpdates.upstreamConditionalUpdates = conditionalUpdates } } + optrAvailableUpdates.Updates = deepCopyReleases(optrAvailableUpdates.upstreamUpdates) + optrAvailableUpdates.ConditionalUpdates = deepCopyConditionalUpdates(optrAvailableUpdates.upstreamConditionalUpdates) optrAvailableUpdates.AcceptRisks = acceptRisks optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks optrAvailableUpdates.risks = optr.risks @@ -205,10 +207,12 @@ type availableUpdates struct { // slice was empty. LastSyncOrConfigChange time.Time - Current configv1.Release - Updates []configv1.Release - ConditionalUpdates []configv1.ConditionalUpdate - ConditionRegistry clusterconditions.ConditionRegistry + Current configv1.Release + Updates []configv1.Release + upstreamUpdates []configv1.Release + ConditionalUpdates []configv1.ConditionalUpdate + upstreamConditionalUpdates []configv1.ConditionalUpdate + ConditionRegistry clusterconditions.ConditionRegistry Condition configv1.ClusterOperatorStatusCondition @@ -332,22 +336,46 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { } if optr.availableUpdates.Updates != nil { - u.Updates = make([]configv1.Release, 0, len(optr.availableUpdates.Updates)) - for _, update := range optr.availableUpdates.Updates { - u.Updates = append(u.Updates, *update.DeepCopy()) - } + u.Updates = deepCopyReleases(optr.availableUpdates.Updates) + } + + if optr.availableUpdates.upstreamUpdates != nil { + u.upstreamUpdates = deepCopyReleases(optr.availableUpdates.upstreamUpdates) } if optr.availableUpdates.ConditionalUpdates != nil { - u.ConditionalUpdates = make([]configv1.ConditionalUpdate, 0, len(optr.availableUpdates.ConditionalUpdates)) - for _, conditionalUpdate := range optr.availableUpdates.ConditionalUpdates { - u.ConditionalUpdates = append(u.ConditionalUpdates, *conditionalUpdate.DeepCopy()) - } + u.ConditionalUpdates = deepCopyConditionalUpdates(optr.availableUpdates.ConditionalUpdates) + } + + if optr.availableUpdates.upstreamConditionalUpdates != nil { + u.upstreamConditionalUpdates = deepCopyConditionalUpdates(optr.availableUpdates.upstreamConditionalUpdates) } return u } +func deepCopyReleases(releases []configv1.Release) []configv1.Release { + if releases == nil { + return nil + } + c := make([]configv1.Release, 0, len(releases)) + for _, update := range releases { + c = append(c, *update.DeepCopy()) + } + return c +} + +func deepCopyConditionalUpdates(updates []configv1.ConditionalUpdate) []configv1.ConditionalUpdate { + if updates == nil { + return nil + } + c := make([]configv1.ConditionalUpdate, 0, len(updates)) + for _, update := range updates { + c = append(c, *update.DeepCopy()) + } + return c +} + func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[string]riskWithVersion, conditionRegistry clusterconditions.ConditionRegistry) map[string][]metav1.Condition { riskConditions := map[string][]metav1.Condition{} for _, riskName := range risks { diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 86b6b6c4f2..3d07c11f1e 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -213,6 +213,7 @@ var cvFixture = &configv1.ClusterVersion{ } var availableUpdatesCmpOpts = []cmp.Option{ + cmpopts.IgnoreUnexported(availableUpdates{}), cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks"), cmpopts.IgnoreTypes(time.Time{}), cmpopts.IgnoreInterfaces(struct { diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index 5a287f70cd..f789b98635 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -16,7 +16,6 @@ import ( "github.com/davecgh/go-spew/spew" "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" appsv1 "k8s.io/api/apps/v1" @@ -2783,7 +2782,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) { } } - if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks", "risks")); diff != "" { + if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, availableUpdatesCmpOpts...); diff != "" { t.Fatalf("unexpected: %s", diff) } if (optr.queue.Len() > 0) != (optr.availableUpdates != nil) { From d9fe47bd7988ec9eb49cce6b805fdf9186a9da82 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Tue, 7 Apr 2026 22:03:24 -0700 Subject: [PATCH 3/3] pkg/cvo/availableupdates: Log target version when explaining risk-acceptance Because [1]: 1. create a cluster which has 1 recommend version and 1 non-recommend version 2. create alert PodDisruptionBudgetAtLimit with warning severity 3. oc adm upgrade accept PodDisruptionBudgetAtLimit had been creating fairly noisy output like: I0408 03:58:45.443626 1 alerts.go:87] refreshed: 10 alerts I0408 03:58:45.443753 1 availableupdates.go:534] Found 10 alerts I0408 03:58:45.443818 1 availableupdates.go:631] Got 0 risks I0408 03:58:45.445403 1 availableupdates.go:868] Risk with name "PodDisruptionBudgetAtLimit" is accepted by the cluster admin and thus not in the evaluation of conditional update I0408 03:58:45.445421 1 availableupdates.go:868] Risk with name "PodDisruptionBudgetAtLimit" is accepted by the cluster admin and thus not in the evaluation of conditional update I0408 03:58:45.445429 1 availableupdates.go:868] Risk with name "PodDisruptionBudgetAtLimit" is accepted by the cluster admin and thus not in the evaluation of conditional update ... I0408 03:58:45.446455 1 availableupdates.go:868] Risk with name "PodDisruptionBudgetAtLimit" is accepted by the cluster admin and thus not in the evaluation of conditional update I0408 03:58:45.446460 1 availableupdates.go:868] Risk with name "PodDisruptionBudgetAtLimit" is accepted by the cluster admin and thus not in the evaluation of conditional update I0408 03:58:45.446475 1 availableupdates.go:177] Requeue available-update evaluation, because "4.22.0-ec.4" is Recommended=Unknown: EvaluationFailed: Could not evaluate exposure to update risk SomeInvokerThing (invalid PromQL result length must be one, but is 0) SomeInvokerThing description: On clusters on default invoker user, this imaginary bug can happen. SomeInvokerThing URL: https://bug.example.com/a I0408 03:58:45.449976 1 cvo.go:816] Finished syncing available updates "openshift-cluster-version/version" (159.68958ms) Logging the version should help distinguish confusedly looping on a single risk for a single target version, from looping over that same risks for multiple different target versions. [1]: https://redhat.atlassian.net/browse/OTA-1813?focusedCommentId=16643738 --- pkg/cvo/availableupdates.go | 8 ++++---- pkg/cvo/availableupdates_test.go | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index fd767941bf..9f1f88e9c9 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -580,7 +580,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry) for i, conditionalUpdate := range u.ConditionalUpdates { - condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) + condition := evaluateConditionalUpdate(conditionalUpdate, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) if condition.Status == metav1.ConditionTrue { u.addUpdate(conditionalUpdate.Release) @@ -666,7 +666,7 @@ func newRecommendedReason(now, want string) string { } func evaluateConditionalUpdate( - risks []configv1.ConditionalUpdateRisk, + conditionalUpdate configv1.ConditionalUpdate, acceptRisks sets.Set[string], shouldReconcileAcceptRisks func() bool, riskConditions map[string][]metav1.Condition, @@ -680,7 +680,7 @@ func evaluateConditionalUpdate( } var errorMessages []string - for _, risk := range risks { + for _, risk := range conditionalUpdate.Risks { riskCondition := meta.FindStatusCondition(riskConditions[risk.Name], internal.ConditionalUpdateRiskConditionTypeApplies) if riskCondition == nil { // This should never happen @@ -700,7 +700,7 @@ func evaluateConditionalUpdate( recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionTrue) recommended.Reason = newRecommendedReason(recommended.Reason, recommendedReasonAllExposedRisksAccepted) recommended.Message = "The update is recommended, because either risk does not apply to this cluster or it is accepted by cluster admins." - klog.V(2).Infof("Risk with name %q is accepted by the cluster admin and thus not in the evaluation of conditional update", risk.Name) + klog.V(2).Infof("Risk with name %q is accepted by the cluster admin and thus not in the evaluation of conditional update %s", risk.Name, conditionalUpdate.Release.Version) } else { recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionFalse) wantReason := recommendedReasonExposed diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 3d07c11f1e..77d3c2a2b6 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -517,7 +517,8 @@ func TestEvaluateConditionalUpdate(t *testing.T) { if tc.riskConditions == nil { tc.riskConditions = map[string][]metav1.Condition{} } - actual := evaluateConditionalUpdate(tc.risks, tc.acceptRisks, tc.shouldReconcileAcceptRisks, tc.riskConditions) + conditionalUpdate := configv1.ConditionalUpdate{Risks: tc.risks} + actual := evaluateConditionalUpdate(conditionalUpdate, tc.acceptRisks, tc.shouldReconcileAcceptRisks, tc.riskConditions) if diff := cmp.Diff(tc.expected, actual); diff != "" { t.Errorf("actual condition differs from expected:\n%s", diff) }