diff --git a/pkg/cvo/availableupdates.go b/pkg/cvo/availableupdates.go index 178fd380ec..9f1f88e9c9 100644 --- a/pkg/cvo/availableupdates.go +++ b/pkg/cvo/availableupdates.go @@ -12,15 +12,11 @@ import ( "time" "github.com/blang/semver/v4" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" - prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/klog/v2" @@ -30,6 +26,7 @@ import ( "github.com/openshift/cluster-version-operator/pkg/cincinnati" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" "github.com/openshift/cluster-version-operator/pkg/internal" + "github.com/openshift/cluster-version-operator/pkg/risk" ) const noArchitecture string = "NoArchitecture" @@ -103,7 +100,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) { needsConditionalUpdateEval = true } - // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval if !needsConditionalUpdateEval { klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339)) return nil @@ -157,14 +153,16 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1 condition.Reason == "ResponseInvalid")) if !responseFailed || (responseFailed && !preserveCacheOnFailure) { optrAvailableUpdates.Current = current - optrAvailableUpdates.Updates = updates - optrAvailableUpdates.ConditionalUpdates = conditionalUpdates + optrAvailableUpdates.upstreamUpdates = updates + optrAvailableUpdates.upstreamConditionalUpdates = conditionalUpdates } } + optrAvailableUpdates.Updates = deepCopyReleases(optrAvailableUpdates.upstreamUpdates) + optrAvailableUpdates.ConditionalUpdates = deepCopyConditionalUpdates(optrAvailableUpdates.upstreamConditionalUpdates) optrAvailableUpdates.AcceptRisks = acceptRisks optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks - optrAvailableUpdates.AlertGetter = optr.AlertGetter + optrAvailableUpdates.risks = optr.risks optrAvailableUpdates.evaluateConditionalUpdates(ctx) queueKey := optr.queueKey() @@ -209,17 +207,21 @@ type availableUpdates struct { // slice was empty. LastSyncOrConfigChange time.Time - Current configv1.Release - Updates []configv1.Release - ConditionalUpdates []configv1.ConditionalUpdate - ConditionRegistry clusterconditions.ConditionRegistry + Current configv1.Release + Updates []configv1.Release + upstreamUpdates []configv1.Release + ConditionalUpdates []configv1.ConditionalUpdate + upstreamConditionalUpdates []configv1.ConditionalUpdate + ConditionRegistry clusterconditions.ConditionRegistry Condition configv1.ClusterOperatorStatusCondition // RiskConditions stores the condition for every risk (name, url, message, matchingRules). RiskConditions map[string][]metav1.Condition - AlertGetter AlertGetter + // risks holds update-risk source (in-cluster alerts, etc.) + // that will be aggregated into conditional update risks. + risks risk.Source } func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool { @@ -325,7 +327,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks, AcceptRisks: optr.availableUpdates.AcceptRisks, RiskConditions: optr.availableUpdates.RiskConditions, - AlertGetter: optr.availableUpdates.AlertGetter, + risks: optr.risks, LastAttempt: optr.availableUpdates.LastAttempt, LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange, Current: *optr.availableUpdates.Current.DeepCopy(), @@ -334,22 +336,46 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates { } if optr.availableUpdates.Updates != nil { - u.Updates = make([]configv1.Release, 0, len(optr.availableUpdates.Updates)) - for _, update := range optr.availableUpdates.Updates { - u.Updates = append(u.Updates, *update.DeepCopy()) - } + u.Updates = deepCopyReleases(optr.availableUpdates.Updates) + } + + if optr.availableUpdates.upstreamUpdates != nil { + u.upstreamUpdates = deepCopyReleases(optr.availableUpdates.upstreamUpdates) } if optr.availableUpdates.ConditionalUpdates != nil { - u.ConditionalUpdates = make([]configv1.ConditionalUpdate, 0, len(optr.availableUpdates.ConditionalUpdates)) - for _, conditionalUpdate := range optr.availableUpdates.ConditionalUpdates { - u.ConditionalUpdates = append(u.ConditionalUpdates, *conditionalUpdate.DeepCopy()) - } + u.ConditionalUpdates = deepCopyConditionalUpdates(optr.availableUpdates.ConditionalUpdates) + } + + if optr.availableUpdates.upstreamConditionalUpdates != nil { + u.upstreamConditionalUpdates = deepCopyConditionalUpdates(optr.availableUpdates.upstreamConditionalUpdates) } return u } +func deepCopyReleases(releases []configv1.Release) []configv1.Release { + if releases == nil { + return nil + } + c := make([]configv1.Release, 0, len(releases)) + for _, update := range releases { + c = append(c, *update.DeepCopy()) + } + return c +} + +func deepCopyConditionalUpdates(updates []configv1.ConditionalUpdate) []configv1.ConditionalUpdate { + if updates == nil { + return nil + } + c := make([]configv1.ConditionalUpdate, 0, len(updates)) + for _, update := range updates { + c = append(c, *update.DeepCopy()) + } + return c +} + func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[string]riskWithVersion, conditionRegistry clusterconditions.ConditionRegistry) map[string][]metav1.Condition { riskConditions := map[string][]metav1.Condition{} for _, riskName := range risks { @@ -364,9 +390,14 @@ func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[st riskCondition.Status = metav1.ConditionUnknown riskCondition.Reason = riskConditionReasonEvaluationFailed riskCondition.Message = msg - } else if match { - riskCondition.Status = metav1.ConditionTrue - riskCondition.Reason = riskConditionReasonMatch + } else { + if match { + riskCondition.Status = metav1.ConditionTrue + riskCondition.Reason = riskConditionReasonMatch + } + if existingCondition := meta.FindStatusCondition(risk.Conditions, riskCondition.Type); existingCondition != nil && existingCondition.Status == riskCondition.Status { + riskCondition = *existingCondition // preserve more-detailed information, LastTransitionTime, etc., from the source + } } riskConditions[risk.Name] = []metav1.Condition{riskCondition} } @@ -518,176 +549,38 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran } } -type AlertGetter interface { - Get(ctx context.Context) prometheusv1.AlertsResult -} - -func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) []configv1.ConditionalUpdateRisk { - if u == nil || u.AlertGetter == nil { - return nil +func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { + if u == nil { + return } - alertsResult := u.AlertGetter.Get(ctx) - return alertsToRisks(alertsResult.Alerts) -} - -func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { - klog.V(2).Infof("Found %d alerts", len(alerts)) - risks := map[string]configv1.ConditionalUpdateRisk{} - for _, alert := range alerts { - var alertName string - if alertName = string(alert.Labels["alertname"]); alertName == "" { - continue - } - if alert.State == "pending" { - continue - } - - var summary string - if summary = string(alert.Annotations["summary"]); summary == "" { - summary = alertName - } - if !strings.HasSuffix(summary, ".") { - summary += "." - } - - var description string - alertMessage := string(alert.Annotations["message"]) - alertDescription := string(alert.Annotations["description"]) - switch { - case alertMessage != "" && alertDescription != "": - description += " The alert description is: " + alertDescription + " | " + alertMessage - case alertDescription != "": - description += " The alert description is: " + alertDescription - case alertMessage != "": - description += " The alert description is: " + alertMessage - default: - description += " The alert has no description." - } - - var runbook string - alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound" - if runbook = string(alert.Annotations["runbook_url"]); runbook == "" { - runbook = "" - } else { - alertURL = runbook - } - - details := fmt.Sprintf("%s%s %s", summary, description, runbook) - severity := string(alert.Labels["severity"]) - if severity == "critical" { - if alertName == internal.AlertNamePodDisruptionBudgetLimit { - details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + if u.ShouldReconcileAcceptRisks() { + allVersions := make([]string, 0, len(u.Updates)+len(u.ConditionalUpdates)) + for _, u := range u.Updates { + allVersions = append(allVersions, u.Version) + } + for _, u := range u.ConditionalUpdates { + allVersions = append(allVersions, u.Release.Version) + } + if u.risks != nil { + name := u.risks.Name() + risks, versions, err := u.risks.Risks(ctx, allVersions) + if err != nil { + klog.Errorf("detecting risks from %s failed, which might impact risk evaluation: %v", name, err) + } + u.Updates, u.ConditionalUpdates, err = risk.Merge(u.Updates, u.ConditionalUpdates, risks, versions) + if err != nil { + klog.Errorf("merging risks from %s failed, which might impact risk evaluation: %v", name, err) } - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: fmt.Sprintf("Alert:%s", alert.State), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { - details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: internal.AlertConditionReason(string(alert.State)), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - if internal.HavePullWaiting.Has(alertName) || - internal.HaveNodes.Has(alertName) || - alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || - alertName == internal.AlertNameVMCannotBeEvicted { - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: internal.AlertConditionReason(string(alert.State)), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - - updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) - if updatePrecheck == "true" { - risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ - Type: internal.ConditionalUpdateRiskConditionTypeApplies, - Status: metav1.ConditionTrue, - Reason: fmt.Sprintf("Alert:%s", alert.State), - Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), - LastTransitionTime: metav1.NewTime(alert.ActiveAt), - }) - continue - } - } - - klog.V(2).Infof("Got %d risks", len(risks)) - if len(risks) == 0 { - return nil - } - - var ret []configv1.ConditionalUpdateRisk - var keys []string - for k := range risks { - keys = append(keys, k) - } - sort.Strings(keys) - for _, k := range keys { - ret = append(ret, risks[k]) - } - return ret -} - -func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk { - risk, ok := risks[riskName] - if !ok { - return configv1.ConditionalUpdateRisk{ - Name: riskName, - Message: message, - URL: url, - // Always as the alert is firing - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{condition}, - } - } - - if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { - c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) - if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { - c.LastTransitionTime = condition.LastTransitionTime } - meta.SetStatusCondition(&risk.Conditions, *c) - } - - return risk -} - -func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { - if u == nil { - return } riskVersions := loadRiskVersions(u.ConditionalUpdates) risks := risksInOrder(riskVersions) u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry) - if u.ShouldReconcileAcceptRisks() { - u.attachAlertRisksToUpdates(u.evaluateAlertRisks(ctx)) - } - - if err := sanityCheck(u.ConditionalUpdates); err != nil { - klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err) - } - for i, conditionalUpdate := range u.ConditionalUpdates { - condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) + condition := evaluateConditionalUpdate(conditionalUpdate, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions) if condition.Status == metav1.ConditionTrue { u.addUpdate(conditionalUpdate.Release) @@ -701,25 +594,6 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) { } } -func sanityCheck(updates []configv1.ConditionalUpdate) error { - risks := map[string]configv1.ConditionalUpdateRisk{} - var errs []error - for _, update := range updates { - for _, risk := range update.Risks { - if v, ok := risks[risk.Name]; ok { - if diff := cmp.Diff(v, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { - errs = append(errs, fmt.Errorf("found collision on risk %s: %v and %v", risk.Name, v, risk)) - } - } else if trimmed := strings.TrimSpace(risk.Name); trimmed == "" { - errs = append(errs, fmt.Errorf("found invalid name on risk %v", risk)) - } else { - risks[risk.Name] = risk - } - } - } - return utilerrors.NewAggregate(errs) -} - func (u *availableUpdates) addUpdate(release configv1.Release) { for _, update := range u.Updates { if update.Image == release.Image { @@ -738,45 +612,6 @@ func (u *availableUpdates) removeUpdate(image string) { } } -func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) { - if u == nil || len(alertRisks) == 0 { - return - } - if u.RiskConditions == nil { - u.RiskConditions = map[string][]metav1.Condition{} - } - for _, alertRisk := range alertRisks { - u.RiskConditions[alertRisk.Name] = alertRisk.Conditions - } - var conditionalUpdates []configv1.ConditionalUpdate - for _, update := range u.Updates { - conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{ - Release: update, - Risks: alertRisks, - }) - } - u.Updates = nil - for _, conditionalUpdate := range u.ConditionalUpdates { - for _, alertRisk := range alertRisks { - var found bool - for _, risk := range conditionalUpdate.Risks { - if alertRisk.Name == risk.Name { - found = true - break - } - } - if !found { - conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk) - } - } - conditionalUpdates = append(conditionalUpdates, conditionalUpdate) - } - sort.Slice(conditionalUpdates, func(i, j int) bool { - return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version - }) - u.ConditionalUpdates = conditionalUpdates -} - func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string { template := `Could not evaluate exposure to update risk %s (%v) %s description: %s @@ -831,7 +666,7 @@ func newRecommendedReason(now, want string) string { } func evaluateConditionalUpdate( - risks []configv1.ConditionalUpdateRisk, + conditionalUpdate configv1.ConditionalUpdate, acceptRisks sets.Set[string], shouldReconcileAcceptRisks func() bool, riskConditions map[string][]metav1.Condition, @@ -845,7 +680,7 @@ func evaluateConditionalUpdate( } var errorMessages []string - for _, risk := range risks { + for _, risk := range conditionalUpdate.Risks { riskCondition := meta.FindStatusCondition(riskConditions[risk.Name], internal.ConditionalUpdateRiskConditionTypeApplies) if riskCondition == nil { // This should never happen @@ -865,7 +700,7 @@ func evaluateConditionalUpdate( recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionTrue) recommended.Reason = newRecommendedReason(recommended.Reason, recommendedReasonAllExposedRisksAccepted) recommended.Message = "The update is recommended, because either risk does not apply to this cluster or it is accepted by cluster admins." - klog.V(2).Infof("Risk with name %q is accepted by the cluster admin and thus not in the evaluation of conditional update", risk.Name) + klog.V(2).Infof("Risk with name %q is accepted by the cluster admin and thus not in the evaluation of conditional update %s", risk.Name, conditionalUpdate.Release.Version) } else { recommended.Status = newRecommendedStatus(recommended.Status, metav1.ConditionFalse) wantReason := recommendedReasonExposed diff --git a/pkg/cvo/availableupdates_test.go b/pkg/cvo/availableupdates_test.go index 400f8a5185..77d3c2a2b6 100644 --- a/pkg/cvo/availableupdates_test.go +++ b/pkg/cvo/availableupdates_test.go @@ -2,14 +2,11 @@ package cvo import ( "context" - "encoding/json" "errors" "fmt" "net/http" "net/http/httptest" "net/url" - "os" - "path/filepath" "runtime" "testing" "time" @@ -17,15 +14,12 @@ import ( "github.com/blang/semver/v4" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" - "github.com/prometheus/common/model" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime/schema" - utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/workqueue" @@ -35,6 +29,8 @@ import ( "github.com/openshift/cluster-version-operator/pkg/clusterconditions/always" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/mock" "github.com/openshift/cluster-version-operator/pkg/featuregates" + "github.com/openshift/cluster-version-operator/pkg/risk" + riskmock "github.com/openshift/cluster-version-operator/pkg/risk/mock" ) // notFoundProxyLister is a stub for ProxyLister @@ -217,11 +213,12 @@ var cvFixture = &configv1.ClusterVersion{ } var availableUpdatesCmpOpts = []cmp.Option{ + cmpopts.IgnoreUnexported(availableUpdates{}), cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks"), cmpopts.IgnoreTypes(time.Time{}), cmpopts.IgnoreInterfaces(struct { clusterconditions.ConditionRegistry - AlertGetter + risk.Source }{}), } @@ -520,7 +517,8 @@ func TestEvaluateConditionalUpdate(t *testing.T) { if tc.riskConditions == nil { tc.riskConditions = map[string][]metav1.Condition{} } - actual := evaluateConditionalUpdate(tc.risks, tc.acceptRisks, tc.shouldReconcileAcceptRisks, tc.riskConditions) + conditionalUpdate := configv1.ConditionalUpdate{Risks: tc.risks} + actual := evaluateConditionalUpdate(conditionalUpdate, tc.acceptRisks, tc.shouldReconcileAcceptRisks, tc.riskConditions) if diff := cmp.Diff(tc.expected, actual); diff != "" { t.Errorf("actual condition differs from expected:\n%s", diff) } @@ -784,58 +782,6 @@ func TestSyncAvailableUpdatesDesiredUpdate(t *testing.T) { } } -func Test_sanityCheck(t *testing.T) { - tests := []struct { - name string - updates []configv1.ConditionalUpdate - expected error - }{ - { - name: "good", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskB"}}}, - }, - }, - { - name: "invalid risk name", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}, {Name: "", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found invalid name on risk {[] some []}")}), - }, - { - name: "bad in one update", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}, {Name: "riskA", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found collision on risk riskA: {[] riskA []} and {[] some riskA []}")}), - }, - { - name: "bad in two updates", - updates: []configv1.ConditionalUpdate{ - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA"}}}, - {Risks: []configv1.ConditionalUpdateRisk{{Name: "riskA", URL: "some"}}}, - }, - expected: utilerrors.NewAggregate([]error{fmt.Errorf("found collision on risk riskA: {[] riskA []} and {[] some riskA []}")}), - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - actual := sanityCheck(tt.updates) - if diff := cmp.Diff(tt.expected, actual, cmp.Comparer(func(x, y error) bool { - if x == nil || y == nil { - return x == nil && y == nil - } - return x.Error() == y.Error() - })); diff != "" { - t.Errorf("sanityCheck() mismatch (-want +got):\n%s", diff) - } - - }) - } -} - func Test_loadRiskVersions(t *testing.T) { testcases := []struct { name string @@ -957,170 +903,6 @@ func Test_loadRiskConditions(t *testing.T) { } } -type mockAlertGetter struct { - ret prometheusv1.AlertsResult - jsonFile string - t *testing.T -} - -func (m *mockAlertGetter) Get(_ context.Context) prometheusv1.AlertsResult { - var ret prometheusv1.AlertsResult - if m.jsonFile != "" { - data, err := os.ReadFile(m.jsonFile) - if err != nil { - m.t.Fatal(err) - } - err = json.Unmarshal(data, &ret) - if err != nil { - m.t.Fatal(err) - } - return ret - } - return m.ret -} - -func Test_evaluateAlertConditions(t *testing.T) { - t1 := time.Now() - t2 := time.Now().Add(-3 * time.Minute) - t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") - if err != nil { - t.Fatalf("failed to parse time: %v", err) - } - tests := []struct { - name string - u *availableUpdates - expectedErr error - expectedAlertRisks []configv1.ConditionalUpdateRisk - }{ - { - name: "basic case", - u: &availableUpdates{ - AlertGetter: &mockAlertGetter{ - t: t, - ret: prometheusv1.AlertsResult{ - Alerts: []prometheusv1.Alert{ - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), - model.LabelName("severity"): model.LabelValue("critical"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), - }, - ActiveAt: t1, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("not-important"), - }, - State: prometheusv1.AlertStatePending, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), - model.LabelName("severity"): model.LabelValue("severity"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), - }, - ActiveAt: t1, - }, - { - Labels: map[model.LabelName]model.LabelValue{ - model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), - model.LabelName("severity"): model.LabelValue("severity"), - model.LabelName("namespace"): model.LabelValue("namespace"), - model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), - }, - State: prometheusv1.AlertStateFiring, - Annotations: map[model.LabelName]model.LabelValue{ - model.LabelName("summary"): model.LabelValue("summary"), - model.LabelName("description"): model.LabelValue("description"), - model.LabelName("message"): model.LabelValue("message"), - model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), - }, - ActiveAt: t2, - }, - }, - }, - }, - }, - expectedAlertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "PodDisruptionBudgetAtLimit", - Message: "summary.", - URL: "http://runbook.example.com/runbooks/bbb.md", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", - LastTransitionTime: metav1.NewTime(t2), - }}, - }, - { - Name: "PodDisruptionBudgetLimit", - Message: "summary.", - URL: "http://runbook.example.com/runbooks/abc.md", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", - LastTransitionTime: metav1.NewTime(t1), - }}, - }, - }, - }, - { - name: "from file", - u: &availableUpdates{ - AlertGetter: &mockAlertGetter{ - t: t, - jsonFile: filepath.Join("testdata", "alerts.json"), - }, - }, - expectedAlertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: metav1.NewTime(t3), - }}, - }, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - actual := tt.u.evaluateAlertRisks(context.TODO()) - if diff := cmp.Diff(tt.expectedAlertRisks, actual); diff != "" { - t.Errorf("AlertRisks mismatch (-want +got):\n%s", diff) - } - }) - } -} - func Test_newRecommendedReason(t *testing.T) { tests := []struct { name string @@ -1259,152 +1041,6 @@ func Test_newRecommendedReason(t *testing.T) { } } -func Test_attachAlertRisksToUpdates(t *testing.T) { - now := metav1.Now() - tests := []struct { - name string - alertRisks []configv1.ConditionalUpdateRisk - updates []configv1.Release - conditionUpdates []configv1.ConditionalUpdate - expected []configv1.Release - expectedConditionUpdates []configv1.ConditionalUpdate - expectedRiskConditions map[string][]metav1.Condition - }{ - { - name: "no alert risks", - }, - { - name: "basic case", - alertRisks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - updates: []configv1.Release{ - { - Version: "4.21.1", - }, - { - Version: "4.21.2", - }, - }, - conditionUpdates: []configv1.ConditionalUpdate{ - { - Release: configv1.Release{ - Version: "4.21.3", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "Risk1", - }, - }, - }, - }, - expectedConditionUpdates: []configv1.ConditionalUpdate{ - { - Release: configv1.Release{ - Version: "4.21.1", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - { - Release: configv1.Release{ - Version: "4.21.2", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - { - Release: configv1.Release{ - Version: "4.21.3", - }, - Risks: []configv1.ConditionalUpdateRisk{ - { - Name: "Risk1", - }, - { - Name: "TestAlert", - Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", - MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, - Conditions: []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - }, - }, - expectedRiskConditions: map[string][]metav1.Condition{ - "TestAlert": []metav1.Condition{{ - Type: "Applies", - Status: "True", - Reason: "Alert:firing", - Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", - LastTransitionTime: now, - }}, - }, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - u := &availableUpdates{Updates: tt.updates, ConditionalUpdates: tt.conditionUpdates} - u.attachAlertRisksToUpdates(tt.alertRisks) - // attaching is idempotent - u.attachAlertRisksToUpdates(tt.alertRisks) - if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { - t.Errorf("available updates mismatch (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tt.expectedConditionUpdates, u.ConditionalUpdates); diff != "" { - t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) - } - if diff := cmp.Diff(tt.expectedRiskConditions, u.RiskConditions); diff != "" { - t.Errorf("risk conditions mismatch (-want +got):\n%s", diff) - } - }) - } -} - func Test_evaluateConditionalUpdates(t *testing.T) { t1, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") if err != nil { @@ -1419,6 +1055,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { shouldReconcileAcceptRisks func() bool updates []configv1.Release conditionUpdates []configv1.ConditionalUpdate + risks risk.Source expected []configv1.Release expectedConditionUpdates []configv1.ConditionalUpdate expectedRiskConditions map[string][]metav1.Condition @@ -1434,6 +1071,22 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Version: "4.21.2", }, }, + risks: &riskmock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t1), + }}, + }}, + }, expectedConditionUpdates: []configv1.ConditionalUpdate{ { Release: configv1.Release{ @@ -1443,7 +1096,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { { Name: "TestAlert", Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + URL: "https://example.com/testAlert", MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, Conditions: []metav1.Condition{{ Type: "Applies", @@ -1458,7 +1111,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Type: "Recommended", Status: "False", Reason: "TestAlert", - Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + Message: "Test summary. https://example.com/testAlert", LastTransitionTime: metav1.NewTime(t1), }}, }, @@ -1470,7 +1123,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { { Name: "TestAlert", Message: "Test summary.", - URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + URL: "https://example.com/testAlert", MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, Conditions: []metav1.Condition{{ Type: "Applies", @@ -1485,7 +1138,7 @@ func Test_evaluateConditionalUpdates(t *testing.T) { Type: "Recommended", Status: "False", Reason: "TestAlert", - Message: "Test summary. https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + Message: "Test summary. https://example.com/testAlert", LastTransitionTime: metav1.NewTime(t1), }}, }, @@ -1507,9 +1160,9 @@ func Test_evaluateConditionalUpdates(t *testing.T) { ConditionalUpdates: tt.conditionUpdates, ShouldReconcileAcceptRisks: tt.shouldReconcileAcceptRisks, ConditionRegistry: conditionRegistry, - AlertGetter: &mockAlertGetter{t: t, jsonFile: filepath.Join("testdata", "alerts.json")}, + risks: tt.risks, } - u.evaluateConditionalUpdates(context.TODO()) + u.evaluateConditionalUpdates(context.Background()) if diff := cmp.Diff(tt.expected, u.Updates); diff != "" { t.Errorf("available updates mismatch (-want +got):\n%s", diff) } diff --git a/pkg/cvo/cvo.go b/pkg/cvo/cvo.go index 9866fe80c3..e9eb1d3f3a 100644 --- a/pkg/cvo/cvo.go +++ b/pkg/cvo/cvo.go @@ -43,7 +43,6 @@ import ( "github.com/openshift/cluster-version-operator/lib/resourcebuilder" "github.com/openshift/cluster-version-operator/lib/validation" "github.com/openshift/cluster-version-operator/pkg/clusterconditions" - "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" "github.com/openshift/cluster-version-operator/pkg/clusterconditions/standard" "github.com/openshift/cluster-version-operator/pkg/customsignaturestore" "github.com/openshift/cluster-version-operator/pkg/cvo/configuration" @@ -54,6 +53,8 @@ import ( "github.com/openshift/cluster-version-operator/pkg/payload" "github.com/openshift/cluster-version-operator/pkg/payload/precondition" preconditioncv "github.com/openshift/cluster-version-operator/pkg/payload/precondition/clusterversion" + "github.com/openshift/cluster-version-operator/pkg/risk" + "github.com/openshift/cluster-version-operator/pkg/risk/alert" ) const ( @@ -209,7 +210,9 @@ type Operator struct { // configuration, if enabled, reconciles the ClusterVersionOperator configuration. configuration *configuration.ClusterVersionOperatorConfiguration - AlertGetter AlertGetter + // risks holds update-risk source (in-cluster alerts, etc.) + // that will be aggregated into conditional update risks. + risks risk.Source } // New returns a new cluster version operator. @@ -270,7 +273,7 @@ func New( exclude: exclude, clusterProfile: clusterProfile, conditionRegistry: standard.NewConditionRegistry(promqlTarget), - AlertGetter: promql.NewAlertGetter(promqlTarget), + risks: alert.New("Alert", promqlTarget), injectClusterIdIntoPromQL: injectClusterIdIntoPromQL, requiredFeatureSet: featureSet, diff --git a/pkg/cvo/cvo_test.go b/pkg/cvo/cvo_test.go index 5a8ba48e16..f789b98635 100644 --- a/pkg/cvo/cvo_test.go +++ b/pkg/cvo/cvo_test.go @@ -16,7 +16,6 @@ import ( "github.com/davecgh/go-spew/spew" "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "github.com/google/uuid" appsv1 "k8s.io/api/apps/v1" @@ -2783,7 +2782,7 @@ func TestOperator_availableUpdatesSync(t *testing.T) { } } - if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, cmpopts.IgnoreFields(availableUpdates{}, "ShouldReconcileAcceptRisks")); diff != "" { + if diff := cmp.Diff(tt.wantUpdates, optr.availableUpdates, availableUpdatesCmpOpts...); diff != "" { t.Fatalf("unexpected: %s", diff) } if (optr.queue.Len() > 0) != (optr.availableUpdates != nil) { diff --git a/pkg/risk/alert/alert.go b/pkg/risk/alert/alert.go new file mode 100644 index 0000000000..496d54d70b --- /dev/null +++ b/pkg/risk/alert/alert.go @@ -0,0 +1,186 @@ +package alert + +import ( + "context" + "fmt" + "sort" + "strings" + + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/klog/v2" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions" + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" + "github.com/openshift/cluster-version-operator/pkg/internal" + "github.com/openshift/cluster-version-operator/pkg/risk" +) + +func New(name string, promQLTarget clusterconditions.PromQLTarget) risk.Source { + return &alertRisk{ + name: name, + getter: promql.NewAlertGetter(promQLTarget), + } +} + +type alertRisk struct { + name string + getter promql.Getter +} + +func (a *alertRisk) Name() string { + return a.name +} + +func (a *alertRisk) Risks(ctx context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) { + alerts := a.getter.Get(ctx) + risks := alertsToRisks(alerts.Alerts) + versionMap := make(map[string][]string, len(risks)) + for _, risk := range risks { + versionMap[risk.Name] = versions + } + return risks, versionMap, nil +} + +func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk { + klog.V(2).Infof("Found %d alerts", len(alerts)) + risks := map[string]configv1.ConditionalUpdateRisk{} + for _, alert := range alerts { + var alertName string + if alertName = string(alert.Labels["alertname"]); alertName == "" { + continue + } + if alert.State == "pending" { + continue + } + + var summary string + if summary = string(alert.Annotations["summary"]); summary == "" { + summary = alertName + } + if !strings.HasSuffix(summary, ".") { + summary += "." + } + + var description string + alertMessage := string(alert.Annotations["message"]) + alertDescription := string(alert.Annotations["description"]) + switch { + case alertMessage != "" && alertDescription != "": + description += " The alert description is: " + alertDescription + " | " + alertMessage + case alertDescription != "": + description += " The alert description is: " + alertDescription + case alertMessage != "": + description += " The alert description is: " + alertMessage + default: + description += " The alert has no description." + } + + var runbook string + alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound" + if runbook = string(alert.Annotations["runbook_url"]); runbook == "" { + runbook = "" + } else { + alertURL = runbook + } + + details := fmt.Sprintf("%s%s %s", summary, description, runbook) + + severity := string(alert.Labels["severity"]) + if severity == "critical" { + if alertName == internal.AlertNamePodDisruptionBudgetLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + } + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if alertName == internal.AlertNamePodDisruptionBudgetAtLimit { + details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details) + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + if internal.HavePullWaiting.Has(alertName) || + internal.HaveNodes.Has(alertName) || + alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing || + alertName == internal.AlertNameVMCannotBeEvicted { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: internal.AlertConditionReason(string(alert.State)), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + + updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"]) + if updatePrecheck == "true" { + risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{ + Type: internal.ConditionalUpdateRiskConditionTypeApplies, + Status: metav1.ConditionTrue, + Reason: fmt.Sprintf("Alert:%s", alert.State), + Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details), + LastTransitionTime: metav1.NewTime(alert.ActiveAt), + }) + continue + } + } + + klog.V(2).Infof("Got %d risks", len(risks)) + if len(risks) == 0 { + return nil + } + + var ret []configv1.ConditionalUpdateRisk + var keys []string + for k := range risks { + keys = append(keys, k) + } + sort.Strings(keys) + for _, k := range keys { + ret = append(ret, risks[k]) + } + return ret +} + +func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk { + risk, ok := risks[riskName] + if !ok { + return configv1.ConditionalUpdateRisk{ + Name: riskName, + Message: message, + URL: url, + // Always as the alert is firing + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{condition}, + } + } + + if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil { + c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message) + if c.LastTransitionTime.After(condition.LastTransitionTime.Time) { + c.LastTransitionTime = condition.LastTransitionTime + } + meta.SetStatusCondition(&risk.Conditions, *c) + } + + return risk +} diff --git a/pkg/risk/alert/alert_test.go b/pkg/risk/alert/alert_test.go new file mode 100644 index 0000000000..9a12724d57 --- /dev/null +++ b/pkg/risk/alert/alert_test.go @@ -0,0 +1,198 @@ +package alert + +import ( + "context" + "encoding/json" + "os" + "path/filepath" + "testing" + "time" + + "github.com/google/go-cmp/cmp" + prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/clusterconditions/promql" +) + +type mockAlertGetter struct { + ret prometheusv1.AlertsResult + jsonFile string + t *testing.T +} + +func (m *mockAlertGetter) Get(_ context.Context) prometheusv1.AlertsResult { + var ret prometheusv1.AlertsResult + if m.jsonFile != "" { + data, err := os.ReadFile(m.jsonFile) + if err != nil { + m.t.Fatal(err) + } + err = json.Unmarshal(data, &ret) + if err != nil { + m.t.Fatal(err) + } + return ret + } + return m.ret +} + +func Test_alertRisk(t *testing.T) { + t1 := time.Now() + t2 := time.Now().Add(-3 * time.Minute) + t3, err := time.Parse(time.RFC3339, "2026-03-04T00:38:19.02109776Z") + if err != nil { + t.Fatalf("failed to parse time: %v", err) + } + tests := []struct { + name string + getter promql.Getter + expectedErr error + expectedAlertRisks []configv1.ConditionalUpdateRisk + }{ + { + name: "basic case", + getter: &mockAlertGetter{ + t: t, + ret: prometheusv1.AlertsResult{ + Alerts: []prometheusv1.Alert{ + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetLimit"), + model.LabelName("severity"): model.LabelValue("critical"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/abc.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("not-important"), + }, + State: prometheusv1.AlertStatePending, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("some-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t1, + }, + { + Labels: map[model.LabelName]model.LabelValue{ + model.LabelName("alertname"): model.LabelValue("PodDisruptionBudgetAtLimit"), + model.LabelName("severity"): model.LabelValue("severity"), + model.LabelName("namespace"): model.LabelValue("namespace"), + model.LabelName("poddisruptionbudget"): model.LabelValue("another-pdb"), + }, + State: prometheusv1.AlertStateFiring, + Annotations: map[model.LabelName]model.LabelValue{ + model.LabelName("summary"): model.LabelValue("summary"), + model.LabelName("description"): model.LabelValue("description"), + model.LabelName("message"): model.LabelValue("message"), + model.LabelName("runbook_url"): model.LabelValue("http://runbook.example.com/runbooks/bbb.md"), + }, + ActiveAt: t2, + }, + }, + }, + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "PodDisruptionBudgetAtLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/bbb.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md; severity alert PodDisruptionBudgetAtLimit firing, which might slow node drains. Namespace=namespace, PodDisruptionBudget=another-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/bbb.md", + LastTransitionTime: metav1.NewTime(t2), + }}, + }, + { + Name: "PodDisruptionBudgetLimit", + Message: "summary.", + URL: "http://runbook.example.com/runbooks/abc.md", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert PodDisruptionBudgetLimit firing, suggesting significant cluster issues worth investigating. Namespace=namespace, PodDisruptionBudget=some-pdb. summary. The alert description is: description | message http://runbook.example.com/runbooks/abc.md", + LastTransitionTime: metav1.NewTime(t1), + }}, + }, + }, + }, + { + name: "from file", + getter: &mockAlertGetter{ + t: t, + jsonFile: filepath.Join("testdata", "alerts.json"), + }, + expectedAlertRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + Conditions: []metav1.Condition{{ + Type: "Applies", + Status: "True", + Reason: "Alert:firing", + Message: "critical alert TestAlert firing, suggesting significant cluster issues worth investigating. Test summary. The alert description is: Test description. ", + LastTransitionTime: metav1.NewTime(t3), + }}, + }, + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + source := &alertRisk{ + name: "Alert", + getter: tt.getter, + } + allVersions := []string{"1.2.3"} + risks, versions, err := source.Risks(context.Background(), allVersions) + + if diff := cmp.Diff(tt.expectedAlertRisks, risks); diff != "" { + t.Errorf("Alert Risks mismatch (-want +got):\n%s", diff) + } + + expectedVersions := make(map[string][]string, len(risks)) + for _, risk := range risks { + expectedVersions[risk.Name] = allVersions + } + if diff := cmp.Diff(expectedVersions, versions); diff != "" { + t.Errorf("Alert Risks version mismatch (-want +got):\n%s", diff) + } + + if err != nil { + t.Errorf("Alert Risks evaluation error: %v", err) + } + }) + } +} diff --git a/pkg/cvo/testdata/alerts.json b/pkg/risk/alert/testdata/alerts.json similarity index 100% rename from pkg/cvo/testdata/alerts.json rename to pkg/risk/alert/testdata/alerts.json diff --git a/pkg/risk/mock/mock.go b/pkg/risk/mock/mock.go new file mode 100644 index 0000000000..b5c6e88a03 --- /dev/null +++ b/pkg/risk/mock/mock.go @@ -0,0 +1,46 @@ +// Package mock implements an update risk source with mock responses, +// for convenient testing. +package mock + +import ( + "context" + "errors" + + configv1 "github.com/openshift/api/config/v1" +) + +// Mock implements an update risk source with mock responses. +type Mock struct { + // InternalName is the source name. + InternalName string + + // InternalRisks are the detected update risks. + InternalRisks []configv1.ConditionalUpdateRisk + + // InternalVersions maps the detected risks to version + // strings. If unset, all internal risks will be reported as + // applicable to all known versions. + InternalVersions map[string][]string +} + +// Name returns the source's name. +func (m *Mock) Name() string { + return m.InternalName +} + +// Risks returns the current set of risks the source is aware of. +func (m *Mock) Risks(_ context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) { + if m.InternalRisks == nil { + return nil, nil, errors.New("failed to calculate risks") + } + var versionMap map[string][]string + if m.InternalVersions != nil { + versionMap = m.InternalVersions + } else { + versionMap = make(map[string][]string, len(m.InternalRisks)) + for _, risk := range m.InternalRisks { + versionMap[risk.Name] = versions + } + } + return m.InternalRisks, versionMap, nil +} diff --git a/pkg/risk/risk.go b/pkg/risk/risk.go new file mode 100644 index 0000000000..065999a0ba --- /dev/null +++ b/pkg/risk/risk.go @@ -0,0 +1,163 @@ +// Package risk implements conditional update risk detection from a +// generic source. For example, risks could come from the update +// service's conditionalEdges. Or from firing in-cluster alerts. Or +// from other sources. This package makes it easy to extend the +// cluster-version operator with additional risk-detection sources over +// time. +package risk + +import ( + "context" + "errors" + "fmt" + "regexp" + "slices" + "sort" + "strings" + + "github.com/blang/semver/v4" + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + + configv1 "github.com/openshift/api/config/v1" +) + +// validName enforces the non-empty CamelCase risk-naming convention. +var validName = regexp.MustCompile(`^[A-Z][A-Za-z0-9]*$`) + +// Source represents a risk source, like the update service's +// conditionalEdges. Or firing in-cluster alerts. Or another source. +type Source interface { + // Name returns the source's name, which can be prefixed to + // the returned risk names to avoid collisions when aggregating + // in a single map. + Name() string + + // Risks returns the current set of risks the source is aware of. + // Implementations are encouraged to make this an inexpensive call, + // for example by caching expensive precursors, so the caller does + // not have to worry about the load generated by frequent calls. + // Arguments are a context to allow cancellation, and a slice of all + // update versions (for sources that are reporting on current cluster + // state, and want any detected risks to apply to all versions). + // Returns a slice of detected risks, and a map from each risk name + // to the version strings that risk is associated with. + Risks(ctx context.Context, versions []string) ([]configv1.ConditionalUpdateRisk, map[string][]string, error) +} + +// Merge updates the input updates and conditionalUpdates by merging +// additional risks. The versions map has risk names as the key, and +// a slice of version strings that risk should be merged into as the +// value. It returns the modified updates and conditionalUpdates slices. +func Merge(updates []configv1.Release, conditionalUpdates []configv1.ConditionalUpdate, risks []configv1.ConditionalUpdateRisk, versions map[string][]string) ([]configv1.Release, []configv1.ConditionalUpdate, error) { + var errs []error + riskMap := make(map[string]configv1.ConditionalUpdateRisk, len(risks)) + invalidRiskMap := make(map[string]struct{}, len(risks)) + for _, risk := range risks { + if !validName.MatchString(risk.Name) { + invalidRiskMap[risk.Name] = struct{}{} + errs = append(errs, fmt.Errorf("invalid risk name %q does not match %q", risk.Name, validName)) + continue + } + + existingRisk, ok := riskMap[risk.Name] + if ok { + if diff := cmp.Diff(existingRisk, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { + errs = append(errs, fmt.Errorf("divergent definitions of risk %q from the merging source:\n%s", risk.Name, diff)) + } + continue + } + riskMap[risk.Name] = risk + } + + riskMissMap := make(map[string]struct{}, len(versions)) + versionRiskNames := make([]string, 0, len(versions)) + for riskName := range versions { + if _, ok := invalidRiskMap[riskName]; ok { + continue // we're already complaining about these + } + versionRiskNames = append(versionRiskNames, riskName) + } + sort.Strings(versionRiskNames) + for _, riskName := range versionRiskNames { + risk, ok := riskMap[riskName] + if !ok { + riskMissMap[riskName] = struct{}{} + continue + } + for _, version := range versions[riskName] { + found := false + for i, update := range updates { + if update.Version != version { + continue + } + found = true + conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{ + Release: update, + Risks: []configv1.ConditionalUpdateRisk{risk}, + }) + updates = append(updates[:i], updates[i+1:]...) + if len(updates) == 0 { + updates = nil + } + break + } + if found { + continue + } + for i, conditionalUpdate := range conditionalUpdates { + if conditionalUpdate.Release.Version != version { + continue + } + found = true + riskFound := false + for _, existingRisk := range conditionalUpdate.Risks { + if existingRisk.Name != risk.Name { + continue + } + riskFound = true + if diff := cmp.Diff(existingRisk, risk, cmpopts.IgnoreFields(configv1.ConditionalUpdateRisk{}, "Conditions")); diff != "" { + errs = append(errs, fmt.Errorf("divergent definitions of risk %q:\n%s", risk.Name, diff)) + continue + } + break + } + if !riskFound { + conditionalUpdates[i].Risks = append(conditionalUpdate.Risks, risk) + } + } + if !found { + errs = append(errs, fmt.Errorf("cannot merge risk %q into unknown release version %q", risk.Name, version)) + } + } + } + if len(riskMissMap) > 0 { + riskMisses := make([]string, 0, len(riskMissMap)) + for riskName := range riskMissMap { + riskMisses = append(riskMisses, riskName) + } + sort.Strings(riskMisses) + errs = append(errs, fmt.Errorf("merge versions referenced unknown risks: %s", strings.Join(riskMisses, ", "))) + } + + slices.SortFunc(conditionalUpdates, func(a, b configv1.ConditionalUpdate) int { + va, errA := semver.Parse(a.Release.Version) + vb, errB := semver.Parse(b.Release.Version) + if errA == nil && errB == nil { + return va.Compare(vb) + } else if errA == nil { + return 1 + } else if errB == nil { + return -1 + } + return strings.Compare(a.Release.Version, b.Release.Version) + }) + + for i := range conditionalUpdates { + slices.SortFunc(conditionalUpdates[i].Risks, func(a, b configv1.ConditionalUpdateRisk) int { + return strings.Compare(a.Name, b.Name) + }) + } + + return updates, conditionalUpdates, errors.Join(errs...) +} diff --git a/pkg/risk/risk_test.go b/pkg/risk/risk_test.go new file mode 100644 index 0000000000..c0b3638a50 --- /dev/null +++ b/pkg/risk/risk_test.go @@ -0,0 +1,262 @@ +package risk_test + +import ( + "context" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + + configv1 "github.com/openshift/api/config/v1" + + "github.com/openshift/cluster-version-operator/pkg/risk" + "github.com/openshift/cluster-version-operator/pkg/risk/mock" +) + +func Test_Merge(t *testing.T) { + tests := []struct { + name string + updates []configv1.Release + conditionalUpdates []configv1.ConditionalUpdate + risks risk.Source + expectedUpdates []configv1.Release + expectedConditionalUpdates []configv1.ConditionalUpdate + expectedError string + }{ + { + name: "basic case", + updates: []configv1.Release{ + { + Version: "4.21.1", + }, + { + Version: "4.21.2", + }, + }, + conditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "ExistingRisk", + }, + }, + }, + }, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }}, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{ + { + Release: configv1.Release{ + Version: "4.21.1", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.2", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + { + Release: configv1.Release{ + Version: "4.21.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "ExistingRisk", + }, + { + Name: "TestAlert", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + }, + }, + { + name: "invalid risk name", + updates: []configv1.Release{{Version: "1.2.3"}}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{{ + Name: "", + Message: "Test summary.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }}, + }, + expectedUpdates: []configv1.Release{{Version: "1.2.3"}}, + expectedError: `invalid risk name "" does not match "^[A-Z][A-Za-z0-9]*$"`, + }, + { + name: "single-source collision", + updates: []configv1.Release{{Version: "1.2.3"}}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + { + Name: "RiskA", + Message: "Test summary 2.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + expectedError: `divergent definitions of risk "RiskA" from the merging source: + v1.ConditionalUpdateRisk{ + ... // 1 ignored field + URL: "https://example.com/testAlert", + Name: "RiskA", +- Message: "Test summary 1.", ++ Message: "Test summary 2.", + MatchingRules: {{Type: "Always"}}, + } +`, + }, + { + name: "multi-source collision", + conditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + risks: &mock.Mock{ + InternalName: "Mock", + InternalRisks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 2.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }, + expectedConditionalUpdates: []configv1.ConditionalUpdate{{ + Release: configv1.Release{ + Version: "1.2.3", + }, + Risks: []configv1.ConditionalUpdateRisk{ + { + Name: "RiskA", + Message: "Test summary 1.", + URL: "https://example.com/testAlert", + MatchingRules: []configv1.ClusterCondition{{Type: "Always"}}, + }, + }, + }}, + expectedError: `divergent definitions of risk "RiskA": + v1.ConditionalUpdateRisk{ + ... // 1 ignored field + URL: "https://example.com/testAlert", + Name: "RiskA", +- Message: "Test summary 1.", ++ Message: "Test summary 2.", + MatchingRules: {{Type: "Always"}}, + } +`, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + allVersions := make([]string, 0, len(tt.updates)+len(tt.conditionalUpdates)) + for _, u := range tt.updates { + allVersions = append(allVersions, u.Version) + } + for _, u := range tt.conditionalUpdates { + allVersions = append(allVersions, u.Release.Version) + } + risks, versions, err := tt.risks.Risks(context.Background(), allVersions) + if err != nil { + t.Fatalf("failed to get risks from %s: %v", tt.risks.Name(), err) + } + updates, conditionalUpdates, err := risk.Merge(tt.updates, tt.conditionalUpdates, risks, versions) + initialPassSuccess := true + if diff := cmp.Diff(tt.expectedUpdates, updates); diff != "" { + initialPassSuccess = false + t.Errorf("available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionalUpdates, conditionalUpdates); diff != "" { + initialPassSuccess = false + t.Errorf("conditional updates mismatch (-want +got):\n%s", diff) + } + var actualError string + if err != nil { + actualError = strings.ReplaceAll(err.Error(), "\u00a0", " ") + } + if diff := cmp.Diff(tt.expectedError, actualError); diff != "" { + initialPassSuccess = false + t.Errorf("merge error mismatch (-want +got):\n%s", diff) + } + if !initialPassSuccess { + return // we already have enough errors to work through; skip the idempotency test + } + updates, conditionalUpdates, err = risk.Merge(updates, conditionalUpdates, risks, versions) + if diff := cmp.Diff(tt.expectedUpdates, updates); diff != "" { + t.Errorf("idempotent available updates mismatch (-want +got):\n%s", diff) + } + if diff := cmp.Diff(tt.expectedConditionalUpdates, conditionalUpdates); diff != "" { + t.Errorf("idempotent conditional updates mismatch (-want +got):\n%s", diff) + } + actualError = "" + if err != nil { + actualError = strings.ReplaceAll(err.Error(), "\u00a0", " ") + } + if diff := cmp.Diff(tt.expectedError, actualError); diff != "" { + t.Errorf("idempotent merge error mismatch (-want +got):\n%s", diff) + } + }) + } +}