@@ -12,15 +12,11 @@ import (
1212 "time"
1313
1414 "github.com/blang/semver/v4"
15- "github.com/google/go-cmp/cmp"
16- "github.com/google/go-cmp/cmp/cmpopts"
1715 "github.com/google/uuid"
18- prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1916
2017 "k8s.io/apimachinery/pkg/api/equality"
2118 "k8s.io/apimachinery/pkg/api/meta"
2219 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23- utilerrors "k8s.io/apimachinery/pkg/util/errors"
2420 "k8s.io/apimachinery/pkg/util/sets"
2521 "k8s.io/klog/v2"
2622
@@ -30,6 +26,7 @@ import (
3026 "github.com/openshift/cluster-version-operator/pkg/cincinnati"
3127 "github.com/openshift/cluster-version-operator/pkg/clusterconditions"
3228 "github.com/openshift/cluster-version-operator/pkg/internal"
29+ "github.com/openshift/cluster-version-operator/pkg/risk"
3330)
3431
3532const noArchitecture string = "NoArchitecture"
@@ -103,7 +100,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
103100 if ! acceptRisks .Equal (optrAvailableUpdates .AcceptRisks ) {
104101 needsConditionalUpdateEval = true
105102 }
106- // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
107103 if ! needsConditionalUpdateEval {
108104 klog .V (2 ).Infof ("Available updates were recently retrieved, with less than %s elapsed since %s, will try later." , optr .minimumUpdateCheckInterval , optrAvailableUpdates .LastAttempt .Format (time .RFC3339 ))
109105 return nil
@@ -164,7 +160,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
164160
165161 optrAvailableUpdates .AcceptRisks = acceptRisks
166162 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
167- optrAvailableUpdates .AlertGetter = optr .AlertGetter
163+ optrAvailableUpdates .risks = optr .risks
168164 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
169165
170166 queueKey := optr .queueKey ()
@@ -219,7 +215,9 @@ type availableUpdates struct {
219215 // RiskConditions stores the condition for every risk (name, url, message, matchingRules).
220216 RiskConditions map [string ][]metav1.Condition
221217
222- AlertGetter AlertGetter
218+ // risks holds update-risk source (in-cluster alerts, etc.)
219+ // that will be aggregated into conditional update risks.
220+ risks risk.Source
223221}
224222
225223func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -325,7 +323,7 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
325323 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
326324 AcceptRisks : optr .availableUpdates .AcceptRisks ,
327325 RiskConditions : optr .availableUpdates .RiskConditions ,
328- AlertGetter : optr .availableUpdates . AlertGetter ,
326+ risks : optr .risks ,
329327 LastAttempt : optr .availableUpdates .LastAttempt ,
330328 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
331329 Current : * optr .availableUpdates .Current .DeepCopy (),
@@ -364,9 +362,14 @@ func loadRiskConditions(ctx context.Context, risks []string, riskVersions map[st
364362 riskCondition .Status = metav1 .ConditionUnknown
365363 riskCondition .Reason = riskConditionReasonEvaluationFailed
366364 riskCondition .Message = msg
367- } else if match {
368- riskCondition .Status = metav1 .ConditionTrue
369- riskCondition .Reason = riskConditionReasonMatch
365+ } else {
366+ if match {
367+ riskCondition .Status = metav1 .ConditionTrue
368+ riskCondition .Reason = riskConditionReasonMatch
369+ }
370+ if existingCondition := meta .FindStatusCondition (risk .Conditions , riskCondition .Type ); existingCondition != nil && existingCondition .Status == riskCondition .Status {
371+ riskCondition = * existingCondition // preserve more-detailed information, LastTransitionTime, etc., from the source
372+ }
370373 }
371374 riskConditions [risk .Name ] = []metav1.Condition {riskCondition }
372375 }
@@ -518,174 +521,36 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
518521 }
519522}
520523
521- type AlertGetter interface {
522- Get (ctx context.Context ) prometheusv1.AlertsResult
523- }
524-
525- func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) []configv1.ConditionalUpdateRisk {
526- if u == nil || u .AlertGetter == nil {
527- return nil
524+ func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
525+ if u == nil {
526+ return
528527 }
529- alertsResult := u .AlertGetter .Get (ctx )
530- return alertsToRisks (alertsResult .Alerts )
531- }
532-
533- func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
534- klog .V (2 ).Infof ("Found %d alerts" , len (alerts ))
535- risks := map [string ]configv1.ConditionalUpdateRisk {}
536- for _ , alert := range alerts {
537- var alertName string
538- if alertName = string (alert .Labels ["alertname" ]); alertName == "" {
539- continue
540- }
541- if alert .State == "pending" {
542- continue
543- }
544-
545- var summary string
546- if summary = string (alert .Annotations ["summary" ]); summary == "" {
547- summary = alertName
548- }
549- if ! strings .HasSuffix (summary , "." ) {
550- summary += "."
551- }
552-
553- var description string
554- alertMessage := string (alert .Annotations ["message" ])
555- alertDescription := string (alert .Annotations ["description" ])
556- switch {
557- case alertMessage != "" && alertDescription != "" :
558- description += " The alert description is: " + alertDescription + " | " + alertMessage
559- case alertDescription != "" :
560- description += " The alert description is: " + alertDescription
561- case alertMessage != "" :
562- description += " The alert description is: " + alertMessage
563- default :
564- description += " The alert has no description."
565- }
566528
567- var runbook string
568- alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
569- if runbook = string (alert .Annotations ["runbook_url" ]); runbook == "" {
570- runbook = "<alert does not have a runbook_url annotation>"
571- } else {
572- alertURL = runbook
573- }
574-
575- details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
576-
577- severity := string (alert .Labels ["severity" ])
578- if severity == "critical" {
579- if alertName == internal .AlertNamePodDisruptionBudgetLimit {
580- details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
529+ if u .ShouldReconcileAcceptRisks () {
530+ allVersions := make ([]string , 0 , len (u .Updates )+ len (u .ConditionalUpdates ))
531+ for _ , u := range u .Updates {
532+ allVersions = append (allVersions , u .Version )
533+ }
534+ for _ , u := range u .ConditionalUpdates {
535+ allVersions = append (allVersions , u .Release .Version )
536+ }
537+ if u .risks != nil {
538+ name := u .risks .Name ()
539+ risks , versions , err := u .risks .Risks (ctx , allVersions )
540+ if err != nil {
541+ klog .Errorf ("detecting risks from %s failed, which might impact risk evaluation: %v" , name , err )
542+ }
543+ u .Updates , u .ConditionalUpdates , err = risk .Merge (u .Updates , u .ConditionalUpdates , risks , versions )
544+ if err != nil {
545+ klog .Errorf ("merging risks from %s failed, which might impact risk evaluation: %v" , name , err )
581546 }
582- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
583- Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
584- Status : metav1 .ConditionTrue ,
585- Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
586- Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting significant cluster issues worth investigating" , details ),
587- LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
588- })
589- continue
590- }
591-
592- if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
593- details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
594- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
595- Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
596- Status : metav1 .ConditionTrue ,
597- Reason : internal .AlertConditionReason (string (alert .State )),
598- Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which might slow node drains" , details ),
599- LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
600- })
601- continue
602- }
603-
604- if internal .HavePullWaiting .Has (alertName ) ||
605- internal .HaveNodes .Has (alertName ) ||
606- alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
607- alertName == internal .AlertNameVMCannotBeEvicted {
608- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
609- Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
610- Status : metav1 .ConditionTrue ,
611- Reason : internal .AlertConditionReason (string (alert .State )),
612- Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which may slow workload redistribution during rolling node updates" , details ),
613- LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
614- })
615- continue
616- }
617-
618- updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
619- if updatePrecheck == "true" {
620- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
621- Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
622- Status : metav1 .ConditionTrue ,
623- Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
624- Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting issues worth investigating before updating the cluster" , details ),
625- LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
626- })
627- continue
628- }
629- }
630-
631- klog .V (2 ).Infof ("Got %d risks" , len (risks ))
632- if len (risks ) == 0 {
633- return nil
634- }
635-
636- var ret []configv1.ConditionalUpdateRisk
637- var keys []string
638- for k := range risks {
639- keys = append (keys , k )
640- }
641- sort .Strings (keys )
642- for _ , k := range keys {
643- ret = append (ret , risks [k ])
644- }
645- return ret
646- }
647-
648- func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
649- risk , ok := risks [riskName ]
650- if ! ok {
651- return configv1.ConditionalUpdateRisk {
652- Name : riskName ,
653- Message : message ,
654- URL : url ,
655- // Always as the alert is firing
656- MatchingRules : []configv1.ClusterCondition {{Type : "Always" }},
657- Conditions : []metav1.Condition {condition },
658- }
659- }
660-
661- if c := meta .FindStatusCondition (risk .Conditions , condition .Type ); c != nil {
662- c .Message = fmt .Sprintf ("%s; %s" , c .Message , condition .Message )
663- if c .LastTransitionTime .After (condition .LastTransitionTime .Time ) {
664- c .LastTransitionTime = condition .LastTransitionTime
665547 }
666- meta .SetStatusCondition (& risk .Conditions , * c )
667- }
668-
669- return risk
670- }
671-
672- func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
673- if u == nil {
674- return
675548 }
676549
677550 riskVersions := loadRiskVersions (u .ConditionalUpdates )
678551 risks := risksInOrder (riskVersions )
679552 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
680553
681- if u .ShouldReconcileAcceptRisks () {
682- u .attachAlertRisksToUpdates (u .evaluateAlertRisks (ctx ))
683- }
684-
685- if err := sanityCheck (u .ConditionalUpdates ); err != nil {
686- klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
687- }
688-
689554 for i , conditionalUpdate := range u .ConditionalUpdates {
690555 condition := evaluateConditionalUpdate (conditionalUpdate .Risks , u .AcceptRisks , u .ShouldReconcileAcceptRisks , u .RiskConditions )
691556
@@ -701,25 +566,6 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
701566 }
702567}
703568
704- func sanityCheck (updates []configv1.ConditionalUpdate ) error {
705- risks := map [string ]configv1.ConditionalUpdateRisk {}
706- var errs []error
707- for _ , update := range updates {
708- for _ , risk := range update .Risks {
709- if v , ok := risks [risk .Name ]; ok {
710- if diff := cmp .Diff (v , risk , cmpopts .IgnoreFields (configv1.ConditionalUpdateRisk {}, "Conditions" )); diff != "" {
711- errs = append (errs , fmt .Errorf ("found collision on risk %s: %v and %v" , risk .Name , v , risk ))
712- }
713- } else if trimmed := strings .TrimSpace (risk .Name ); trimmed == "" {
714- errs = append (errs , fmt .Errorf ("found invalid name on risk %v" , risk ))
715- } else {
716- risks [risk .Name ] = risk
717- }
718- }
719- }
720- return utilerrors .NewAggregate (errs )
721- }
722-
723569func (u * availableUpdates ) addUpdate (release configv1.Release ) {
724570 for _ , update := range u .Updates {
725571 if update .Image == release .Image {
@@ -738,45 +584,6 @@ func (u *availableUpdates) removeUpdate(image string) {
738584 }
739585}
740586
741- func (u * availableUpdates ) attachAlertRisksToUpdates (alertRisks []configv1.ConditionalUpdateRisk ) {
742- if u == nil || len (alertRisks ) == 0 {
743- return
744- }
745- if u .RiskConditions == nil {
746- u .RiskConditions = map [string ][]metav1.Condition {}
747- }
748- for _ , alertRisk := range alertRisks {
749- u .RiskConditions [alertRisk .Name ] = alertRisk .Conditions
750- }
751- var conditionalUpdates []configv1.ConditionalUpdate
752- for _ , update := range u .Updates {
753- conditionalUpdates = append (conditionalUpdates , configv1.ConditionalUpdate {
754- Release : update ,
755- Risks : alertRisks ,
756- })
757- }
758- u .Updates = nil
759- for _ , conditionalUpdate := range u .ConditionalUpdates {
760- for _ , alertRisk := range alertRisks {
761- var found bool
762- for _ , risk := range conditionalUpdate .Risks {
763- if alertRisk .Name == risk .Name {
764- found = true
765- break
766- }
767- }
768- if ! found {
769- conditionalUpdate .Risks = append (conditionalUpdate .Risks , alertRisk )
770- }
771- }
772- conditionalUpdates = append (conditionalUpdates , conditionalUpdate )
773- }
774- sort .Slice (conditionalUpdates , func (i , j int ) bool {
775- return conditionalUpdates [i ].Release .Version < conditionalUpdates [j ].Release .Version
776- })
777- u .ConditionalUpdates = conditionalUpdates
778- }
779-
780587func unknownExposureMessage (risk configv1.ConditionalUpdateRisk , err error ) string {
781588 template := `Could not evaluate exposure to update risk %s (%v)
782589 %s description: %s
0 commit comments