Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 30 additions & 47 deletions pkg/test/ginkgo/retries.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,41 @@
package ginkgo

import (
"context"
_ "embed"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"time"

"github.com/openshift/origin/pkg/dataloader"
"github.com/sirupsen/logrus"
"sigs.k8s.io/yaml"
)

//go:embed retry_allowed_tests.yaml
var retryAllowedTestsYAML []byte

// retryAllowedTestsConfig represents the YAML structure for the retry allowlist.
type retryAllowedTestsConfig struct {
Tests []string `json:"tests"`
}

// loadRetryAllowedTests parses the embedded YAML and returns the set of test names
// that are permitted to retry on failure.
func loadRetryAllowedTests() map[string]bool {
var config retryAllowedTestsConfig
if err := yaml.Unmarshal(retryAllowedTestsYAML, &config); err != nil {
logrus.WithError(err).Error("Failed to parse retry_allowed_tests.yaml, no tests will be allowed to retry")
return map[string]bool{}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should've used the kube set type, FWIW

APIGroups sets.Set[string] `json:"-"`

}
allowed := make(map[string]bool, len(config.Tests))
for _, test := range config.Tests {
allowed[test] = true
}
return allowed
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.

const (
defaultRetryStrategy = "once"

Expand Down Expand Up @@ -60,12 +83,12 @@ type RetryStrategy interface {
}

type RetryOnceStrategy struct {
PermittedRetryImageTags []string
AllowedRetryTests map[string]bool
}

func NewRetryOnceStrategy() *RetryOnceStrategy {
return &RetryOnceStrategy{
PermittedRetryImageTags: []string{"tests"}, // tests = openshift-tests image
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stbenjam this looks safe to me, but would appreciate if you can confirm. Mat is closing the retry gap at last, with an explicit list of tests that need retries only allowed.

AllowedRetryTests: loadRetryAllowedTests(),
}
}

Expand Down Expand Up @@ -112,51 +135,11 @@ func (s *RetryOnceStrategy) DecideOutcome(attempts []*testCase) RetryOutcome {
}

func (s *RetryOnceStrategy) shouldRetryTest(test *testCase) bool {
// Internal tests (no binary) are eligible for retry, we shouldn't really have any of these
// now that origin is also an extension.
if test.binary == nil {
if s.AllowedRetryTests[test.name] {
logrus.WithField("test", test.name).Debug("Test is in the retry allowlist, permitting retry")
return true
}

tlog := logrus.WithField("test", test.name)

// Test retries were disabled for some suites when they moved to OTE. This exposed small numbers of tests that
// were actually flaky and nobody knew. We attempted to fix these, a few did not make it in time. Restore
// retries for specific test names so the overall suite can continue to not retry.
retryTestNames := []string{
"[sig-instrumentation] Metrics should grab all metrics from kubelet /metrics/resource endpoint [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57477
"[sig-network] Services should be rejected for evicted pods (no endpoints exist) [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57665
"[sig-node] Pods Extended Pod Container lifecycle evicted pods should be terminal [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-57658
"[sig-cli] Kubectl logs all pod logs the Deployment has 2 replicas and each pod has 2 containers should get logs from each pod and each container in Deployment [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61287
"[sig-cli] Kubectl Port forwarding Shutdown client connection while the remote stream is writing data to the port-forward connection port-forward should keep working after detect broken connection [Suite:openshift/conformance/parallel] [Suite:k8s]", // https://issues.redhat.com/browse/OCPBUGS-61734
"[sig-storage] OCP CSI Volumes [Driver: csi-hostpath-groupsnapshot] [OCPFeatureGate:VolumeGroupSnapshot] [Testpattern: (delete policy)] volumegroupsnapshottable [Feature:volumegroupsnapshot] VolumeGroupSnapshottable should create snapshots for multiple volumes in a pod", // https://issues.redhat.com/browse/OCPBUGS-66967
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check if all these are in the list with the new query I added in the PR, if not we should add them.

Also could you ensure the list is sorted, the query I provided isn't. :)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The list is now sorted. The "migrated" whitelisted tests are there. After running your revised query we are down to ~200 tests only.

}
for _, rtn := range retryTestNames {
if test.name == rtn {
tlog.Debug("test has an exception allowing retry")
return true
}
}

// Get extension info to check if it's from a permitted image
info, err := test.binary.Info(context.Background())
if err != nil {
tlog.WithError(err).
Debug("Failed to get binary info, skipping retry")
return false
}

// Check if the test's source image is in the permitted retry list
for _, permittedTag := range s.PermittedRetryImageTags {
if strings.Contains(info.Source.SourceImage, permittedTag) {
tlog.WithField("image", info.Source.SourceImage).
Debug("Permitting retry")
return true
}
}

tlog.WithField("image", info.Source.SourceImage).
Debug("Test not eligible for retry based on image tag")
logrus.WithField("test", test.name).Debug("Test is not in the retry allowlist, retry not permitted")
return false
}

Expand Down
52 changes: 52 additions & 0 deletions pkg/test/ginkgo/retries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,58 @@ import (
"time"
)

func TestRetryOnceStrategy_ShouldRetryTest_Allowlist(t *testing.T) {
tests := []struct {
name string
allowedTests map[string]bool
testName string
wantRetry bool
}{
{
name: "test_on_allowlist_gets_retry",
allowedTests: map[string]bool{"[sig-network] some flaky test": true},
testName: "[sig-network] some flaky test",
wantRetry: true,
},
{
name: "test_not_on_allowlist_gets_no_retry",
allowedTests: map[string]bool{"[sig-network] some flaky test": true},
testName: "[sig-node] some other test",
wantRetry: false,
},
{
name: "empty_allowlist_blocks_all",
allowedTests: map[string]bool{},
testName: "[sig-network] some flaky test",
wantRetry: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
strategy := &RetryOnceStrategy{AllowedRetryTests: tt.allowedTests}
tc := &testCase{name: tt.testName, failed: true}

if got := strategy.GetMaxRetries(tc); (got == 1) != tt.wantRetry {
t.Errorf("GetMaxRetries() = %d, wantRetry = %v", got, tt.wantRetry)
}

attempts := []*testCase{tc}
if got := strategy.ShouldContinue(tc, attempts, 2); got != tt.wantRetry {
t.Errorf("ShouldContinue() = %v, wantRetry = %v", got, tt.wantRetry)
}
})
}
}

func TestLoadRetryAllowedTests(t *testing.T) {
// Verify the embedded YAML file can be parsed without error.
tests := loadRetryAllowedTests()
if tests == nil {
t.Error("loadRetryAllowedTests returned nil, expected an initialized map")
}
}

// createAttempts creates a slice of test attempts with the specified number of successes and failures
func createAttempts(successes, failures int) []*testCase {
var attempts []*testCase
Expand Down
Loading