From f1b4d90c60ddbbd675101c881dba571bb925ec86 Mon Sep 17 00:00:00 2001 From: xueqzhan Date: Wed, 20 May 2026 13:26:07 -0400 Subject: [PATCH 1/5] Try to remove a crashing pod to test cpu situation in tp jobs --- pkg/test/ginkgo/cmd_runsuite.go | 68 +++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go index 7d011e9c725e..e9955c0cafb6 100644 --- a/pkg/test/ginkgo/cmd_runsuite.go +++ b/pkg/test/ginkgo/cmd_runsuite.go @@ -25,6 +25,7 @@ import ( "github.com/spf13/pflag" "golang.org/x/mod/semver" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/cli-runtime/pkg/genericclioptions" "k8s.io/client-go/kubernetes" @@ -722,12 +723,22 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc tests = append(tests, openshiftTestsCopy...) // run the must-gather tests after parallel tests to reduce resource contention + // Scale down crash-looping CAPI deployments to avoid CPU pressure during must-gather + previousCAPIReplicas, capiErr := scaleCAPIDeployments(ctx, restConfig, 0) + if capiErr != nil { + logrus.Errorf("Failed to scale down CAPI deployments: %v", capiErr) + } + mustGatherTestsCopy := copyTests(mustGatherTests) mustGatherIntervalID, mustGatherStartTime := recordTestBucketInterval(monitorEventRecorder, "MustGather") q.Execute(testCtx, mustGatherTestsCopy, parallelism, testOutputConfig, abortFn) monitorEventRecorder.EndInterval(mustGatherIntervalID, time.Now()) logrus.Infof("Completed MustGather test bucket in %v", time.Since(mustGatherStartTime)) tests = append(tests, mustGatherTestsCopy...) + + if len(previousCAPIReplicas) > 0 { + restoreCAPIDeployments(ctx, restConfig, previousCAPIReplicas) + } } // TODO: will move to the monitor @@ -1422,6 +1433,63 @@ func getClusterNodeCounts(ctx context.Context, config *rest.Config) (int, int, e return totalNodes, workerNodes, nil } +func scaleCAPIDeployments(ctx context.Context, config *rest.Config, targetReplicas int32) (map[string]map[string]int32, error) { + kubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, err + } + + capiDeployments := map[string][]string{ + "openshift-cluster-api-operator": {"capi-operator"}, + "openshift-cluster-api": {"capi-controller-manager", "capa-controller-manager"}, + } + + previousReplicas := map[string]map[string]int32{} + + for ns, deployNames := range capiDeployments { + for _, name := range deployNames { + dep, err := kubeClient.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + logrus.Infof("CAPI deployment %s/%s not found, skipping: %v", ns, name, err) + continue + } + + if previousReplicas[ns] == nil { + previousReplicas[ns] = map[string]int32{} + } + previousReplicas[ns][name] = *dep.Spec.Replicas + + patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, targetReplicas)) + if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil { + logrus.Errorf("Failed to scale %s/%s to %d: %v", ns, name, targetReplicas, err) + continue + } + logrus.Infof("Scaled CAPI deployment %s/%s from %d to %d", ns, name, *dep.Spec.Replicas, targetReplicas) + } + } + + return previousReplicas, nil +} + +func restoreCAPIDeployments(ctx context.Context, config *rest.Config, previousReplicas map[string]map[string]int32) { + kubeClient, err := kubernetes.NewForConfig(config) + if err != nil { + logrus.Errorf("Failed to create client for CAPI restore: %v", err) + return + } + + for ns, deployments := range previousReplicas { + for name, replicas := range deployments { + patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, replicas)) + if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil { + logrus.Errorf("Failed to restore %s/%s to %d replicas: %v", ns, name, replicas, err) + continue + } + logrus.Infof("Restored CAPI deployment %s/%s to %d replicas", ns, name, replicas) + } + } +} + // recordTestBucketInterval creates and starts an interval for tracking test bucket execution func recordTestBucketInterval(monitorEventRecorder monitorapi.Recorder, bucketName string) (int, time.Time) { startTime := time.Now() From 79d3d089ff1c752abbfbea125479f44d0e7a5e4c Mon Sep 17 00:00:00 2001 From: xueqzhan Date: Wed, 20 May 2026 18:00:28 -0400 Subject: [PATCH 2/5] revert the change to see the differences --- pkg/test/ginkgo/cmd_runsuite.go | 68 --------------------------------- 1 file changed, 68 deletions(-) diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go index e9955c0cafb6..7d011e9c725e 100644 --- a/pkg/test/ginkgo/cmd_runsuite.go +++ b/pkg/test/ginkgo/cmd_runsuite.go @@ -25,7 +25,6 @@ import ( "github.com/spf13/pflag" "golang.org/x/mod/semver" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/cli-runtime/pkg/genericclioptions" "k8s.io/client-go/kubernetes" @@ -723,22 +722,12 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc tests = append(tests, openshiftTestsCopy...) // run the must-gather tests after parallel tests to reduce resource contention - // Scale down crash-looping CAPI deployments to avoid CPU pressure during must-gather - previousCAPIReplicas, capiErr := scaleCAPIDeployments(ctx, restConfig, 0) - if capiErr != nil { - logrus.Errorf("Failed to scale down CAPI deployments: %v", capiErr) - } - mustGatherTestsCopy := copyTests(mustGatherTests) mustGatherIntervalID, mustGatherStartTime := recordTestBucketInterval(monitorEventRecorder, "MustGather") q.Execute(testCtx, mustGatherTestsCopy, parallelism, testOutputConfig, abortFn) monitorEventRecorder.EndInterval(mustGatherIntervalID, time.Now()) logrus.Infof("Completed MustGather test bucket in %v", time.Since(mustGatherStartTime)) tests = append(tests, mustGatherTestsCopy...) - - if len(previousCAPIReplicas) > 0 { - restoreCAPIDeployments(ctx, restConfig, previousCAPIReplicas) - } } // TODO: will move to the monitor @@ -1433,63 +1422,6 @@ func getClusterNodeCounts(ctx context.Context, config *rest.Config) (int, int, e return totalNodes, workerNodes, nil } -func scaleCAPIDeployments(ctx context.Context, config *rest.Config, targetReplicas int32) (map[string]map[string]int32, error) { - kubeClient, err := kubernetes.NewForConfig(config) - if err != nil { - return nil, err - } - - capiDeployments := map[string][]string{ - "openshift-cluster-api-operator": {"capi-operator"}, - "openshift-cluster-api": {"capi-controller-manager", "capa-controller-manager"}, - } - - previousReplicas := map[string]map[string]int32{} - - for ns, deployNames := range capiDeployments { - for _, name := range deployNames { - dep, err := kubeClient.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{}) - if err != nil { - logrus.Infof("CAPI deployment %s/%s not found, skipping: %v", ns, name, err) - continue - } - - if previousReplicas[ns] == nil { - previousReplicas[ns] = map[string]int32{} - } - previousReplicas[ns][name] = *dep.Spec.Replicas - - patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, targetReplicas)) - if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil { - logrus.Errorf("Failed to scale %s/%s to %d: %v", ns, name, targetReplicas, err) - continue - } - logrus.Infof("Scaled CAPI deployment %s/%s from %d to %d", ns, name, *dep.Spec.Replicas, targetReplicas) - } - } - - return previousReplicas, nil -} - -func restoreCAPIDeployments(ctx context.Context, config *rest.Config, previousReplicas map[string]map[string]int32) { - kubeClient, err := kubernetes.NewForConfig(config) - if err != nil { - logrus.Errorf("Failed to create client for CAPI restore: %v", err) - return - } - - for ns, deployments := range previousReplicas { - for name, replicas := range deployments { - patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, replicas)) - if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil { - logrus.Errorf("Failed to restore %s/%s to %d replicas: %v", ns, name, replicas, err) - continue - } - logrus.Infof("Restored CAPI deployment %s/%s to %d replicas", ns, name, replicas) - } - } -} - // recordTestBucketInterval creates and starts an interval for tracking test bucket execution func recordTestBucketInterval(monitorEventRecorder monitorapi.Recorder, bucketName string) (int, time.Time) { startTime := time.Now() From f3601f57124f4ee00d8013840553017cd465d9e6 Mon Sep 17 00:00:00 2001 From: xueqzhan Date: Thu, 21 May 2026 16:49:26 -0400 Subject: [PATCH 3/5] Refactor must gather test to minimize parallel load on the apiserver --- test/extended/cli/mustgather.go | 149 ++++++++++---------------------- 1 file changed, 45 insertions(+), 104 deletions(-) diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go index 6865c75bb932..66f8dc04a897 100644 --- a/test/extended/cli/mustgather.go +++ b/test/extended/cli/mustgather.go @@ -129,7 +129,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { }) g.It("runs successfully for audit logs [apigroup:config.openshift.io][apigroup:oauth.openshift.io]", func() { - // On External clusters, events will not be part of the output, since audit logs do not include control plane logs. controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) o.Expect(err).NotTo(o.HaveOccurred()) @@ -182,10 +181,12 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { pluginOutputDir := GetPluginOutputDir(tempDir) + // Validate audit event counts and JSON format + g.By("Validating audit log event counts and JSON format") expectedDirectoriesToExpectedCount := map[string]int{ path.Join(pluginOutputDir, "audit_logs", "kube-apiserver"): 1000, - path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, // openshift apiservers don't necessarily get much traffic. Especially early in a run - path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"): 10, // oauth apiservers don't necessarily get much traffic. Especially early in a run + path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, + path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"): 10, } expectedFiles := [][]string{ @@ -194,13 +195,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { {pluginOutputDir, "audit_logs", "oauth-apiserver.audit_logs_listing"}, } - // for some crazy reason, it seems that the files from must-gather take time to appear on disk for reading. I don't understand why - // but this was in a previous commit and I don't want to immediately flake: https://github.com/openshift/origin/commit/006745a535848e84dcbcdd1c83ae86deddd3a229#diff-ad1c47fa4213de16d8b3237df5d71724R168 - // so we're going to try to get a pass every 10 seconds for a minute. If we pass, great. If we don't, we report the - // last error we had. var lastErr error o.Eventually(func() bool { - // make sure we do not log OAuth tokens for auditDirectory, expectedNumberOfAuditEntries := range expectedDirectoriesToExpectedCount { eventsChecked := 0 err := filepath.Walk(auditDirectory, func(path string, info os.FileInfo, err error) error { @@ -215,8 +211,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { return nil } - // at this point, we expect only audit files with json events, one per line - readFile := false file, err := os.Open(path) @@ -226,9 +220,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { fi, err := file.Stat() o.Expect(err).NotTo(o.HaveOccurred()) - // it will happen that the audit files are sometimes empty, we can - // safely ignore these files since they don't provide valuable information - // TODO this doesn't seem right. It should be really unlikely, but we'll deal with later if fi.Size() == 0 { return nil } @@ -240,15 +231,13 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { for scanner.Scan() { text := scanner.Text() if !strings.HasSuffix(text, "}") { - continue // ignore truncated data + continue } o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`)) readFile = true eventsChecked++ } - // ignore this error as we usually fail to read the whole GZ file - // o.Expect(scanner.Err()).NotTo(o.HaveOccurred()) o.Expect(readFile).To(o.BeTrue()) return nil @@ -260,14 +249,12 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { return false } - // reset lastErr if we succeeded. lastErr = nil } - // if we get here, it means both directories checked out ok return true }, 1*time.Minute, 10*time.Second).Should(o.BeTrue()) - o.Expect(lastErr).NotTo(o.HaveOccurred()) // print the last error first if we have one + o.Expect(lastErr).NotTo(o.HaveOccurred()) emptyFiles := []string{} for _, expectedFile := range expectedFiles { @@ -282,63 +269,52 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { if len(emptyFiles) > 0 { o.Expect(fmt.Errorf("expected files should not be empty: %s", strings.Join(emptyFiles, ","))).NotTo(o.HaveOccurred()) } - }) - g.When("looking at the audit logs [apigroup:config.openshift.io]", func() { - g.Describe("[sig-node] kubelet", func() { - g.It("runs apiserver processes strictly sequentially in order to not risk audit log corruption", func() { - controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) + // Check apiserver processes ran strictly sequentially (lock.log check) + g.By("Checking apiserver processes ran strictly sequentially") + expectedAuditSubDirs := []string{"kube-apiserver", "openshift-apiserver", "oauth-apiserver"} + seen := sets.String{} + for _, apiserver := range expectedAuditSubDirs { + err := filepath.Walk(filepath.Join(pluginOutputDir, "audit_logs", apiserver), func(path string, info os.FileInfo, err error) error { + g.By(path) o.Expect(err).NotTo(o.HaveOccurred()) - - // On External clusters, events will not be part of the output, since audit logs do not include control plane logs. - if *controlPlaneTopology == configv1.ExternalTopologyMode { - g.Skip("External clusters don't have control plane audit logs") + if info.IsDir() { + return nil } - tempDir, err := os.MkdirTemp("", "test.oc-adm-must-gather.") - o.Expect(err).NotTo(o.HaveOccurred()) - defer os.RemoveAll(tempDir) + seen.Insert(apiserver) - args := []string{ - "--dest-dir", tempDir, - "--volume-percentage=100", - "--", - "/usr/bin/gather_audit_logs", + if filepath.Base(path) != "lock.log" { + return nil } - o.Expect(oc.Run("adm", "must-gather").Args(args...).Execute()).To(o.Succeed()) - - pluginOutputDir := GetPluginOutputDir(tempDir) - expectedAuditSubDirs := []string{"kube-apiserver", "openshift-apiserver", "oauth-apiserver"} - - seen := sets.String{} - for _, apiserver := range expectedAuditSubDirs { - err := filepath.Walk(filepath.Join(pluginOutputDir, "audit_logs", apiserver), func(path string, info os.FileInfo, err error) error { - g.By(path) - o.Expect(err).NotTo(o.HaveOccurred()) - if info.IsDir() { - return nil - } - - seen.Insert(apiserver) - - if filepath.Base(path) != "lock.log" { - return nil - } - - lockLog, err := os.ReadFile(path) - o.Expect(err).NotTo(o.HaveOccurred()) - - // TODO: turn this into a failure as soon as kubelet is fixed - result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog)) - return nil - }) - o.Expect(err).NotTo(o.HaveOccurred()) - } + lockLog, err := os.ReadFile(path) + o.Expect(err).NotTo(o.HaveOccurred()) - o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue()) + result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog)) + return nil }) - }) + o.Expect(err).NotTo(o.HaveOccurred()) + } + o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue()) + + // Validate OAuth audit logs contain auditID + g.By("Validating OAuth audit logs contain auditID") + oauthAuditFiles := getOauthAudit(tempDir) + for _, file := range oauthAuditFiles { + f, err := os.Open(file) + o.Expect(err).NotTo(o.HaveOccurred()) + defer f.Close() + gzipReader, err := gzip.NewReader(f) + o.Expect(err).NotTo(o.HaveOccurred()) + defer gzipReader.Close() + scanner := bufio.NewScanner(gzipReader) + var headContent string + if scanner.Scan() { + headContent = scanner.Text() + } + o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs") + } }) // author: yinzhou@redhat.com @@ -398,41 +374,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { }, 600*time.Second, 10*time.Second).Should(o.MatchRegexp("No resources found"), "Still find the must-gather pod in own namespace even wait for 10 mins") }) - // author: yinzhou@redhat.com - g.It("Fetch audit logs of login attempts via oc commands timeout [apigroup:config.openshift.io][Timeout:20m]", func() { - controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) - o.Expect(err).NotTo(o.HaveOccurred()) - - // Skip on hypershift (External topology) - if *controlPlaneTopology == configv1.ExternalTopologyMode { - g.Skip("Hypershift clusters don't have oauth-server audit logs in must-gather") - } - - g.By("run the must-gather") - tempDir, err := os.MkdirTemp("", "test.oc-adm-must-gather.") - o.Expect(err).NotTo(o.HaveOccurred()) - defer os.RemoveAll(tempDir) - - o.Expect(oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute()).To(o.Succeed()) - - g.By("check the must-gather result") - oauth_audit_files := getOauthAudit(tempDir) - for _, file := range oauth_audit_files { - f, err := os.Open(file) - defer f.Close() - o.Expect(err).NotTo(o.HaveOccurred()) - gzipReader, err := gzip.NewReader(f) - defer gzipReader.Close() - o.Expect(err).NotTo(o.HaveOccurred()) - scanner := bufio.NewScanner(gzipReader) - var headContent string - if scanner.Scan() { - headContent = scanner.Text() - } - o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs") - } - }) - // author: yinzhou@redhat.com g.It("must-gather support since and since-time flags timeout [apigroup:config.openshift.io][Timeout:20m]", func() { controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) @@ -550,7 +491,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { o.Expect(err).NotTo(o.HaveOccurred()) defer os.RemoveAll(tempDir) - err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute() + err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute() o.Expect(err).NotTo(o.HaveOccurred()) g.By("check the must-gather and verify that oc binary version is included") @@ -589,7 +530,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { o.Expect(err).NotTo(o.HaveOccurred()) defer os.RemoveAll(tempDir2) - err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute() + err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute() o.Expect(err).NotTo(o.HaveOccurred()) g.By("check logs generated are included in the must-gather directory when must-gather is run") From de9582250acdfd4e65f008e4609590474fdf585a Mon Sep 17 00:00:00 2001 From: xueqzhan Date: Tue, 26 May 2026 14:55:24 -0400 Subject: [PATCH 4/5] Refactor must gather audit log tests to avoid repeated audit download --- test/extended/cli/mustgather.go | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go index 66f8dc04a897..644d2ba5890b 100644 --- a/test/extended/cli/mustgather.go +++ b/test/extended/cli/mustgather.go @@ -129,6 +129,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { }) g.It("runs successfully for audit logs [apigroup:config.openshift.io][apigroup:oauth.openshift.io]", func() { + // On External clusters, events will not be part of the output, since audit logs do not include control plane logs. controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc) o.Expect(err).NotTo(o.HaveOccurred()) @@ -185,8 +186,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { g.By("Validating audit log event counts and JSON format") expectedDirectoriesToExpectedCount := map[string]int{ path.Join(pluginOutputDir, "audit_logs", "kube-apiserver"): 1000, - path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, - path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"): 10, + path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, // openshift apiservers don't necessarily get much traffic. Especially early in a run + path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"): 10, // oauth apiservers don't necessarily get much traffic. Especially early in a run } expectedFiles := [][]string{ @@ -195,8 +196,13 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { {pluginOutputDir, "audit_logs", "oauth-apiserver.audit_logs_listing"}, } + // for some crazy reason, it seems that the files from must-gather take time to appear on disk for reading. I don't understand why + // but this was in a previous commit and I don't want to immediately flake: https://github.com/openshift/origin/commit/006745a535848e84dcbcdd1c83ae86deddd3a229#diff-ad1c47fa4213de16d8b3237df5d71724R168 + // so we're going to try to get a pass every 10 seconds for a minute. If we pass, great. If we don't, we report the + // last error we had. var lastErr error o.Eventually(func() bool { + // make sure we do not log OAuth tokens for auditDirectory, expectedNumberOfAuditEntries := range expectedDirectoriesToExpectedCount { eventsChecked := 0 err := filepath.Walk(auditDirectory, func(path string, info os.FileInfo, err error) error { @@ -211,6 +217,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { return nil } + // at this point, we expect only audit files with json events, one per line + readFile := false file, err := os.Open(path) @@ -220,6 +228,9 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { fi, err := file.Stat() o.Expect(err).NotTo(o.HaveOccurred()) + // it will happen that the audit files are sometimes empty, we can + // safely ignore these files since they don't provide valuable information + // TODO this doesn't seem right. It should be really unlikely, but we'll deal with later if fi.Size() == 0 { return nil } @@ -231,13 +242,15 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { for scanner.Scan() { text := scanner.Text() if !strings.HasSuffix(text, "}") { - continue + continue // ignore truncated data } o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`)) readFile = true eventsChecked++ } + // ignore this error as we usually fail to read the whole GZ file + // o.Expect(scanner.Err()).NotTo(o.HaveOccurred()) o.Expect(readFile).To(o.BeTrue()) return nil @@ -249,12 +262,14 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { return false } + // reset lastErr if we succeeded. lastErr = nil } + // if we get here, it means both directories checked out ok return true }, 1*time.Minute, 10*time.Second).Should(o.BeTrue()) - o.Expect(lastErr).NotTo(o.HaveOccurred()) + o.Expect(lastErr).NotTo(o.HaveOccurred()) // print the last error first if we have one emptyFiles := []string{} for _, expectedFile := range expectedFiles { @@ -291,12 +306,15 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { lockLog, err := os.ReadFile(path) o.Expect(err).NotTo(o.HaveOccurred()) - result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog)) + // TODO: turn this into a failure as soon as kubelet is fixed + if strings.TrimSpace(string(lockLog)) != "" { + result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog)) + } return nil }) o.Expect(err).NotTo(o.HaveOccurred()) } - o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue()) + o.Expect(seen.HasAll(expectedAuditSubDirs...)).To(o.BeTrue()) // Validate OAuth audit logs contain auditID g.By("Validating OAuth audit logs contain auditID") From 2477c1ba27f8f3ea336d82fee3ee1e5d015d0bfd Mon Sep 17 00:00:00 2001 From: xueqzhan Date: Wed, 27 May 2026 11:57:44 -0400 Subject: [PATCH 5/5] Address review comments to further simplify must-gather tests --- test/extended/cli/mustgather.go | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go index 644d2ba5890b..bca1be429cb5 100644 --- a/test/extended/cli/mustgather.go +++ b/test/extended/cli/mustgather.go @@ -320,18 +320,20 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { g.By("Validating OAuth audit logs contain auditID") oauthAuditFiles := getOauthAudit(tempDir) for _, file := range oauthAuditFiles { - f, err := os.Open(file) - o.Expect(err).NotTo(o.HaveOccurred()) - defer f.Close() - gzipReader, err := gzip.NewReader(f) - o.Expect(err).NotTo(o.HaveOccurred()) - defer gzipReader.Close() - scanner := bufio.NewScanner(gzipReader) - var headContent string - if scanner.Scan() { - headContent = scanner.Text() - } - o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs") + func() { + f, err := os.Open(file) + o.Expect(err).NotTo(o.HaveOccurred()) + defer f.Close() + gzipReader, err := gzip.NewReader(f) + o.Expect(err).NotTo(o.HaveOccurred()) + defer gzipReader.Close() + scanner := bufio.NewScanner(gzipReader) + var headContent string + if scanner.Scan() { + headContent = scanner.Text() + } + o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs") + }() } }) @@ -509,7 +511,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() { o.Expect(err).NotTo(o.HaveOccurred()) defer os.RemoveAll(tempDir) - err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute() + err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/bin/true").Execute() o.Expect(err).NotTo(o.HaveOccurred()) g.By("check the must-gather and verify that oc binary version is included")