From f1b4d90c60ddbbd675101c881dba571bb925ec86 Mon Sep 17 00:00:00 2001
From: xueqzhan <kenzhang@redhat.com>
Date: Wed, 20 May 2026 13:26:07 -0400
Subject: [PATCH 1/5] Try to remove a crashing pod to test cpu situation in tp
 jobs

---
 pkg/test/ginkgo/cmd_runsuite.go | 68 +++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go
index 7d011e9c725e..e9955c0cafb6 100644
--- a/pkg/test/ginkgo/cmd_runsuite.go
+++ b/pkg/test/ginkgo/cmd_runsuite.go
@@ -25,6 +25,7 @@ import (
 	"github.com/spf13/pflag"
 	"golang.org/x/mod/semver"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/cli-runtime/pkg/genericclioptions"
 	"k8s.io/client-go/kubernetes"
@@ -722,12 +723,22 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
 		tests = append(tests, openshiftTestsCopy...)
 
 		// run the must-gather tests after parallel tests to reduce resource contention
+		// Scale down crash-looping CAPI deployments to avoid CPU pressure during must-gather
+		previousCAPIReplicas, capiErr := scaleCAPIDeployments(ctx, restConfig, 0)
+		if capiErr != nil {
+			logrus.Errorf("Failed to scale down CAPI deployments: %v", capiErr)
+		}
+
 		mustGatherTestsCopy := copyTests(mustGatherTests)
 		mustGatherIntervalID, mustGatherStartTime := recordTestBucketInterval(monitorEventRecorder, "MustGather")
 		q.Execute(testCtx, mustGatherTestsCopy, parallelism, testOutputConfig, abortFn)
 		monitorEventRecorder.EndInterval(mustGatherIntervalID, time.Now())
 		logrus.Infof("Completed MustGather test bucket in %v", time.Since(mustGatherStartTime))
 		tests = append(tests, mustGatherTestsCopy...)
+
+		if len(previousCAPIReplicas) > 0 {
+			restoreCAPIDeployments(ctx, restConfig, previousCAPIReplicas)
+		}
 	}
 
 	// TODO: will move to the monitor
@@ -1422,6 +1433,63 @@ func getClusterNodeCounts(ctx context.Context, config *rest.Config) (int, int, e
 	return totalNodes, workerNodes, nil
 }
 
+func scaleCAPIDeployments(ctx context.Context, config *rest.Config, targetReplicas int32) (map[string]map[string]int32, error) {
+	kubeClient, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		return nil, err
+	}
+
+	capiDeployments := map[string][]string{
+		"openshift-cluster-api-operator": {"capi-operator"},
+		"openshift-cluster-api":          {"capi-controller-manager", "capa-controller-manager"},
+	}
+
+	previousReplicas := map[string]map[string]int32{}
+
+	for ns, deployNames := range capiDeployments {
+		for _, name := range deployNames {
+			dep, err := kubeClient.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{})
+			if err != nil {
+				logrus.Infof("CAPI deployment %s/%s not found, skipping: %v", ns, name, err)
+				continue
+			}
+
+			if previousReplicas[ns] == nil {
+				previousReplicas[ns] = map[string]int32{}
+			}
+			previousReplicas[ns][name] = *dep.Spec.Replicas
+
+			patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, targetReplicas))
+			if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil {
+				logrus.Errorf("Failed to scale %s/%s to %d: %v", ns, name, targetReplicas, err)
+				continue
+			}
+			logrus.Infof("Scaled CAPI deployment %s/%s from %d to %d", ns, name, *dep.Spec.Replicas, targetReplicas)
+		}
+	}
+
+	return previousReplicas, nil
+}
+
+func restoreCAPIDeployments(ctx context.Context, config *rest.Config, previousReplicas map[string]map[string]int32) {
+	kubeClient, err := kubernetes.NewForConfig(config)
+	if err != nil {
+		logrus.Errorf("Failed to create client for CAPI restore: %v", err)
+		return
+	}
+
+	for ns, deployments := range previousReplicas {
+		for name, replicas := range deployments {
+			patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, replicas))
+			if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil {
+				logrus.Errorf("Failed to restore %s/%s to %d replicas: %v", ns, name, replicas, err)
+				continue
+			}
+			logrus.Infof("Restored CAPI deployment %s/%s to %d replicas", ns, name, replicas)
+		}
+	}
+}
+
 // recordTestBucketInterval creates and starts an interval for tracking test bucket execution
 func recordTestBucketInterval(monitorEventRecorder monitorapi.Recorder, bucketName string) (int, time.Time) {
 	startTime := time.Now()

From 79d3d089ff1c752abbfbea125479f44d0e7a5e4c Mon Sep 17 00:00:00 2001
From: xueqzhan <kenzhang@redhat.com>
Date: Wed, 20 May 2026 18:00:28 -0400
Subject: [PATCH 2/5] revert the change to see the differences

---
 pkg/test/ginkgo/cmd_runsuite.go | 68 ---------------------------------
 1 file changed, 68 deletions(-)

diff --git a/pkg/test/ginkgo/cmd_runsuite.go b/pkg/test/ginkgo/cmd_runsuite.go
index e9955c0cafb6..7d011e9c725e 100644
--- a/pkg/test/ginkgo/cmd_runsuite.go
+++ b/pkg/test/ginkgo/cmd_runsuite.go
@@ -25,7 +25,6 @@ import (
 	"github.com/spf13/pflag"
 	"golang.org/x/mod/semver"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"k8s.io/apimachinery/pkg/types"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/cli-runtime/pkg/genericclioptions"
 	"k8s.io/client-go/kubernetes"
@@ -723,22 +722,12 @@ func (o *GinkgoRunSuiteOptions) Run(suite *TestSuite, clusterConfig *clusterdisc
 		tests = append(tests, openshiftTestsCopy...)
 
 		// run the must-gather tests after parallel tests to reduce resource contention
-		// Scale down crash-looping CAPI deployments to avoid CPU pressure during must-gather
-		previousCAPIReplicas, capiErr := scaleCAPIDeployments(ctx, restConfig, 0)
-		if capiErr != nil {
-			logrus.Errorf("Failed to scale down CAPI deployments: %v", capiErr)
-		}
-
 		mustGatherTestsCopy := copyTests(mustGatherTests)
 		mustGatherIntervalID, mustGatherStartTime := recordTestBucketInterval(monitorEventRecorder, "MustGather")
 		q.Execute(testCtx, mustGatherTestsCopy, parallelism, testOutputConfig, abortFn)
 		monitorEventRecorder.EndInterval(mustGatherIntervalID, time.Now())
 		logrus.Infof("Completed MustGather test bucket in %v", time.Since(mustGatherStartTime))
 		tests = append(tests, mustGatherTestsCopy...)
-
-		if len(previousCAPIReplicas) > 0 {
-			restoreCAPIDeployments(ctx, restConfig, previousCAPIReplicas)
-		}
 	}
 
 	// TODO: will move to the monitor
@@ -1433,63 +1422,6 @@ func getClusterNodeCounts(ctx context.Context, config *rest.Config) (int, int, e
 	return totalNodes, workerNodes, nil
 }
 
-func scaleCAPIDeployments(ctx context.Context, config *rest.Config, targetReplicas int32) (map[string]map[string]int32, error) {
-	kubeClient, err := kubernetes.NewForConfig(config)
-	if err != nil {
-		return nil, err
-	}
-
-	capiDeployments := map[string][]string{
-		"openshift-cluster-api-operator": {"capi-operator"},
-		"openshift-cluster-api":          {"capi-controller-manager", "capa-controller-manager"},
-	}
-
-	previousReplicas := map[string]map[string]int32{}
-
-	for ns, deployNames := range capiDeployments {
-		for _, name := range deployNames {
-			dep, err := kubeClient.AppsV1().Deployments(ns).Get(ctx, name, metav1.GetOptions{})
-			if err != nil {
-				logrus.Infof("CAPI deployment %s/%s not found, skipping: %v", ns, name, err)
-				continue
-			}
-
-			if previousReplicas[ns] == nil {
-				previousReplicas[ns] = map[string]int32{}
-			}
-			previousReplicas[ns][name] = *dep.Spec.Replicas
-
-			patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, targetReplicas))
-			if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil {
-				logrus.Errorf("Failed to scale %s/%s to %d: %v", ns, name, targetReplicas, err)
-				continue
-			}
-			logrus.Infof("Scaled CAPI deployment %s/%s from %d to %d", ns, name, *dep.Spec.Replicas, targetReplicas)
-		}
-	}
-
-	return previousReplicas, nil
-}
-
-func restoreCAPIDeployments(ctx context.Context, config *rest.Config, previousReplicas map[string]map[string]int32) {
-	kubeClient, err := kubernetes.NewForConfig(config)
-	if err != nil {
-		logrus.Errorf("Failed to create client for CAPI restore: %v", err)
-		return
-	}
-
-	for ns, deployments := range previousReplicas {
-		for name, replicas := range deployments {
-			patch := []byte(fmt.Sprintf(`{"spec":{"replicas":%d}}`, replicas))
-			if _, err := kubeClient.AppsV1().Deployments(ns).Patch(ctx, name, types.MergePatchType, patch, metav1.PatchOptions{}); err != nil {
-				logrus.Errorf("Failed to restore %s/%s to %d replicas: %v", ns, name, replicas, err)
-				continue
-			}
-			logrus.Infof("Restored CAPI deployment %s/%s to %d replicas", ns, name, replicas)
-		}
-	}
-}
-
 // recordTestBucketInterval creates and starts an interval for tracking test bucket execution
 func recordTestBucketInterval(monitorEventRecorder monitorapi.Recorder, bucketName string) (int, time.Time) {
 	startTime := time.Now()

From f3601f57124f4ee00d8013840553017cd465d9e6 Mon Sep 17 00:00:00 2001
From: xueqzhan <kenzhang@redhat.com>
Date: Thu, 21 May 2026 16:49:26 -0400
Subject: [PATCH 3/5] Refactor must gather test to minimize parallel load on
 the apiserver

---
 test/extended/cli/mustgather.go | 149 ++++++++++----------------------
 1 file changed, 45 insertions(+), 104 deletions(-)

diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go
index 6865c75bb932..66f8dc04a897 100644
--- a/test/extended/cli/mustgather.go
+++ b/test/extended/cli/mustgather.go
@@ -129,7 +129,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 	})
 
 	g.It("runs successfully for audit logs [apigroup:config.openshift.io][apigroup:oauth.openshift.io]", func() {
-		// On External clusters, events will not be part of the output, since audit logs do not include control plane logs.
 		controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc)
 		o.Expect(err).NotTo(o.HaveOccurred())
 
@@ -182,10 +181,12 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 
 		pluginOutputDir := GetPluginOutputDir(tempDir)
 
+		// Validate audit event counts and JSON format
+		g.By("Validating audit log event counts and JSON format")
 		expectedDirectoriesToExpectedCount := map[string]int{
 			path.Join(pluginOutputDir, "audit_logs", "kube-apiserver"):      1000,
-			path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, // openshift apiservers don't necessarily get much traffic.  Especially early in a run
-			path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"):     10, // oauth apiservers don't necessarily get much traffic.  Especially early in a run
+			path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10,
+			path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"):     10,
 		}
 
 		expectedFiles := [][]string{
@@ -194,13 +195,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 			{pluginOutputDir, "audit_logs", "oauth-apiserver.audit_logs_listing"},
 		}
 
-		// for some crazy reason, it seems that the files from must-gather take time to appear on disk for reading.  I don't understand why
-		// but this was in a previous commit and I don't want to immediately flake: https://github.com/openshift/origin/commit/006745a535848e84dcbcdd1c83ae86deddd3a229#diff-ad1c47fa4213de16d8b3237df5d71724R168
-		// so we're going to try to get a pass every 10 seconds for a minute.  If we pass, great.  If we don't, we report the
-		// last error we had.
 		var lastErr error
 		o.Eventually(func() bool {
-			// make sure we do not log OAuth tokens
 			for auditDirectory, expectedNumberOfAuditEntries := range expectedDirectoriesToExpectedCount {
 				eventsChecked := 0
 				err := filepath.Walk(auditDirectory, func(path string, info os.FileInfo, err error) error {
@@ -215,8 +211,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 						return nil
 					}
 
-					// at this point, we expect only audit files with json events, one per line
-
 					readFile := false
 
 					file, err := os.Open(path)
@@ -226,9 +220,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					fi, err := file.Stat()
 					o.Expect(err).NotTo(o.HaveOccurred())
 
-					// it will happen that the audit files are sometimes empty, we can
-					// safely ignore these files since they don't provide valuable information
-					// TODO this doesn't seem right.  It should be really unlikely, but we'll deal with later
 					if fi.Size() == 0 {
 						return nil
 					}
@@ -240,15 +231,13 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					for scanner.Scan() {
 						text := scanner.Text()
 						if !strings.HasSuffix(text, "}") {
-							continue // ignore truncated data
+							continue
 						}
 						o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`))
 
 						readFile = true
 						eventsChecked++
 					}
-					// ignore this error as we usually fail to read the whole GZ file
-					// o.Expect(scanner.Err()).NotTo(o.HaveOccurred())
 					o.Expect(readFile).To(o.BeTrue())
 
 					return nil
@@ -260,14 +249,12 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					return false
 				}
 
-				// reset lastErr if we succeeded.
 				lastErr = nil
 			}
 
-			// if we get here, it means both directories checked out ok
 			return true
 		}, 1*time.Minute, 10*time.Second).Should(o.BeTrue())
-		o.Expect(lastErr).NotTo(o.HaveOccurred()) // print the last error first if we have one
+		o.Expect(lastErr).NotTo(o.HaveOccurred())
 
 		emptyFiles := []string{}
 		for _, expectedFile := range expectedFiles {
@@ -282,63 +269,52 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		if len(emptyFiles) > 0 {
 			o.Expect(fmt.Errorf("expected files should not be empty: %s", strings.Join(emptyFiles, ","))).NotTo(o.HaveOccurred())
 		}
-	})
 
-	g.When("looking at the audit logs [apigroup:config.openshift.io]", func() {
-		g.Describe("[sig-node] kubelet", func() {
-			g.It("runs apiserver processes strictly sequentially in order to not risk audit log corruption", func() {
-				controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc)
+		// Check apiserver processes ran strictly sequentially (lock.log check)
+		g.By("Checking apiserver processes ran strictly sequentially")
+		expectedAuditSubDirs := []string{"kube-apiserver", "openshift-apiserver", "oauth-apiserver"}
+		seen := sets.String{}
+		for _, apiserver := range expectedAuditSubDirs {
+			err := filepath.Walk(filepath.Join(pluginOutputDir, "audit_logs", apiserver), func(path string, info os.FileInfo, err error) error {
+				g.By(path)
 				o.Expect(err).NotTo(o.HaveOccurred())
-
-				// On External clusters, events will not be part of the output, since audit logs do not include control plane logs.
-				if *controlPlaneTopology == configv1.ExternalTopologyMode {
-					g.Skip("External clusters don't have control plane audit logs")
+				if info.IsDir() {
+					return nil
 				}
 
-				tempDir, err := os.MkdirTemp("", "test.oc-adm-must-gather.")
-				o.Expect(err).NotTo(o.HaveOccurred())
-				defer os.RemoveAll(tempDir)
+				seen.Insert(apiserver)
 
-				args := []string{
-					"--dest-dir", tempDir,
-					"--volume-percentage=100",
-					"--",
-					"/usr/bin/gather_audit_logs",
+				if filepath.Base(path) != "lock.log" {
+					return nil
 				}
 
-				o.Expect(oc.Run("adm", "must-gather").Args(args...).Execute()).To(o.Succeed())
-
-				pluginOutputDir := GetPluginOutputDir(tempDir)
-				expectedAuditSubDirs := []string{"kube-apiserver", "openshift-apiserver", "oauth-apiserver"}
-
-				seen := sets.String{}
-				for _, apiserver := range expectedAuditSubDirs {
-					err := filepath.Walk(filepath.Join(pluginOutputDir, "audit_logs", apiserver), func(path string, info os.FileInfo, err error) error {
-						g.By(path)
-						o.Expect(err).NotTo(o.HaveOccurred())
-						if info.IsDir() {
-							return nil
-						}
-
-						seen.Insert(apiserver)
-
-						if filepath.Base(path) != "lock.log" {
-							return nil
-						}
-
-						lockLog, err := os.ReadFile(path)
-						o.Expect(err).NotTo(o.HaveOccurred())
-
-						// TODO: turn this into a failure as soon as kubelet is fixed
-						result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog))
-						return nil
-					})
-					o.Expect(err).NotTo(o.HaveOccurred())
-				}
+				lockLog, err := os.ReadFile(path)
+				o.Expect(err).NotTo(o.HaveOccurred())
 
-				o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue())
+				result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog))
+				return nil
 			})
-		})
+			o.Expect(err).NotTo(o.HaveOccurred())
+		}
+		o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue())
+
+		// Validate OAuth audit logs contain auditID
+		g.By("Validating OAuth audit logs contain auditID")
+		oauthAuditFiles := getOauthAudit(tempDir)
+		for _, file := range oauthAuditFiles {
+			f, err := os.Open(file)
+			o.Expect(err).NotTo(o.HaveOccurred())
+			defer f.Close()
+			gzipReader, err := gzip.NewReader(f)
+			o.Expect(err).NotTo(o.HaveOccurred())
+			defer gzipReader.Close()
+			scanner := bufio.NewScanner(gzipReader)
+			var headContent string
+			if scanner.Scan() {
+				headContent = scanner.Text()
+			}
+			o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs")
+		}
 	})
 
 	// author: yinzhou@redhat.com
@@ -398,41 +374,6 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		}, 600*time.Second, 10*time.Second).Should(o.MatchRegexp("No resources found"), "Still find the must-gather pod in own namespace even wait for 10 mins")
 	})
 
-	// author: yinzhou@redhat.com
-	g.It("Fetch audit logs of login attempts via oc commands timeout [apigroup:config.openshift.io][Timeout:20m]", func() {
-		controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc)
-		o.Expect(err).NotTo(o.HaveOccurred())
-
-		// Skip on hypershift (External topology)
-		if *controlPlaneTopology == configv1.ExternalTopologyMode {
-			g.Skip("Hypershift clusters don't have oauth-server audit logs in must-gather")
-		}
-
-		g.By("run the must-gather")
-		tempDir, err := os.MkdirTemp("", "test.oc-adm-must-gather.")
-		o.Expect(err).NotTo(o.HaveOccurred())
-		defer os.RemoveAll(tempDir)
-
-		o.Expect(oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute()).To(o.Succeed())
-
-		g.By("check the must-gather result")
-		oauth_audit_files := getOauthAudit(tempDir)
-		for _, file := range oauth_audit_files {
-			f, err := os.Open(file)
-			defer f.Close()
-			o.Expect(err).NotTo(o.HaveOccurred())
-			gzipReader, err := gzip.NewReader(f)
-			defer gzipReader.Close()
-			o.Expect(err).NotTo(o.HaveOccurred())
-			scanner := bufio.NewScanner(gzipReader)
-			var headContent string
-			if scanner.Scan() {
-				headContent = scanner.Text()
-			}
-			o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs")
-		}
-	})
-
 	// author: yinzhou@redhat.com
 	g.It("must-gather support since and since-time flags timeout [apigroup:config.openshift.io][Timeout:20m]", func() {
 		controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc)
@@ -550,7 +491,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		o.Expect(err).NotTo(o.HaveOccurred())
 		defer os.RemoveAll(tempDir)
 
-		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute()
+		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute()
 		o.Expect(err).NotTo(o.HaveOccurred())
 
 		g.By("check the must-gather and verify that oc binary version is included")
@@ -589,7 +530,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		o.Expect(err).NotTo(o.HaveOccurred())
 		defer os.RemoveAll(tempDir2)
 
-		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/usr/bin/gather_audit_logs").Execute()
+		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute()
 		o.Expect(err).NotTo(o.HaveOccurred())
 
 		g.By("check logs generated are included in the must-gather directory when must-gather is run")

From de9582250acdfd4e65f008e4609590474fdf585a Mon Sep 17 00:00:00 2001
From: xueqzhan <kenzhang@redhat.com>
Date: Tue, 26 May 2026 14:55:24 -0400
Subject: [PATCH 4/5] Refactor must gather audit log tests to avoid repeated
 audit download

---
 test/extended/cli/mustgather.go | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go
index 66f8dc04a897..644d2ba5890b 100644
--- a/test/extended/cli/mustgather.go
+++ b/test/extended/cli/mustgather.go
@@ -129,6 +129,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 	})
 
 	g.It("runs successfully for audit logs [apigroup:config.openshift.io][apigroup:oauth.openshift.io]", func() {
+		// On External clusters, events will not be part of the output, since audit logs do not include control plane logs.
 		controlPlaneTopology, err := exutil.GetControlPlaneTopology(oc)
 		o.Expect(err).NotTo(o.HaveOccurred())
 
@@ -185,8 +186,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		g.By("Validating audit log event counts and JSON format")
 		expectedDirectoriesToExpectedCount := map[string]int{
 			path.Join(pluginOutputDir, "audit_logs", "kube-apiserver"):      1000,
-			path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10,
-			path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"):     10,
+			path.Join(pluginOutputDir, "audit_logs", "openshift-apiserver"): 10, // openshift apiservers don't necessarily get much traffic.  Especially early in a run
+			path.Join(pluginOutputDir, "audit_logs", "oauth-apiserver"):     10, // oauth apiservers don't necessarily get much traffic.  Especially early in a run
 		}
 
 		expectedFiles := [][]string{
@@ -195,8 +196,13 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 			{pluginOutputDir, "audit_logs", "oauth-apiserver.audit_logs_listing"},
 		}
 
+		// for some crazy reason, it seems that the files from must-gather take time to appear on disk for reading.  I don't understand why
+		// but this was in a previous commit and I don't want to immediately flake: https://github.com/openshift/origin/commit/006745a535848e84dcbcdd1c83ae86deddd3a229#diff-ad1c47fa4213de16d8b3237df5d71724R168
+		// so we're going to try to get a pass every 10 seconds for a minute.  If we pass, great.  If we don't, we report the
+		// last error we had.
 		var lastErr error
 		o.Eventually(func() bool {
+			// make sure we do not log OAuth tokens
 			for auditDirectory, expectedNumberOfAuditEntries := range expectedDirectoriesToExpectedCount {
 				eventsChecked := 0
 				err := filepath.Walk(auditDirectory, func(path string, info os.FileInfo, err error) error {
@@ -211,6 +217,8 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 						return nil
 					}
 
+					// at this point, we expect only audit files with json events, one per line
+
 					readFile := false
 
 					file, err := os.Open(path)
@@ -220,6 +228,9 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					fi, err := file.Stat()
 					o.Expect(err).NotTo(o.HaveOccurred())
 
+					// it will happen that the audit files are sometimes empty, we can
+					// safely ignore these files since they don't provide valuable information
+					// TODO this doesn't seem right.  It should be really unlikely, but we'll deal with later
 					if fi.Size() == 0 {
 						return nil
 					}
@@ -231,13 +242,15 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					for scanner.Scan() {
 						text := scanner.Text()
 						if !strings.HasSuffix(text, "}") {
-							continue
+							continue // ignore truncated data
 						}
 						o.Expect(text).To(o.HavePrefix(`{"kind":"Event",`))
 
 						readFile = true
 						eventsChecked++
 					}
+					// ignore this error as we usually fail to read the whole GZ file
+					// o.Expect(scanner.Err()).NotTo(o.HaveOccurred())
 					o.Expect(readFile).To(o.BeTrue())
 
 					return nil
@@ -249,12 +262,14 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 					return false
 				}
 
+				// reset lastErr if we succeeded.
 				lastErr = nil
 			}
 
+			// if we get here, it means both directories checked out ok
 			return true
 		}, 1*time.Minute, 10*time.Second).Should(o.BeTrue())
-		o.Expect(lastErr).NotTo(o.HaveOccurred())
+		o.Expect(lastErr).NotTo(o.HaveOccurred()) // print the last error first if we have one
 
 		emptyFiles := []string{}
 		for _, expectedFile := range expectedFiles {
@@ -291,12 +306,15 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 				lockLog, err := os.ReadFile(path)
 				o.Expect(err).NotTo(o.HaveOccurred())
 
-				result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog))
+				// TODO: turn this into a failure as soon as kubelet is fixed
+				if strings.TrimSpace(string(lockLog)) != "" {
+					result.Flakef("kubelet launched %s without waiting for the old process to terminate (lock was still hold): \n\n%s", apiserver, string(lockLog))
+				}
 				return nil
 			})
 			o.Expect(err).NotTo(o.HaveOccurred())
 		}
-		o.Expect(seen.HasAll(expectedAuditSubDirs...), o.BeTrue())
+		o.Expect(seen.HasAll(expectedAuditSubDirs...)).To(o.BeTrue())
 
 		// Validate OAuth audit logs contain auditID
 		g.By("Validating OAuth audit logs contain auditID")

From 2477c1ba27f8f3ea336d82fee3ee1e5d015d0bfd Mon Sep 17 00:00:00 2001
From: xueqzhan <kenzhang@redhat.com>
Date: Wed, 27 May 2026 11:57:44 -0400
Subject: [PATCH 5/5] Address review comments to further simplify must-gather
 tests

---
 test/extended/cli/mustgather.go | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/test/extended/cli/mustgather.go b/test/extended/cli/mustgather.go
index 644d2ba5890b..bca1be429cb5 100644
--- a/test/extended/cli/mustgather.go
+++ b/test/extended/cli/mustgather.go
@@ -320,18 +320,20 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		g.By("Validating OAuth audit logs contain auditID")
 		oauthAuditFiles := getOauthAudit(tempDir)
 		for _, file := range oauthAuditFiles {
-			f, err := os.Open(file)
-			o.Expect(err).NotTo(o.HaveOccurred())
-			defer f.Close()
-			gzipReader, err := gzip.NewReader(f)
-			o.Expect(err).NotTo(o.HaveOccurred())
-			defer gzipReader.Close()
-			scanner := bufio.NewScanner(gzipReader)
-			var headContent string
-			if scanner.Scan() {
-				headContent = scanner.Text()
-			}
-			o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs")
+			func() {
+				f, err := os.Open(file)
+				o.Expect(err).NotTo(o.HaveOccurred())
+				defer f.Close()
+				gzipReader, err := gzip.NewReader(f)
+				o.Expect(err).NotTo(o.HaveOccurred())
+				defer gzipReader.Close()
+				scanner := bufio.NewScanner(gzipReader)
+				var headContent string
+				if scanner.Scan() {
+					headContent = scanner.Text()
+				}
+				o.Expect(headContent).To(o.ContainSubstring("auditID"), "Failed to read the oauth audit logs")
+			}()
 		}
 	})
 
@@ -509,7 +511,7 @@ var _ = g.Describe("[sig-cli] oc adm must-gather", func() {
 		o.Expect(err).NotTo(o.HaveOccurred())
 		defer os.RemoveAll(tempDir)
 
-		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir).Execute()
+		err = oc.AsAdmin().WithoutNamespace().Run("adm").Args("must-gather", "--dest-dir="+tempDir, "--", "/bin/true").Execute()
 		o.Expect(err).NotTo(o.HaveOccurred())
 
 		g.By("check the must-gather and verify that oc binary version is included")