Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ import (
"github.com/sirupsen/logrus"

corev1 "k8s.io/api/core/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
metricsclient "k8s.io/metrics/pkg/client/clientset/versioned"
Expand Down Expand Up @@ -231,6 +233,34 @@ func (ma *MetricsAgent) StoreMachinesSnapshot(obj ctrlruntimeclient.Object) {
ma.Record(&MachinesEvent{Workload: workload})
}

// StoreMachinesSnapshotForBuildPod waits until the build pod exists and has ci-workload label, then snapshots machines.
func (ma *MetricsAgent) StoreMachinesSnapshotForBuildPod(ctx context.Context, namespace, podName string, podClient ctrlruntimeclient.Client) {
if ma == nil || ma.machinesPlugin == nil || podClient == nil {
return
}
go func() {
waitCtx, cancel := context.WithTimeout(ctx, 60*time.Minute)
defer cancel()

if err := wait.PollUntilContextCancel(waitCtx, 5*time.Second, true, func(ctx context.Context) (bool, error) {
var pod corev1.Pod
if err := podClient.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: namespace, Name: podName}, &pod); err != nil {
if kerrors.IsNotFound(err) {
return false, nil
}
return false, err
}
if pod.Labels[CIWorkloadLabel] == "" {
return false, nil
}
ma.StoreMachinesSnapshot(&pod)
return true, nil
}); err != nil {
ma.logger.WithError(err).Debugf("Failed waiting for pod %s/%s to store machines snapshot", namespace, podName)
}
}()
}

// RecordStepEvent records a single Finished event for a step, including objects metadata.
func (ma *MetricsAgent) RecordStepEvent(step api.Step, objects []ctrlruntimeclient.Object, start, finish time.Time, runErr error) {
if ma == nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/steps/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -598,6 +598,7 @@ func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.P
}

client.MetricsAgent().AddNodeWorkload(ctx, ns, fmt.Sprintf("%s-build", name), name, podClient)
client.MetricsAgent().StoreMachinesSnapshotForBuildPod(ctx, ns, fmt.Sprintf("%s-build", name), podClient)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Start the build-pod snapshot watcher only once per build attempt set.

Line 601 is inside the retry loop, so each retry starts another 60-minute polling goroutine for the same pod. That can inflate API calls and duplicate machine snapshot events.

Suggested fix
 func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.PodClient, build buildapi.Build, needsMultiArchWorkaround func() bool) error {
 	const attempts = 5
 	ns, name := build.Namespace, build.Name
 	var errs []error
+	snapshotWatcherStarted := false
 	if err := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) {
 		var attempt buildapi.Build
@@
 		client.MetricsAgent().AddNodeWorkload(ctx, ns, fmt.Sprintf("%s-build", name), name, podClient)
-		client.MetricsAgent().StoreMachinesSnapshotForBuildPod(ctx, ns, fmt.Sprintf("%s-build", name), podClient)
+		if !snapshotWatcherStarted {
+			client.MetricsAgent().StoreMachinesSnapshotForBuildPod(ctx, ns, fmt.Sprintf("%s-build", name), podClient)
+			snapshotWatcherStarted = true
+		}
 		if err := waitForBuildOrTimeout(ctx, client, podClient, ns, name); err != nil {
 			errs = append(errs, err)
 			return false, handleFailedBuild(ctx, client, ns, name, err)
 		}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/steps/source.go` at line 601, The call to
client.MetricsAgent().StoreMachinesSnapshotForBuildPod(...) is being started
inside the retry loop and thus spawns a new 60-minute watcher on every retry;
move this call so the snapshot watcher is started only once per build attempt
set—either relocate the invocation outside the retry loop that contains the pod
start/retry logic or protect it with a sync.Once (or a boolean on the build
attempt struct) so client.MetricsAgent().StoreMachinesSnapshotForBuildPod(ctx,
ns, fmt.Sprintf("%s-build", name), podClient) is invoked a single time using the
established podClient/ns/name for that attempt.

if err := waitForBuildOrTimeout(ctx, client, podClient, ns, name); err != nil {
errs = append(errs, err)
return false, handleFailedBuild(ctx, client, ns, name, err)
Expand Down