Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 86 additions & 1 deletion pkg/steps/release/import_release.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package release

import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
Expand Down Expand Up @@ -30,6 +32,16 @@ import (
"github.com/openshift/ci-tools/pkg/util"
)

const (
releaseImportLockPrefix = "ci-operator-release-import-lock-"
releaseImportDonePrefix = "ci-operator-release-import-done-"
)

func releaseImportResourceSuffix(pullSpec string) string {
sum := sha256.Sum256([]byte(pullSpec))
return hex.EncodeToString(sum[:])[:12]
}

// importReleaseStep is responsible for importing release images from
// external image streams for use with tests that need to install or
// upgrade a cluster. It uses the `cli` image within the image stream to
Expand Down Expand Up @@ -106,13 +118,86 @@ func (s *importReleaseStep) run(ctx context.Context) error {
return fmt.Errorf("could not create stable imagestream: %w", err)
}

// tag the release image in and let it import
pullSpec, err := s.source.PullSpec(ctx)
if err != nil {
return err
}
logrus.WithField("name", s.name).Debugf("setting originalPullSpec to: %s for multi-stage steps to reference", pullSpec)
s.originalPullSpec = pullSpec

ns := s.jobSpec.Namespace()
suffix := releaseImportResourceSuffix(pullSpec)
lockName := releaseImportLockPrefix + s.name + "-" + suffix
doneName := releaseImportDonePrefix + s.name + "-" + suffix

var existingDone coreapi.ConfigMap
if err := s.client.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: ns, Name: doneName}, &existingDone); err == nil {
if existingDone.Data["pullspec"] == pullSpec {
logrus.Infof("release %s import already completed in this namespace (shared aggregate run), skipping duplicate import", s.name)
return nil
}
} else if !kerrors.IsNotFound(err) {
return fmt.Errorf("get release import completion marker: %w", err)
}

lockCM := &coreapi.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Namespace: ns,
Name: lockName,
Labels: map[string]string{"ci.openshift.io/release-import-coordination": "true"},
},
Data: map[string]string{"holder": s.jobSpec.BuildID},
}
if err := s.client.Create(ctx, lockCM); err != nil {
if !kerrors.IsAlreadyExists(err) {
return fmt.Errorf("create release import lock: %w", err)
}
logrus.Infof("waiting for another ci-operator to finish release %s import in this namespace", s.name)
if err := waitForReleaseImportDone(ctx, s.client, ns, doneName, pullSpec, utils.DefaultImageImportTimeout); err != nil {
return err
}
return nil
Comment on lines +151 to +159
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Retry lock acquisition when the current importer exits early.

This path only waits for doneName. If the lock holder returns any error before creating the done ConfigMap, every peer that already saw AlreadyExists will sit there until DefaultImageImportTimeout even after the lock is gone. That turns one transient importer failure into a namespace-wide timeout. Re-check the lock while waiting and retry Create() when it disappears instead of waiting solely on doneName.

Also applies to: 182-198

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/steps/release/import_release.go` around lines 151 - 159, The current
branch only waits for doneName after seeing kerrors.IsAlreadyExists on
s.client.Create(ctx, lockCM), which can hang peers if the lock holder exits
without creating doneName; modify the logic in the Create()/AlreadyExists
handling (and the analogous block around lines 182-198) to loop-retry Create:
when Create returns AlreadyExists, poll the lock ConfigMap via s.client.Get (or
equivalent) with a short backoff until either the lock CM is gone or
DefaultImageImportTimeout elapses; if the lock is removed, retry
s.client.Create(ctx, lockCM); if doneName appears while waiting, return as
before; propagate errors appropriately and ensure waitForReleaseImportDone is
still used for the normal “wait for completion” case.

}
defer func() {
del := &coreapi.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: lockName}}
if err := s.client.Delete(ctx, del); err != nil && !kerrors.IsNotFound(err) {
logrus.WithError(err).Warn("failed to delete release import lock configmap")
}
}()
Comment on lines +161 to +166
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Use a fresh context for deferred lock cleanup.

The deferred delete reuses ctx, so a cancellation or deadline from runImportBody() will usually leave the lock ConfigMap behind. Subsequent jobs then keep hitting AlreadyExists and wait on a done marker that will never be written.

Suggested fix
 	defer func() {
+		cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
 		del := &coreapi.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: lockName}}
-		if err := s.client.Delete(ctx, del); err != nil && !kerrors.IsNotFound(err) {
+		if err := s.client.Delete(cleanupCtx, del); err != nil && !kerrors.IsNotFound(err) {
 			logrus.WithError(err).Warn("failed to delete release import lock configmap")
 		}
 	}()
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
defer func() {
del := &coreapi.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: lockName}}
if err := s.client.Delete(ctx, del); err != nil && !kerrors.IsNotFound(err) {
logrus.WithError(err).Warn("failed to delete release import lock configmap")
}
}()
defer func() {
cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
del := &coreapi.ConfigMap{ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: lockName}}
if err := s.client.Delete(cleanupCtx, del); err != nil && !kerrors.IsNotFound(err) {
logrus.WithError(err).Warn("failed to delete release import lock configmap")
}
}()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/steps/release/import_release.go` around lines 161 - 166, The deferred
lock cleanup currently reuses ctx (which may be canceled by runImportBody()),
causing the ConfigMap lock to be left behind; change the defer to call
s.client.Delete using a fresh context (e.g. context.Background() or a short
context.WithTimeout) instead of ctx when deleting the ConfigMap (the block that
constructs del with Namespace: ns, Name: lockName and calls s.client.Delete).
Keep the existing kerrors.IsNotFound check and logging (logrus.WithError) but
ensure the delete uses the newCtx so cancellation of runImportBody() won't
prevent lock cleanup.


if err := s.runImportBody(ctx, streamName, pullSpec, startTime); err != nil {
return err
}

doneCM := &coreapi.ConfigMap{
ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: doneName},
Data: map[string]string{"pullspec": pullSpec},
}
if err := s.client.Create(ctx, doneCM); err != nil && !kerrors.IsAlreadyExists(err) {
return fmt.Errorf("mark release import done: %w", err)
}
return nil
}

func waitForReleaseImportDone(ctx context.Context, client ctrlruntimeclient.Client, ns, doneName, wantPullSpec string, timeout time.Duration) error {
err := wait.PollUntilContextTimeout(ctx, 2*time.Second, timeout, true, func(ctx context.Context) (bool, error) {
cm := &coreapi.ConfigMap{}
err := client.Get(ctx, ctrlruntimeclient.ObjectKey{Namespace: ns, Name: doneName}, cm)
if kerrors.IsNotFound(err) {
return false, nil
}
if err != nil {
return false, err
}
return cm.Data["pullspec"] == wantPullSpec, nil
})
if err != nil {
return fmt.Errorf("timed out waiting for release import completion in namespace %s: %w", ns, err)
}
return nil
}

func (s *importReleaseStep) runImportBody(ctx context.Context, streamName string, pullSpec string, startTime time.Time) error {
// retry importing the image a few times because we might race against establishing credentials/roles
// and be unable to import images on the same cluster
if newPullSpec, err := utils.ImportTagWithRetries(ctx, s.client, s.jobSpec.Namespace(), "release", s.name, pullSpec, api.ImageStreamImportRetries, s.client.MetricsAgent()); err != nil {
Expand Down