Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/steps/bundle_source.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (s *bundleSourceStep) run(ctx context.Context) error {
// Bundle images are not multi-arch by design. Here we build it without creating a manifest-listed image.
// Note that we are not configuring a node selector here, so the build will be scheduled on any available
// node no matter the architecture.
return handleBuild(ctx, s.client, s.podClient, *build, func() bool { return false })
return handleBuild(ctx, s.client, s.podClient, *build)
}

func replaceCommand(pullSpec, with string) string {
Expand Down
2 changes: 1 addition & 1 deletion pkg/steps/project_image.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (s *projectDirectoryImageBuildStep) run(ctx context.Context) error {

// Bundle images are non multi-arch by design. No manifest list is needed. Here we spawn a single build.
if s.config.IsBundleImage() {
return handleBuild(ctx, s.client, s.podClient, *build, func() bool { return false })
return handleBuild(ctx, s.client, s.podClient, *build)
}

return handleBuilds(ctx, s.client, s.podClient, *build, s.metricsAgent, newImageBuildOptions(s.architectures.UnsortedList()))
Expand Down
42 changes: 5 additions & 37 deletions pkg/steps/source.go
Original file line number Diff line number Diff line change
Expand Up @@ -467,31 +467,13 @@ func isBuildPhaseTerminated(phase buildapi.BuildPhase) bool {
}

type ImageBuildOptions struct {
Architectures []string
NeedsMultiArchWorkaround func() bool
Architectures []string
}

func newImageBuildOptions(archs []string) ImageBuildOptions {
return ImageBuildOptions{Architectures: archs}
}

// multiArchRepos are repos that need the sleep workaround for multi-arch builds.
var multiArchRepos = sets.New[string](
"openshift/ci-tools",
"openshift/kueue-operator",
"openshift/loki",
"openshift/multiarch-tuning-operator",
"openshift/oadp-must-gather",
"openshift/oadp-operator",
"openshift/openshift-velero-plugin",
"openshift/velero",
"openshift/velero-plugin-for-aws",
"openshift/velero-plugin-for-gcp",
"openshift/velero-plugin-for-legacy-aws",
"openshift/velero-plugin-for-microsoft-azure",
"openshift-eng/baremetal-qe-infra",
)

func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubernetes.PodClient, build buildapi.Build, metricsAgent *metrics.MetricsAgent, opts ...ImageBuildOptions) error {
var wg sync.WaitGroup

Expand All @@ -500,17 +482,6 @@ func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubern
o = opts[0]
}

// Create closure that checks if this build is for a multi-arch repo
needsMultiArchWorkaround := o.NeedsMultiArchWorkaround
if needsMultiArchWorkaround == nil {
needsMultiArchWorkaround = func() bool {
if org, repo := build.Labels[LabelMetadataOrg], build.Labels[LabelMetadataRepo]; org != "" && repo != "" {
return multiArchRepos.Has(fmt.Sprintf("%s/%s", org, repo))
}
return false
}
}

builds := constructMultiArchBuilds(build, o.Architectures)
errChan := make(chan error, len(builds))

Expand All @@ -519,7 +490,7 @@ func handleBuilds(ctx context.Context, buildClient BuildClient, podClient kubern
go func(b buildapi.Build) {
defer wg.Done()
metricsAgent.AddNodeWorkload(ctx, b.Namespace, fmt.Sprintf("%s-build", b.Name), b.Name, podClient)
if err := handleBuild(ctx, buildClient, podClient, b, needsMultiArchWorkaround); err != nil {
if err := handleBuild(ctx, buildClient, podClient, b); err != nil {
errChan <- fmt.Errorf("error occurred handling build %s: %w", b.Name, err)
}
metricsAgent.RemoveNodeWorkload(b.Name)
Expand Down Expand Up @@ -577,17 +548,14 @@ func constructMultiArchBuilds(build buildapi.Build, stepArchitectures []string)
return ret
}

func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.PodClient, build buildapi.Build, needsMultiArchWorkaround func() bool) error {
func handleBuild(ctx context.Context, client BuildClient, podClient kubernetes.PodClient, build buildapi.Build) error {
const attempts = 5
ns, name := build.Namespace, build.Name
var errs []error
if err := wait.ExponentialBackoff(wait.Backoff{Duration: time.Minute, Factor: 1.5, Steps: attempts}, func() (bool, error) {
var attempt buildapi.Build
// TODO: This is a workaround to avoid race condition with multiple ci-operator instances building different architectures
// See https://issues.redhat.com/browse/OCPBUGS-65845
if needsMultiArchWorkaround != nil && needsMultiArchWorkaround() {
time.Sleep(5 * time.Minute)
}
// builds are using older src image, adding wait to avoid race condition
time.Sleep(1 * time.Minute)
build.DeepCopyInto(&attempt)
Comment on lines +551 to 559
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Commented-out race condition fix is a concern if merged.

Lines 557-558 contain dead code with a comment acknowledging a race condition ("builds are using older src image"). The sleep that previously mitigated this is now commented out.

Given the PR title "DO NOT MERGE" and /hold label, this appears intentional for investigation. However, if this were to be merged:

  • The race condition described in the comment would remain unaddressed
  • The commented-out code should be removed entirely or replaced with a proper fix

Consider removing the dead code and comment, or implementing a proper solution before any future merge.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/steps/source.go` around lines 551 - 559, In handleBuild, remove the
commented-out sleep and its explanatory comment (the dead code around the
"builds are using older src image" sleep) or replace it with a deterministic
wait: implement a proper retry/wait that checks the build/pod image readiness
before proceeding (e.g., use wait.PollImmediate or extend the existing
ExponentialBackoff to poll the relevant image/version field on the attempt/build
or associated pod via podClient until it reflects the expected src image),
ensuring you update the logic around build.DeepCopyInto(&attempt) accordingly.

if err := client.Create(ctx, &attempt); err == nil {
logrus.Infof("Created build %q", name)
Expand Down