From 45ebf7ab87507501427af2600b763ebaf3bc68f1 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Thu, 11 Jun 2026 13:26:56 +0000 Subject: [PATCH 01/10] daemon: add Stage and Reboot methods to bootc executor Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/bootc/executor.go | 52 ++++++++++++++++++++++++++++-------- internal/daemon/fake_test.go | 8 ++++++ 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/internal/bootc/executor.go b/internal/bootc/executor.go index 013d0ce..83ff77f 100644 --- a/internal/bootc/executor.go +++ b/internal/bootc/executor.go @@ -6,6 +6,9 @@ import ( "context" "fmt" "os/exec" + "strings" + + logf "sigs.k8s.io/controller-runtime/pkg/log" ) // Executor abstracts the execution of bootc commands on the host. @@ -13,6 +16,8 @@ import ( // PID namespaces. Tests can provide a fake implementation. type Executor interface { Status(ctx context.Context) ([]byte, error) + Stage(ctx context.Context, image string) error + Reboot(ctx context.Context) error } // HostExecutor runs bootc commands on the host via nsenter. @@ -23,21 +28,46 @@ func NewHostExecutor() *HostExecutor { return &HostExecutor{} } -func (e *HostExecutor) Status(ctx context.Context) ([]byte, error) { - cmd := exec.CommandContext(ctx, - "nsenter", +func (e *HostExecutor) nsenterCmd(ctx context.Context, args ...string) *exec.Cmd { + base := []string{ "--target", "1", - "--mount", - "--pid", - "--setuid", "0", - "--setgid", "0", - "--env", - "--", - "bootc", "status", "--json", "--format-version", "1", - ) + "--mount", "--pid", + "--setuid", "0", "--setgid", "0", + "--env", "--", + } + return exec.CommandContext(ctx, "nsenter", append(base, args...)...) +} + +func (e *HostExecutor) Status(ctx context.Context) ([]byte, error) { + cmd := e.nsenterCmd(ctx, "bootc", "status", "--json", "--format-version", "1") out, err := cmd.Output() if err != nil { return nil, fmt.Errorf("running bootc status: %w", err) } return out, nil } + +func (e *HostExecutor) Stage(ctx context.Context, image string) error { + log := logf.FromContext(ctx) + + // TODO: use --download-only once available (https://github.com/bootc-dev/bootc/issues/2137) + cmd := e.nsenterCmd(ctx, "bootc", "switch", image) + log.Info("Executing", "cmd", strings.Join(cmd.Args, " ")) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("running bootc switch: %s: %w", out, err) + } + return nil +} + +func (e *HostExecutor) Reboot(ctx context.Context) error { + log := logf.FromContext(ctx) + + cmd := e.nsenterCmd(ctx, "systemctl", "reboot") + log.Info("Executing", "cmd", strings.Join(cmd.Args, " ")) + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("running systemctl reboot: %s: %w", out, err) + } + return nil +} diff --git a/internal/daemon/fake_test.go b/internal/daemon/fake_test.go index b6ad398..df703eb 100644 --- a/internal/daemon/fake_test.go +++ b/internal/daemon/fake_test.go @@ -19,6 +19,14 @@ func (f *fakeExecutor) Status(_ context.Context) ([]byte, error) { return f.data, f.err } +func (f *fakeExecutor) Stage(_ context.Context, _ string) error { + return nil +} + +func (f *fakeExecutor) Reboot(_ context.Context) error { + return nil +} + func (f *fakeExecutor) set(data []byte, err error) { f.mu.Lock() defer f.mu.Unlock() From 1825c892194f9546742c44c785cbd24f69315d9d Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Thu, 11 Jun 2026 14:13:20 +0000 Subject: [PATCH 02/10] daemon: implement state machine in BootcNode reconciler Rewrite the reconciler to detect image mismatches between spec.desiredImage and the booted image, stage via bootc switch in a background goroutine. Once, it finished to staged the image, the termination of the goroutine triggers once more the reconciliation loop which will detect that the system requires a reboot. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/daemon/reconciler.go | 320 ++++++++++++++++++++++++++++++---- 1 file changed, 283 insertions(+), 37 deletions(-) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index 084a329..cecd60b 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -5,32 +5,68 @@ package daemon import ( "context" "fmt" + "github.com/distribution/reference" + "reflect" + "sync" + "time" + "github.com/go-logr/logr" apierrors "k8s.io/apimachinery/pkg/api/errors" apimeta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/source" bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1" "github.com/jlebon/bootc-operator/internal/bootc" ) +const ( + stageBackoffMin = 5 * time.Second + // caps exponential backoff at 5m20s + stageMaxBackoffExponent = 7 +) + +// stageOp tracks the state of an in-flight bootc stage operation. +type stageOp struct { + mu sync.Mutex + // runMu serializes run() calls so that at most one bootc switch + // process is active at a time, even if the previous one is still + // exiting after context cancellation. + runMu sync.Mutex + image string + cancel context.CancelFunc + err error + retries int +} + // BootcNodeReconciler reconciles the BootcNode for the node this daemon -// runs on. It reads bootc status from the host and writes it into the -// BootcNode's status subresource. +// runs on. It reads bootc status from the host, detects image mismatches, +// and drives updates via bootc stage. type BootcNodeReconciler struct { client.Client Scheme *runtime.Scheme NodeName string Executor bootc.Executor + + inflight stageOp + stageDone chan event.GenericEvent + // rebootIssued tracks whether a reboot has been issued so classifyAction + // can distinguish the Staged→Rebooting. + rebootIssued bool } func (r *BootcNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { + r.stageDone = make(chan event.GenericEvent, 1) + return ctrl.NewControllerManagedBy(mgr). For(&bootcv1alpha1.BootcNode{}). + WatchesRawSource(source.Channel(r.stageDone, &handler.EnqueueRequestForObject{})). Named("bootcnode"). Complete(r) } @@ -51,66 +87,276 @@ func (r *BootcNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) ( return ctrl.Result{}, fmt.Errorf("fetching BootcNode: %w", err) } - patch := client.MergeFrom(bn.DeepCopy()) - - if err := r.populateStatus(ctx, &bn); err != nil { - log.Error(err, "Failed to populate bootc status") - } + orig := bn.DeepCopy() + bn.Status.ObservedGeneration = bn.Generation - if err := r.Status().Patch(ctx, &bn, patch); err != nil { - return ctrl.Result{}, fmt.Errorf("patching BootcNode status: %w", err) - } + apimeta.SetStatusCondition(&bn.Status.Conditions, metav1.Condition{ + Type: bootcv1alpha1.NodeIdle, + Status: metav1.ConditionTrue, + Reason: bootcv1alpha1.NodeReasonIdle, + ObservedGeneration: bn.Generation, + }) - log.Info("Patched BootcNode status from bootc") - return ctrl.Result{}, nil -} + res, reconcileErr := r.reconcileBootcNode(ctx, &bn) -func (r *BootcNodeReconciler) populateStatus(ctx context.Context, bn *bootcv1alpha1.BootcNode) error { - data, err := r.Executor.Status(ctx) - if err != nil { + if res.degradedMsg != "" { apimeta.SetStatusCondition(&bn.Status.Conditions, metav1.Condition{ Type: bootcv1alpha1.NodeDegraded, Status: metav1.ConditionTrue, Reason: bootcv1alpha1.NodeReasonError, - Message: fmt.Sprintf("failed to get bootc status: %v", err), + Message: res.degradedMsg, ObservedGeneration: bn.Generation, }) - return fmt.Errorf("getting bootc status: %w", err) - } - - status, err := bootc.ParseStatus(data) - if err != nil { + } else { apimeta.SetStatusCondition(&bn.Status.Conditions, metav1.Condition{ Type: bootcv1alpha1.NodeDegraded, - Status: metav1.ConditionTrue, - Reason: bootcv1alpha1.NodeReasonError, - Message: fmt.Sprintf("failed to parse bootc status: %v", err), + Status: metav1.ConditionFalse, + Reason: bootcv1alpha1.NodeReasonHealthy, ObservedGeneration: bn.Generation, }) - return fmt.Errorf("parsing bootc status: %w", err) } - bn.Status.ObservedGeneration = bn.Generation - bn.Status.Booted = convertBootEntry(status.Status.Booted) - bn.Status.Staged = convertBootEntry(status.Status.Staged) - bn.Status.Rollback = convertBootEntry(status.Status.Rollback) + if !reflect.DeepEqual(bn.Status, orig.Status) { + if patchErr := r.Status().Patch(ctx, &bn, client.MergeFrom(orig)); patchErr != nil { + return ctrl.Result{}, fmt.Errorf("patching BootcNode status: %w", patchErr) + } + } + + // Reboot after the status patch so the Rebooting condition is persisted before the node goes down. + if res.needsReboot { + log.Info("Starting reboot") + if err := r.Executor.Reboot(ctx); err != nil { + return ctrl.Result{}, fmt.Errorf("reboot: %w", err) + } + // Record if the reboot was issued in this way we can transition from Staged to Rebooting + r.rebootIssued = true + } + + return res.result, reconcileErr +} + +type reconcileResult struct { + result ctrl.Result + degradedMsg string + needsReboot bool +} + +// reconcileBootcNode defines the result of the reconcile of the bootc nodes. It returns the results for the reconcile, +// the degraded message and eventual errors. We distinguish the degraded message from a reconcile error since we want to +// implement an exponential back-off if the staging failed. +func (r *BootcNodeReconciler) reconcileBootcNode(ctx context.Context, bn *bootcv1alpha1.BootcNode) (reconcileResult, error) { + log := logf.FromContext(ctx).WithValues("node", r.NodeName) + + if err := r.populateBootcFields(ctx, bn); err != nil { + degradedErr := fmt.Errorf("populating bootc fields: %w", err) + return reconcileResult{degradedMsg: degradedErr.Error()}, degradedErr + } + + if bn.Status.Booted == nil { + degradedErr := fmt.Errorf("bootc status has no booted entry") + return reconcileResult{degradedMsg: degradedErr.Error()}, degradedErr + } + + desiredRef, err := reference.ParseNamed(bn.Spec.DesiredImage) + if err != nil { + // Image is invalid don't trigger another reconcile iteration + return reconcileResult{ + degradedMsg: fmt.Sprintf("invalid image ref %q: %v", bn.Spec.DesiredImage, err), + }, nil + } + // The controller always resolves tags to digests at the pool level. + digested, ok := desiredRef.(reference.Digested) + if !ok { + return reconcileResult{ + degradedMsg: fmt.Sprintf("image ref %q has no digest", bn.Spec.DesiredImage), + }, nil + } + + // Nothing to do the desired image matches the booted ones. + // Rest the reconciler to start from a clean state + if digested.Digest().String() == bn.Status.Booted.ImageDigest { + r.reset() + return reconcileResult{}, nil + } + + stageErr := r.inflight.takeErr() + + if stageErr != nil { + // Set delay for the requeue. If the error is set, then the requeue delay is ignored. For this reason, in this + // case we set the degraded message but not the reconcile error. + return reconcileResult{ + result: ctrl.Result{RequeueAfter: r.inflight.backoff()}, + degradedMsg: fmt.Sprintf("bootc stage failed: %v", stageErr), + }, nil + } + + desiredImage := desiredRef.String() + + action := r.classifyAction(bn, digested, desiredImage) + + res := reconcileResult{} + var reason string + switch action { + case actionStage: + reason = bootcv1alpha1.NodeReasonStaging + switchCtx, cancel := context.WithCancel(context.Background()) + r.inflight.acquire(log, desiredImage, cancel) + log.Info("Starting staging", "image", desiredImage) + go r.inflight.run(switchCtx, r.NodeName, desiredImage, r.Executor, r.stageDone) + + case actionAwaitStage: + reason = bootcv1alpha1.NodeReasonStaging + + case actionReboot: + reason = bootcv1alpha1.NodeReasonRebooting + res.needsReboot = true + + case actionAwaitReboot: + reason = bootcv1alpha1.NodeReasonRebooting + + case actionAwaitBooted: + reason = bootcv1alpha1.NodeReasonStaged + log.Info("Image staged", "image", desiredImage) + } apimeta.SetStatusCondition(&bn.Status.Conditions, metav1.Condition{ Type: bootcv1alpha1.NodeIdle, - Status: metav1.ConditionTrue, - Reason: bootcv1alpha1.NodeReasonIdle, - ObservedGeneration: bn.Generation, - }) - apimeta.SetStatusCondition(&bn.Status.Conditions, metav1.Condition{ - Type: bootcv1alpha1.NodeDegraded, Status: metav1.ConditionFalse, - Reason: bootcv1alpha1.NodeReasonHealthy, + Reason: reason, ObservedGeneration: bn.Generation, }) + return res, nil +} + +func (s *stageOp) takeErr() error { + s.mu.Lock() + defer s.mu.Unlock() + err := s.err + s.err = nil + if err != nil && s.retries < stageMaxBackoffExponent { + s.retries++ + } + return err +} + +func (s *stageOp) backoff() time.Duration { + s.mu.Lock() + defer s.mu.Unlock() + return stageBackoffMin << (s.retries - 1) +} + +func (s *stageOp) isInFlight(image string) bool { + s.mu.Lock() + defer s.mu.Unlock() + return s.image == image +} + +func (s *stageOp) acquire(log logr.Logger, image string, cancel context.CancelFunc) { + s.mu.Lock() + defer s.mu.Unlock() + + if s.cancel != nil { + log.Info("Cancelling in-flight stage", "old", s.image, "new", image) + s.cancel() + s.retries = 0 + } + s.image = image + s.cancel = cancel + s.err = nil +} + +func (r *BootcNodeReconciler) reset() { + r.rebootIssued = false + r.inflight.mu.Lock() + defer r.inflight.mu.Unlock() + r.inflight.retries = 0 +} + +// run executes bootc stage in a goroutine. The results are delivered via the done channel. +func (s *stageOp) run(ctx context.Context, nodeName, image string, executor bootc.Executor, done chan<- event.GenericEvent) { + s.runMu.Lock() + defer s.runMu.Unlock() + + log := logf.FromContext(context.Background()).WithValues("node", nodeName, "image", image) + + // TODO: exec bootc switch async and select on the cancel channel to send SIGINT for graceful shutdown. + err := executor.Stage(ctx, image) + + s.mu.Lock() + if ctx.Err() != nil { + log.Info("Stage cancelled") + s.mu.Unlock() + return + } + if err != nil { + log.Error(err, "Stage failed") + s.err = err + } + s.image = "" + s.cancel = nil + s.mu.Unlock() + + done <- event.GenericEvent{ + Object: &bootcv1alpha1.BootcNode{ + ObjectMeta: metav1.ObjectMeta{Name: nodeName}, + }, + } +} + +func (r *BootcNodeReconciler) populateBootcFields(ctx context.Context, bn *bootcv1alpha1.BootcNode) error { + data, err := r.Executor.Status(ctx) + if err != nil { + return fmt.Errorf("getting bootc status: %w", err) + } + + status, err := bootc.ParseStatus(data) + if err != nil { + return fmt.Errorf("failed to parse bootc status: %w", err) + } + + bn.Status.Booted = convertBootEntry(status.Status.Booted) + bn.Status.Staged = convertBootEntry(status.Status.Staged) + bn.Status.Rollback = convertBootEntry(status.Status.Rollback) + return nil } +// updateAction represents the next step the daemon should take for an +// in-progress update. Classified once, then used to drive the stage. +type updateAction int + +const ( + actionStage updateAction = iota // desired image not yet staged + actionAwaitStage // stage in-flight, waiting for completion + actionAwaitBooted // staged, waiting for reboot approval + actionReboot // staged + approved, issue reboot + actionAwaitReboot // reboot issued, waiting for completion +) + +func (r *BootcNodeReconciler) classifyAction(bn *bootcv1alpha1.BootcNode, digested reference.Digested, desiredImage string) updateAction { + desiredDigest := digested.Digest().String() + alreadyStaged := bn.Status.Staged != nil && bn.Status.Staged.ImageDigest == desiredDigest + if !alreadyStaged { + if r.inflight.isInFlight(desiredImage) { + return actionAwaitStage + } + return actionStage + } + + if bn.Spec.DesiredImageState != bootcv1alpha1.DesiredImageStateBooted { + return actionAwaitBooted + } + + // rebootIssued is volatile: if the daemon restarts it resets to false. + // That is safe because either the reboot already landed (booted digest + // matches and we return idle earlier) or it hasn't and we re-issue it. + if r.rebootIssued { + return actionAwaitReboot + } + return actionReboot +} + func convertBootEntry(entry *bootc.BootEntry) *bootcv1alpha1.ImageInfo { if entry == nil || entry.Image == nil { return nil From a05f7b842cb2c3c91c9cc4ab6ebd633e3d27f946 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Fri, 19 Jun 2026 08:02:53 +0000 Subject: [PATCH 03/10] daemon: refactor fakeExecutor to model bootc state Replace raw JSON bytes with a bootc.Status struct in the test fake. Status() serializes the struct via json.Marshal, and Stage() auto-mutates the status (staging sets Staged). Reboot() records the call for test assertions. Add newBootcStatus() and newBootEntry() helpers to build test state without verbose JSON constants. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/daemon/fake_test.go | 115 ++++++++++++++++++++++++++--- internal/daemon/reconciler_test.go | 33 ++------- 2 files changed, 111 insertions(+), 37 deletions(-) diff --git a/internal/daemon/fake_test.go b/internal/daemon/fake_test.go index df703eb..eb7f8fc 100644 --- a/internal/daemon/fake_test.go +++ b/internal/daemon/fake_test.go @@ -4,32 +4,129 @@ package daemon import ( "context" + "encoding/json" + "strings" "sync" + + testutil "github.com/jlebon/bootc-operator/test/util" + + "github.com/jlebon/bootc-operator/internal/bootc" ) type fakeExecutor struct { - mu sync.Mutex - data []byte - err error + mu sync.Mutex + status bootc.Status + statusErr error + + stageErr error + stageImg string + stageHook func() + + rebootCalled bool + rebootErr error } func (f *fakeExecutor) Status(_ context.Context) ([]byte, error) { f.mu.Lock() defer f.mu.Unlock() - return f.data, f.err + if f.statusErr != nil { + return nil, f.statusErr + } + data, err := json.Marshal(f.status) + if err != nil { + return nil, err + } + return data, nil } -func (f *fakeExecutor) Stage(_ context.Context, _ string) error { +func (f *fakeExecutor) Stage(_ context.Context, image string) error { + f.mu.Lock() + f.stageImg = image + f.mu.Unlock() + + if f.stageHook != nil { + f.stageHook() + } + if f.stageErr != nil { + return f.stageErr + } + + f.mu.Lock() + defer f.mu.Unlock() + _, digest, _ := strings.Cut(image, "@") + f.status.Status.Staged = newBootEntry(image, digest) return nil } func (f *fakeExecutor) Reboot(_ context.Context) error { - return nil + f.mu.Lock() + defer f.mu.Unlock() + f.rebootCalled = true + return f.rebootErr +} + +func (f *fakeExecutor) setStatusErr(err error) { + f.mu.Lock() + defer f.mu.Unlock() + f.statusErr = err +} + +func (f *fakeExecutor) setStageErr(err error) { + f.mu.Lock() + defer f.mu.Unlock() + f.stageErr = err +} + +func (f *fakeExecutor) setStageHook(hook func()) { + f.mu.Lock() + defer f.mu.Unlock() + f.stageHook = hook +} + +func (f *fakeExecutor) getStageImg() string { + f.mu.Lock() + defer f.mu.Unlock() + return f.stageImg +} + +func (f *fakeExecutor) getRebootCalled() bool { + f.mu.Lock() + defer f.mu.Unlock() + return f.rebootCalled } -func (f *fakeExecutor) set(data []byte, err error) { +func (f *fakeExecutor) reset() { f.mu.Lock() defer f.mu.Unlock() - f.data = data - f.err = err + f.status = bootc.Status{} + f.statusErr = nil + f.stageErr = nil + f.stageImg = "" + f.stageHook = nil + f.rebootCalled = false + f.rebootErr = nil +} + +func newBootEntry(image, digest string) *bootc.BootEntry { + return &bootc.BootEntry{ + Image: &bootc.ImageStatus{ + Image: bootc.ImageReference{Image: image, Transport: "registry"}, + ImageDigest: digest, + Architecture: "amd64", + }, + } +} + +func newBootcStatus(bootedDigest string) bootc.Status { + return bootc.Status{ + APIVersion: "org.containers.bootc/v1alpha1", + Kind: "BootcHost", + Spec: bootc.StatusSpec{ + Image: &bootc.ImageReference{Image: testutil.ImageTaggedRef, Transport: "registry"}, + BootOrder: "default", + }, + Status: bootc.StatusBody{ + Booted: newBootEntry(testutil.ImageTaggedRef, bootedDigest), + }, + } } diff --git a/internal/daemon/reconciler_test.go b/internal/daemon/reconciler_test.go index e1fdc82..d965fb7 100644 --- a/internal/daemon/reconciler_test.go +++ b/internal/daemon/reconciler_test.go @@ -4,6 +4,7 @@ package daemon import ( "context" + "encoding/json" "fmt" "testing" "time" @@ -77,7 +78,9 @@ func TestReconcilePopulatesStatus(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.set([]byte(bootcStatusFull), nil) + if err := json.Unmarshal([]byte(bootcStatusFull), &fake.status); err != nil { + t.Fatal(err) + } bn := testutil.NewNode(testNodeName, testImageRef) g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) @@ -123,7 +126,7 @@ func TestReconcileBootcStatusError(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.set(nil, fmt.Errorf("bootc status failed")) + fake.setStatusErr(fmt.Errorf("bootc status failed")) bn := testutil.NewNode(testNodeName, testImageRef) g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) @@ -142,29 +145,3 @@ func TestReconcileBootcStatusError(t *testing.T) { ))) }).Should(Succeed()) } - -func TestReconcileInvalidJSON(t *testing.T) { - g := NewWithT(t) - g.SetDefaultEventuallyTimeout(pollTimeout) - g.SetDefaultEventuallyPollingInterval(pollInterval) - ctx := context.Background() - - fake.set([]byte(`{invalid json`), nil) - - bn := testutil.NewNode(testNodeName, testImageRef) - g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) - t.Cleanup(func() { - _ = k8sClient.Delete(ctx, bn) - }) - - g.Eventually(func(g Gomega) { - var got bootcv1alpha1.BootcNode - g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) - g.Expect(got.Status.Conditions).To(ContainElement(And( - HaveField("Type", bootcv1alpha1.NodeDegraded), - HaveField("Status", metav1.ConditionTrue), - HaveField("Reason", bootcv1alpha1.NodeReasonError), - HaveField("Message", ContainSubstring("parse")), - ))) - }).Should(Succeed()) -} From c8dc49ce77dc5825dcb1f58a71a365e263b49327 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Fri, 19 Jun 2026 08:03:10 +0000 Subject: [PATCH 04/10] daemon: use bootc helpers in reconciler tests Replace the bootcStatusFull JSON constant with newBootcStatus() struct construction. Tighten the error assertion to match the exact error chain. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/daemon/reconciler_test.go | 91 +++++++++++------------------- 1 file changed, 32 insertions(+), 59 deletions(-) diff --git a/internal/daemon/reconciler_test.go b/internal/daemon/reconciler_test.go index d965fb7..189381b 100644 --- a/internal/daemon/reconciler_test.go +++ b/internal/daemon/reconciler_test.go @@ -4,7 +4,7 @@ package daemon import ( "context" - "encoding/json" + "errors" "fmt" "testing" "time" @@ -14,6 +14,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1" + "github.com/jlebon/bootc-operator/internal/bootc" testutil "github.com/jlebon/bootc-operator/test/util" ) @@ -21,55 +22,7 @@ const ( pollInterval = 200 * time.Millisecond pollTimeout = 10 * time.Second - testImageRef = testutil.ImageDigestRefA - - bootcStatusFull = `{ - "apiVersion": "org.containers.bootc/v1alpha1", - "kind": "BootcHost", - "spec": { - "image": {"image": "quay.io/example/myos:latest", "transport": "registry"}, - "bootOrder": "default" - }, - "status": { - "booted": { - "image": { - "image": {"image": "quay.io/example/myos:latest", "transport": "registry"}, - "imageDigest": "` + testutil.DigestA + `", - "version": "1.0", - "architecture": "amd64" - }, - "incompatible": false, - "pinned": false, - "softRebootCapable": false, - "downloadOnly": false - }, - "staged": { - "image": { - "image": {"image": "quay.io/example/myos:latest", "transport": "registry"}, - "imageDigest": "` + testutil.DigestB + `", - "version": "2.0", - "architecture": "amd64" - }, - "incompatible": false, - "pinned": false, - "softRebootCapable": true, - "downloadOnly": false - }, - "rollback": { - "image": { - "image": {"image": "quay.io/example/myos:latest", "transport": "registry"}, - "imageDigest": "` + testutil.DigestC + `", - "version": "0.9", - "architecture": "amd64" - }, - "incompatible": false, - "pinned": false, - "softRebootCapable": false, - "downloadOnly": false - }, - "rollbackQueued": false - } -}` + bootcStatusErrMsg = "bootc status failed" ) func TestReconcilePopulatesStatus(t *testing.T) { @@ -78,11 +31,30 @@ func TestReconcilePopulatesStatus(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - if err := json.Unmarshal([]byte(bootcStatusFull), &fake.status); err != nil { - t.Fatal(err) + v1 := "v1" + v2 := "v2" + v3 := "v3" + fake.status = newBootcStatus(testutil.DigestA) + fake.status.Status.Booted.Image.Version = &v1 + fake.status.Status.Staged = &bootc.BootEntry{ + Image: &bootc.ImageStatus{ + Image: bootc.ImageReference{Image: testutil.ImageTaggedRef, Transport: "registry"}, + ImageDigest: testutil.DigestB, + Version: &v2, + Architecture: "amd64", + }, + SoftRebootCapable: true, + } + fake.status.Status.Rollback = &bootc.BootEntry{ + Image: &bootc.ImageStatus{ + Image: bootc.ImageReference{Image: testutil.ImageTaggedRef, Transport: "registry"}, + ImageDigest: testutil.DigestC, + Version: &v3, + Architecture: "amd64", + }, } - bn := testutil.NewNode(testNodeName, testImageRef) + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefA) g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) t.Cleanup(func() { _ = k8sClient.Delete(ctx, bn) @@ -95,17 +67,17 @@ func TestReconcilePopulatesStatus(t *testing.T) { g.Expect(got.Status.Booted).NotTo(BeNil()) g.Expect(got.Status.Booted.Image).To(Equal(testutil.ImageTaggedRef)) g.Expect(got.Status.Booted.ImageDigest).To(Equal(testutil.DigestA)) - g.Expect(got.Status.Booted.Version).To(Equal("1.0")) + g.Expect(got.Status.Booted.Version).To(Equal(v1)) g.Expect(got.Status.Booted.Architecture).To(Equal("amd64")) g.Expect(got.Status.Staged).NotTo(BeNil()) g.Expect(got.Status.Staged.ImageDigest).To(Equal(testutil.DigestB)) - g.Expect(got.Status.Staged.Version).To(Equal("2.0")) + g.Expect(got.Status.Staged.Version).To(Equal(v2)) g.Expect(got.Status.Staged.SoftRebootCapable).To(BeTrue()) g.Expect(got.Status.Rollback).NotTo(BeNil()) g.Expect(got.Status.Rollback.ImageDigest).To(Equal(testutil.DigestC)) - g.Expect(got.Status.Rollback.Version).To(Equal("0.9")) + g.Expect(got.Status.Rollback.Version).To(Equal(v3)) g.Expect(got.Status.Conditions).To(ContainElement(And( HaveField("Type", bootcv1alpha1.NodeIdle), @@ -126,9 +98,10 @@ func TestReconcileBootcStatusError(t *testing.T) { g.SetDefaultEventuallyPollingInterval(pollInterval) ctx := context.Background() - fake.setStatusErr(fmt.Errorf("bootc status failed")) + fake.reset() + fake.setStatusErr(errors.New(bootcStatusErrMsg)) - bn := testutil.NewNode(testNodeName, testImageRef) + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefA) g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) t.Cleanup(func() { _ = k8sClient.Delete(ctx, bn) @@ -141,7 +114,7 @@ func TestReconcileBootcStatusError(t *testing.T) { HaveField("Type", bootcv1alpha1.NodeDegraded), HaveField("Status", metav1.ConditionTrue), HaveField("Reason", bootcv1alpha1.NodeReasonError), - HaveField("Message", ContainSubstring("bootc status")), + HaveField("Message", Equal(fmt.Sprintf("failed to get bootc status: getting bootc status: %s", bootcStatusErrMsg))), ))) }).Should(Succeed()) } From 18c717f3173710274277fa1cc6c609f5560f8822 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Thu, 11 Jun 2026 14:13:37 +0000 Subject: [PATCH 05/10] daemon: add state machine unit tests Add envtest cases for the daemon reconciler state machine: - TestStagingTriggered: image mismatch triggers bootc stage - TestStagingError: stage failure sets Degraded condition - TestAlreadyStaged: skip stage when image already staged - TestRebootingSet: reboot triggered when desiredImageState is Booted - TestRollback: restage when desired image changes - TestCancelInflightStage: spec change cancels in-flight stage Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/daemon/fake_test.go | 14 +- internal/daemon/reconciler_test.go | 201 ++++++++++++++++++++++++++++- test/util/builders.go | 7 + 3 files changed, 213 insertions(+), 9 deletions(-) diff --git a/internal/daemon/fake_test.go b/internal/daemon/fake_test.go index eb7f8fc..9ee3817 100644 --- a/internal/daemon/fake_test.go +++ b/internal/daemon/fake_test.go @@ -22,8 +22,7 @@ type fakeExecutor struct { stageImg string stageHook func() - rebootCalled bool - rebootErr error + rebooted bool } func (f *fakeExecutor) Status(_ context.Context) ([]byte, error) { @@ -61,8 +60,8 @@ func (f *fakeExecutor) Stage(_ context.Context, image string) error { func (f *fakeExecutor) Reboot(_ context.Context) error { f.mu.Lock() defer f.mu.Unlock() - f.rebootCalled = true - return f.rebootErr + f.rebooted = true + return nil } func (f *fakeExecutor) setStatusErr(err error) { @@ -89,10 +88,10 @@ func (f *fakeExecutor) getStageImg() string { return f.stageImg } -func (f *fakeExecutor) getRebootCalled() bool { +func (f *fakeExecutor) getRebooted() bool { f.mu.Lock() defer f.mu.Unlock() - return f.rebootCalled + return f.rebooted } func (f *fakeExecutor) reset() { @@ -103,8 +102,7 @@ func (f *fakeExecutor) reset() { f.stageErr = nil f.stageImg = "" f.stageHook = nil - f.rebootCalled = false - f.rebootErr = nil + f.rebooted = false } func newBootEntry(image, digest string) *bootc.BootEntry { diff --git a/internal/daemon/reconciler_test.go b/internal/daemon/reconciler_test.go index 189381b..b8f2985 100644 --- a/internal/daemon/reconciler_test.go +++ b/internal/daemon/reconciler_test.go @@ -22,6 +22,7 @@ const ( pollInterval = 200 * time.Millisecond pollTimeout = 10 * time.Second + stageErrMsg = "stage failed: pull error" bootcStatusErrMsg = "bootc status failed" ) @@ -114,7 +115,205 @@ func TestReconcileBootcStatusError(t *testing.T) { HaveField("Type", bootcv1alpha1.NodeDegraded), HaveField("Status", metav1.ConditionTrue), HaveField("Reason", bootcv1alpha1.NodeReasonError), - HaveField("Message", Equal(fmt.Sprintf("failed to get bootc status: getting bootc status: %s", bootcStatusErrMsg))), + HaveField("Message", Equal(fmt.Sprintf("populating bootc fields: getting bootc status: %s", bootcStatusErrMsg))), ))) }).Should(Succeed()) } + +func TestStagingTriggered(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func(g Gomega) { + var got bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) + + g.Expect(got.Status.Staged).NotTo(BeNil()) + g.Expect(got.Status.Staged.ImageDigest).To(Equal(testutil.DigestB)) + + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonStaged), + ))) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeDegraded), + HaveField("Status", metav1.ConditionFalse), + ))) + }).Should(Succeed()) + + g.Expect(fake.getStageImg()).To(Equal(testutil.ImageDigestRefB)) + g.Expect(fake.getRebooted()).To(BeFalse()) +} + +func TestStagingError(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + fake.setStageErr(errors.New(stageErrMsg)) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func(g Gomega) { + var got bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", bootcv1alpha1.NodeReasonIdle), + ))) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeDegraded), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", bootcv1alpha1.NodeReasonError), + HaveField("Message", Equal(fmt.Sprintf("bootc stage failed: %s", stageErrMsg))), + ))) + }).Should(Succeed()) +} + +func TestAlreadyStaged(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func(g Gomega) { + var got bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonStaged), + ))) + }).Should(Succeed()) + + g.Expect(fake.getStageImg()).To(BeEmpty()) +} + +func TestRebootingSet(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB, testutil.WithDesiredImageState(bootcv1alpha1.DesiredImageStateBooted)) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func(g Gomega) { + var got bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonRebooting), + ))) + }).Should(Succeed()) + + g.Expect(fake.getRebooted()).To(BeTrue()) +} + +func TestRollback(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + fake.status.Status.Staged = newBootEntry(testutil.ImageDigestRefB, testutil.DigestB) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefC) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func(g Gomega) { + var got bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &got)).To(Succeed()) + g.Expect(got.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonStaged), + ))) + g.Expect(got.Status.Staged).NotTo(BeNil()) + g.Expect(got.Status.Staged.ImageDigest).To(Equal(testutil.DigestC)) + }).Should(Succeed()) + + g.Expect(fake.getRebooted()).To(BeFalse()) +} + +func TestCancelInflightStage(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + ctx := context.Background() + + fake.reset() + fake.status = newBootcStatus(testutil.DigestA) + + firstBlock := make(chan struct{}) + fake.setStageHook(func() { + <-firstBlock + }) + + bn := testutil.NewNode(testNodeName, testutil.ImageDigestRefB) + g.Expect(k8sClient.Create(ctx, bn)).To(Succeed()) + t.Cleanup(func() { + _ = k8sClient.Delete(ctx, bn) + }) + + g.Eventually(func() string { + return fake.getStageImg() + }).Should(Equal(testutil.ImageDigestRefB)) + + fake.setStageHook(nil) + close(firstBlock) + + g.Eventually(func(g Gomega) { + var latest bootcv1alpha1.BootcNode + g.Expect(k8sClient.Get(ctx, client.ObjectKeyFromObject(bn), &latest)).To(Succeed()) + latest.Spec.DesiredImage = testutil.ImageDigestRefC + g.Expect(k8sClient.Update(ctx, &latest)).To(Succeed()) + }).Should(Succeed()) + + g.Eventually(func() string { + return fake.getStageImg() + }).Should(Equal(testutil.ImageDigestRefC)) +} diff --git a/test/util/builders.go b/test/util/builders.go index 2fbce6e..0787351 100644 --- a/test/util/builders.go +++ b/test/util/builders.go @@ -184,6 +184,13 @@ func WithNodeAnnotation(key, value string) NodeOption { } } +// WithDesiredImageState overrides the default DesiredImageState on a node. +func WithDesiredImageState(state bootcv1alpha1.DesiredImageState) NodeOption { + return func(node *bootcv1alpha1.BootcNode) { + node.Spec.DesiredImageState = state + } +} + // WithNodePullSecret sets the pull secret reference and hash on a node. func WithNodePullSecret(name, namespace, hash string) NodeOption { return func(node *bootcv1alpha1.BootcNode) { From 1ccfeebf1396d0b4f598e9718d0b436ec24fad55 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Fri, 12 Jun 2026 11:46:44 +0000 Subject: [PATCH 06/10] e2e: add NodeImageUpdateDigestedPullSpec to e2e env Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- Makefile | 1 + test/e2e/e2eutil/env.go | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 4245b3c..2207444 100644 --- a/Makefile +++ b/Makefile @@ -68,6 +68,7 @@ e2e: ## Run e2e tests (requires: make deploy-bink). V=1 for verbose. RUN= BINK_LOCAL_REGISTRY_NODE_IMAGE=$(BINK_LOCAL_REGISTRY_NODE_IMAGE) \ ARTIFACTS=$(ARTIFACTS) \ BINK_NODE_IMAGE_DIGEST=$$(skopeo inspect --tls-verify=false --format '{{.Digest}}' docker://localhost:5000/node:latest) \ + BINK_NODE_IMAGE_UPDATE_DIGEST=$$(skopeo inspect --tls-verify=false docker://localhost:5000/node:update | jq -r '.Digest') \ go test -timeout 10m -count=1 $(if $(V),-v) $(if $(RUN),-run $(RUN)) . ##@ Build diff --git a/test/e2e/e2eutil/env.go b/test/e2e/e2eutil/env.go index 423ca19..65d2311 100644 --- a/test/e2e/e2eutil/env.go +++ b/test/e2e/e2eutil/env.go @@ -60,6 +60,10 @@ type Env struct { // nodeImageRegistry is the in-cluster registry path for the seeded node image // (e.g. "registry.cluster.local:5000/node"). Empty when not seeded. nodeImageRegistry string + + // nodeImageUpdateDigest is the manifest digest of the update image + // (e.g. "sha256:def456..."). Empty when not built. + nodeImageUpdateDigest string } // New connects to an existing bink cluster and returns an Env ready @@ -79,16 +83,27 @@ func New(t *testing.T) *Env { } nodeImageDigest := os.Getenv("BINK_NODE_IMAGE_DIGEST") + if nodeImageDigest == "" { + t.Fatal("BINK_NODE_IMAGE_DIGEST must be set") + } nodeImageRegistry := os.Getenv("BINK_LOCAL_REGISTRY_NODE_IMAGE") + if nodeImageRegistry == "" { + t.Fatal("BINK_LOCAL_REGISTRY_NODE_IMAGE must be set") + } + nodeImageUpdateDigest := os.Getenv("BINK_NODE_IMAGE_UPDATE_DIGEST") + if nodeImageUpdateDigest == "" { + t.Fatal("BINK_NODE_IMAGE_UPDATE_DIGEST must be set") + } k8sClient := buildClient(t, kubeconfigPath) env := &Env{ - Client: k8sClient, - clusterName: clusterName, - testID: sanitizeTestName(t.Name()), - nodeImageDigest: nodeImageDigest, - nodeImageRegistry: nodeImageRegistry, + Client: k8sClient, + clusterName: clusterName, + testID: sanitizeTestName(t.Name()), + nodeImageDigest: nodeImageDigest, + nodeImageRegistry: nodeImageRegistry, + nodeImageUpdateDigest: nodeImageUpdateDigest, } t.Cleanup(func() { @@ -214,6 +229,20 @@ func (e *Env) NodeImageDigest() string { return e.nodeImageDigest } +// NodeImageUpdateDigestedPullSpec returns the digest-qualified reference for the +// update image (e.g. "registry.cluster.local:5000/node@sha256:def456"). +func (e *Env) NodeImageUpdateDigestedPullSpec() string { + if e.nodeImageRegistry == "" || e.nodeImageUpdateDigest == "" { + return "" + } + return e.nodeImageRegistry + "@" + e.nodeImageUpdateDigest +} + +// NodeImageUpdateDigest returns the manifest digest of the update image. +func (e *Env) NodeImageUpdateDigest() string { + return e.nodeImageUpdateDigest +} + // cleanup gathers diagnostic logs, then deletes test-scoped resources // and bink nodes. func (e *Env) cleanup(t *testing.T) { From b268d48969f5fcb77c3f3c5d288ec246198ebab0 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Fri, 12 Jun 2026 11:46:47 +0000 Subject: [PATCH 07/10] e2e: add TestUpdateReboot for full update lifecycle The TestUpdateReboot verifies that the upgrades to the new image is successfully performed. It starts by patching the desiredImage to a new one. Then, the BootcNode should have the Rebooting condition. After the reboot the node should be schedulable, proving that the uncordon was successful. Additionally, the test verifies that the image is the one we built for upgrades by checking the existance of the file /usr/share/update-marker. We don't detect the Staged condition since the transition between the phase Rebooting can happen very quickly. This is already covered by the env test and it can be verified when the rollout is paused. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- test/e2e/bootcnode_test.go | 111 +++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/test/e2e/bootcnode_test.go b/test/e2e/bootcnode_test.go index 221f113..9ae7c42 100644 --- a/test/e2e/bootcnode_test.go +++ b/test/e2e/bootcnode_test.go @@ -4,6 +4,9 @@ package e2e import ( "context" + "fmt" + "os" + "os/exec" "testing" "time" @@ -87,3 +90,111 @@ func TestControllerMembership(t *testing.T) { ))) }).WithTimeout(3 * time.Minute).Should(Succeed()) } + +// TestUpdateReboot provisions a worker node, creates a pool with the +// original image, then updates the pool to a new image and verifies the +// full update lifecycle: staging, reboot, and idle with the new image. +func TestUpdateReboot(t *testing.T) { + g := NewWithT(t) + g.SetDefaultEventuallyTimeout(pollTimeout) + g.SetDefaultEventuallyPollingInterval(pollInterval) + + env := e2eutil.New(t) + nodeName := env.AddNode(t) + + ctx := context.Background() + + // Phase 1: Create pool with original image and wait for Idle. + pool := env.NewPool("workers", env.NodeImageDigestedPullSpec()) + g.Expect(env.Client.Create(ctx, pool)).To(Succeed()) + + var bn bootcv1alpha1.BootcNode + g.Eventually(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &bn)).To(Succeed()) + g.Expect(bn.Status.Booted).NotTo(BeNil()) + g.Expect(bn.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", bootcv1alpha1.NodeReasonIdle), + ))) + }).WithTimeout(3 * time.Minute).Should(Succeed()) + + t.Logf("Node %q is Idle with original image", nodeName) + + // Phase 2: Patch pool to update image. + updateRef := env.NodeImageUpdateDigestedPullSpec() + + modified := pool.DeepCopy() + modified.Spec.Image.Ref = updateRef + g.Expect(env.Client.Patch(ctx, modified, client.MergeFrom(pool))).To(Succeed()) + *pool = *modified + + t.Logf("Patched pool to update image %s", updateRef) + + // Phase 3: Wait for Rebooting — proves image was staged and reboot started. + // TODO: the Staged phase will be tested separately when the rollout paused. + g.Eventually(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &bn)).To(Succeed()) + g.Expect(bn.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionFalse), + HaveField("Reason", bootcv1alpha1.NodeReasonRebooting), + ))) + }).WithTimeout(5*time.Minute).Should(Succeed(), "expected node to reach Rebooting state") + + t.Logf("Node %q reached Rebooting state", nodeName) + + // Phase 4: Wait for Idle with the update digest — proves reboot completed. + g.Eventually(func(g Gomega) { + g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &bn)).To(Succeed()) + g.Expect(bn.Status.Booted).NotTo(BeNil()) + g.Expect(bn.Status.Booted.ImageDigest).To(Equal(env.NodeImageUpdateDigest()), + "expected booted digest to match update image") + g.Expect(bn.Status.Conditions).To(ContainElement(And( + HaveField("Type", bootcv1alpha1.NodeIdle), + HaveField("Status", metav1.ConditionTrue), + HaveField("Reason", bootcv1alpha1.NodeReasonIdle), + ))) + }).WithTimeout(5*time.Minute).Should(Succeed(), "expected node to reach Idle with update image after reboot") + + t.Logf("Node %q is Idle with update image", nodeName) + + // Phase 5: Verify node is schedulable (uncordoned after reboot). + var node corev1.Node + g.Eventually(func(g Gomega) bool { + g.Expect(env.Client.Get(ctx, client.ObjectKey{Name: nodeName}, &node)).To(Succeed()) + return node.Spec.Unschedulable + }).WithTimeout(3*time.Minute).Should(BeFalse(), "expected node to be schedulable after update") + + // Phase 6: Verify update marker exists on the host via daemon pod exec. + var daemonPod corev1.Pod + g.Eventually(func(g Gomega) { + var pods corev1.PodList + g.Expect(env.Client.List(ctx, &pods, + client.InNamespace("bootc-operator"), + client.MatchingLabels{ + "app.kubernetes.io/name": "bootc-operator", + "app.kubernetes.io/component": "daemon", + }, + )).To(Succeed()) + var matched []corev1.Pod + for _, p := range pods.Items { + if p.Spec.NodeName == nodeName { + matched = append(matched, p) + } + } + g.Expect(matched).To(HaveLen(1), "expected exactly one daemon pod on %s", nodeName) + g.Expect(matched[0].Status.Phase).To(Equal(corev1.PodRunning)) + daemonPod = matched[0] + }).WithTimeout(1*time.Minute).Should(Succeed(), "expected running daemon pod on %s", nodeName) + + kubeconfigPath := os.Getenv("KUBECONFIG") + cmd := exec.CommandContext(ctx, "kubectl", "--kubeconfig", kubeconfigPath, + "-n", "bootc-operator", "exec", daemonPod.Name, "--", + "stat", "/proc/1/root/usr/share/update-marker") + out, err := cmd.CombinedOutput() + g.Expect(err).NotTo(HaveOccurred(), + fmt.Sprintf("expected update-marker to exist on host, kubectl exec output: %s", string(out))) + + t.Logf("Verified update-marker exists on host via daemon pod") +} From 31a4c84420abc93d84af3984fe31b4572080e56b Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Fri, 12 Jun 2026 11:46:50 +0000 Subject: [PATCH 08/10] daemon: increase memory resource The daemon runs bootc switch which downloads the images. This operations requires consumes additional memory limits, otherwise it gets OOM killed. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- config/daemon/daemon.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/daemon/daemon.yaml b/config/daemon/daemon.yaml index fefaa14..af5d7c8 100644 --- a/config/daemon/daemon.yaml +++ b/config/daemon/daemon.yaml @@ -38,7 +38,7 @@ spec: resources: limits: cpu: 500m - memory: 128Mi + memory: 512Mi requests: cpu: 10m memory: 64Mi From 22c2df2c42582f0f70da8ff188d637893d575579 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Wed, 3 Jun 2026 14:20:52 +0000 Subject: [PATCH 09/10] daemon: add StatusWatcher for fsnotify + polling Add a StatusWatcher component that detects external bootc status changes via fsnotify on /proc/1/root/ostree/bootc (with fallback to /proc/1/root/sysroot/state/deploy for composefs), plus a configurable polling interval as a safety net. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- internal/daemon/watcher.go | 133 +++++++++++++++++++++++++++++++ internal/daemon/watcher_test.go | 137 ++++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 internal/daemon/watcher.go create mode 100644 internal/daemon/watcher_test.go diff --git a/internal/daemon/watcher.go b/internal/daemon/watcher.go new file mode 100644 index 0000000..6bc6e0e --- /dev/null +++ b/internal/daemon/watcher.go @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: Apache-2.0 + +package daemon + +import ( + "context" + "os" + "time" + + "github.com/fsnotify/fsnotify" + "github.com/go-logr/logr" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1" + "sigs.k8s.io/controller-runtime/pkg/event" +) + +const ( + // ostree backend + DefaultPrimaryPath = "/proc/1/root/ostree/bootc" + // composefs backend + DefaultFallbackPath = "/proc/1/root/sysroot/state/deploy" +) + +type StatusWatcher struct { + PollInterval time.Duration + PrimaryPath string + FallbackPath string + Events chan event.GenericEvent + NodeName string + Ready chan struct{} +} + +func (w *StatusWatcher) Start(ctx context.Context) error { + log := logf.FromContext(ctx).WithName("status-watcher") + + watchPath := w.resolveWatchPath() + + fsWatcher := w.setupFsnotify(log, watchPath) + + if fsWatcher != nil { + defer func() { _ = fsWatcher.Close() }() + } + + ticker := time.NewTicker(w.PollInterval) + defer ticker.Stop() + + if w.Ready != nil { + close(w.Ready) + } + + for { + select { + case <-ctx.Done(): + return nil + case ev, ok := <-w.fsEvents(fsWatcher): + if !ok { + return nil + } + if ev.Has(fsnotify.Chmod) { + log.V(1).Info("Detected bootc status change via fsnotify") + w.sendEvent() + } + case err, ok := <-w.fsErrors(fsWatcher): + if !ok { + return nil + } + log.Error(err, "fsnotify error") + case <-ticker.C: + log.V(1).Info("Polling bootc status") + w.sendEvent() + } + } +} + +func (w *StatusWatcher) setupFsnotify(log logr.Logger, watchPath string) *fsnotify.Watcher { + if watchPath == "" { + log.Info("No bootc status path found, using polling only") + return nil + } + + fsWatcher, err := fsnotify.NewWatcher() + if err != nil { + log.Error(err, "Failed to create fsnotify watcher, falling back to polling") + return nil + } + + if err := fsWatcher.Add(watchPath); err != nil { + log.Error(err, "Failed to watch path, falling back to polling", "path", watchPath) + _ = fsWatcher.Close() + return nil + } + + log.Info("Watching path for bootc status changes", "path", watchPath) + return fsWatcher +} + +func (w *StatusWatcher) resolveWatchPath() string { + if _, err := os.Stat(w.PrimaryPath); err == nil { + return w.PrimaryPath + } + if _, err := os.Stat(w.FallbackPath); err == nil { + return w.FallbackPath + } + return "" +} + +func (w *StatusWatcher) sendEvent() { + ev := event.GenericEvent{ + Object: &bootcv1alpha1.BootcNode{ + ObjectMeta: metav1.ObjectMeta{Name: w.NodeName}, + }, + } + select { + case w.Events <- ev: + default: + } +} + +func (w *StatusWatcher) fsEvents(watcher *fsnotify.Watcher) <-chan fsnotify.Event { + if watcher == nil { + return nil + } + return watcher.Events +} + +func (w *StatusWatcher) fsErrors(watcher *fsnotify.Watcher) <-chan error { + if watcher == nil { + return nil + } + return watcher.Errors +} diff --git a/internal/daemon/watcher_test.go b/internal/daemon/watcher_test.go new file mode 100644 index 0000000..1cb8550 --- /dev/null +++ b/internal/daemon/watcher_test.go @@ -0,0 +1,137 @@ +// SPDX-License-Identifier: Apache-2.0 + +package daemon + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "sigs.k8s.io/controller-runtime/pkg/event" +) + +func startWatcher(t *testing.T, w *StatusWatcher) (done <-chan error, cancel context.CancelFunc) { + t.Helper() + ctx, cancel := context.WithCancel(context.Background()) + ch := make(chan error, 1) + go func() { ch <- w.Start(ctx) }() + <-w.Ready + return ch, cancel +} + +func TestWatcherEvents(t *testing.T) { + tests := []struct { + name string + mkPrimary bool + mkFallback bool + touchPrimary bool + touchFallback bool + pollInterval time.Duration + }{ + { + name: "Fsnotify", + mkPrimary: true, + touchPrimary: true, + pollInterval: 10 * time.Minute, + }, + { + name: "FallbackPath", + mkFallback: true, + touchFallback: true, + pollInterval: 10 * time.Minute, + }, + { + name: "PollOnly", + pollInterval: 200 * time.Millisecond, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + dir := t.TempDir() + primaryPath := filepath.Join(dir, "bootc") + fallbackPath := filepath.Join(dir, "deploy") + + if tt.mkPrimary { + if err := os.Mkdir(primaryPath, 0o755); err != nil { + t.Fatal(err) + } + } + if tt.mkFallback { + if err := os.Mkdir(fallbackPath, 0o755); err != nil { + t.Fatal(err) + } + } + + events := make(chan event.GenericEvent, 1) + w := &StatusWatcher{ + PollInterval: tt.pollInterval, + PrimaryPath: primaryPath, + FallbackPath: fallbackPath, + Events: events, + NodeName: "test-node", + Ready: make(chan struct{}), + } + + done, cancel := startWatcher(t, w) + defer cancel() + + now := time.Now() + if tt.touchPrimary { + if err := os.Chtimes(primaryPath, now, now); err != nil { + t.Fatal(err) + } + } + if tt.touchFallback { + if err := os.Chtimes(fallbackPath, now, now); err != nil { + t.Fatal(err) + } + } + + select { + case ev := <-events: + if ev.Object.GetName() != "test-node" { + t.Errorf("expected node name test-node, got %s", ev.Object.GetName()) + } + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for event") + } + + cancel() + if err := <-done; err != nil { + t.Fatalf("watcher returned error: %v", err) + } + }) + } +} + +func TestWatcherShutdown(t *testing.T) { + dir := t.TempDir() + watchDir := filepath.Join(dir, "bootc") + if err := os.Mkdir(watchDir, 0o755); err != nil { + t.Fatal(err) + } + + w := &StatusWatcher{ + PollInterval: 10 * time.Minute, + PrimaryPath: watchDir, + FallbackPath: filepath.Join(dir, "nonexistent"), + Events: make(chan event.GenericEvent, 1), + NodeName: "test-node", + Ready: make(chan struct{}), + } + + done, cancel := startWatcher(t, w) + cancel() + + select { + case err := <-done: + if err != nil { + t.Fatalf("watcher returned error on shutdown: %v", err) + } + case <-time.After(5 * time.Second): + t.Fatal("timed out waiting for watcher to shut down") + } +} From db134dff6b218f5a9a1e66c142af8334e4c25759 Mon Sep 17 00:00:00 2001 From: Alice Frosi Date: Wed, 3 Jun 2026 14:20:59 +0000 Subject: [PATCH 10/10] daemon: wire StatusWatcher into reconciler and daemon Add --poll-interval flag to the daemon binary and wire the StatusWatcher channel into the reconciler as a second WatchesRawSource alongside switchDone. Assisted-by: Claude Opus 4.6 (1M context) Signed-off-by: Alice Frosi --- cmd/daemon/main.go | 30 +++++++++++++++++++++++++----- internal/daemon/reconciler.go | 14 ++++++++++---- internal/daemon/watcher_test.go | 10 +++++----- 3 files changed, 40 insertions(+), 14 deletions(-) diff --git a/cmd/daemon/main.go b/cmd/daemon/main.go index d99f5c6..c605c59 100644 --- a/cmd/daemon/main.go +++ b/cmd/daemon/main.go @@ -6,6 +6,7 @@ import ( "flag" "fmt" "os" + "time" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" @@ -14,6 +15,7 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log/zap" bootcv1alpha1 "github.com/jlebon/bootc-operator/api/v1alpha1" @@ -32,6 +34,9 @@ func init() { } func main() { + var pollInterval time.Duration + flag.DurationVar(&pollInterval, "poll-interval", 5*time.Minute, "Interval for polling bootc status as a fallback to fsnotify") + opts := zap.Options{ Development: true, } @@ -62,17 +67,32 @@ func main() { os.Exit(1) } + statusChanged := make(chan event.GenericEvent, 1) + if err := (&daemon.BootcNodeReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - NodeName: nodeName, - Executor: bootc.NewHostExecutor(), + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + NodeName: nodeName, + Executor: bootc.NewHostExecutor(), + StatusChanged: statusChanged, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "Failed to create controller", "controller", "bootcnode") os.Exit(1) } - setupLog.Info("Starting daemon", "node", nodeName) + watcher := &daemon.StatusWatcher{ + PollInterval: pollInterval, + PrimaryPath: daemon.DefaultPrimaryPath, + FallbackPath: daemon.DefaultFallbackPath, + Events: statusChanged, + NodeName: nodeName, + } + if err := mgr.Add(watcher); err != nil { + setupLog.Error(err, "Failed to add status watcher") + os.Exit(1) + } + + setupLog.Info("Starting daemon", "node", nodeName, "pollInterval", pollInterval) if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { setupLog.Error(err, "Failed to run daemon") os.Exit(1) diff --git a/internal/daemon/reconciler.go b/internal/daemon/reconciler.go index cecd60b..889cad6 100644 --- a/internal/daemon/reconciler.go +++ b/internal/daemon/reconciler.go @@ -58,17 +58,23 @@ type BootcNodeReconciler struct { stageDone chan event.GenericEvent // rebootIssued tracks whether a reboot has been issued so classifyAction // can distinguish the Staged→Rebooting. - rebootIssued bool + rebootIssued bool + StatusChanged chan event.GenericEvent } func (r *BootcNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { r.stageDone = make(chan event.GenericEvent, 1) - return ctrl.NewControllerManagedBy(mgr). + builder := ctrl.NewControllerManagedBy(mgr). For(&bootcv1alpha1.BootcNode{}). WatchesRawSource(source.Channel(r.stageDone, &handler.EnqueueRequestForObject{})). - Named("bootcnode"). - Complete(r) + Named("bootcnode") + + if r.StatusChanged != nil { + builder = builder.WatchesRawSource(source.Channel(r.StatusChanged, &handler.EnqueueRequestForObject{})) + } + + return builder.Complete(r) } func (r *BootcNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { diff --git a/internal/daemon/watcher_test.go b/internal/daemon/watcher_test.go index 1cb8550..29de499 100644 --- a/internal/daemon/watcher_test.go +++ b/internal/daemon/watcher_test.go @@ -23,12 +23,12 @@ func startWatcher(t *testing.T, w *StatusWatcher) (done <-chan error, cancel con func TestWatcherEvents(t *testing.T) { tests := []struct { - name string - mkPrimary bool - mkFallback bool - touchPrimary bool + name string + mkPrimary bool + mkFallback bool + touchPrimary bool touchFallback bool - pollInterval time.Duration + pollInterval time.Duration }{ { name: "Fsnotify",