From a71fb7d3035cee34885d2911f7874ebc48907830 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20B=C4=85k?= Date: Thu, 28 May 2026 12:04:32 +0200 Subject: [PATCH] Fix StatefulSet recreate failure handling Bug: when restart fallback scaled a host StatefulSet down, a timed-out StatefulSet delete was ignored and the following create failure was converted into ErrCRUDRecreate, which the create path also ignored. The reconcile could therefore report success while the old StatefulSet was still deleting and the replacement pod was not created. Fix: abort recreate when StatefulSet deletion fails, treat create API failures as aborts, and defensively abort if ErrCRUDRecreate reaches the create completion path. Tests: add focused statefulset reconciler unit coverage for create errors, unexpected recreate actions in the create path, and delete failures during recreate. --- .../statefulset/statefulset-reconciler.go | 15 +- .../statefulset-reconciler_test.go | 142 ++++++++++++++++++ 2 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 pkg/controller/common/statefulset/statefulset-reconciler_test.go diff --git a/pkg/controller/common/statefulset/statefulset-reconciler.go b/pkg/controller/common/statefulset/statefulset-reconciler.go index c2e571753..6650e5738 100644 --- a/pkg/controller/common/statefulset/statefulset-reconciler.go +++ b/pkg/controller/common/statefulset/statefulset-reconciler.go @@ -192,7 +192,9 @@ func (r *Reconciler) recreateStatefulSet(ctx context.Context, host *api.Host, re r.a.V(2).M(host).S().Info(util.NamespaceNameString(host.GetCR())) defer r.a.V(2).M(host).E().Info(util.NamespaceNameString(host.GetCR())) - _ = r.doDeleteStatefulSet(ctx, host) + if err := r.doDeleteStatefulSet(ctx, host); err != nil { + return common.ErrCRUDAbort + } _ = r.storage.ReconcilePVCs(ctx, host, api.DesiredStatefulSet) return r.createStatefulSet(ctx, host, register, opts) } @@ -386,9 +388,10 @@ func (r *Reconciler) shouldAbortOrContinueCreateStatefulSet(action error, host * return nil case common.ErrCRUDRecreate: - // Continue - r.a.V(1).M(host).Warning("Got recreate action. Ignore and continue for now") - return nil + // Recreate is not actionable from the create path. Treat it as failed + // instead of marking an uncreated StatefulSet as reconciled. + r.a.V(1).M(host).Warning("Got recreate action while creating StatefulSet. Abort") + return common.ErrCRUDAbort case common.ErrCRUDUnexpectedFlow: // Continue @@ -410,7 +413,7 @@ func (r *Reconciler) doCreateStatefulSet(ctx context.Context, host *api.Host, op log.V(1).Info("Create StatefulSet: %s", util.NamespaceNameString(statefulSet)) if _, err := r.sts.Create(ctx, statefulSet); err != nil { log.V(1).M(host).F().Error("StatefulSet create failed. err: %v", err) - return common.ErrCRUDRecreate + return common.ErrCRUDAbort } // StatefulSet created, wait until host is launched @@ -514,6 +517,7 @@ func (r *Reconciler) doDeleteStatefulSet(ctx context.Context, host *api.Host) er // Unable to fetch cur StatefulSet, but this is not necessarily an error yet if apiErrors.IsNotFound(err) { log.V(1).M(host).Info("NEUTRAL not found StatefulSet %s/%s", namespace, name) + return nil } else { log.V(1).M(host).F().Error("FAIL get StatefulSet %s/%s err:%v", namespace, name, err) } @@ -539,6 +543,7 @@ func (r *Reconciler) doDeleteStatefulSet(ctx context.Context, host *api.Host) er log.V(1).M(host).Info("NEUTRAL not found StatefulSet %s/%s", namespace, name) } else { log.V(1).M(host).F().Error("FAIL delete StatefulSet %s/%s err: %v", namespace, name, err) + return err } return nil diff --git a/pkg/controller/common/statefulset/statefulset-reconciler_test.go b/pkg/controller/common/statefulset/statefulset-reconciler_test.go new file mode 100644 index 000000000..f5efe6a75 --- /dev/null +++ b/pkg/controller/common/statefulset/statefulset-reconciler_test.go @@ -0,0 +1,142 @@ +// Copyright 2019 Altinity Ltd and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package statefulset + +import ( + "context" + "errors" + "testing" + + "github.com/stretchr/testify/require" + apps "k8s.io/api/apps/v1" + apiErrors "k8s.io/apimachinery/pkg/api/errors" + meta "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + + api "github.com/altinity/clickhouse-operator/pkg/apis/clickhouse.altinity.com/v1" + "github.com/altinity/clickhouse-operator/pkg/controller/common" + "github.com/altinity/clickhouse-operator/pkg/interfaces" +) + +type fakeSTS struct { + cur *apps.StatefulSet + createErr error + deleteErr error +} + +func (s *fakeSTS) Get(context.Context, ...any) (*apps.StatefulSet, error) { + return s.cur, nil +} + +func (s *fakeSTS) Create(context.Context, *apps.StatefulSet) (*apps.StatefulSet, error) { + return nil, s.createErr +} + +func (s *fakeSTS) Update(context.Context, *apps.StatefulSet) (*apps.StatefulSet, error) { + return s.cur, nil +} + +func (s *fakeSTS) Delete(context.Context, string, string) error { + return s.deleteErr +} + +func (s *fakeSTS) List(context.Context, string, meta.ListOptions) ([]apps.StatefulSet, error) { + return nil, nil +} + +type testNamer struct{} + +func (testNamer) Names(interfaces.NameType, ...any) []string { + return nil +} + +func (testNamer) Name(interfaces.NameType, ...any) string { + return "chi-test-cluster-0-0" +} + +type noopHostObjectsPoller struct{} + +func (noopHostObjectsPoller) WaitHostStatefulSetReady(context.Context, *api.Host) error { + return nil +} + +func (noopHostObjectsPoller) WaitHostPodStarted(context.Context, *api.Host) error { + return nil +} + +func testHostWithDesiredSTS() *api.Host { + host := &api.Host{} + host.Runtime.SetCR(&api.ClickHouseInstallation{ + ObjectMeta: meta.ObjectMeta{ + Namespace: "default", + Name: "test", + }, + }) + host.Runtime.Address.Namespace = "default" + host.Runtime.DesiredStatefulSet = &apps.StatefulSet{ + ObjectMeta: meta.ObjectMeta{ + Namespace: "default", + Name: "chi-test-cluster-0-0", + }, + } + return host +} + +func TestDoCreateStatefulSetCreateErrorAborts(t *testing.T) { + err := apiErrors.NewAlreadyExists( + schema.GroupResource{Group: "apps", Resource: "statefulsets"}, + "chi-test-cluster-0-0", + ) + reconciler := &Reconciler{ + sts: &fakeSTS{createErr: err}, + } + + action := reconciler.doCreateStatefulSet(context.Background(), testHostWithDesiredSTS(), nil) + + require.ErrorIs(t, action, common.ErrCRUDAbort) +} + +func TestShouldAbortOrContinueCreateStatefulSetRecreateActionAborts(t *testing.T) { + reconciler := &Reconciler{} + + err := reconciler.shouldAbortOrContinueCreateStatefulSet(common.ErrCRUDRecreate, testHostWithDesiredSTS()) + + require.ErrorIs(t, err, common.ErrCRUDAbort) +} + +func TestRecreateStatefulSetDeleteErrorAborts(t *testing.T) { + replicas := int32(1) + errDelete := errors.New("delete timed out") + reconciler := &Reconciler{ + hostObjectsPoller: noopHostObjectsPoller{}, + namer: testNamer{}, + sts: &fakeSTS{ + cur: &apps.StatefulSet{ + ObjectMeta: meta.ObjectMeta{ + Namespace: "default", + Name: "chi-test-cluster-0-0", + }, + Spec: apps.StatefulSetSpec{ + Replicas: &replicas, + }, + }, + deleteErr: errDelete, + }, + } + + err := reconciler.recreateStatefulSet(context.Background(), testHostWithDesiredSTS(), true, nil) + + require.ErrorIs(t, err, common.ErrCRUDAbort) +}