Skip to content

Commit 28a3038

Browse files
committed
Onboarding: Timeout on active instance eventually
The code currently assumes, that an instance would report the expected string eventually, which assumes that the metadata service lookup works. The active state only reflects that the hypervisor could boot the VM. As the instance usually boots within seconds, a five minute timeout seems save enough to catch this eventuality.
1 parent 40ef4bb commit 28a3038

File tree

3 files changed

+90
-5
lines changed

3 files changed

+90
-5
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ require (
104104
k8s.io/component-base v0.35.0 // indirect
105105
k8s.io/klog/v2 v2.130.1 // indirect
106106
k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
107-
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
107+
k8s.io/utils v0.0.0-20251002143259-bc988d571ff4
108108
sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.33.0 // indirect
109109
sigs.k8s.io/gateway-api v1.4.0 // indirect
110110
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect

internal/controller/onboarding_controller.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import (
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3232
"k8s.io/apimachinery/pkg/runtime"
3333
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
3435
ctrl "sigs.k8s.io/controller-runtime"
3536
k8sclient "sigs.k8s.io/controller-runtime/pkg/client"
3637
logger "sigs.k8s.io/controller-runtime/pkg/log"
@@ -51,6 +52,7 @@ var errRequeue = errors.New("requeue requested")
5152

5253
const (
5354
defaultWaitTime = 1 * time.Minute
55+
smokeTestTimeout = 5 * time.Minute
5456
testProjectName = "test"
5557
testDomainName = "cc3test"
5658
testImageName = "cirros-kvm"
@@ -63,6 +65,7 @@ type OnboardingController struct {
6365
k8sclient.Client
6466
Scheme *runtime.Scheme
6567
TestFlavorID string
68+
Clock clock.Clock
6669
computeClient *gophercloud.ServiceClient
6770
testComputeClient *gophercloud.ServiceClient
6871
testImageClient *gophercloud.ServiceClient
@@ -286,6 +289,21 @@ func (r *OnboardingController) smokeTest(ctx context.Context, hv *kvmv1.Hypervis
286289
}
287290

288291
if !strings.Contains(consoleOutput, server.Name) {
292+
if !server.LaunchedAt.IsZero() && r.Clock.Now().After(server.LaunchedAt.Add(smokeTestTimeout)) {
293+
base := hv.DeepCopy()
294+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
295+
Type: kvmv1.ConditionTypeOnboarding,
296+
Status: metav1.ConditionTrue,
297+
Reason: kvmv1.ConditionReasonTesting,
298+
Message: fmt.Sprintf("timeout waiting for console output since %v", server.LaunchedAt),
299+
})
300+
if err := r.patchStatus(ctx, hv, base); err != nil {
301+
return ctrl.Result{}, err
302+
}
303+
if err = servers.Delete(ctx, r.testComputeClient, server.ID).ExtractErr(); err != nil {
304+
return ctrl.Result{}, fmt.Errorf("failed to delete timed out test instance %v: %w", server.ID, err)
305+
}
306+
}
289307
return ctrl.Result{RequeueAfter: defaultWaitTime}, nil
290308
}
291309

@@ -613,6 +631,8 @@ func (r *OnboardingController) SetupWithManager(mgr ctrl.Manager) error {
613631
}
614632
r.testNetworkClient.ResourceBase = fmt.Sprintf("%vv2.0/", r.testNetworkClient.Endpoint)
615633

634+
r.Clock = clock.RealClock{}
635+
616636
return ctrl.NewControllerManagedBy(mgr).
617637
Named(OnboardingControllerName).
618638
For(&kvmv1.Hypervisor{}).

internal/controller/onboarding_controller_test.go

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"net/http"
2323
"os"
24+
"time"
2425

2526
"github.com/gophercloud/gophercloud/v2/testhelper"
2627
"github.com/gophercloud/gophercloud/v2/testhelper/client"
@@ -30,6 +31,8 @@ import (
3031
"k8s.io/apimachinery/pkg/api/meta"
3132
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3233
"k8s.io/apimachinery/pkg/types"
34+
"k8s.io/utils/clock"
35+
clocktesting "k8s.io/utils/clock/testing"
3336
ctrl "sigs.k8s.io/controller-runtime"
3437

3538
kvmv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1"
@@ -162,7 +165,8 @@ var _ = Describe("Onboarding Controller", func() {
162165
createServerBody = `{
163166
"server": {
164167
"id": "server-id",
165-
"status": "ACTIVE"
168+
"status": "ACTIVE",
169+
"OS-SRV-USG:launched_at": "2025-01-01T12:00:00.000000"
166170
}
167171
}`
168172
)
@@ -221,6 +225,7 @@ var _ = Describe("Onboarding Controller", func() {
221225
Client: k8sClient,
222226
Scheme: k8sClient.Scheme(),
223227
TestFlavorID: "1",
228+
Clock: clock.RealClock{},
224229
computeClient: client.ServiceClient(fakeServer),
225230
testComputeClient: client.ServiceClient(fakeServer),
226231
testImageClient: client.ServiceClient(fakeServer),
@@ -340,6 +345,11 @@ var _ = Describe("Onboarding Controller", func() {
340345
})
341346

342347
Context("running tests after initial setup", func() {
348+
var (
349+
serverActionHandler func(http.ResponseWriter, *http.Request)
350+
serverDeleteHandler func(http.ResponseWriter, *http.Request)
351+
)
352+
343353
BeforeEach(func(ctx SpecContext) {
344354
hv := &kvmv1.Hypervisor{}
345355
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
@@ -406,15 +416,21 @@ var _ = Describe("Onboarding Controller", func() {
406416
Expect(err).NotTo(HaveOccurred())
407417
})
408418

409-
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
419+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
410420
w.Header().Add("Content-Type", "application/json")
411421
w.WriteHeader(http.StatusOK)
412422
_, err := fmt.Fprintf(w, `{"output": "FAKE CONSOLE OUTPUT\nANOTHER\nLAST LINE\nohooc--%v-%v\n"}`, hv.Name, hv.UID)
413423
Expect(err).NotTo(HaveOccurred())
414-
424+
}
425+
fakeServer.Mux.HandleFunc("POST /servers/server-id/action", func(w http.ResponseWriter, r *http.Request) {
426+
serverActionHandler(w, r)
415427
})
416-
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
428+
429+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
417430
w.WriteHeader(http.StatusAccepted)
431+
}
432+
fakeServer.Mux.HandleFunc("DELETE /servers/server-id", func(w http.ResponseWriter, r *http.Request) {
433+
serverDeleteHandler(w, r)
418434
})
419435
})
420436

@@ -497,6 +513,7 @@ var _ = Describe("Onboarding Controller", func() {
497513
))
498514
})
499515
})
516+
500517
When("SkipTests is set to false", func() {
501518
BeforeEach(func(ctx SpecContext) {
502519
hv := &kvmv1.Hypervisor{}
@@ -571,5 +588,53 @@ var _ = Describe("Onboarding Controller", func() {
571588
})
572589
})
573590

591+
When("smoke test times out waiting for console output", func() {
592+
var serverDeletedCalled bool
593+
594+
BeforeEach(func(ctx SpecContext) {
595+
By("Overriding HV status to Testing state")
596+
hv := &kvmv1.Hypervisor{}
597+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
598+
meta.SetStatusCondition(&hv.Status.Conditions, metav1.Condition{
599+
Type: kvmv1.ConditionTypeOnboarding,
600+
Status: metav1.ConditionTrue,
601+
Reason: kvmv1.ConditionReasonTesting,
602+
})
603+
Expect(k8sClient.Status().Update(ctx, hv)).To(Succeed())
604+
605+
By("Setting fake clock past the smoke test timeout")
606+
// createServerBody has LaunchedAt "2025-01-01T12:00:00", so 6 minutes later is past the 5-minute deadline
607+
onboardingReconciler.Clock = clocktesting.NewFakeClock(time.Date(2025, 1, 1, 12, 6, 0, 0, time.UTC))
608+
serverDeletedCalled = false
609+
610+
serverActionHandler = func(w http.ResponseWriter, _ *http.Request) {
611+
w.Header().Add("Content-Type", "application/json")
612+
w.WriteHeader(http.StatusOK)
613+
_, err := fmt.Fprint(w, `{"output": "some unrelated console output\n"}`)
614+
Expect(err).NotTo(HaveOccurred())
615+
}
616+
serverDeleteHandler = func(w http.ResponseWriter, _ *http.Request) {
617+
serverDeletedCalled = true
618+
w.WriteHeader(http.StatusAccepted)
619+
}
620+
})
621+
622+
It("should delete the stalled server and record a timeout in the status", func(ctx SpecContext) {
623+
By("Reconciling smoke test past the timeout deadline")
624+
_, err := onboardingReconciler.Reconcile(ctx, reconcileReq)
625+
Expect(err).NotTo(HaveOccurred())
626+
627+
By("Verifying the timed-out server was deleted")
628+
Expect(serverDeletedCalled).To(BeTrue())
629+
630+
By("Verifying the onboarding condition message indicates a timeout")
631+
hv := &kvmv1.Hypervisor{}
632+
Expect(k8sClient.Get(ctx, namespacedName, hv)).To(Succeed())
633+
onboardingCond := meta.FindStatusCondition(hv.Status.Conditions, kvmv1.ConditionTypeOnboarding)
634+
Expect(onboardingCond).NotTo(BeNil())
635+
Expect(onboardingCond.Reason).To(Equal(kvmv1.ConditionReasonTesting))
636+
Expect(onboardingCond.Message).To(ContainSubstring("timeout"))
637+
})
638+
})
574639
})
575640
})

0 commit comments

Comments
 (0)