Skip to content

Commit f40df6e

Browse files
authored
feat: PoC failover reservation controller (#572)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit ## Release Notes * **New Features** * Added failover reservations system for automatic VM failover handling * Added new scheduling pipelines for failover reservation creation and reuse * Added visualization tool for monitoring reservations and VM allocations * **Documentation** * Added comprehensive failover reservations system documentation * **Improvements** * Updated reservation domain label keys to cloud domain * Extended reservation status with timestamp tracking for changes and acknowledgments <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent c957ff7 commit f40df6e

28 files changed

Lines changed: 9571 additions & 31 deletions

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
*.dll
55
*.so
66
*.dylib
7+
*.tgz
8+
79
build/**
810

911
# Test binary, built with `go test -c`

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ GOBIN=$(shell go env GOBIN)
66
endif
77

88
.PHONY: all
9-
all: crds deepcopy lint test
9+
all: crds deepcopy lint-fix format lint test
1010

1111
.PHONY: help
1212
help: ## Display this help.

api/v1alpha1/reservation_types.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const (
2929

3030
// LabelReservationType identifies the type of reservation.
3131
// This label is present on all reservations to enable type-based filtering.
32-
LabelReservationType = "reservations.cortex.sap.com/type"
32+
LabelReservationType = "reservations.cortex.cloud/type"
3333

3434
// Reservation type label values
3535
ReservationTypeLabelCommittedResource = "committed-resource"
@@ -152,6 +152,18 @@ type FailoverReservationStatus struct {
152152
// Key: VM/instance UUID, Value: Host name where the VM is currently running.
153153
// +kubebuilder:validation:Optional
154154
Allocations map[string]string `json:"allocations,omitempty"`
155+
156+
// LastChanged tracks when the reservation was last modified.
157+
// This is used to track pending changes that need acknowledgment.
158+
// +kubebuilder:validation:Optional
159+
LastChanged *metav1.Time `json:"lastChanged,omitempty"`
160+
161+
// AcknowledgedAt is the timestamp when the last change was acknowledged.
162+
// When nil, the reservation is in a pending state awaiting acknowledgment.
163+
// This does not affect the Ready condition - reservations are still considered
164+
// ready even when not yet acknowledged.
165+
// +kubebuilder:validation:Optional
166+
AcknowledgedAt *metav1.Time `json:"acknowledgedAt,omitempty"`
155167
}
156168

157169
// ReservationStatus defines the observed state of Reservation.
@@ -189,7 +201,7 @@ type ReservationStatus struct {
189201
// +kubebuilder:object:root=true
190202
// +kubebuilder:subresource:status
191203
// +kubebuilder:resource:scope=Cluster
192-
// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.sap\\.com/type']"
204+
// +kubebuilder:printcolumn:name="Type",type="string",JSONPath=".metadata.labels['reservations\\.cortex\\.cloud/type']"
193205
// +kubebuilder:printcolumn:name="Host",type="string",JSONPath=".status.host"
194206
// +kubebuilder:printcolumn:name="Ready",type="string",JSONPath=".status.conditions[?(@.type=='Ready')].status"
195207

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

cmd/main.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,13 +42,16 @@ import (
4242
"github.com/cobaltcore-dev/cortex/internal/knowledge/kpis"
4343
"github.com/cobaltcore-dev/cortex/internal/scheduling/cinder"
4444
"github.com/cobaltcore-dev/cortex/internal/scheduling/explanation"
45+
"github.com/cobaltcore-dev/cortex/internal/scheduling/external"
4546
schedulinglib "github.com/cobaltcore-dev/cortex/internal/scheduling/lib"
4647
"github.com/cobaltcore-dev/cortex/internal/scheduling/machines"
4748
"github.com/cobaltcore-dev/cortex/internal/scheduling/manila"
4849
"github.com/cobaltcore-dev/cortex/internal/scheduling/nova"
4950
"github.com/cobaltcore-dev/cortex/internal/scheduling/pods"
51+
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
5052
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments"
5153
reservationscontroller "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/controller"
54+
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover"
5255
"github.com/cobaltcore-dev/cortex/pkg/conf"
5356
"github.com/cobaltcore-dev/cortex/pkg/monitoring"
5457
"github.com/cobaltcore-dev/cortex/pkg/multicluster"
@@ -142,6 +145,12 @@ func main() {
142145

143146
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
144147

148+
// Log the main configuration
149+
setupLog.Info("loaded main configuration",
150+
"enabledControllers", mainConfig.EnabledControllers,
151+
"enabledTasks", mainConfig.EnabledTasks,
152+
"leaderElectionID", mainConfig.LeaderElectionID)
153+
145154
// if the enable-http2 flag is false (the default), http/2 should be disabled
146155
// due to its vulnerabilities. More specifically, disabling http/2 will
147156
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
@@ -350,6 +359,7 @@ func main() {
350359
}
351360
}
352361
if slices.Contains(mainConfig.EnabledControllers, "nova-deschedulings-executor") {
362+
setupLog.Info("enabling controller", "controller", "nova-deschedulings-executor")
353363
executorConfig := conf.GetConfigOrDie[nova.DeschedulingsExecutorConfig]()
354364
novaClient := nova.NewNovaClient()
355365
novaClientConfig := conf.GetConfigOrDie[nova.NovaClientConfig]()
@@ -379,6 +389,7 @@ func main() {
379389
}
380390
}
381391
if slices.Contains(mainConfig.EnabledControllers, "manila-decisions-pipeline-controller") {
392+
setupLog.Info("enabling controller", "controller", "manila-decisions-pipeline-controller")
382393
controller := &manila.FilterWeigherPipelineController{
383394
Monitor: filterWeigherPipelineMonitor,
384395
}
@@ -398,6 +409,7 @@ func main() {
398409
}
399410
}
400411
if slices.Contains(mainConfig.EnabledControllers, "cinder-decisions-pipeline-controller") {
412+
setupLog.Info("enabling controller", "controller", "cinder-decisions-pipeline-controller")
401413
controller := &cinder.FilterWeigherPipelineController{
402414
Monitor: filterWeigherPipelineMonitor,
403415
}
@@ -417,6 +429,7 @@ func main() {
417429
}
418430
}
419431
if slices.Contains(mainConfig.EnabledControllers, "ironcore-decisions-pipeline-controller") {
432+
setupLog.Info("enabling controller", "controller", "ironcore-decisions-pipeline-controller")
420433
controller := &machines.FilterWeigherPipelineController{
421434
Monitor: filterWeigherPipelineMonitor,
422435
}
@@ -435,6 +448,7 @@ func main() {
435448
}
436449
}
437450
if slices.Contains(mainConfig.EnabledControllers, "pods-decisions-pipeline-controller") {
451+
setupLog.Info("enabling controller", "controller", "pods-decisions-pipeline-controller")
438452
controller := &pods.FilterWeigherPipelineController{
439453
Monitor: filterWeigherPipelineMonitor,
440454
}
@@ -453,6 +467,7 @@ func main() {
453467
}
454468
}
455469
if slices.Contains(mainConfig.EnabledControllers, "explanation-controller") {
470+
setupLog.Info("enabling controller", "controller", "explanation-controller")
456471
// Setup a controller which will reconcile the history and explanation for
457472
// decision resources.
458473
explanationControllerConfig := conf.GetConfigOrDie[explanation.ControllerConfig]()
@@ -466,6 +481,7 @@ func main() {
466481
}
467482
}
468483
if slices.Contains(mainConfig.EnabledControllers, "reservations-controller") {
484+
setupLog.Info("enabling controller", "controller", "reservations-controller")
469485
monitor := reservationscontroller.NewControllerMonitor(multiclusterClient)
470486
metrics.Registry.MustRegister(&monitor)
471487
reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]()
@@ -480,6 +496,7 @@ func main() {
480496
}
481497
}
482498
if slices.Contains(mainConfig.EnabledControllers, "datasource-controllers") {
499+
setupLog.Info("enabling controller", "controller", "datasource-controllers")
483500
monitor := datasources.NewMonitor()
484501
metrics.Registry.MustRegister(&monitor)
485502
if err := (&openstack.OpenStackDatasourceReconciler{
@@ -502,6 +519,7 @@ func main() {
502519
}
503520
}
504521
if slices.Contains(mainConfig.EnabledControllers, "knowledge-controllers") {
522+
setupLog.Info("enabling controller", "controller", "knowledge-controllers")
505523
monitor := extractor.NewMonitor()
506524
metrics.Registry.MustRegister(&monitor)
507525
if err := (&extractor.KnowledgeReconciler{
@@ -523,6 +541,7 @@ func main() {
523541
}
524542
}
525543
if slices.Contains(mainConfig.EnabledControllers, "kpis-controller") {
544+
setupLog.Info("enabling controller", "controller", "kpis-controller")
526545
kpisControllerConfig := conf.GetConfigOrDie[kpis.ControllerConfig]()
527546
if err := (&kpis.Controller{
528547
Client: multiclusterClient,
@@ -532,6 +551,93 @@ func main() {
532551
os.Exit(1)
533552
}
534553
}
554+
if slices.Contains(mainConfig.EnabledControllers, "failover-reservations-controller") {
555+
setupLog.Info("enabling controller", "controller", "failover-reservations-controller")
556+
failoverConfig := conf.GetConfigOrDie[failover.FailoverConfig]()
557+
558+
// Apply defaults for unset values
559+
defaults := failover.DefaultConfig()
560+
if failoverConfig.DatasourceName == "" {
561+
failoverConfig.DatasourceName = defaults.DatasourceName
562+
}
563+
if failoverConfig.SchedulerURL == "" {
564+
failoverConfig.SchedulerURL = defaults.SchedulerURL
565+
}
566+
if failoverConfig.ReconcileInterval == 0 {
567+
failoverConfig.ReconcileInterval = defaults.ReconcileInterval
568+
}
569+
if failoverConfig.Creator == "" {
570+
failoverConfig.Creator = defaults.Creator
571+
}
572+
if failoverConfig.FlavorFailoverRequirements == nil {
573+
failoverConfig.FlavorFailoverRequirements = defaults.FlavorFailoverRequirements
574+
}
575+
if failoverConfig.RevalidationInterval == 0 {
576+
failoverConfig.RevalidationInterval = defaults.RevalidationInterval
577+
}
578+
579+
// DatasourceName is still required - check after applying defaults
580+
if failoverConfig.DatasourceName == "" {
581+
setupLog.Error(nil, "failover-reservations-controller requires datasourceName to be configured")
582+
os.Exit(1)
583+
}
584+
585+
// The scheduler client calls the nova external scheduler API to get placement decisions
586+
schedulerClient := reservations.NewSchedulerClient(failoverConfig.SchedulerURL)
587+
588+
// Defer the initialization of PostgresReader until the manager starts
589+
// because the cache is not ready during setup
590+
if err := mgr.Add(manager.RunnableFunc(func(ctx context.Context) error {
591+
// Create PostgresReader from the configured Datasource CRD
592+
// This runs after the cache is started
593+
postgresReader, err := external.NewPostgresReader(ctx, multiclusterClient, failoverConfig.DatasourceName)
594+
if err != nil {
595+
setupLog.Error(err, "unable to create postgres reader for failover controller",
596+
"datasourceName", failoverConfig.DatasourceName)
597+
return err
598+
}
599+
600+
// Create NovaReader and DBVMSource
601+
novaReader := external.NewNovaReader(postgresReader)
602+
vmSource := failover.NewDBVMSource(novaReader)
603+
604+
// Create the unified failover controller
605+
// It handles both:
606+
// 1. Watch-based per-reservation reconciliation (acknowledgment, validation)
607+
// 2. Periodic bulk VM processing (creating/assigning reservations)
608+
failoverController := failover.NewFailoverReservationController(
609+
multiclusterClient,
610+
vmSource,
611+
failoverConfig,
612+
schedulerClient,
613+
)
614+
615+
// Set up the watch-based reconciler for per-reservation reconciliation
616+
if err := failoverController.SetupWithManager(mgr, multiclusterClient); err != nil {
617+
setupLog.Error(err, "unable to set up failover reservation controller")
618+
return err
619+
}
620+
621+
setupLog.Info("failover-reservations-controller starting",
622+
"datasourceName", failoverConfig.DatasourceName,
623+
"schedulerURL", failoverConfig.SchedulerURL,
624+
"reconcileInterval", failoverConfig.ReconcileInterval,
625+
"revalidationInterval", failoverConfig.RevalidationInterval)
626+
627+
// Start the controller's periodic reconciliation loop
628+
return failoverController.Start(ctx)
629+
})); err != nil {
630+
setupLog.Error(err, "unable to add failover controller to manager")
631+
os.Exit(1)
632+
}
633+
setupLog.Info("failover-reservations-controller registered",
634+
"datasourceName", failoverConfig.DatasourceName,
635+
"schedulerURL", failoverConfig.SchedulerURL,
636+
"reconcileInterval", failoverConfig.ReconcileInterval,
637+
"revalidationInterval", failoverConfig.RevalidationInterval,
638+
"trustHypervisorLocation", failoverConfig.TrustHypervisorLocation,
639+
"maxVMsToProcess", failoverConfig.MaxVMsToProcess)
640+
}
535641

536642
// +kubebuilder:scaffold:builder
537643

0 commit comments

Comments
 (0)