Skip to content

Commit ca6b214

Browse files
authored
fix: commitments package refactoring/pipeline config (#601)
## Changes - commitment uuid format enforced - reservation deletion considers allocations and targetHost - commitments use configurable pipeline for scheduling - set validation time - refactor package and controller - refactor logging
1 parent a24c1e5 commit ca6b214

15 files changed

Lines changed: 864 additions & 380 deletions

File tree

cmd/main.go

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ import (
5151
"github.com/cobaltcore-dev/cortex/internal/scheduling/pods"
5252
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations"
5353
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/commitments"
54-
reservationscontroller "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/controller"
5554
"github.com/cobaltcore-dev/cortex/internal/scheduling/reservations/failover"
5655
"github.com/cobaltcore-dev/cortex/pkg/conf"
5756
"github.com/cobaltcore-dev/cortex/pkg/monitoring"
@@ -487,18 +486,37 @@ func main() {
487486
os.Exit(1)
488487
}
489488
}
490-
if slices.Contains(mainConfig.EnabledControllers, "reservations-controller") {
491-
setupLog.Info("enabling controller", "controller", "reservations-controller")
492-
monitor := reservationscontroller.NewControllerMonitor(multiclusterClient)
489+
if slices.Contains(mainConfig.EnabledControllers, "committed-resource-reservations-controller") {
490+
setupLog.Info("enabling controller", "controller", "committed-resource-reservations-controller")
491+
monitor := reservations.NewMonitor(multiclusterClient)
493492
metrics.Registry.MustRegister(&monitor)
494-
reservationsControllerConfig := conf.GetConfigOrDie[reservationscontroller.Config]()
493+
commitmentsConfig := conf.GetConfigOrDie[commitments.Config]()
494+
commitmentsDefaults := commitments.DefaultConfig()
495+
if commitmentsConfig.RequeueIntervalActive == 0 {
496+
commitmentsConfig.RequeueIntervalActive = commitmentsDefaults.RequeueIntervalActive
497+
}
498+
if commitmentsConfig.RequeueIntervalRetry == 0 {
499+
commitmentsConfig.RequeueIntervalRetry = commitmentsDefaults.RequeueIntervalRetry
500+
}
501+
if commitmentsConfig.PipelineDefault == "" {
502+
commitmentsConfig.PipelineDefault = commitmentsDefaults.PipelineDefault
503+
}
504+
if commitmentsConfig.SchedulerURL == "" {
505+
commitmentsConfig.SchedulerURL = commitmentsDefaults.SchedulerURL
506+
}
507+
if commitmentsConfig.ChangeAPIWatchReservationsTimeout == 0 {
508+
commitmentsConfig.ChangeAPIWatchReservationsTimeout = commitmentsDefaults.ChangeAPIWatchReservationsTimeout
509+
}
510+
if commitmentsConfig.ChangeAPIWatchReservationsPollInterval == 0 {
511+
commitmentsConfig.ChangeAPIWatchReservationsPollInterval = commitmentsDefaults.ChangeAPIWatchReservationsPollInterval
512+
}
495513

496-
if err := (&reservationscontroller.ReservationReconciler{
514+
if err := (&commitments.CommitmentReservationController{
497515
Client: multiclusterClient,
498516
Scheme: mgr.GetScheme(),
499-
Conf: reservationsControllerConfig,
517+
Conf: commitmentsConfig,
500518
}).SetupWithManager(mgr, multiclusterClient); err != nil {
501-
setupLog.Error(err, "unable to create controller", "controller", "Reservation")
519+
setupLog.Error(err, "unable to create controller", "controller", "CommitmentReservation")
502520
os.Exit(1)
503521
}
504522
}
@@ -677,9 +695,13 @@ func main() {
677695
setupLog.Info("starting commitments syncer")
678696
syncer := commitments.NewSyncer(multiclusterClient)
679697
syncerConfig := conf.GetConfigOrDie[commitments.SyncerConfig]()
698+
syncerDefaults := commitments.DefaultSyncerConfig()
699+
if syncerConfig.SyncInterval == 0 {
700+
syncerConfig.SyncInterval = syncerDefaults.SyncInterval
701+
}
680702
if err := (&task.Runner{
681703
Client: multiclusterClient,
682-
Interval: time.Hour,
704+
Interval: syncerConfig.SyncInterval,
683705
Name: "commitments-sync-task",
684706
Run: func(ctx context.Context) error { return syncer.SyncReservations(ctx) },
685707
Init: func(ctx context.Context) error { return syncer.Init(ctx, syncerConfig) },

helm/bundles/cortex-nova/templates/pipelines_kvm.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ spec:
236236
237237
This is the pipeline used for KVM hypervisors (qemu and cloud-hypervisor).
238238
Specifically, this pipeline is used for general purpose workloads.
239+
It is also used for (CR/HA) reservation requests.
239240
type: filter-weigher
240241
createDecisions: false
241242
# Fetch all placement candidates, ignoring nova's preselection.

helm/bundles/cortex-nova/values.yaml

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -135,13 +135,33 @@ cortex-scheduling-controllers:
135135
- nova-deschedulings-executor
136136
- hypervisor-overcommit-controller
137137
- explanation-controller
138-
- reservations-controller
138+
- committed-resource-reservations-controller
139139
- failover-reservations-controller
140140
enabledTasks:
141141
- nova-decisions-cleanup-task
142-
# Endpoints configuration for reservations controller
143-
endpoints:
144-
novaExternalScheduler: "http://localhost:8080/scheduler/nova/external"
142+
# CommittedResourceFlavorGroupPipelines maps flavor group IDs to pipeline names for CR reservations
143+
# This allows different scheduling strategies per flavor group (e.g., HANA vs GP)
144+
committedResourceFlavorGroupPipelines:
145+
"2152": "kvm-hana-bin-packing-all-filters-enabled" # HANA flavor group
146+
"2101": "kvm-general-purpose-load-balancing-all-filters-enabled" # General Purpose flavor group
147+
"*": "kvm-general-purpose-load-balancing-all-filters-enabled" # Catch-all fallback
148+
# Default pipeline for CR reservations when no CommittedResourceFlavorGroupPipelines entry matches
149+
committedResourcePipelineDefault: "kvm-general-purpose-load-balancing-all-filters-enabled"
150+
# How often to re-verify active reservations
151+
# 5m = 300000000000 nanoseconds
152+
committedResourceRequeueIntervalActive: 300000000000
153+
# How often to retry when knowledge is not ready
154+
# 1m = 60000000000 nanoseconds
155+
committedResourceRequeueIntervalRetry: 60000000000
156+
# Timeout for watching reservations to become ready before rolling back
157+
# 10s = 10000000000 nanoseconds
158+
committedResourceChangeAPIWatchReservationsTimeout: 10000000000
159+
# How often to poll reservation status during watch
160+
# 500ms = 500000000 nanoseconds
161+
committedResourceChangeAPIWatchReservationsPollInterval: 500000000
162+
# Whether the change-commitments API endpoint is active
163+
# When false, the endpoint returns HTTP 503. The info endpoint remains available.
164+
committedResourceEnableChangeCommitmentsAPI: true
145165
# OvercommitMappings is a list of mappings that map hypervisor traits to
146166
# overcommit ratios. Note that this list is applied in order, so if there
147167
# are multiple mappings applying to the same hypervisors, the last mapping
@@ -189,6 +209,9 @@ cortex-knowledge-controllers:
189209
- datasource-controllers
190210
- knowledge-controllers
191211
- kpis-controller
212+
# How often the commitments syncer reconciles Limes commitments to Reservation CRDs
213+
# 1h = 3600000000000 nanoseconds
214+
committedResourceSyncInterval: 3600000000000
192215
enabledTasks:
193216
- commitments-sync-task
194217

0 commit comments

Comments
 (0)