Skip to content

Commit df59572

Browse files
authored
Failover controller misc: (#656)
* more logging * better debug cli tool * default config is more active
1 parent d493f0c commit df59572

3 files changed

Lines changed: 417 additions & 69 deletions

File tree

helm/bundles/cortex-nova/values.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,12 @@ cortex-scheduling-controllers:
186186
# Used when maxVMsToProcess limits processing, allows faster catch-up and for the first reconcile
187187
shortReconcileInterval: 1m
188188
# Number of max VMs to process in one periodic reconciliation loop
189-
maxVMsToProcess: 25
189+
maxVMsToProcess: 50
190+
# How often to rotate VM selection offset when maxVMsToProcess limits processing
191+
# Every N reconcile cycles, the offset rotates to process different VMs
192+
vmSelectionRotationInterval: 3
190193
# Minimum successful reservations to use short interval
191-
minSuccessForShortInterval: 1
194+
minSuccessForShortInterval: 0
192195
# Maximum failures allowed to still use short interval
193196
maxFailuresForShortInterval: 99
194197
# If true, uses hypervisor CRD as source of truth for VM location instead of postgres

internal/scheduling/reservations/failover/controller.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,7 @@ func (c *FailoverReservationController) validateReservation(ctx context.Context,
228228

229229
// reconcileSummary holds statistics from the reconciliation cycle.
230230
type reconcileSummary struct {
231+
vmsMissingFailover int
231232
vmsProcessed int
232233
reservationsNeeded int
233234
totalReused int
@@ -268,6 +269,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
268269
}
269270
logger.V(1).Info("found VMs from source", "count", len(vms))
270271

272+
// todo: vms are vms from all AZs, we should consdier processing them by AZ (sequencial or in parallel) but not mixing them together
271273
// List only failover reservations using label selector
272274
var reservationList v1alpha1.ReservationList
273275
if err := c.List(ctx, &reservationList, client.MatchingLabels{
@@ -313,6 +315,7 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
313315

314316
// 6. Create and assign reservations for VMs that need them
315317
assignSummary, hitMaxVMsLimit := c.reconcileCreateAndAssignReservations(ctx, vms, failoverReservations, allHypervisors)
318+
summary.vmsMissingFailover = assignSummary.vmsMissingFailover
316319
summary.vmsProcessed = assignSummary.vmsProcessed
317320
summary.reservationsNeeded = assignSummary.reservationsNeeded
318321
summary.totalReused = assignSummary.totalReused
@@ -332,6 +335,9 @@ func (c *FailoverReservationController) ReconcilePeriodic(ctx context.Context) (
332335
"reconcileCount", c.reconcileCount,
333336
"duration", duration.Round(time.Millisecond),
334337
"requeueAfter", requeueAfter,
338+
"totalVMs", len(vms),
339+
"totalReservations", len(failoverReservations),
340+
"vmsMissingFailover", summary.vmsMissingFailover,
335341
"vmsProcessed", summary.vmsProcessed,
336342
"reservationsNeeded", summary.reservationsNeeded,
337343
"reused", summary.totalReused,
@@ -557,11 +563,12 @@ func (c *FailoverReservationController) reconcileCreateAndAssignReservations(
557563
vmsMissingFailover := c.calculateVMsMissingFailover(ctx, vms, failoverReservations)
558564
logger.V(1).Info("VMs missing failover reservations", "count", len(vmsMissingFailover))
559565

566+
totalVMsMissingFailover := len(vmsMissingFailover)
560567
vmsMissingFailover, hitMaxVMsLimit := c.selectVMsToProcess(ctx, vmsMissingFailover, c.Config.MaxVMsToProcess)
561568

562569
logger.V(1).Info("found hypervisors and vm missing failover reservation",
563570
"countHypervisors", len(allHypervisors),
564-
"countVMsMissingFailover", len(vmsMissingFailover))
571+
"countVMsMissingFailover", totalVMsMissingFailover)
565572

566573
totalReservationsNeeded := 0
567574
for _, need := range vmsMissingFailover {
@@ -649,6 +656,7 @@ func (c *FailoverReservationController) reconcileCreateAndAssignReservations(
649656
}
650657

651658
return reconcileSummary{
659+
vmsMissingFailover: totalVMsMissingFailover,
652660
vmsProcessed: len(vmsMissingFailover),
653661
reservationsNeeded: totalReservationsNeeded,
654662
totalReused: totalReused,

0 commit comments

Comments
 (0)