Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 34 additions & 19 deletions pkg/controller/machineset.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,23 +331,38 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
return nil
}

var staleMachines, machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine
var (
staleMachines []*v1alpha1.Machine
numNonTerminatingMachines int
machinesWithoutUpdateSuccessfulLabel []*v1alpha1.Machine
)
for _, m := range allMachines {
if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful {
machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m)
// Machines that are terminating are skipped when computing the diff for the required
// machineSet replicas since either:
// 1. There was a scale-down event that led to the machine being marked for removal,
// in which case there's no need to process it for further iterations; or
// 2. This was a `failed` machine that was terminated in which case the machineSet
// replicas won't change and hence there's a new machine that should be spawned
// in place of the failed one.
if m.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating {
numNonTerminatingMachines++
if m.Labels[v1alpha1.LabelKeyNodeUpdateResult] != v1alpha1.LabelValueNodeUpdateSuccessful {
machinesWithoutUpdateSuccessfulLabel = append(machinesWithoutUpdateSuccessfulLabel, m)
}
}
}
allMachinesDiff := len(allMachines) - int(machineSet.Spec.Replicas)
// During in-place updates, in the newMachineSet, it can happen that a machine has come from the oldMachineSet
// but the ReplicaCount of newMachineSet has not increased yet.
// In such cases, we should not delete the machine immediately,
// otherwise it can cause unnecessary machine deletion and creation during in-place updates.
nonTerminatingMachinesDiff := numNonTerminatingMachines - int(machineSet.Spec.Replicas)
// During in-place updates, in the newMachineSet, it can happen that a machine has come from the
// oldMachineSet but the ReplicaCount of newMachineSet has not increased yet. In such cases, we
// should not delete the machine immediately, otherwise it can cause unnecessary machine
// deletion and creation during in-place updates. So the below diff is checked to find if there
// are machines that require removal.
machinesWithoutUpdateSuccessfulLabelDiff := len(machinesWithoutUpdateSuccessfulLabel) - int(machineSet.Spec.Replicas)

// During in-place updates, ScaleUps are disabled in the oldMachineSet and
// in newMachineSet, its ReplicaCount would never increase before a machine from oldMachineSet is moved.
// So no need for any special case during in-place updates.
if allMachinesDiff < 0 {
// During in-place updates, ScaleUps are disabled in the oldMachineSet and the newMachineSet's
// ReplicaCount would never increase before a machine from oldMachineSet is moved.
// So no need for any special case during in-place updates when checking if machines need to be created.
if nonTerminatingMachinesDiff < 0 {
// If MachineSet is frozen and no deletion timestamp, don't process it
if machineSet.Labels["freeze"] == "True" && machineSet.DeletionTimestamp == nil {
klog.V(2).Infof("MachineSet %q is frozen, and hence not processing", machineSet.Name)
Expand All @@ -361,20 +376,20 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
return nil
}

allMachinesDiff *= -1
if allMachinesDiff > BurstReplicas {
allMachinesDiff = BurstReplicas
nonTerminatingMachinesDiff *= -1
if nonTerminatingMachinesDiff > BurstReplicas {
nonTerminatingMachinesDiff = BurstReplicas
}
// TODO: Track UIDs of creates just like deletes. The problem currently
// is we'd need to wait on the result of a create to record the machine's
// UID, which would require locking *across* the create, which will turn
// into a performance bottleneck. We should generate a UID for the machine
// beforehand and store it via ExpectCreations.
if err := c.expectations.ExpectCreations(machineSetKey, allMachinesDiff); err != nil {
if err := c.expectations.ExpectCreations(machineSetKey, nonTerminatingMachinesDiff); err != nil {
// TODO: proper error handling needs to happen here
klog.Errorf("failed expect creations for machineset %s: %v", machineSet.Name, err)
}
klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), allMachinesDiff)
klog.V(2).Infof("Too few replicas for MachineSet %s, need %d, creating %d", machineSet.Name, (machineSet.Spec.Replicas), nonTerminatingMachinesDiff)
// Batch the machine creates. Batch sizes start at SlowStartInitialBatchSize
// and double with each successful iteration in a kind of "slow start".
// This handles attempts to start large numbers of machines that would
Expand All @@ -383,7 +398,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
// prevented from spamming the API service with the machine create requests
// after one of its machines fails. Conveniently, this also prevents the
// event spam that those failures would generate.
successfulCreations, err := slowStartBatch(allMachinesDiff, SlowStartInitialBatchSize, func() error {
successfulCreations, err := slowStartBatch(nonTerminatingMachinesDiff, SlowStartInitialBatchSize, func() error {
boolPtr := func(b bool) *bool { return &b }
controllerRef := &metav1.OwnerReference{
APIVersion: controllerKindMachineSet.GroupVersion().String(), // #ToCheck
Expand All @@ -410,7 +425,7 @@ func (c *controller) manageReplicas(ctx context.Context, allMachines []*v1alpha1
// Any skipped machines that we never attempted to start shouldn't be expected.
// The skipped machines will be retried later. The next controller resync will
// retry the slow start process.
if skippedMachines := allMachinesDiff - successfulCreations; skippedMachines > 0 {
if skippedMachines := nonTerminatingMachinesDiff - successfulCreations; skippedMachines > 0 {
klog.V(2).Infof("Slow-start failure. Skipping creation of %d machines, decrementing expectations for %v %v/%v", skippedMachines, machineSet.Kind, machineSet.Namespace, machineSet.Name)
for range skippedMachines {
// Decrement the expected number of creates because the informer won't observe this machine
Expand Down
Loading