Skip to content

Commit 5bbde50

Browse files
authored
Committed resources syncer with more alerts (#631)
1 parent 1b0dddb commit 5bbde50

6 files changed

Lines changed: 335 additions & 67 deletions

File tree

helm/bundles/cortex-nova/alerts/nova.alerts.yaml

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,3 +483,115 @@ groups:
483483
The committed resource capacity API (Limes LIQUID integration) is experiencing
484484
high latency (p95 > 5s). This may indicate slow database queries or knowledge
485485
CRD retrieval. Limes scrapes may time out, affecting capacity reporting.
486+
487+
# Committed Resource Syncer Alerts
488+
- alert: CortexNovaCommittedResourceSyncerNotRunning
489+
expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0
490+
for: 5m
491+
labels:
492+
context: committed-resource-syncer
493+
dashboard: cortex/cortex
494+
service: cortex
495+
severity: warning
496+
support_group: workload-management
497+
annotations:
498+
summary: "Committed Resource syncer not running"
499+
description: >
500+
The committed resource syncer has not run in the last 2 hours. This indicates
501+
that the syncer may have stopped or is encountering errors. Check the syncer logs for errors.
502+
503+
- alert: CortexNovaCommittedResourceSyncerErrorsHigh
504+
expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3
505+
for: 5m
506+
labels:
507+
context: committed-resource-syncer
508+
dashboard: cortex/cortex
509+
service: cortex
510+
severity: warning
511+
support_group: workload-management
512+
annotations:
513+
summary: "Committed Resource syncer experiencing errors"
514+
description: >
515+
The committed resource syncer has encountered multiple errors in the last hour.
516+
This may indicate connectivity issues with Limes. Check the syncer logs for error details.
517+
518+
- alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh
519+
expr: |
520+
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])
521+
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05
522+
for: 15m
523+
labels:
524+
context: committed-resource-syncer
525+
dashboard: cortex/cortex
526+
service: cortex
527+
severity: warning
528+
support_group: workload-management
529+
annotations:
530+
summary: "Committed Resource syncer unit mismatch rate >5%"
531+
description: >
532+
More than 5% of commitments are being skipped due to unit mismatches between
533+
Limes and Cortex flavor groups. This happens when Limes has not yet been
534+
updated to use the new unit format after a flavor group change. The affected
535+
commitments will keep their existing reservations until Limes notices the update.
536+
Check the logs if this error persists for longer time.
537+
538+
- alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh
539+
expr: |
540+
rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])
541+
/ rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0
542+
for: 15m
543+
labels:
544+
context: committed-resource-syncer
545+
dashboard: cortex/cortex
546+
service: cortex
547+
severity: warning
548+
support_group: workload-management
549+
annotations:
550+
summary: "Committed Resource syncer unknown flavor group rate >0%"
551+
description: >
552+
Some commitments reference flavor groups that don't exist in
553+
Cortex Knowledge (anymore). This may indicate that flavor group configuration is
554+
out of sync between Limes and Cortex, or that Knowledge extraction is failing.
555+
Check the flavor group Knowledge CRD and history to see what was changed.
556+
557+
- alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh
558+
expr: |
559+
(
560+
rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) +
561+
rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) +
562+
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
563+
) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01
564+
for: 15m
565+
labels:
566+
context: committed-resource-syncer
567+
dashboard: cortex/cortex
568+
service: cortex
569+
severity: warning
570+
support_group: workload-management
571+
annotations:
572+
summary: "Committed Resource syncer local change rate >1%"
573+
description: >
574+
More than 1% of synced commitments are requiring reservation changes
575+
(creates, deletes, or repairs). This is higher than expected for steady-state
576+
operation and may indicate data inconsistencies, external modifications to
577+
reservations, or issues with the CRDs. Check Cortex logs for details.
578+
579+
- alert: CortexNovaCommittedResourceSyncerRepairRateHigh
580+
expr: |
581+
rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h])
582+
/ rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0
583+
for: 15m
584+
labels:
585+
context: committed-resource-syncer
586+
dashboard: cortex/cortex
587+
service: cortex
588+
severity: warning
589+
support_group: workload-management
590+
annotations:
591+
summary: "Committed Resource syncer repair rate >0%"
592+
description: >
593+
Some commitments have reservations that needed repair
594+
(wrong metadata like project ID or flavor group). This may indicate data
595+
corruption, bugs in reservation creation, or external modifications.
596+
Reservations are automatically repaired, but the root cause should be
597+
investigated if this alert persists.

internal/scheduling/reservations/commitments/api_change_commitments.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -253,15 +253,15 @@ ProcessLoop:
253253

254254
logger.V(1).Info("applying commitment state change", "commitmentUUID", commitment.UUID, "oldMemory", stateBefore.TotalMemoryBytes, "desiredMemory", stateDesired.TotalMemoryBytes)
255255

256-
touchedReservations, deletedReservations, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi")
256+
applyResult, err := manager.ApplyCommitmentState(ctx, logger, stateDesired, flavorGroups, "changeCommitmentsApi")
257257
if err != nil {
258258
failedCommitments[string(commitment.UUID)] = "failed to apply commitment state"
259259
logger.Info("failed to apply commitment state for commitment", "commitmentUUID", commitment.UUID, "error", err)
260260
requireRollback = true
261261
break ProcessLoop
262262
}
263-
logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(touchedReservations), "deletedReservations", len(deletedReservations))
264-
reservationsToWatch = append(reservationsToWatch, touchedReservations...)
263+
logger.V(1).Info("applied commitment state change", "commitmentUUID", commitment.UUID, "touchedReservations", len(applyResult.TouchedReservations), "deletedReservations", len(applyResult.RemovedReservations))
264+
reservationsToWatch = append(reservationsToWatch, applyResult.TouchedReservations...)
265265
}
266266
}
267267
}
@@ -305,7 +305,7 @@ ProcessLoop:
305305
for commitmentUUID, state := range statesBefore {
306306
// Rollback to statesBefore for this commitment
307307
logger.Info("applying rollback for commitment", "commitmentUUID", commitmentUUID, "stateBefore", state)
308-
_, _, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback")
308+
_, err := manager.ApplyCommitmentState(ctx, logger, state, flavorGroups, "changeCommitmentsApiRollback")
309309
if err != nil {
310310
logger.Info("failed to apply rollback state for commitment", "commitmentUUID", commitmentUUID, "error", err)
311311
// continue with best effort rollback for other projects

internal/scheduling/reservations/commitments/reservation_manager.go

Lines changed: 39 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,20 @@ import (
1717
"sigs.k8s.io/controller-runtime/pkg/client"
1818
)
1919

20+
// ApplyResult contains the result of applying a commitment state.
21+
type ApplyResult struct {
22+
// Created is the number of reservations created
23+
Created int
24+
// Deleted is the number of reservations deleted
25+
Deleted int
26+
// Repaired is the number of reservations repaired (metadata sync or recreated due to wrong config)
27+
Repaired int
28+
// TouchedReservations are reservations that were created or updated
29+
TouchedReservations []v1alpha1.Reservation
30+
// RemovedReservations are reservations that were deleted
31+
RemovedReservations []v1alpha1.Reservation
32+
}
33+
2034
// ReservationManager handles CRUD operations for Reservation CRDs.
2135
type ReservationManager struct {
2236
client.Client
@@ -42,14 +56,16 @@ func NewReservationManager(k8sClient client.Client) *ReservationManager {
4256
// - Deleting unused/excess slots when capacity decreases
4357
// - Syncing reservation metadata for all remaining slots
4458
//
45-
// Returns touched reservations (created/updated) and removed reservations for caller tracking.
59+
// Returns ApplyResult containing touched/removed reservations and counts for metrics.
4660
func (m *ReservationManager) ApplyCommitmentState(
4761
ctx context.Context,
4862
log logr.Logger,
4963
desiredState *CommitmentState,
5064
flavorGroups map[string]compute.FlavorGroupFeature,
5165
creator string,
52-
) (touchedReservations, removedReservations []v1alpha1.Reservation, err error) {
66+
) (*ApplyResult, error) {
67+
68+
result := &ApplyResult{}
5369

5470
log = log.WithName("ReservationManager")
5571

@@ -58,7 +74,7 @@ func (m *ReservationManager) ApplyCommitmentState(
5874
if err := m.List(ctx, &allReservations, client.MatchingLabels{
5975
v1alpha1.LabelReservationType: v1alpha1.ReservationTypeLabelCommittedResource,
6076
}); err != nil {
61-
return nil, nil, fmt.Errorf("failed to list reservations: %w", err)
77+
return nil, fmt.Errorf("failed to list reservations: %w", err)
6278
}
6379

6480
// Filter by name prefix to find reservations for this commitment
@@ -74,7 +90,7 @@ func (m *ReservationManager) ApplyCommitmentState(
7490
flavorGroup, exists := flavorGroups[desiredState.FlavorGroupName]
7591

7692
if !exists {
77-
return nil, nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName)
93+
return nil, fmt.Errorf("flavor group not found: %s", desiredState.FlavorGroupName)
7894
}
7995
deltaMemoryBytes := desiredState.TotalMemoryBytes
8096
for _, res := range existing {
@@ -90,7 +106,6 @@ func (m *ReservationManager) ApplyCommitmentState(
90106
// Phase 3 (DELETE): Delete inconsistent reservations (wrong flavor group/project)
91107
// They will be recreated with correct metadata in subsequent phases.
92108
var validReservations []v1alpha1.Reservation
93-
var repairedCount int
94109
for _, res := range existing {
95110
if res.Spec.CommittedResourceReservation.ResourceGroup != desiredState.FlavorGroupName ||
96111
res.Spec.CommittedResourceReservation.ProjectID != desiredState.ProjectID {
@@ -101,13 +116,13 @@ func (m *ReservationManager) ApplyCommitmentState(
101116
"actualFlavorGroup", res.Spec.CommittedResourceReservation.ResourceGroup,
102117
"expectedProjectID", desiredState.ProjectID,
103118
"actualProjectID", res.Spec.CommittedResourceReservation.ProjectID)
104-
repairedCount++
105-
removedReservations = append(removedReservations, res)
119+
result.Repaired++
120+
result.RemovedReservations = append(result.RemovedReservations, res)
106121
memValue := res.Spec.Resources[hv1.ResourceMemory]
107122
deltaMemoryBytes += memValue.Value()
108123

109124
if err := m.Delete(ctx, &res); err != nil {
110-
return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
125+
return result, fmt.Errorf("failed to delete reservation %s: %w", res.Name, err)
111126
}
112127
} else {
113128
validReservations = append(validReservations, res)
@@ -139,33 +154,33 @@ func (m *ReservationManager) ApplyCommitmentState(
139154
reservationToDelete = &existing[len(existing)-1]
140155
existing = existing[:len(existing)-1] // remove from existing list
141156
}
142-
removedReservations = append(removedReservations, *reservationToDelete)
157+
result.RemovedReservations = append(result.RemovedReservations, *reservationToDelete)
158+
result.Deleted++
143159
memValue := reservationToDelete.Spec.Resources[hv1.ResourceMemory]
144160
deltaMemoryBytes += memValue.Value()
145161

146162
if err := m.Delete(ctx, reservationToDelete); err != nil {
147-
return touchedReservations, removedReservations, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err)
163+
return result, fmt.Errorf("failed to delete reservation %s: %w", reservationToDelete.Name, err)
148164
}
149165
}
150166

151167
// Phase 5 (CREATE): Create new reservations (capacity increased)
152-
var createdCount int
153168
for deltaMemoryBytes > 0 {
154169
// Need to create new reservation slots, always prefer largest flavor within the group
155170
// TODO more sophisticated flavor selection, especially with flavors of different cpu/memory ratio
156171
reservation := m.newReservation(desiredState, nextSlotIndex, deltaMemoryBytes, flavorGroup, creator)
157-
touchedReservations = append(touchedReservations, *reservation)
172+
result.TouchedReservations = append(result.TouchedReservations, *reservation)
158173
memValue := reservation.Spec.Resources[hv1.ResourceMemory]
159174
deltaMemoryBytes -= memValue.Value()
160-
createdCount++
175+
result.Created++
161176

162177
if err := m.Create(ctx, reservation); err != nil {
163178
if apierrors.IsAlreadyExists(err) {
164-
return touchedReservations, removedReservations, fmt.Errorf(
179+
return result, fmt.Errorf(
165180
"reservation %s already exists (collision detected): %w",
166181
reservation.Name, err)
167182
}
168-
return touchedReservations, removedReservations, fmt.Errorf(
183+
return result, fmt.Errorf(
169184
"failed to create reservation slot %d: %w",
170185
nextSlotIndex, err)
171186
}
@@ -177,24 +192,25 @@ func (m *ReservationManager) ApplyCommitmentState(
177192
for i := range existing {
178193
updated, err := m.syncReservationMetadata(ctx, log, &existing[i], desiredState)
179194
if err != nil {
180-
return touchedReservations, removedReservations, err
195+
return result, err
181196
}
182197
if updated != nil {
183-
touchedReservations = append(touchedReservations, *updated)
198+
result.TouchedReservations = append(result.TouchedReservations, *updated)
199+
result.Repaired++
184200
}
185201
}
186202

187203
// Only log if there were actual changes
188-
if hasChanges || createdCount > 0 || len(removedReservations) > 0 || repairedCount > 0 {
204+
if hasChanges || result.Created > 0 || len(result.RemovedReservations) > 0 || result.Repaired > 0 {
189205
log.Info("commitment state sync completed",
190206
"commitmentUUID", desiredState.CommitmentUUID,
191-
"created", createdCount,
192-
"deleted", len(removedReservations),
193-
"repaired", repairedCount,
194-
"total", len(existing)+createdCount)
207+
"created", result.Created,
208+
"deleted", result.Deleted,
209+
"repaired", result.Repaired,
210+
"total", len(existing)+result.Created)
195211
}
196212

197-
return touchedReservations, removedReservations, nil
213+
return result, nil
198214
}
199215

200216
// syncReservationMetadata updates reservation metadata if it differs from desired state.

0 commit comments

Comments
 (0)